diff --git a/.mailmap b/.mailmap index ea69b340168b..a12172b447e5 100644 --- a/.mailmap +++ b/.mailmap @@ -410,6 +410,7 @@ Liam Mark Linas Vepstas Linus Lüssing Linus Lüssing +Linus Lüssing Li Yang Li Yang diff --git a/CREDITS b/CREDITS index cda68f04d5f1..1f9f0f078b4a 100644 --- a/CREDITS +++ b/CREDITS @@ -4339,7 +4339,7 @@ D: Freescale Highspeed USB device driver D: Freescale QE SoC support and Ethernet driver S: B-1206 Jingmao Guojigongyu S: 16 Baliqiao Nanjie, Beijing 101100 -S: People's Repulic of China +S: People's Republic of China N: Vlad Yasevich E: vyasevich@gmail.com diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index f1b90cf1249b..b057eddefbfc 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -355,10 +355,15 @@ Description: If 'target' is written to the 'type' file, writing to or What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//matching Date: Dec 2022 Contact: SeongJae Park -Description: Writing 'Y' or 'N' to this file sets whether to filter out - pages that do or do not match to the 'type' and 'memcg_path', - respectively. Filter out means the action of the scheme will - not be applied to. +Description: Writing 'Y' or 'N' to this file sets whether the filter is for + the memory of the 'type', or all except the 'type'. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//allow +Date: Jan 2025 +Contact: SeongJae Park +Description: Writing 'Y' or 'N' to this file sets whether to allow or reject + applying the scheme's action to the memory that satisfies the + 'type' and the 'matching' of the directory. What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/nr_tried Date: Mar 2022 @@ -384,6 +389,12 @@ Contact: SeongJae Park Description: Reading this file returns the total size of regions that the action of the scheme has successfully applied in bytes. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/sz_ops_filter_passed +Date: Dec 2024 +Contact: SeongJae Park +Description: Reading this file returns the total size of memory that passed + DAMON operations layer-handled filters of the scheme in bytes. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/qt_exceeds Date: Mar 2022 Contact: SeongJae Park @@ -424,3 +435,10 @@ Contact: SeongJae Park Description: Reading this file returns the 'age' of a memory region that corresponding DAMON-based Operation Scheme's action has tried to be applied. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//sz_filter_passed +Date: Dec 2024 +Contact: SeongJae Park +Description: Reading this file returns the size of the memory in the region + that passed DAMON operations layer-handled filters of the + scheme in bytes. diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index f61c01fc376e..210c194d4a7b 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -100,29 +100,29 @@ Get delays, since system boot, for pid 10:: # ./getdelays -d -p 10 (output similar to next case) -Get sum of delays, since system boot, for all pids with tgid 5:: +Get sum and peak of delays, since system boot, for all pids with tgid 242:: - # ./getdelays -d -t 5 + bash-4.4# ./getdelays -d -t 242 print delayacct stats ON - TGID 5 + TGID 242 - CPU count real total virtual total delay total delay average - 8 7000000 6872122 3382277 0.423ms - IO count delay total delay average - 0 0 0.000ms - SWAP count delay total delay average - 0 0 0.000ms - RECLAIM count delay total delay average - 0 0 0.000ms - THRASHING count delay total delay average - 0 0 0.000ms - COMPACT count delay total delay average - 0 0 0.000ms - WPCOPY count delay total delay average - 0 0 0.000ms - IRQ count delay total delay average - 0 0 0.000ms + CPU count real total virtual total delay total delay average delay max delay min + 39 156000000 156576579 2111069 0.054ms 0.212296ms 0.031307ms + IO count delay total delay average delay max delay min + 0 0 0.000ms 0.000000ms 0.000000ms + SWAP count delay total delay average delay max delay min + 0 0 0.000ms 0.000000ms 0.000000ms + RECLAIM count delay total delay average delay max delay min + 0 0 0.000ms 0.000000ms 0.000000ms + THRASHING count delay total delay average delay max delay min + 0 0 0.000ms 0.000000ms 0.000000ms + COMPACT count delay total delay average delay max delay min + 0 0 0.000ms 0.000000ms 0.000000ms + WPCOPY count delay total delay average delay max delay min + 156 11215873 0.072ms 0.207403ms 0.033913ms + IRQ count delay total delay average delay max delay min + 0 0 0.000ms 0.000000ms 0.000000ms Get IO accounting for pid 1, it works only with -p:: diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3872bc6ec49d..9138fcd18260 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3351,8 +3351,8 @@ [KNL] Set the initial state for the memory hotplug onlining policy. If not specified, the default value is set according to the - CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config - option. + CONFIG_MHP_DEFAULT_ONLINE_TYPE kernel config + options. See Documentation/admin-guide/mm/memory-hotplug.rst. memmap=exactmap [KNL,X86,EARLY] Enable setting of an exact @@ -6992,6 +6992,13 @@ See Documentation/admin-guide/mm/transhuge.rst for more details. + transparent_hugepage_tmpfs= [KNL] + Format: [always|within_size|advise|never] + Can be used to control the default hugepage allocation policy + for the tmpfs mount. + See Documentation/admin-guide/mm/transhuge.rst + for more details. + trusted.source= [KEYS] Format: This parameter identifies the trust source as a backend diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index c4dddf6733cd..ede14b679d02 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -42,32 +42,45 @@ the execution. :: $ git clone https://github.com/sjp38/masim; cd masim; make $ sudo damo start "./masim ./configs/stairs.cfg --quiet" - $ sudo ./damo show - 0 addr [85.541 TiB , 85.541 TiB ) (57.707 MiB ) access 0 % age 10.400 s - 1 addr [85.541 TiB , 85.542 TiB ) (413.285 MiB) access 0 % age 11.400 s - 2 addr [127.649 TiB , 127.649 TiB) (57.500 MiB ) access 0 % age 1.600 s - 3 addr [127.649 TiB , 127.649 TiB) (32.500 MiB ) access 0 % age 500 ms - 4 addr [127.649 TiB , 127.649 TiB) (9.535 MiB ) access 100 % age 300 ms - 5 addr [127.649 TiB , 127.649 TiB) (8.000 KiB ) access 60 % age 0 ns - 6 addr [127.649 TiB , 127.649 TiB) (6.926 MiB ) access 0 % age 1 s - 7 addr [127.998 TiB , 127.998 TiB) (120.000 KiB) access 0 % age 11.100 s - 8 addr [127.998 TiB , 127.998 TiB) (8.000 KiB ) access 40 % age 100 ms - 9 addr [127.998 TiB , 127.998 TiB) (4.000 KiB ) access 0 % age 11 s - total size: 577.590 MiB - $ sudo ./damo stop + $ sudo damo report access + heatmap: 641111111000000000000000000000000000000000000000000000[...]33333333333333335557984444[...]7 + # min/max temperatures: -1,840,000,000, 370,010,000, column size: 3.925 MiB + 0 addr 86.182 TiB size 8.000 KiB access 0 % age 14.900 s + 1 addr 86.182 TiB size 8.000 KiB access 60 % age 0 ns + 2 addr 86.182 TiB size 3.422 MiB access 0 % age 4.100 s + 3 addr 86.182 TiB size 2.004 MiB access 95 % age 2.200 s + 4 addr 86.182 TiB size 29.688 MiB access 0 % age 14.100 s + 5 addr 86.182 TiB size 29.516 MiB access 0 % age 16.700 s + 6 addr 86.182 TiB size 29.633 MiB access 0 % age 17.900 s + 7 addr 86.182 TiB size 117.652 MiB access 0 % age 18.400 s + 8 addr 126.990 TiB size 62.332 MiB access 0 % age 9.500 s + 9 addr 126.990 TiB size 13.980 MiB access 0 % age 5.200 s + 10 addr 126.990 TiB size 9.539 MiB access 100 % age 3.700 s + 11 addr 126.990 TiB size 16.098 MiB access 0 % age 6.400 s + 12 addr 127.987 TiB size 132.000 KiB access 0 % age 2.900 s + total size: 314.008 MiB + $ sudo damo stop The first command of the above example downloads and builds an artificial memory access generator program called ``masim``. The second command asks DAMO -to execute the artificial generator process start via the given command and -make DAMON monitors the generator process. The third command retrieves the -current snapshot of the monitored access pattern of the process from DAMON and -shows the pattern in a human readable format. +to start the program via the given command and make DAMON monitors the newly +started process. The third command retrieves the current snapshot of the +monitored access pattern of the process from DAMON and shows the pattern in a +human readable format. -Each line of the output shows which virtual address range (``addr [XX, XX)``) -of the process is how frequently (``access XX %``) accessed for how long time -(``age XX``). For example, the fifth region of ~9 MiB size is being most -frequently accessed for last 300 milliseconds. Finally, the fourth command -stops DAMON. +The first line of the output shows the relative access temperature (hotness) of +the regions in a single row hetmap format. Each column on the heatmap +represents regions of same size on the monitored virtual address space. The +position of the colun on the row and the number on the column represents the +relative location and access temperature of the region. ``[...]`` means +unmapped huge regions on the virtual address spaces. The second line shows +additional information for better understanding the heatmap. + +Each line of the output from the third line shows which virtual address range +(``addr XX size XX``) of the process is how frequently (``access XX %``) +accessed for how long time (``age XX``). For example, the evelenth region of +~9.5 MiB size is being most frequently accessed for last 3.7 seconds. Finally, +the fourth command stops DAMON. Note that DAMON can monitor not only virtual address spaces but multiple types of address spaces including the physical address space. @@ -95,7 +108,7 @@ Visualizing Recorded Patterns You can visualize the pattern in a heatmap, showing which memory region (x-axis) got accessed when (y-axis) and how frequently (number).:: - $ sudo damo report heats --heatmap stdout + $ sudo damo report heatmap 22222222222222222222222222222222222222211111111111111111111111111111111111111100 44444444444444444444444444444444444444434444444444444444444444444444444444443200 44444444444444444444444444444444444444433444444444444444444444444444444444444200 @@ -160,6 +173,6 @@ Data Access Pattern Aware Memory Management Below command makes every memory region of size >=4K that has not accessed for >=60 seconds in your workload to be swapped out. :: - $ sudo damo schemes --damos_access_rate 0 0 --damos_sz_region 4K max \ - --damos_age 60s max --damos_action pageout \ - + $ sudo damo start --damos_access_rate 0 0 --damos_sz_region 4K max \ + --damos_age 60s max --damos_action pageout \ + diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index d9be9f7caa7d..47a44bd348ab 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -26,12 +26,6 @@ DAMON provides below interfaces for different users. writing kernel space DAMON application programs for you. You can even extend DAMON for various address spaces. For detail, please refer to the interface :doc:`document `. -- *debugfs interface. (DEPRECATED!)* - :ref:`This ` is almost identical to :ref:`sysfs interface - `. This is deprecated, so users should move to the - :ref:`sysfs interface `. If you depend on this and cannot - move, please report your usecase to damon@lists.linux.dev and - linux-mm@kvack.org. .. _sysfs_interface: @@ -89,10 +83,10 @@ comma (","). │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value │ │ │ │ │ │ │ :ref:`watermarks `/metric,interval_us,high,mid,low │ │ │ │ │ │ │ :ref:`filters `/nr_filters - │ │ │ │ │ │ │ │ 0/type,matching,memcg_id - │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds + │ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx + │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds │ │ │ │ │ │ │ :ref:`tried_regions `/total_bytes - │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age + │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age,sz_filter_passed │ │ │ │ │ │ │ │ ... │ │ │ │ │ │ ... │ │ │ │ ... @@ -412,59 +406,62 @@ number (``N``) to the file creates the number of child directories named ``0`` to ``N-1``. Each directory represents each filter. The filters are evaluated in the numeric order. -Each filter directory contains six files, namely ``type``, ``matcing``, -``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``. To ``type`` -file, you can write one of five special keywords: ``anon`` for anonymous pages, -``memcg`` for specific memory cgroup, ``young`` for young pages, ``addr`` for -specific address range (an open-ended interval), or ``target`` for specific -DAMON monitoring target filtering. In case of the memory cgroup filtering, you -can specify the memory cgroup of the interest by writing the path of the memory -cgroup from the cgroups mount point to ``memcg_path`` file. In case of the -address range filtering, you can specify the start and end address of the range -to ``addr_start`` and ``addr_end`` files, respectively. For the DAMON -monitoring target filtering, you can specify the index of the target between -the list of the DAMON context's monitoring targets list to ``target_idx`` file. -You can write ``Y`` or ``N`` to ``matching`` file to filter out pages that does -or does not match to the type, respectively. Then, the scheme's action will -not be applied to the pages that specified to be filtered out. +Each filter directory contains seven files, namely ``type``, ``matching``, +``allow``, ``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``. +To ``type`` file, you can write one of five special keywords: ``anon`` for +anonymous pages, ``memcg`` for specific memory cgroup, ``young`` for young +pages, ``addr`` for specific address range (an open-ended interval), or +``target`` for specific DAMON monitoring target filtering. Meaning of the +types are same to the description on the :ref:`design doc +`. + +In case of the memory cgroup filtering, you can specify the memory cgroup of +the interest by writing the path of the memory cgroup from the cgroups mount +point to ``memcg_path`` file. In case of the address range filtering, you can +specify the start and end address of the range to ``addr_start`` and +``addr_end`` files, respectively. For the DAMON monitoring target filtering, +you can specify the index of the target between the list of the DAMON context's +monitoring targets list to ``target_idx`` file. + +You can write ``Y`` or ``N`` to ``matching`` file to specify whether the filter +is for memory that matches the ``type``. You can write ``Y`` or ``N`` to +``allow`` file to specify if applying the action to the memory that satisfies +the ``type`` and ``matching`` should be allowed or not. For example, below restricts a DAMOS action to be applied to only non-anonymous pages of all memory cgroups except ``/having_care_already``.:: # echo 2 > nr_filters - # # filter out anonymous pages + # # disallow anonymous pages echo anon > 0/type echo Y > 0/matching + echo N > 0/allow # # further filter out all cgroups except one at '/having_care_already' echo memcg > 1/type echo /having_care_already > 1/memcg_path echo Y > 1/matching + echo N > 1/allow -Note that ``anon`` and ``memcg`` filters are currently supported only when -``paddr`` :ref:`implementation ` is being used. - -Also, memory regions that are filtered out by ``addr`` or ``target`` filters -are not counted as the scheme has tried to those, while regions that filtered -out by other type filters are counted as the scheme has tried to. The -difference is applied to :ref:`stats ` and -:ref:`tried regions `. +Refer to the :ref:`DAMOS filters design documentation +` for more details including how multiple filters +of different ``allow`` works, when each of the filters are supported, and +differences on stats. .. _sysfs_schemes_stats: schemes//stats/ ------------------ -DAMON counts the total number and bytes of regions that each scheme is tried to -be applied, the two numbers for the regions that each scheme is successfully -applied, and the total number of the quota limit exceeds. This statistics can -be used for online analysis or tuning of the schemes. +DAMON counts statistics for each scheme. This statistics can be used for +online analysis or tuning of the schemes. Refer to :ref:`design doc +` for more details about the stats. The statistics can be retrieved by reading the files under ``stats`` directory -(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, and -``qt_exceeds``), respectively. The files are not updated in real time, so you -should ask DAMON sysfs interface to update the content of the files for the -stats by writing a special keyword, ``update_schemes_stats`` to the relevant -``kdamonds//state`` file. +(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, +``sz_ops_filter_passed``, and ``qt_exceeds``), respectively. The files are not +updated in real time, so you should ask DAMON sysfs interface to update the +content of the files for the stats by writing a special keyword, +``update_schemes_stats`` to the relevant ``kdamonds//state`` file. .. _sysfs_schemes_tried_regions: @@ -501,10 +498,10 @@ set the ``access pattern`` as their interested pattern that they want to query. tried_regions// ------------------ -In each region directory, you will find four files (``start``, ``end``, -``nr_accesses``, and ``age``). Reading the files will show the start and end -addresses, ``nr_accesses``, and ``age`` of the region that corresponding -DAMON-based operation scheme ``action`` has tried to be applied. +In each region directory, you will find five files (``start``, ``end``, +``nr_accesses``, ``age``, and ``sz_filter_passed``). Reading the files will +show the properties of the region that corresponding DAMON-based operation +scheme ``action`` has tried to be applied. Example ~~~~~~~ @@ -600,306 +597,3 @@ fields are as usual. It shows the index of the DAMON context (``ctx_idx=X``) of the scheme in the list of the contexts of the context's kdamond, the index of the scheme (``scheme_idx=X``) in the list of the schemes of the context, in addition to the output of ``damon_aggregated`` tracepoint. - - -.. _debugfs_interface: - -debugfs Interface (DEPRECATED!) -=============================== - -.. note:: - - THIS IS DEPRECATED! - - DAMON debugfs interface is deprecated, so users should move to the - :ref:`sysfs interface `. If you depend on this and cannot - move, please report your usecase to damon@lists.linux.dev and - linux-mm@kvack.org. - -DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``, -``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, -``mk_contexts`` and ``rm_contexts`` under its debugfs directory, -``/damon/``. - - -``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation -notice. Reading it returns the deprecation notice, as below:: - - # cat DEPRECATED - DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org. - - -Attributes ----------- - -Users can get and set the ``sampling interval``, ``aggregation interval``, -``update interval``, and min/max number of monitoring target regions by -reading from and writing to the ``attrs`` file. To know about the monitoring -attributes in detail, please refer to the :doc:`/mm/damon/design`. For -example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and -1000, and then check it again:: - - # cd /damon - # echo 5000 100000 1000000 10 1000 > attrs - # cat attrs - 5000 100000 1000000 10 1000 - - -Target IDs ----------- - -Some types of address spaces supports multiple monitoring target. For example, -the virtual memory address spaces monitoring can have multiple processes as the -monitoring targets. Users can set the targets by writing relevant id values of -the targets to, and get the ids of the current targets by reading from the -``target_ids`` file. In case of the virtual address spaces monitoring, the -values should be pids of the monitoring target processes. For example, below -commands set processes having pids 42 and 4242 as the monitoring targets and -check it again:: - - # cd /damon - # echo 42 4242 > target_ids - # cat target_ids - 42 4242 - -Users can also monitor the physical memory address space of the system by -writing a special keyword, "``paddr\n``" to the file. Because physical address -space monitoring doesn't support multiple targets, reading the file will show a -fake value, ``42``, as below:: - - # cd /damon - # echo paddr > target_ids - # cat target_ids - 42 - -Note that setting the target ids doesn't start the monitoring. - - -Initial Monitoring Target Regions ---------------------------------- - -In case of the virtual address space monitoring, DAMON automatically sets and -updates the monitoring target regions so that entire memory mappings of target -processes can be covered. However, users can want to limit the monitoring -region to specific address ranges, such as the heap, the stack, or specific -file-mapped area. Or, some users can know the initial access pattern of their -workloads and therefore want to set optimal initial regions for the 'adaptive -regions adjustment'. - -In contrast, DAMON do not automatically sets and updates the monitoring target -regions in case of physical memory monitoring. Therefore, users should set the -monitoring target regions by themselves. - -In such cases, users can explicitly set the initial monitoring target regions -as they want, by writing proper values to the ``init_regions`` file. The input -should be a sequence of three integers separated by white spaces that represent -one region in below form.:: - - - -The ``target idx`` should be the index of the target in ``target_ids`` file, -starting from ``0``, and the regions should be passed in address order. For -example, below commands will set a couple of address ranges, ``1-100`` and -``100-200`` as the initial monitoring target region of pid 42, which is the -first one (index ``0``) in ``target_ids``, and another couple of address -ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one -(index ``1``) in ``target_ids``.:: - - # cd /damon - # cat target_ids - 42 4242 - # echo "0 1 100 \ - 0 100 200 \ - 1 20 40 \ - 1 50 100" > init_regions - -Note that this sets the initial monitoring target regions only. In case of -virtual memory monitoring, DAMON will automatically updates the boundary of the -regions after one ``update interval``. Therefore, users should set the -``update interval`` large enough in this case, if they don't want the -update. - - -Schemes -------- - -Users can get and set the DAMON-based operation :ref:`schemes -` by reading from and writing to ``schemes`` debugfs file. -Reading the file also shows the statistics of each scheme. To the file, each -of the schemes should be represented in each line in below form:: - - - -You can disable schemes by simply writing an empty string to the file. - -Target Access Pattern -~~~~~~~~~~~~~~~~~~~~~ - -The target access :ref:`pattern ` of the -scheme. The ```` is constructed with three ranges in -below form:: - - min-size max-size min-acc max-acc min-age max-age - -Specifically, bytes for the size of regions (``min-size`` and ``max-size``), -number of monitored accesses per aggregate interval for access frequency -(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of -regions (``min-age`` and ``max-age``) are specified. Note that the ranges are -closed interval. - -Action -~~~~~~ - -The ```` is a predefined integer for memory management :ref:`actions -`. The mapping between the ```` values and -the memory management actions is as below. For the detailed meaning of the -action and DAMON operations set supporting each action, please refer to the -list on :ref:`design doc `. - - - 0: ``willneed`` - - 1: ``cold`` - - 2: ``pageout`` - - 3: ``hugepage`` - - 4: ``nohugepage`` - - 5: ``stat`` - -Quota -~~~~~ - -Users can set the :ref:`quotas ` of the given scheme -via the ```` in below form:: - - - -This makes DAMON to try to use only up to ```` milliseconds for applying -the action to memory regions of the ``target access pattern`` within the -```` milliseconds, and to apply the action to only up to -```` bytes of memory regions within the ````. Setting both -```` and ```` zero disables the quota limits. - -For the :ref:`prioritization `, users -can set the weights for the three properties in ```` in below -form:: - - - -Watermarks -~~~~~~~~~~ - -Users can specify :ref:`watermarks ` of the -given scheme via ```` in below form:: - - - -```` is a predefined integer for the metric to be checked. The -supported numbers and their meanings are as below. - - - 0: Ignore the watermarks - - 1: System's free memory rate (per thousand) - -The value of the metric is checked every ```` microseconds. - -If the value is higher than ```` or lower than ````, the -scheme is deactivated. If the value is lower than ````, the scheme -is activated. - -.. _damos_stats: - -Statistics -~~~~~~~~~~ - -It also counts the total number and bytes of regions that each scheme is tried -to be applied, the two numbers for the regions that each scheme is successfully -applied, and the total number of the quota limit exceeds. This statistics can -be used for online analysis or tuning of the schemes. - -The statistics can be shown by reading the ``schemes`` file. Reading the file -will show each scheme you entered in each line, and the five numbers for the -statistics will be added at the end of each line. - -Example -~~~~~~~ - -Below commands applies a scheme saying "If a memory region of size in [4KiB, -8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate -interval in [10, 20], page out the region. For the paging out, use only up to -10ms per second, and also don't page out more than 1GiB per second. Under the -limitation, page out memory regions having longer age first. Also, check the -free memory rate of the system every 5 seconds, start the monitoring and paging -out when the free memory rate becomes lower than 50%, but stop it if the free -memory rate becomes larger than 60%, or lower than 30%".:: - - # cd /damon - # scheme="4096 8192 0 5 10 20 2" # target access pattern and action - # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas - # scheme+=" 0 0 100" # prioritization weights - # scheme+=" 1 5000000 600 500 300" # watermarks - # echo "$scheme" > schemes - - -Turning On/Off --------------- - -Setting the files as described above doesn't incur effect unless you explicitly -start the monitoring. You can start, stop, and check the current status of the -monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file. -Writing ``on`` to the file starts the monitoring of the targets with the -attributes. Writing ``off`` to the file stops those. DAMON also stops if -every target process is terminated. Below example commands turn on, off, and -check the status of DAMON:: - - # cd /damon - # echo on > monitor_on_DEPRECATED - # echo off > monitor_on_DEPRECATED - # cat monitor_on_DEPRECATED - off - -Please note that you cannot write to the above-mentioned debugfs files while -the monitoring is turned on. If you write to the files while DAMON is running, -an error code such as ``-EBUSY`` will be returned. - - -Monitoring Thread PID ---------------------- - -DAMON does requested monitoring with a kernel thread called ``kdamond``. You -can get the pid of the thread by reading the ``kdamond_pid`` file. When the -monitoring is turned off, reading the file returns ``none``. :: - - # cd /damon - # cat monitor_on_DEPRECATED - off - # cat kdamond_pid - none - # echo on > monitor_on_DEPRECATED - # cat kdamond_pid - 18594 - - -Using Multiple Monitoring Threads ---------------------------------- - -One ``kdamond`` thread is created for each monitoring context. You can create -and remove monitoring contexts for multiple ``kdamond`` required use case using -the ``mk_contexts`` and ``rm_contexts`` files. - -Writing the name of the new context to the ``mk_contexts`` file creates a -directory of the name on the DAMON debugfs directory. The directory will have -DAMON debugfs files for the context. :: - - # cd /damon - # ls foo - # ls: cannot access 'foo': No such file or directory - # echo foo > mk_contexts - # ls foo - # attrs init_regions kdamond_pid schemes target_ids - -If the context is not needed anymore, you can remove it and the corresponding -directory by putting the name of the context to the ``rm_contexts`` file. :: - - # echo foo > rm_contexts - # ls foo - # ls: cannot access 'foo': No such file or directory - -Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files -are in the root directory only. diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index cb2c080f400c..33c886f3d198 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -280,8 +280,8 @@ The following files are currently defined: blocks; configure auto-onlining. The default value depends on the - CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel configuration - option. + CONFIG_MHP_DEFAULT_ONLINE_TYPE kernel configuration + options. See the ``state`` property of memory blocks for details. ``block_size_bytes`` read-only: the size in bytes of a memory block. diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 8872203df088..dff8d5985f0f 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -332,6 +332,12 @@ allocation policy for the internal shmem mount by using the kernel parameter seven valid policies for shmem (``always``, ``within_size``, ``advise``, ``never``, ``deny``, and ``force``). +Similarly to ``transparent_hugepage_shmem``, you can control the default +hugepage allocation policy for the tmpfs mount by using the kernel parameter +``transparent_hugepage_tmpfs=``, where ```` is one of the +four valid policies for tmpfs (``always``, ``within_size``, ``advise``, +``never``). The tmpfs mount default policy is ``never``. + In the same manner as ``thp_anon`` controls each supported anonymous THP size, ``thp_shmem`` controls each supported shmem THP size. ``thp_shmem`` has the same format as ``thp_anon``, but also supports the policy @@ -352,8 +358,21 @@ default to ``never``. Hugepages in tmpfs/shmem ======================== -You can control hugepage allocation policy in tmpfs with mount option -``huge=``. It can have following values: +Traditionally, tmpfs only supported a single huge page size ("PMD"). Today, +it also supports smaller sizes just like anonymous memory, often referred +to as "multi-size THP" (mTHP). Huge pages of any size are commonly +represented in the kernel as "large folios". + +While there is fine control over the huge page sizes to use for the internal +shmem mount (see below), ordinary tmpfs mounts will make use of all available +huge page sizes without any control over the exact sizes, behaving more like +other file systems. + +tmpfs mounts +------------ + +The THP allocation policy for tmpfs mounts can be adjusted using the mount +option: ``huge=``. It can have following values: always Attempt to allocate huge pages every time we need a new page; @@ -363,24 +382,24 @@ never within_size Only allocate huge page if it will be fully within i_size. - Also respect fadvise()/madvise() hints; + Also respect madvise() hints; advise - Only allocate huge pages if requested with fadvise()/madvise(); + Only allocate huge pages if requested with madvise(); -The default policy is ``never``. +Remember, that the kernel may use huge pages of all available sizes, and +that no fine control as for the internal tmpfs mount is available. + +The default policy in the past was ``never``, but it can now be adjusted +using the kernel parameter ``transparent_hugepage_tmpfs=``. ``mount -o remount,huge= /mountpoint`` works fine after mount: remounting ``huge=never`` will not attempt to break up huge pages at all, just stop more from being allocated. -There's also sysfs knob to control hugepage allocation policy for internal -shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount -is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or -MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem. - -In addition to policies listed above, shmem_enabled allows two further -values: +In addition to policies listed above, the sysfs knob +/sys/kernel/mm/transparent_hugepage/shmem_enabled will affect the +allocation policy of tmpfs mounts, when set to the following values: deny For use in emergencies, to force the huge option off from @@ -388,13 +407,24 @@ deny force Force the huge option on for all - very useful for testing; -Shmem can also use "multi-size THP" (mTHP) by adding a new sysfs knob to -control mTHP allocation: -'/sys/kernel/mm/transparent_hugepage/hugepages-kB/shmem_enabled', -and its value for each mTHP is essentially consistent with the global -setting. An 'inherit' option is added to ensure compatibility with these -global settings. Conversely, the options 'force' and 'deny' are dropped, -which are rather testing artifacts from the old ages. +shmem / internal tmpfs +---------------------- +The mount internal tmpfs mount is used for SysV SHM, memfds, shared anonymous +mmaps (of /dev/zero or MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem. + +To control the THP allocation policy for this internal tmpfs mount, the +sysfs knob /sys/kernel/mm/transparent_hugepage/shmem_enabled and the knobs +per THP size in +'/sys/kernel/mm/transparent_hugepage/hugepages-kB/shmem_enabled' +can be used. + +The global knob has the same semantics as the ``huge=`` mount options +for tmpfs mounts, except that the different huge page sizes can be controlled +individually, and will only use the setting of the global knob when the +per-size knob is set to 'inherit'. + +The options 'force' and 'deny' are dropped for the individual sizes, which +are rather testing artifacts from the old ages. always Attempt to allocate huge pages every time we need a new page; @@ -408,10 +438,10 @@ never within_size Only allocate huge page if it will be fully within i_size. - Also respect fadvise()/madvise() hints; + Also respect madvise() hints; advise - Only allocate huge pages if requested with fadvise()/madvise(); + Only allocate huge pages if requested with madvise(); Need of application restart =========================== @@ -561,6 +591,16 @@ swpin is incremented every time a huge page is swapped in from a non-zswap swap device in one piece. +swpin_fallback + is incremented if swapin fails to allocate or charge a huge page + and instead falls back to using huge pages with lower orders or + small pages. + +swpin_fallback_charge + is incremented if swapin fails to charge a huge page and instead + falls back to using huge pages with lower orders or small pages + even though the allocation was successful. + swpout is incremented every time a huge page is swapped out to a non-zswap swap device in one piece without splitting. diff --git a/Documentation/core-api/min_heap.rst b/Documentation/core-api/min_heap.rst index 0c636c8b7aa5..683bc6d09f00 100644 --- a/Documentation/core-api/min_heap.rst +++ b/Documentation/core-api/min_heap.rst @@ -4,6 +4,8 @@ Min Heap API ============ +:Author: Kuan-Wei Chiu + Introduction ============ diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index 77e0ece2b1d6..f6a3eef4fe7f 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -42,8 +42,8 @@ call xa_tag_pointer() to create an entry with a tag, xa_untag_pointer() to turn a tagged entry back into an untagged pointer and xa_pointer_tag() to retrieve the tag of an entry. Tagged pointers use the same bits that are used to distinguish value entries from normal pointers, so you must -decide whether they want to store value entries or tagged pointers in -any particular XArray. +decide whether you want to store value entries or tagged pointers in any +particular XArray. The XArray does not support storing IS_ERR() pointers as some conflict with value entries or internal entries. @@ -52,8 +52,9 @@ An unusual feature of the XArray is the ability to create entries which occupy a range of indices. Once stored to, looking up any index in the range will return the same entry as looking up any other index in the range. Storing to any index will store to all of them. Multi-index -entries can be explicitly split into smaller entries, or storing ``NULL`` -into any entry will cause the XArray to forget about the range. +entries can be explicitly split into smaller entries. Unsetting (using +xa_erase() or xa_store() with ``NULL``) any entry will cause the XArray +to forget about the range. Normal API ========== @@ -63,13 +64,14 @@ for statically allocated XArrays or xa_init() for dynamically allocated ones. A freshly-initialised XArray contains a ``NULL`` pointer at every index. -You can then set entries using xa_store() and get entries -using xa_load(). xa_store will overwrite any entry with the -new entry and return the previous entry stored at that index. You can -use xa_erase() instead of calling xa_store() with a -``NULL`` entry. There is no difference between an entry that has never -been stored to, one that has been erased and one that has most recently -had ``NULL`` stored to it. +You can then set entries using xa_store() and get entries using +xa_load(). xa_store() will overwrite any entry with the new entry and +return the previous entry stored at that index. You can unset entries +using xa_erase() or by setting the entry to ``NULL`` using xa_store(). +There is no difference between an entry that has never been stored to +and one that has been erased with xa_erase(); an entry that has most +recently had ``NULL`` stored to it is also equivalent except if the +XArray was initialized with ``XA_FLAGS_ALLOC``. You can conditionally replace an entry at an index by using xa_cmpxchg(). Like cmpxchg(), it will only succeed if diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 6a882c57a7e7..238afcb86d1f 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -48,6 +48,7 @@ fixes/update part 1.1 Stefani Seibold June 9 2009 3.11 /proc//patch_state - Livepatch patch operation state 3.12 /proc//arch_status - Task architecture specific information 3.13 /proc//fd - List of symlinks to open files + 3.14 /proc//fd for fast access. ------------------------------------------------------- +3.14 /proc//ksm_merging_pages shows. + +ksm_process_profit +^^^^^^^^^^^^^^^^^^ + +The profit that KSM brings (Saved bytes). KSM can save memory by merging +identical pages, but also can consume additional memory, because it needs +to generate a number of rmap_items to save each scanned page's brief rmap +information. Some of these pages may be merged, but some may not be abled +to be merged after being checked several times, which are unprofitable +memory consumed. + +ksm_merge_any +^^^^^^^^^^^^^ + +It specifies whether the process's mm is added by prctl() into the +candidate list of KSM or not, and KSM scanning is fully enabled at process +level. + +ksm_mergeable +^^^^^^^^^^^^^ + +It specifies whether any VMAs of the process's mm are currently applicable +to KSM. + +More information about KSM can be found at +Documentation/admin-guide/mm/ksm.rst. + Chapter 4: Configuring procfs ============================= @@ -2261,7 +2331,7 @@ arguments are now protected against local eavesdroppers. hidepid=invisible or hidepid=2 means hidepid=1 plus all /proc// will be fully invisible to other users. It doesn't mean that it hides a fact whether a process with a specific pid value exists (it can be learned by other means, e.g. -by "kill -0 $PID"), but it hides process' uid and gid, which may be learned by +by "kill -0 $PID"), but it hides process's uid and gid, which may be learned by stat()'ing /proc// otherwise. It greatly complicates an intruder's task of gathering information about running processes, whether some daemon runs with elevated privileges, whether other user runs some sensitive program, whether diff --git a/Documentation/filesystems/squashfs.rst b/Documentation/filesystems/squashfs.rst index 4af8d6207509..45653b3228f9 100644 --- a/Documentation/filesystems/squashfs.rst +++ b/Documentation/filesystems/squashfs.rst @@ -6,7 +6,7 @@ Squashfs 4.0 Filesystem Squashfs is a compressed read-only filesystem for Linux. -It uses zlib, lz4, lzo, or xz compression to compress files, inodes and +It uses zlib, lz4, lzo, xz or zstd compression to compress files, inodes and directories. Inodes in the system are very small and all blocks are packed to minimise data overhead. Block sizes greater than 4K are supported up to a maximum of 1Mbytes (default block size 128K). @@ -16,8 +16,8 @@ use (i.e. in cases where a .tar.gz file may be used), and in constrained block device/memory systems (e.g. embedded systems) where low overhead is needed. -Mailing list: squashfs-devel@lists.sourceforge.net -Web site: www.squashfs.org +Mailing list (kernel code): linux-fsdevel@vger.kernel.org +Web site: github.com/plougher/squashfs-tools 1. Filesystem Features ---------------------- @@ -58,11 +58,9 @@ inodes have different sizes). As squashfs is a read-only filesystem, the mksquashfs program must be used to create populated squashfs filesystems. This and other squashfs utilities -can be obtained from http://www.squashfs.org. Usage instructions can be -obtained from this site also. - -The squashfs-tools development tree is now located on kernel.org - git://git.kernel.org/pub/scm/fs/squashfs/squashfs-tools.git +are very likely packaged by your linux distribution (called squashfs-tools). +The source code can be obtained from github.com/plougher/squashfs-tools. +Usage instructions can also be obtained from this site. 2.1 Mount options ----------------- diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index f9c50525bdbf..e28c6a1b40ae 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -203,6 +203,8 @@ This scheme, however, cannot preserve the quality of the output if the assumption is not guaranteed. +.. _damon_design_adaptive_regions_adjustment: + Adaptive Regions Adjustment ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -264,6 +266,61 @@ tracepoints. For more details, please refer to the documentations for respectively. +.. _damon_design_monitoring_params_tuning_guide: + +Monitoring Parameters Tuning Guide +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In short, set ``aggregation interval`` to capture meaningful amount of accesses +for the purpose. The amount of accesses can be measured using ``nr_accesses`` +and ``age`` of regions in the aggregated monitoring results snapshot. The +default value of the interval, ``100ms``, turns out to be too short in many +cases. Set ``sampling interval`` proportional to ``aggregation interval``. By +default, ``1/20`` is recommended as the ratio. + +``Aggregation interval`` should be set as the time interval that the workload +can make an amount of accesses for the monitoring purpose, within the interval. +If the interval is too short, only small number of accesses are captured. As a +result, the monitoring results look everything is samely accessed only rarely. +For many purposes, that would be useless. If it is too long, however, the time +to converge regions with the :ref:`regions adjustment mechanism +` can be too long, depending on the +time scale of the given purpose. This could happen if the workload is actually +making only rare accesses but the user thinks the amount of accesses for the +monitoring purpose too high. For such cases, the target amount of access to +capture per ``aggregation interval`` should carefully reconsidered. Also, note +that the captured amount of accesses is represented with not only +``nr_accesses``, but also ``age``. For example, even if every region on the +monitoring results show zero ``nr_accesses``, regions could still be +distinguished using ``age`` values as the recency information. + +Hence the optimum value of ``aggregation interval`` depends on the access +intensiveness of the workload. The user should tune the interval based on the +amount of access that captured on each aggregated snapshot of the monitoring +results. + +Note that the default value of the interval is 100 milliseconds, which is too +short in many cases, especially on large systems. + +``Sampling interval`` defines the resolution of each aggregation. If it is set +too large, monitoring results will look like every region was samely rarely +accessed, or samely frequently accessed. That is, regions become +undistinguishable based on access pattern, and therefore the results will be +useless in many use cases. If ``sampling interval`` is too small, it will not +degrade the resolution, but will increase the monitoring overhead. If it is +appropriate enough to provide a resolution of the monitoring results that +sufficient for the given purpose, it shouldn't be unnecessarily further +lowered. It is recommended to be set proportional to ``aggregation interval``. +By default, the ratio is set as ``1/20``, and it is still recommended. + +Refer to below documents for an example tuning based on the above guide. + +.. toctree:: + :maxdepth: 1 + + monitoring_intervals_tuning_example + + .. _damon_design_damos: Operation Schemes @@ -504,9 +561,34 @@ have a list of latency-critical processes. To let users optimize DAMOS schemes with such special knowledge, DAMOS provides a feature called DAMOS filters. The feature allows users to set an arbitrary -number of filters for each scheme. Each filter specifies the type of target -memory, and whether it should exclude the memory of the type (filter-out), or -all except the memory of the type (filter-in). +number of filters for each scheme. Each filter specifies + +- a type of memory (``type``), +- whether it is for the memory of the type or all except the type + (``matching``), and +- whether it is to allow (include) or reject (exclude) applying + the scheme's action to the memory (``allow``). + +When multiple filters are installed, each filter is evaluated in the installed +order. If a part of memory is matched to one of the filter, next filters are +ignored. If the memory passes through the filters evaluation stage because it +is not matched to any of the filters, applying the scheme's action to it is +allowed, same to the behavior when no filter exists. + +For example, let's assume 1) a filter for allowing anonymous pages and 2) +another filter for rejecting young pages are installed in the order. If a page +of a region that eligible to apply the scheme's action is an anonymous page, +the scheme's action will be applied to the page regardless of whether it is +young or not, since it matches with the first allow-filter. If the page is +not anonymous but young, the scheme's action will not be applied, since the +second reject-filter blocks it. If the page is neither anonymous nor young, +the page will pass through the filters evaluation stage since there is no +matching filter, and the action will be applied to the page. + +Note that the action can equally be applied to memory that either explicitly +filter-allowed or filters evaluation stage passed. It means that installing +allow-filters at the end of the list makes no practical change but only +filters-checking overhead. For efficient handling of filters, some types of filters are handled by the core layer, while others are handled by operations set. In the latter case, @@ -516,7 +598,7 @@ filter are not counted as the scheme has tried to the region. In contrast, if a memory regions is filtered by an operations set layer-handled filter, it is counted as the scheme has tried. This difference affects the statistics. -Below types of filters are currently supported. +Below ``type`` of filters are currently supported. - anonymous page - Applied to pages that containing data that not stored in files. @@ -539,6 +621,60 @@ To know how user-space can set the watermarks via :ref:`DAMON sysfs interface `, refer to :ref:`filters ` part of the documentation. +.. _damon_design_damos_stat: + +Statistics +~~~~~~~~~~ + +The statistics of DAMOS behaviors that designed to help monitoring, tuning and +debugging of DAMOS. + +DAMOS accounts below statistics for each scheme, from the beginning of the +scheme's execution. + +- ``nr_tried``: Total number of regions that the scheme is tried to be applied. +- ``sz_trtied``: Total size of regions that the scheme is tried to be applied. +- ``sz_ops_filter_passed``: Total bytes that passed operations set + layer-handled DAMOS filters. +- ``nr_applied``: Total number of regions that the scheme is applied. +- ``sz_applied``: Total size of regions that the scheme is applied. +- ``qt_exceeds``: Total number of times the quota of the scheme has exceeded. + +"A scheme is tried to be applied to a region" means DAMOS core logic determined +the region is eligible to apply the scheme's :ref:`action +`. The :ref:`access pattern +`, :ref:`quotas +`, :ref:`watermarks +`, and :ref:`filters +` that handled on core logic could affect this. +The core logic will only ask the underlying :ref:`operation set +` to do apply the action to the region, so whether the +action is really applied or not is unclear. That's why it is called "tried". + +"A scheme is applied to a region" means the :ref:`operation set +` has applied the action to at least a part of the +region. The :ref:`filters ` that handled by the +operation set, and the types of the :ref:`action ` +and the pages of the region can affect this. For example, if a filter is set +to exclude anonymous pages and the region has only anonymous pages, or if the +action is ``pageout`` while all pages of the region are unreclaimable, applying +the action to the region will fail. + +To know how user-space can read the stats via :ref:`DAMON sysfs interface +`, refer to :ref:s`stats ` part of the +documentation. + +Regions Walking +~~~~~~~~~~~~~~~ + +DAMOS feature allowing users access each region that a DAMOS action has just +applied. Using this feature, DAMON :ref:`API ` allows users +access full properties of the regions including the access monitoring results +and amount of the region's internal memory that passed the DAMOS filters. +:ref:`DAMON sysfs interface ` also allows users read the data +via special :ref:`files `. + +.. _damon_design_api: Application Programming Interface --------------------------------- @@ -573,15 +709,11 @@ General Purpose User Interface Modules DAMON modules that provide user space ABIs for general purpose DAMON usage in runtime. -DAMON user interface modules, namely 'DAMON sysfs interface' and 'DAMON debugfs -interface' are DAMON API user kernel modules that provide ABIs to the -user-space. Please note that DAMON debugfs interface is currently deprecated. - -Like many other ABIs, the modules create files on sysfs and debugfs, allow -users to specify their requests to and get the answers from DAMON by writing to -and reading from the files. As a response to such I/O, DAMON user interface -modules control DAMON and retrieve the results as user requested via the DAMON -API, and return the results to the user-space. +Like many other ABIs, the modules create files on pseudo file systems like +'sysfs', allow users to specify their requests to and get the answers from +DAMON by writing to and reading from the files. As a response to such I/O, +DAMON user interface modules control DAMON and retrieve the results as user +requested via the DAMON API, and return the results to the user-space. The ABIs are designed to be used for user space applications development, rather than human beings' fingers. Human users are recommended to use such @@ -590,8 +722,9 @@ Github (https://github.com/damonitor/damo), Pypi (https://pypistats.org/packages/damo), and Fedora (https://packages.fedoraproject.org/pkgs/python-damo/damo/). -Please refer to the ABI :doc:`document ` for -details of the interfaces. +Currently, one module for this type, namely 'DAMON sysfs interface' is +available. Please refer to the ABI :ref:`doc ` for details of +the interfaces. Special-Purpose Access-aware Kernel Modules @@ -599,8 +732,8 @@ Special-Purpose Access-aware Kernel Modules DAMON modules that provide user space ABI for specific purpose DAMON usage. -DAMON sysfs/debugfs user interfaces are for full control of all DAMON features -in runtime. For each special-purpose system-wide data access-aware system +DAMON user interface modules are for full control of all DAMON features in +runtime. For each special-purpose system-wide data access-aware system operations such as proactive reclamation or LRU lists balancing, the interfaces could be simplified by removing unnecessary knobs for the specific purpose, and extended for boot-time and even compile time control. Default values of DAMON diff --git a/Documentation/mm/damon/monitoring_intervals_tuning_example.rst b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst new file mode 100644 index 000000000000..334a854efb40 --- /dev/null +++ b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst @@ -0,0 +1,247 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================================= +DAMON Moniting Interval Parameters Tuning Example +================================================= + +DAMON's monitoring parameters need tuning based on given workload and the +monitoring purpose. There is a :ref:`tuning guide +` for that. This document +provides an example tuning based on the guide. + +Setup +===== + +For below example, DAMON of Linux kernel v6.11 and `damo +`_ (DAMON user-space tool) v2.5.9 was used to +monitor and visualize access patterns on the physical address space of a system +running a real-world server workload. + +5ms/100ms intervals: Too Short Interval +======================================= + +Let's start by capturing the access pattern snapshot on the physical address +space of the system using DAMON, with the default interval parameters (5 +milliseconds and 100 milliseconds for the sampling and the aggregation +intervals, respectively). Wait ten minutes between the start of DAMON and +the capturing of the snapshot, to show a meaningful time-wise access patterns. +:: + + # damo start + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + +Then, list the DAMON-found regions of different access patterns, sorted by the +"access temperature". "Access temperature" is a metric representing the +access-hotness of a region. It is calculated as a weighted sum of the access +frequency and the age of the region. If the access frequency is 0 %, the +temperature is multipled by minus one. That is, if a region is not accessed, +it gets minus temperature and it gets lower as not accessed for longer time. +The sorting is in temperature-ascendint order, so the region at the top of the +list is the coldest, and the one at the bottom is the hottest one. :: + + # damo report access --sort_regions_by temperature + 0 addr 16.052 GiB size 5.985 GiB access 0 % age 5.900 s # coldest + 1 addr 22.037 GiB size 6.029 GiB access 0 % age 5.300 s + 2 addr 28.065 GiB size 6.045 GiB access 0 % age 5.200 s + 3 addr 10.069 GiB size 5.983 GiB access 0 % age 4.500 s + 4 addr 4.000 GiB size 6.069 GiB access 0 % age 4.400 s + 5 addr 62.008 GiB size 3.992 GiB access 0 % age 3.700 s + 6 addr 56.795 GiB size 5.213 GiB access 0 % age 3.300 s + 7 addr 39.393 GiB size 6.096 GiB access 0 % age 2.800 s + 8 addr 50.782 GiB size 6.012 GiB access 0 % age 2.800 s + 9 addr 34.111 GiB size 5.282 GiB access 0 % age 2.300 s + 10 addr 45.489 GiB size 5.293 GiB access 0 % age 1.800 s # hottest + total size: 62.000 GiB + +The list shows not seemingly hot regions, and only minimum access pattern +diversity. Every region has zero access frequency. The number of region is +10, which is the default ``min_nr_regions value``. Size of each region is also +nearly idential. We can suspect this is because “adaptive regions adjustment” +mechanism was not well working. As the guide suggested, we can get relative +hotness of regions using ``age`` as the recency information. That would be +better than nothing, but given the fact that the longest age is only about 6 +seconds while we waited about ten minuts, it is unclear how useful this will +be. + +The temperature ranges to total size of regions of each range histogram +visualization of the results also shows no interesting distribution pattern. :: + + # damo report access --style temperature-sz-hist + + [-,590,000,000, -,549,000,000) 5.985 GiB |********** | + [-,549,000,000, -,508,000,000) 12.074 GiB |********************| + [-,508,000,000, -,467,000,000) 0 B | | + [-,467,000,000, -,426,000,000) 12.052 GiB |********************| + [-,426,000,000, -,385,000,000) 0 B | | + [-,385,000,000, -,344,000,000) 3.992 GiB |******* | + [-,344,000,000, -,303,000,000) 5.213 GiB |********* | + [-,303,000,000, -,262,000,000) 12.109 GiB |********************| + [-,262,000,000, -,221,000,000) 5.282 GiB |********* | + [-,221,000,000, -,180,000,000) 0 B | | + [-,180,000,000, -,139,000,000) 5.293 GiB |********* | + total size: 62.000 GiB + +In short, the parameters provide poor quality monitoring results for hot +regions detection. According to the :ref:`guide +`, this is due to the too short +aggregation interval. + +100ms/2s intervals: Starts Showing Small Hot Regions +==================================================== + +Following the guide, increase the interval 20 times (100 milliseocnds and 2 +seconds for sampling and aggregation intervals, respectively). :: + + # damo start -s 100ms -a 2s + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + # damo report access --sort_regions_by temperature + 0 addr 10.180 GiB size 6.117 GiB access 0 % age 7 m 8 s # coldest + 1 addr 49.275 GiB size 6.195 GiB access 0 % age 6 m 14 s + 2 addr 62.421 GiB size 3.579 GiB access 0 % age 6 m 4 s + 3 addr 40.154 GiB size 6.127 GiB access 0 % age 5 m 40 s + 4 addr 16.296 GiB size 6.182 GiB access 0 % age 5 m 32 s + 5 addr 34.254 GiB size 5.899 GiB access 0 % age 5 m 24 s + 6 addr 46.281 GiB size 2.995 GiB access 0 % age 5 m 20 s + 7 addr 28.420 GiB size 5.835 GiB access 0 % age 5 m 6 s + 8 addr 4.000 GiB size 6.180 GiB access 0 % age 4 m 16 s + 9 addr 22.478 GiB size 5.942 GiB access 0 % age 3 m 58 s + 10 addr 55.470 GiB size 915.645 MiB access 0 % age 3 m 6 s + 11 addr 56.364 GiB size 6.056 GiB access 0 % age 2 m 8 s + 12 addr 56.364 GiB size 4.000 KiB access 95 % age 16 s + 13 addr 49.275 GiB size 4.000 KiB access 100 % age 8 m 24 s # hottest + total size: 62.000 GiB + # damo report access --style temperature-sz-hist + + [-42,800,000,000, -33,479,999,000) 22.018 GiB |***************** | + [-33,479,999,000, -24,159,998,000) 27.090 GiB |********************| + [-24,159,998,000, -14,839,997,000) 6.836 GiB |****** | + [-14,839,997,000, -5,519,996,000) 6.056 GiB |***** | + [-5,519,996,000, 3,800,005,000) 4.000 KiB |* | + [3,800,005,000, 13,120,006,000) 0 B | | + [13,120,006,000, 22,440,007,000) 0 B | | + [22,440,007,000, 31,760,008,000) 0 B | | + [31,760,008,000, 41,080,009,000) 0 B | | + [41,080,009,000, 50,400,010,000) 0 B | | + [50,400,010,000, 59,720,011,000) 4.000 KiB |* | + total size: 62.000 GiB + +DAMON found two distinct 4 KiB regions that pretty hot. The regions are also +well aged. The hottest 4 KiB region was keeping the access frequency for about +8 minutes, and the coldest region was keeping no access for about 7 minutes. +The distribution on the histogram also looks like having a pattern. + +Especially, the finding of the 4 KiB regions among the 62 GiB total memory +shows DAMON’s adaptive regions adjustment is working as designed. + +Still the number of regions is close to the ``min_nr_regions``, and sizes of +cold regions are similar, though. Apparently it is improved, but it still has +rooms to improve. + +400ms/8s intervals: Pretty Improved Results +=========================================== + +Increase the intervals four times (400 milliseconds and 8 seconds +for sampling and aggregation intervals, respectively). :: + + # damo start -s 400ms -a 8s + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + # damo report access --sort_regions_by temperature + 0 addr 64.492 GiB size 1.508 GiB access 0 % age 6 m 48 s # coldest + 1 addr 21.749 GiB size 5.674 GiB access 0 % age 6 m 8 s + 2 addr 27.422 GiB size 5.801 GiB access 0 % age 6 m + 3 addr 49.431 GiB size 8.675 GiB access 0 % age 5 m 28 s + 4 addr 33.223 GiB size 5.645 GiB access 0 % age 5 m 12 s + 5 addr 58.321 GiB size 6.170 GiB access 0 % age 5 m 4 s + [...] + 25 addr 6.615 GiB size 297.531 MiB access 15 % age 0 ns + 26 addr 9.513 GiB size 12.000 KiB access 20 % age 0 ns + 27 addr 9.511 GiB size 108.000 KiB access 25 % age 0 ns + 28 addr 9.513 GiB size 20.000 KiB access 25 % age 0 ns + 29 addr 9.511 GiB size 12.000 KiB access 30 % age 0 ns + 30 addr 9.520 GiB size 4.000 KiB access 40 % age 0 ns + [...] + 41 addr 9.520 GiB size 4.000 KiB access 80 % age 56 s + 42 addr 9.511 GiB size 12.000 KiB access 100 % age 6 m 16 s + 43 addr 58.321 GiB size 4.000 KiB access 100 % age 6 m 24 s + 44 addr 9.512 GiB size 4.000 KiB access 100 % age 6 m 48 s + 45 addr 58.106 GiB size 4.000 KiB access 100 % age 6 m 48 s # hottest + total size: 62.000 GiB + # damo report access --style temperature-sz-hist + + [-40,800,000,000, -32,639,999,000) 21.657 GiB |********************| + [-32,639,999,000, -24,479,998,000) 17.938 GiB |***************** | + [-24,479,998,000, -16,319,997,000) 16.885 GiB |**************** | + [-16,319,997,000, -8,159,996,000) 586.879 MiB |* | + [-8,159,996,000, 5,000) 4.946 GiB |***** | + [5,000, 8,160,006,000) 260.000 KiB |* | + [8,160,006,000, 16,320,007,000) 0 B | | + [16,320,007,000, 24,480,008,000) 0 B | | + [24,480,008,000, 32,640,009,000) 0 B | | + [32,640,009,000, 40,800,010,000) 16.000 KiB |* | + [40,800,010,000, 48,960,011,000) 8.000 KiB |* | + total size: 62.000 GiB + +The number of regions having different access patterns has significantly +increased. Size of each region is also more varied. Total size of non-zero +access frequency regions is also significantly increased. Maybe this is already +good enough to make some meaningful memory management efficieny changes. + +800ms/16s intervals: Another bias +================================= + +Further double the intervals (800 milliseconds and 16 seconds for sampling +and aggregation intervals, respectively). The results is more improved for the +hot regions detection, but starts looking degrading cold regions detection. :: + + # damo start -s 800ms -a 16s + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + # damo report access --sort_regions_by temperature + 0 addr 64.781 GiB size 1.219 GiB access 0 % age 4 m 48 s + 1 addr 24.505 GiB size 2.475 GiB access 0 % age 4 m 16 s + 2 addr 26.980 GiB size 504.273 MiB access 0 % age 4 m + 3 addr 29.443 GiB size 2.462 GiB access 0 % age 4 m + 4 addr 37.264 GiB size 5.645 GiB access 0 % age 4 m + 5 addr 31.905 GiB size 5.359 GiB access 0 % age 3 m 44 s + [...] + 20 addr 8.711 GiB size 40.000 KiB access 5 % age 2 m 40 s + 21 addr 27.473 GiB size 1.970 GiB access 5 % age 4 m + 22 addr 48.185 GiB size 4.625 GiB access 5 % age 4 m + 23 addr 47.304 GiB size 902.117 MiB access 10 % age 4 m + 24 addr 8.711 GiB size 4.000 KiB access 100 % age 4 m + 25 addr 20.793 GiB size 3.713 GiB access 5 % age 4 m 16 s + 26 addr 8.773 GiB size 4.000 KiB access 100 % age 4 m 16 s + total size: 62.000 GiB + # damo report access --style temperature-sz-hist + + [-28,800,000,000, -23,359,999,000) 12.294 GiB |***************** | + [-23,359,999,000, -17,919,998,000) 9.753 GiB |************* | + [-17,919,998,000, -12,479,997,000) 15.131 GiB |********************| + [-12,479,997,000, -7,039,996,000) 0 B | | + [-7,039,996,000, -1,599,995,000) 7.506 GiB |********** | + [-1,599,995,000, 3,840,006,000) 6.127 GiB |********* | + [3,840,006,000, 9,280,007,000) 0 B | | + [9,280,007,000, 14,720,008,000) 136.000 KiB |* | + [14,720,008,000, 20,160,009,000) 40.000 KiB |* | + [20,160,009,000, 25,600,010,000) 11.188 GiB |*************** | + [25,600,010,000, 31,040,011,000) 4.000 KiB |* | + total size: 62.000 GiB + +It found more non-zero access frequency regions. The number of regions is still +much higher than the ``min_nr_regions``, but it is reduced from that of the +previous setup. And apparently the distribution seems bit biased to hot +regions. + +Conclusion +========== + +With the above experimental tuning results, we can conclude the theory and the +guide makes sense to at least this workload, and could be applied to similar +cases. diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index 1d416658d7f5..f573de936b5d 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -531,6 +531,10 @@ are extra requirements for accessing them: new page table has been installed in the same location and filled with entries. Writers normally need to take the PTE lock and revalidate that the PMD entry still refers to the same PTE-level page table. + If the writer does not care whether it is the same PTE-level page table, it + can take the PMD lock and revalidate that the contents of pmd entry still meet + the requirements. In particular, this also happens in :c:func:`!retract_page_tables` + when handling :c:macro:`!MADV_COLLAPSE`. To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or :c:func:`!pte_offset_map` can be used depending on stability requirements. @@ -712,9 +716,14 @@ calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`, before releasing the RCU lock via :c:func:`!rcu_read_unlock`. -VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for -their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it -via :c:func:`!vma_end_read`. +In cases when the user already holds mmap read lock, :c:func:`!vma_start_read_locked` +and :c:func:`!vma_start_read_locked_nested` can be used. These functions do not +fail due to lock contention but the caller should still check their return values +in case they fail for other reasons. + +VMA read locks increment :c:member:`!vma.vm_refcnt` reference counter for their +duration and the caller of :c:func:`!lock_vma_under_rcu` must drop it via +:c:func:`!vma_end_read`. VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always @@ -722,9 +731,9 @@ acquired. An mmap write lock **must** be held for the duration of the VMA write lock, releasing or downgrading the mmap write lock also releases the VMA write lock so there is no :c:func:`!vma_end_write` function. -Note that a semaphore write lock is not held across a VMA lock. Rather, a -sequence number is used for serialisation, and the write semaphore is only -acquired at the point of write lock to update this. +Note that when write-locking a VMA lock, the :c:member:`!vma.vm_refcnt` is temporarily +modified so that readers can detect the presense of a writer. The reference counter is +restored once the vma sequence number used for serialisation is updated. This ensures the semantics we require - VMA write locks provide exclusive write access to the VMA. @@ -734,7 +743,7 @@ Implementation details The VMA lock mechanism is designed to be a lightweight means of avoiding the use of the heavily contended mmap lock. It is implemented using a combination of a -read/write semaphore and sequence numbers belonging to the containing +reference counter and sequence numbers belonging to the containing :c:struct:`!struct mm_struct` and the VMA. Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic @@ -775,28 +784,31 @@ release of any VMA locks on its release makes sense, as you would never want to keep VMAs locked across entirely separate write operations. It also maintains correct lock ordering. -Each time a VMA read lock is acquired, we acquire a read lock on the -:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that -the sequence count of the VMA does not match that of the mm. +Each time a VMA read lock is acquired, we increment :c:member:`!vma.vm_refcnt` +reference counter and check that the sequence count of the VMA does not match +that of the mm. -If it does, the read lock fails. If it does not, we hold the lock, excluding -writers, but permitting other readers, who will also obtain this lock under RCU. +If it does, the read lock fails and :c:member:`!vma.vm_refcnt` is dropped. +If it does not, we keep the reference counter raised, excluding writers, but +permitting other readers, who can also obtain this lock under RCU. Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu` are also RCU safe, so the whole read lock operation is guaranteed to function correctly. -On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock` -read/write semaphore, before setting the VMA's sequence number under this lock, -also simultaneously holding the mmap write lock. +On the write side, we set a bit in :c:member:`!vma.vm_refcnt` which can't be +modified by readers and wait for all readers to drop their reference count. +Once there are no readers, VMA's sequence number is set to match that of the +mm. During this entire operation mmap write lock is held. This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep until these are finished and mutual exclusion is achieved. -After setting the VMA's sequence number, the lock is released, avoiding -complexity with a long-term held write lock. +After setting the VMA's sequence number, the bit in :c:member:`!vma.vm_refcnt` +indicating a writer is cleared. From this point on, VMA's sequence number will +indicate VMA's write-locked state until mmap write lock is dropped or downgraded. -This clever combination of a read/write semaphore and sequence count allows for +This clever combination of a reference counter and sequence count allows for fast RCU-based per-VMA lock acquisition (especially on page fault, though utilised elsewhere) with minimal complexity around lock ordering. diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst index 581446d4a4eb..8e1ceb0a6619 100644 --- a/Documentation/mm/split_page_table_lock.rst +++ b/Documentation/mm/split_page_table_lock.rst @@ -62,7 +62,7 @@ Support of split page table lock by an architecture =================================================== There's no need in special enabling of PTE split page table lock: everything -required is done by pagetable_pte_ctor() and pagetable_pte_dtor(), which +required is done by pagetable_pte_ctor() and pagetable_dtor(), which must be called on PTE table allocation / freeing. Make sure the architecture doesn't use slab allocator for page table @@ -73,7 +73,7 @@ PMD split lock only makes sense if you have more than two page table levels. PMD split lock enabling requires pagetable_pmd_ctor() call on PMD table -allocation and pagetable_pmd_dtor() on freeing. +allocation and pagetable_dtor() on freeing. Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst index 50f6f0b6bf11..9d7cb51be493 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst @@ -26,12 +26,7 @@ DAMON 为不同的用户提供了下面这些接口。 使用它,用户可以通过读取和写入特殊的sysfs文件来使用DAMON的主要功能。因此,你可以编写和使 用你个性化的DAMON sysfs包装程序,代替你读/写sysfs文件。 `DAMON用户空间工具 `_ 就是这种程序的一个例子 它同时支持虚拟和物理地址 - 空间的监测。注意,这个界面只提供简单的监测结果 :ref:`统计 `。对于详细的监测 - 结果,DAMON提供了一个:ref:`跟踪点 `。 -- *debugfs interface.* - :ref:`这 ` 几乎与:ref:`sysfs interface ` 接 - 口相同。这将在下一个LTS内核发布后被移除,所以用户应该转移到 - :ref:`sysfs interface `。 + 空间的监测。 - *内核空间编程接口。* :doc:`这 ` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内 核空间的DAMON应用程序,最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。 @@ -335,247 +330,6 @@ tried_regions// 请注意,我们强烈建议使用用户空间的工具,如 `damo `_ , 而不是像上面那样手动读写文件。以上只是一个例子。 -debugfs接口 -=========== - -.. note:: - - DAMON debugfs接口将在下一个LTS内核发布后被移除,所以用户应该转移到 - :ref:`sysfs接口`。 - -DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``, -``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和 -``rm_contexts`` under its debugfs directory, ``/damon/``. - - -属性 ----- - -用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 、 ``聚集间隔`` 、 ``更新间隔`` -以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/mm/damon/design` 。例如, -下面的命令将这些值设置为5ms、100ms、1000ms、10和1000,然后再次检查:: - - # cd /damon - # echo 5000 100000 1000000 10 1000 > attrs - # cat attrs - 5000 100000 1000000 10 1000 - - -目标ID ------- - -一些类型的地址空间支持多个监测目标。例如,虚拟内存地址空间的监测可以有多个进程作为监测目标。用户 -可以通过写入目标的相关id值来设置目标,并通过读取 ``target_ids`` 文件来获得当前目标的id。在监 -测虚拟地址空间的情况下,这些值应该是监测目标进程的pid。例如,下面的命令将pid为42和4242的进程设 -为监测目标,并再次检查:: - - # cd /damon - # echo 42 4242 > target_ids - # cat target_ids - 42 4242 - -用户还可以通过在文件中写入一个特殊的关键字 "paddr\n" 来监测系统的物理内存地址空间。因为物理地 -址空间监测不支持多个目标,读取文件会显示一个假值,即 ``42`` ,如下图所示:: - - # cd /damon - # echo paddr > target_ids - # cat target_ids - 42 - -请注意,设置目标ID并不启动监测。 - - -初始监测目标区域 ----------------- - -在虚拟地址空间监测的情况下,DAMON自动设置和更新监测的目标区域,这样就可以覆盖目标进程的整个 -内存映射。然而,用户可能希望将监测区域限制在特定的地址范围内,如堆、栈或特定的文件映射区域。 -或者,一些用户可以知道他们工作负载的初始访问模式,因此希望为“自适应区域调整”设置最佳初始区域。 - -相比之下,DAMON在物理内存监测的情况下不会自动设置和更新监测目标区域。因此,用户应该自己设置 -监测目标区域。 - -在这种情况下,用户可以通过在 ``init_regions`` 文件中写入适当的值,明确地设置他们想要的初 -始监测目标区域。输入应该是一个由三个整数组成的队列,用空格隔开,代表一个区域的形式如下:: - - - -目标idx应该是 ``target_ids`` 文件中目标的索引,从 ``0`` 开始,区域应该按照地址顺序传递。 -例如,下面的命令将设置几个地址范围, ``1-100`` 和 ``100-200`` 作为pid 42的初始监测目标 -区域,这是 ``target_ids`` 中的第一个(索引 ``0`` ),另外几个地址范围, ``20-40`` 和 -``50-100`` 作为pid 4242的地址,这是 ``target_ids`` 中的第二个(索引 ``1`` ):: - - # cd /damon - # cat target_ids - 42 4242 - # echo "0 1 100 \ - 0 100 200 \ - 1 20 40 \ - 1 50 100" > init_regions - -请注意,这只是设置了初始的监测目标区域。在虚拟内存监测的情况下,DAMON会在一个 ``更新间隔`` -后自动更新区域的边界。因此,在这种情况下,如果用户不希望更新的话,应该把 ``更新间隔`` 设 -置得足够大。 - - -方案 ----- - -对于通常的基于DAMON的数据访问感知的内存管理优化,用户只是希望系统对特定访问模式的内存区域应用内 -存管理操作。DAMON从用户那里接收这种形式化的操作方案,并将这些方案应用到目标进程中。 - -用户可以通过读取和写入 ``scheme`` debugfs文件来获得和设置这些方案。读取该文件还可以显示每个 -方案的统计数据。在文件中,每一个方案都应该在每一行中以下列形式表示出来:: - - - -你可以通过简单地在文件中写入一个空字符串来禁用方案。 - -目标访问模式 -~~~~~~~~~~~~ - -``<目标访问模式>`` 是由三个范围构成的,形式如下:: - - min-size max-size min-acc max-acc min-age max-age - -具体来说,区域大小的字节数( `min-size` 和 `max-size` ),访问频率的每聚合区间的监测访问次 -数( `min-acc` 和 `max-acc` ),区域年龄的聚合区间数( `min-age` 和 `max-age` )都被指定。 -请注意,这些范围是封闭区间。 - -动作 -~~~~ - -```` 是一个预定义的内存管理动作的整数,DAMON将应用于具有目标访问模式的区域。支持 -的数字和它们的含义如下:: - - - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - - 1: Call ``madvise()`` for the region with ``MADV_COLD`` - - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` - - 5: Do nothing but count the statistics - -配额 -~~~~ - -每个 ``动作`` 的最佳 ``目标访问模式`` 取决于工作负载,所以不容易找到。更糟糕的是,将某个 -动作的方案设置得过于激进会导致严重的开销。为了避免这种开销,用户可以通过下面表格中的 ```` -来限制方案的时间和大小配额:: - - - -这使得DAMON在 ```` 毫秒内,尽量只用 ```` 毫秒的时间对 ``目标访 -问模式`` 的内存区域应用动作,并在 ```` 内只对最多字节的内存区域应 -用动作。将 ```` 和 ```` 都设置为零,可以禁用配额限制。 - -当预计超过配额限制时,DAMON会根据 ``目标访问模式`` 的大小、访问频率和年龄,对发现的内存 -区域进行优先排序。为了实现个性化的优先级,用户可以在 ``<优先级权重>`` 中设置这三个属性的 -权重,具体形式如下:: - - - -水位 -~~~~ - -有些方案需要根据系统特定指标的当前值来运行,如自由内存比率。对于这种情况,用户可以为该条 -件指定水位。:: - - - -```` 是一个预定义的整数,用于要检查的度量。支持的数字和它们的含义如下。 - - - 0: 忽视水位 - - 1: 系统空闲内存率 (千分比) - -每隔 ``<检查间隔>`` 微秒检查一次公制的值。 - -如果该值高于 ``<高标>`` 或低于 ``<低标>`` ,该方案被停用。如果该值低于 ``<中标>`` , -该方案将被激活。 - -统计数据 -~~~~~~~~ - -它还统计每个方案被尝试应用的区域的总数量和字节数,每个方案被成功应用的区域的两个数量,以 -及超过配额限制的总数量。这些统计数据可用于在线分析或调整方案。 - -统计数据可以通过读取方案文件来显示。读取该文件将显示你在每一行中输入的每个 ``方案`` , -统计的五个数字将被加在每一行的末尾。 - -例子 -~~~~ - -下面的命令应用了一个方案:”如果一个大小为[4KiB, 8KiB]的内存区域在[10, 20]的聚合时间 -间隔内显示出每一个聚合时间间隔[0, 5]的访问量,请分页出该区域。对于分页,每秒最多只能使 -用10ms,而且每秒分页不能超过1GiB。在这一限制下,首先分页出具有较长年龄的内存区域。另外, -每5秒钟检查一次系统的可用内存率,当可用内存率低于50%时开始监测和分页,但如果可用内存率 -大于60%,或低于30%,则停止监测“:: - - # cd /damon - # scheme="4096 8192 0 5 10 20 2" # target access pattern and action - # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas - # scheme+=" 0 0 100" # prioritization weights - # scheme+=" 1 5000000 600 500 300" # watermarks - # echo "$scheme" > schemes - - -开关 ----- - -除非你明确地启动监测,否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on_DEPRECATED`` -文件来启动、停止和检查监测的当前状态。写入 ``on`` 该文件可以启动对有属性的目标的监测。写入 -``off`` 该文件则停止这些目标。如果每个目标进程被终止,DAMON也会停止。下面的示例命令开启、关 -闭和检查DAMON的状态:: - - # cd /damon - # echo on > monitor_on_DEPRECATED - # echo off > monitor_on_DEPRECATED - # cat monitor_on_DEPRECATED - off - -请注意,当监测开启时,你不能写到上述的debugfs文件。如果你在DAMON运行时写到这些文件,将会返 -回一个错误代码,如 ``-EBUSY`` 。 - - -监测线程PID ------------ - -DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以通过读取 ``kdamond_pid`` 文件获 -得该线程的 ``pid`` 。当监测被 ``关闭`` 时,读取该文件不会返回任何信息:: - - # cd /damon - # cat monitor_on_DEPRECATED - off - # cat kdamond_pid - none - # echo on > monitor_on_DEPRECATED - # cat kdamond_pid - 18594 - - -使用多个监测线程 ----------------- - -每个监测上下文都会创建一个 ``kdamond`` 线程。你可以使用 ``mk_contexts`` 和 ``rm_contexts`` -文件为多个 ``kdamond`` 需要的用例创建和删除监测上下文。 - -将新上下文的名称写入 ``mk_contexts`` 文件,在 ``DAMON debugfs`` 目录上创建一个该名称的目录。 -该目录将有该上下文的 ``DAMON debugfs`` 文件:: - - # cd /damon - # ls foo - # ls: cannot access 'foo': No such file or directory - # echo foo > mk_contexts - # ls foo - # attrs init_regions kdamond_pid schemes target_ids - -如果不再需要上下文,你可以通过把上下文的名字放到 ``rm_contexts`` 文件中来删除它和相应的目录:: - - # echo foo > rm_contexts - # ls foo - # ls: cannot access 'foo': No such file or directory - -注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目录下。 - 监测结果的监测点 ================ diff --git a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst index fbbbbad59ee4..d3fd4f850793 100644 --- a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst @@ -26,12 +26,7 @@ DAMON 爲不同的用戶提供了下面這些接口。 使用它,用戶可以通過讀取和寫入特殊的sysfs文件來使用DAMON的主要功能。因此,你可以編寫和使 用你個性化的DAMON sysfs包裝程序,代替你讀/寫sysfs文件。 `DAMON用戶空間工具 `_ 就是這種程序的一個例子 它同時支持虛擬和物理地址 - 空間的監測。注意,這個界面只提供簡單的監測結果 :ref:`統計 `。對於詳細的監測 - 結果,DAMON提供了一個:ref:`跟蹤點 `。 -- *debugfs interface.* - :ref:`這 ` 幾乎與:ref:`sysfs interface ` 接 - 口相同。這將在下一個LTS內核發佈後被移除,所以用戶應該轉移到 - :ref:`sysfs interface `。 + 空間的監測。 - *內核空間編程接口。* :doc:`這 ` 這是爲內核空間程序員準備的。使用它,用戶可以通過爲你編寫內 核空間的DAMON應用程序,最靈活有效地利用DAMON的每一個功能。你甚至可以爲各種地址空間擴展DAMON。 @@ -335,247 +330,6 @@ tried_regions// 請注意,我們強烈建議使用用戶空間的工具,如 `damo `_ , 而不是像上面那樣手動讀寫文件。以上只是一個例子。 -debugfs接口 -=========== - -.. note:: - - DAMON debugfs接口將在下一個LTS內核發佈後被移除,所以用戶應該轉移到 - :ref:`sysfs接口`。 - -DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``, -``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和 -``rm_contexts`` under its debugfs directory, ``/damon/``. - - -屬性 ----- - -用戶可以通過讀取和寫入 ``attrs`` 文件獲得和設置 ``採樣間隔`` 、 ``聚集間隔`` 、 ``更新間隔`` -以及監測目標區域的最小/最大數量。要詳細瞭解監測屬性,請參考 `:doc:/mm/damon/design` 。例如, -下面的命令將這些值設置爲5ms、100ms、1000ms、10和1000,然後再次檢查:: - - # cd /damon - # echo 5000 100000 1000000 10 1000 > attrs - # cat attrs - 5000 100000 1000000 10 1000 - - -目標ID ------- - -一些類型的地址空間支持多個監測目標。例如,虛擬內存地址空間的監測可以有多個進程作爲監測目標。用戶 -可以通過寫入目標的相關id值來設置目標,並通過讀取 ``target_ids`` 文件來獲得當前目標的id。在監 -測虛擬地址空間的情況下,這些值應該是監測目標進程的pid。例如,下面的命令將pid爲42和4242的進程設 -爲監測目標,並再次檢查:: - - # cd /damon - # echo 42 4242 > target_ids - # cat target_ids - 42 4242 - -用戶還可以通過在文件中寫入一個特殊的關鍵字 "paddr\n" 來監測系統的物理內存地址空間。因爲物理地 -址空間監測不支持多個目標,讀取文件會顯示一個假值,即 ``42`` ,如下圖所示:: - - # cd /damon - # echo paddr > target_ids - # cat target_ids - 42 - -請注意,設置目標ID並不啓動監測。 - - -初始監測目標區域 ----------------- - -在虛擬地址空間監測的情況下,DAMON自動設置和更新監測的目標區域,這樣就可以覆蓋目標進程的整個 -內存映射。然而,用戶可能希望將監測區域限制在特定的地址範圍內,如堆、棧或特定的文件映射區域。 -或者,一些用戶可以知道他們工作負載的初始訪問模式,因此希望爲“自適應區域調整”設置最佳初始區域。 - -相比之下,DAMON在物理內存監測的情況下不會自動設置和更新監測目標區域。因此,用戶應該自己設置 -監測目標區域。 - -在這種情況下,用戶可以通過在 ``init_regions`` 文件中寫入適當的值,明確地設置他們想要的初 -始監測目標區域。輸入應該是一個由三個整數組成的隊列,用空格隔開,代表一個區域的形式如下:: - - - -目標idx應該是 ``target_ids`` 文件中目標的索引,從 ``0`` 開始,區域應該按照地址順序傳遞。 -例如,下面的命令將設置幾個地址範圍, ``1-100`` 和 ``100-200`` 作爲pid 42的初始監測目標 -區域,這是 ``target_ids`` 中的第一個(索引 ``0`` ),另外幾個地址範圍, ``20-40`` 和 -``50-100`` 作爲pid 4242的地址,這是 ``target_ids`` 中的第二個(索引 ``1`` ):: - - # cd /damon - # cat target_ids - 42 4242 - # echo "0 1 100 \ - 0 100 200 \ - 1 20 40 \ - 1 50 100" > init_regions - -請注意,這只是設置了初始的監測目標區域。在虛擬內存監測的情況下,DAMON會在一個 ``更新間隔`` -後自動更新區域的邊界。因此,在這種情況下,如果用戶不希望更新的話,應該把 ``更新間隔`` 設 -置得足夠大。 - - -方案 ----- - -對於通常的基於DAMON的數據訪問感知的內存管理優化,用戶只是希望系統對特定訪問模式的內存區域應用內 -存管理操作。DAMON從用戶那裏接收這種形式化的操作方案,並將這些方案應用到目標進程中。 - -用戶可以通過讀取和寫入 ``scheme`` debugfs文件來獲得和設置這些方案。讀取該文件還可以顯示每個 -方案的統計數據。在文件中,每一個方案都應該在每一行中以下列形式表示出來:: - - - -你可以通過簡單地在文件中寫入一個空字符串來禁用方案。 - -目標訪問模式 -~~~~~~~~~~~~ - -``<目標訪問模式>`` 是由三個範圍構成的,形式如下:: - - min-size max-size min-acc max-acc min-age max-age - -具體來說,區域大小的字節數( `min-size` 和 `max-size` ),訪問頻率的每聚合區間的監測訪問次 -數( `min-acc` 和 `max-acc` ),區域年齡的聚合區間數( `min-age` 和 `max-age` )都被指定。 -請注意,這些範圍是封閉區間。 - -動作 -~~~~ - -```` 是一個預定義的內存管理動作的整數,DAMON將應用於具有目標訪問模式的區域。支持 -的數字和它們的含義如下:: - - - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - - 1: Call ``madvise()`` for the region with ``MADV_COLD`` - - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` - - 5: Do nothing but count the statistics - -配額 -~~~~ - -每個 ``動作`` 的最佳 ``目標訪問模式`` 取決於工作負載,所以不容易找到。更糟糕的是,將某個 -動作的方案設置得過於激進會導致嚴重的開銷。爲了避免這種開銷,用戶可以通過下面表格中的 ```` -來限制方案的時間和大小配額:: - - - -這使得DAMON在 ```` 毫秒內,儘量只用 ```` 毫秒的時間對 ``目標訪 -問模式`` 的內存區域應用動作,並在 ```` 內只對最多字節的內存區域應 -用動作。將 ```` 和 ```` 都設置爲零,可以禁用配額限制。 - -當預計超過配額限制時,DAMON會根據 ``目標訪問模式`` 的大小、訪問頻率和年齡,對發現的內存 -區域進行優先排序。爲了實現個性化的優先級,用戶可以在 ``<優先級權重>`` 中設置這三個屬性的 -權重,具體形式如下:: - - - -水位 -~~~~ - -有些方案需要根據系統特定指標的當前值來運行,如自由內存比率。對於這種情況,用戶可以爲該條 -件指定水位。:: - - - -```` 是一個預定義的整數,用於要檢查的度量。支持的數字和它們的含義如下。 - - - 0: 忽視水位 - - 1: 系統空閒內存率 (千分比) - -每隔 ``<檢查間隔>`` 微秒檢查一次公制的值。 - -如果該值高於 ``<高標>`` 或低於 ``<低標>`` ,該方案被停用。如果該值低於 ``<中標>`` , -該方案將被激活。 - -統計數據 -~~~~~~~~ - -它還統計每個方案被嘗試應用的區域的總數量和字節數,每個方案被成功應用的區域的兩個數量,以 -及超過配額限制的總數量。這些統計數據可用於在線分析或調整方案。 - -統計數據可以通過讀取方案文件來顯示。讀取該文件將顯示你在每一行中輸入的每個 ``方案`` , -統計的五個數字將被加在每一行的末尾。 - -例子 -~~~~ - -下面的命令應用了一個方案:”如果一個大小爲[4KiB, 8KiB]的內存區域在[10, 20]的聚合時間 -間隔內顯示出每一個聚合時間間隔[0, 5]的訪問量,請分頁出該區域。對於分頁,每秒最多隻能使 -用10ms,而且每秒分頁不能超過1GiB。在這一限制下,首先分頁出具有較長年齡的內存區域。另外, -每5秒鐘檢查一次系統的可用內存率,當可用內存率低於50%時開始監測和分頁,但如果可用內存率 -大於60%,或低於30%,則停止監測“:: - - # cd /damon - # scheme="4096 8192 0 5 10 20 2" # target access pattern and action - # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas - # scheme+=" 0 0 100" # prioritization weights - # scheme+=" 1 5000000 600 500 300" # watermarks - # echo "$scheme" > schemes - - -開關 ----- - -除非你明確地啓動監測,否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on_DEPRECATED`` -文件來啓動、停止和檢查監測的當前狀態。寫入 ``on`` 該文件可以啓動對有屬性的目標的監測。寫入 -``off`` 該文件則停止這些目標。如果每個目標進程被終止,DAMON也會停止。下面的示例命令開啓、關 -閉和檢查DAMON的狀態:: - - # cd /damon - # echo on > monitor_on_DEPRECATED - # echo off > monitor_on_DEPRECATED - # cat monitor_on_DEPRECATED - off - -請注意,當監測開啓時,你不能寫到上述的debugfs文件。如果你在DAMON運行時寫到這些文件,將會返 -回一個錯誤代碼,如 ``-EBUSY`` 。 - - -監測線程PID ------------ - -DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以通過讀取 ``kdamond_pid`` 文件獲 -得該線程的 ``pid`` 。當監測被 ``關閉`` 時,讀取該文件不會返回任何信息:: - - # cd /damon - # cat monitor_on_DEPRECATED - off - # cat kdamond_pid - none - # echo on > monitor_on_DEPRECATED - # cat kdamond_pid - 18594 - - -使用多個監測線程 ----------------- - -每個監測上下文都會創建一個 ``kdamond`` 線程。你可以使用 ``mk_contexts`` 和 ``rm_contexts`` -文件爲多個 ``kdamond`` 需要的用例創建和刪除監測上下文。 - -將新上下文的名稱寫入 ``mk_contexts`` 文件,在 ``DAMON debugfs`` 目錄上創建一個該名稱的目錄。 -該目錄將有該上下文的 ``DAMON debugfs`` 文件:: - - # cd /damon - # ls foo - # ls: cannot access 'foo': No such file or directory - # echo foo > mk_contexts - # ls foo - # attrs init_regions kdamond_pid schemes target_ids - -如果不再需要上下文,你可以通過把上下文的名字放到 ``rm_contexts`` 文件中來刪除它和相應的目錄:: - - # echo foo > rm_contexts - # ls foo - # ls: cannot access 'foo': No such file or directory - -注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目錄下。 - 監測結果的監測點 ================ diff --git a/MAINTAINERS b/MAINTAINERS index 0fa7c5728f1e..ecb70ad017b8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2827,7 +2827,7 @@ ARM/NXP S32G ARCHITECTURE R: Chester Lin R: Matthias Brugger R: Ghennadi Procopciuc -L: NXP S32 Linux Team +R: NXP S32 Linux Team L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm64/boot/dts/freescale/s32g*.dts* @@ -6327,6 +6327,7 @@ F: Documentation/mm/damon/ F: include/linux/damon.h F: include/trace/events/damon.h F: mm/damon/ +F: samples/damon/ F: tools/testing/selftests/damon/ DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER @@ -15068,7 +15069,15 @@ L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: mm/mlock.c F: mm/mmap.c +F: mm/mprotect.c +F: mm/mremap.c +F: mm/mseal.c +F: mm/vma.c +F: mm/vma.h +F: mm/vma_internal.h +F: tools/testing/vma/ MEMORY TECHNOLOGY DEVICES (MTD) M: Miquel Raynal @@ -16600,8 +16609,8 @@ F: arch/nios2/ NITRO ENCLAVES (NE) M: Alexandru Ciobotaru +R: The AWS Nitro Enclaves Team L: linux-kernel@vger.kernel.org -L: The AWS Nitro Enclaves Team S: Supported W: https://aws.amazon.com/ec2/nitro/nitro-enclaves/ F: Documentation/virt/ne_overview.rst @@ -16612,8 +16621,8 @@ F: samples/nitro_enclaves/ NITRO SECURE MODULE (NSM) M: Alexander Graf +R: The AWS Nitro Enclaves Team L: linux-kernel@vger.kernel.org -L: The AWS Nitro Enclaves Team S: Supported W: https://aws.amazon.com/ec2/nitro/nitro-enclaves/ F: drivers/misc/nsm.c @@ -18425,8 +18434,8 @@ M: Fabio Estevam M: Shawn Guo M: Jacky Bai R: Pengutronix Kernel Team +R: NXP S32 Linux Team L: linux-gpio@vger.kernel.org -L: NXP S32 Linux Team S: Maintained F: Documentation/devicetree/bindings/pinctrl/fsl,* F: Documentation/devicetree/bindings/pinctrl/nxp,s32* @@ -19561,7 +19570,7 @@ F: drivers/ras/amd/fmpm.c RASPBERRY PI PISP BACK END M: Jacopo Mondi -L: Raspberry Pi Kernel Maintenance +R: Raspberry Pi Kernel Maintenance L: linux-media@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/media/raspberrypi,pispbe.yaml @@ -25018,21 +25027,6 @@ F: include/uapi/linux/vsockmon.h F: net/vmw_vsock/ F: tools/testing/vsock/ -VMA -M: Andrew Morton -M: Liam R. Howlett -M: Lorenzo Stoakes -R: Vlastimil Babka -R: Jann Horn -L: linux-mm@kvack.org -S: Maintained -W: https://www.linux-mm.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm -F: mm/vma.c -F: mm/vma.h -F: mm/vma_internal.h -F: tools/testing/vma/ - VMALLOC M: Andrew Morton R: Uladzislau Rezki diff --git a/arch/alpha/kernel/core_cia.c b/arch/alpha/kernel/core_cia.c index ca3d9c732b61..6e577228e175 100644 --- a/arch/alpha/kernel/core_cia.c +++ b/arch/alpha/kernel/core_cia.c @@ -331,10 +331,7 @@ cia_prepare_tbia_workaround(int window) long i; /* Use minimal 1K map. */ - ppte = memblock_alloc(CIA_BROKEN_TBIA_SIZE, 32768); - if (!ppte) - panic("%s: Failed to allocate %u bytes align=0x%x\n", - __func__, CIA_BROKEN_TBIA_SIZE, 32768); + ppte = memblock_alloc_or_panic(CIA_BROKEN_TBIA_SIZE, 32768); pte = (virt_to_phys(ppte) >> (PAGE_SHIFT - 1)) | 1; for (i = 0; i < CIA_BROKEN_TBIA_SIZE / sizeof(unsigned long); ++i) diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index b22248044bf0..b1bfbd11980d 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -81,10 +81,7 @@ mk_resource_name(int pe, int port, char *str) char *name; sprintf(tmp, "PCI %s PE %d PORT %d", str, pe, port); - name = memblock_alloc(strlen(tmp) + 1, SMP_CACHE_BYTES); - if (!name) - panic("%s: Failed to allocate %zu bytes\n", __func__, - strlen(tmp) + 1); + name = memblock_alloc_or_panic(strlen(tmp) + 1, SMP_CACHE_BYTES); strcpy(name, tmp); return name; @@ -119,10 +116,7 @@ alloc_io7(unsigned int pe) return NULL; } - io7 = memblock_alloc(sizeof(*io7), SMP_CACHE_BYTES); - if (!io7) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*io7)); + io7 = memblock_alloc_or_panic(sizeof(*io7), SMP_CACHE_BYTES); io7->pe = pe; raw_spin_lock_init(&io7->irq_lock); diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 4458eb7f44f0..8e9b4ac86b7e 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -391,10 +391,7 @@ alloc_pci_controller(void) { struct pci_controller *hose; - hose = memblock_alloc(sizeof(*hose), SMP_CACHE_BYTES); - if (!hose) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*hose)); + hose = memblock_alloc_or_panic(sizeof(*hose), SMP_CACHE_BYTES); *hose_tail = hose; hose_tail = &hose->next; @@ -405,13 +402,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - void *ptr = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); - - if (!ptr) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct resource)); - - return ptr; + return memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES); } diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 7fcf3e9b7103..681f56089d9c 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -71,14 +71,8 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base, if (align < mem_size) align = mem_size; - arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES); - if (!arena) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*arena)); - arena->ptes = memblock_alloc(mem_size, align); - if (!arena->ptes) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, mem_size, align); + arena = memblock_alloc_or_panic(sizeof(*arena), SMP_CACHE_BYTES); + arena->ptes = memblock_alloc_or_panic(mem_size, align); spin_lock_init(&arena->lock); arena->hose = hose; diff --git a/arch/alpha/lib/fpreg.c b/arch/alpha/lib/fpreg.c index 9a238e7536ae..3d32165043f8 100644 --- a/arch/alpha/lib/fpreg.c +++ b/arch/alpha/lib/fpreg.c @@ -10,7 +10,6 @@ #include #include #include -#include #if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67) #define STT(reg,val) asm volatile ("ftoit $f"#reg",%0" : "=r"(val)); diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 4fe618446e4c..61c2198b1359 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -42,7 +42,7 @@ pgd_alloc(struct mm_struct *mm) { pgd_t *ret, *init; - ret = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + ret = __pgd_alloc(mm, 0); init = pgd_offset(&init_mm, 0UL); if (ret) { #ifdef CONFIG_ALPHA_LARGE_VMALLOC diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 096b8ef58edb..dfae070fe8d5 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -53,19 +53,14 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *ret = (pgd_t *) __get_free_page(GFP_KERNEL); + pgd_t *ret = __pgd_alloc(mm, 0); if (ret) { int num, num2; - num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE; - memzero(ret, num * sizeof(pgd_t)); + num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE; num2 = VMALLOC_SIZE / PGDIR_SIZE; memcpy(ret + num, swapper_pg_dir + num, num2 * sizeof(pgd_t)); - - memzero(ret + num + num2, - (PTRS_PER_PGD - num - num2) * sizeof(pgd_t)); - } return ret; } diff --git a/arch/arc/kernel/unaligned.c b/arch/arc/kernel/unaligned.c index d2f5ceaaed1b..3b2d8b1bd271 100644 --- a/arch/arc/kernel/unaligned.c +++ b/arch/arc/kernel/unaligned.c @@ -200,7 +200,6 @@ int misaligned_fixup(unsigned long address, struct pt_regs *regs, struct callee_regs *cregs) { struct disasm_state state; - char buf[TASK_COMM_LEN]; /* handle user mode only and only if enabled by sysadmin */ if (!user_mode(regs) || !unaligned_enabled) @@ -212,11 +211,11 @@ int misaligned_fixup(unsigned long address, struct pt_regs *regs, " performance significantly\n. To enable further" " logging of such instances, please \n" " echo 0 > /proc/sys/kernel/ignore-unaligned-usertrap\n", - get_task_comm(buf, current), task_pid_nr(current)); + current->comm, task_pid_nr(current)); } else { /* Add rate limiting if it gets down to it */ pr_warn("%s(%d): unaligned access to/from 0x%lx by PC: 0x%lx\n", - get_task_comm(buf, current), task_pid_nr(current), + current->comm, task_pid_nr(current), address, regs->ret); } diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index f40d06ad5d2a..ea4fbe7b17f6 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -26,14 +26,7 @@ #else /* !CONFIG_MMU */ -#include #include - -static inline void __tlb_remove_table(void *_table) -{ - free_page_and_swap_cache((struct page *)_table); -} - #include static inline void @@ -41,8 +34,6 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { struct ptdesc *ptdesc = page_ptdesc(pte); - pagetable_pte_dtor(ptdesc); - #ifndef CONFIG_ARM_LPAE /* * With the classic ARM MMU, a pte page has two corresponding pmd @@ -61,7 +52,6 @@ __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) #ifdef CONFIG_ARM_LPAE struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); - pagetable_pmd_dtor(ptdesc); tlb_remove_ptdesc(tlb, ptdesc); #endif } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index e6a857bf0ce6..a41c93988d2c 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -880,10 +880,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) */ boot_alias_start = phys_to_idmap(start); if (arm_has_idmap_alias() && boot_alias_start != IDMAP_INVALID_ADDR) { - res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", - __func__, sizeof(*res)); + res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES); res->name = "System RAM (boot alias)"; res->start = boot_alias_start; res->end = phys_to_idmap(res_end); @@ -891,10 +888,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) request_resource(&iomem_resource, res); } - res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*res)); + res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES); res->name = "System RAM"; res->start = start; res->end = res_end; diff --git a/arch/arm/mach-pxa/sharpsl_pm.c b/arch/arm/mach-pxa/sharpsl_pm.c index 0c8d9000df5a..dd930e3a61a4 100644 --- a/arch/arm/mach-pxa/sharpsl_pm.c +++ b/arch/arm/mach-pxa/sharpsl_pm.c @@ -31,10 +31,10 @@ /* * Constants */ -#define SHARPSL_CHARGE_ON_TIME_INTERVAL (msecs_to_jiffies(1*60*1000)) /* 1 min */ -#define SHARPSL_CHARGE_FINISH_TIME (msecs_to_jiffies(10*60*1000)) /* 10 min */ -#define SHARPSL_BATCHK_TIME (msecs_to_jiffies(15*1000)) /* 15 sec */ -#define SHARPSL_BATCHK_TIME_SUSPEND (60*10) /* 10 min */ +#define SHARPSL_CHARGE_ON_TIME_INTERVAL (secs_to_jiffies(60)) +#define SHARPSL_CHARGE_FINISH_TIME (secs_to_jiffies(10*60)) +#define SHARPSL_BATCHK_TIME (secs_to_jiffies(15)) +#define SHARPSL_BATCHK_TIME_SUSPEND (60*10) /* 10 min */ #define SHARPSL_WAIT_CO_TIME 15 /* 15 sec */ #define SHARPSL_WAIT_DISCHARGE_ON 100 /* 100 msec */ diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index f5b7a16c5803..f02f872ea8a9 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -726,13 +726,8 @@ EXPORT_SYMBOL(phys_mem_access_prot); static void __init *early_alloc(unsigned long sz) { - void *ptr = memblock_alloc(sz, sz); + return memblock_alloc_or_panic(sz, sz); - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, sz, sz); - - return ptr; } static void *__init late_alloc(unsigned long sz) @@ -1027,10 +1022,7 @@ void __init iotable_init(struct map_desc *io_desc, int nr) if (!nr) return; - svm = memblock_alloc(sizeof(*svm) * nr, __alignof__(*svm)); - if (!svm) - panic("%s: Failed to allocate %zu bytes align=0x%zx\n", - __func__, sizeof(*svm) * nr, __alignof__(*svm)); + svm = memblock_alloc_or_panic(sizeof(*svm) * nr, __alignof__(*svm)); for (md = io_desc; nr; md++, nr--) { create_mapping(md); @@ -1052,10 +1044,7 @@ void __init vm_reserve_area_early(unsigned long addr, unsigned long size, struct vm_struct *vm; struct static_vm *svm; - svm = memblock_alloc(sizeof(*svm), __alignof__(*svm)); - if (!svm) - panic("%s: Failed to allocate %zu bytes align=0x%zx\n", - __func__, sizeof(*svm), __alignof__(*svm)); + svm = memblock_alloc_or_panic(sizeof(*svm), __alignof__(*svm)); vm = &svm->vm; vm->addr = (void *)addr; diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index c415f3859b20..1a8f6914ee59 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -162,10 +162,7 @@ void __init paging_init(const struct machine_desc *mdesc) mpu_setup(); /* allocate the zero page. */ - zero_page = (void *)memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + zero_page = (void *)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); bootmem_init(); diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index f8e9bc58a84f..4eb81b7ed03a 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -17,11 +17,11 @@ #include "mm.h" #ifdef CONFIG_ARM_LPAE -#define __pgd_alloc() kmalloc_array(PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL) -#define __pgd_free(pgd) kfree(pgd) +#define _pgd_alloc(mm) kmalloc_array(PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL | __GFP_ZERO) +#define _pgd_free(mm, pgd) kfree(pgd) #else -#define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL, 2) -#define __pgd_free(pgd) free_pages((unsigned long)pgd, 2) +#define _pgd_alloc(mm) __pgd_alloc(mm, 2) +#define _pgd_free(mm, pgd) __pgd_free(mm, pgd) #endif /* @@ -35,12 +35,10 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *new_pmd, *init_pmd; pte_t *new_pte, *init_pte; - new_pgd = __pgd_alloc(); + new_pgd = _pgd_alloc(mm); if (!new_pgd) goto no_pgd; - memset(new_pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - /* * Copy over the kernel and IO PGD entries */ @@ -134,7 +132,7 @@ no_pmd: no_pud: p4d_free(mm, new_p4d); no_p4d: - __pgd_free(new_pgd); + _pgd_free(mm, new_pgd); no_pgd: return NULL; } @@ -207,5 +205,5 @@ no_pgd: p4d_free(mm, p4d); } #endif - __pgd_free(pgd_base); + _pgd_free(mm, pgd_base); } diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index e75422864d1b..1b4509d3382c 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -85,24 +85,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp) __pgd_populate(pgdp, __pa(p4dp), pgdval); } -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - gfp_t gfp = GFP_PGTABLE_USER; - - if (mm == &init_mm) - gfp = GFP_PGTABLE_KERNEL; - return (p4d_t *)get_zeroed_page(gfp); -} - -static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) -{ - if (!pgtable_l5_enabled()) - return; - BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - free_page((unsigned long)p4d); -} - -#define __p4d_free_tlb(tlb, p4d, addr) p4d_free((tlb)->mm, p4d) #else static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot) { diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index a947c6e784ed..8d762607285c 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -9,12 +9,7 @@ #define __ASM_TLB_H #include -#include -static inline void __tlb_remove_table(void *_table) -{ - free_page_and_swap_cache((struct page *)_table); -} #define tlb_flush tlb_flush static void tlb_flush(struct mmu_gather *tlb); @@ -82,7 +77,6 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, { struct ptdesc *ptdesc = page_ptdesc(pte); - pagetable_pte_dtor(ptdesc); tlb_remove_ptdesc(tlb, ptdesc); } @@ -92,7 +86,6 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, { struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); - pagetable_pmd_dtor(ptdesc); tlb_remove_ptdesc(tlb, ptdesc); } #endif @@ -106,7 +99,19 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, if (!pgtable_l4_enabled()) return; - pagetable_pud_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); +} +#endif + +#if CONFIG_PGTABLE_LEVELS > 4 +static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4dp, + unsigned long addr) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(p4dp); + + if (!pgtable_l5_enabled()) + return; + tlb_remove_ptdesc(tlb, ptdesc); } #endif diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 4f613e8e0745..85104587f849 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -223,9 +223,7 @@ static void __init request_standard_resources(void) num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); - standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); - if (!standard_resources) - panic("%s: Failed to allocate %zu bytes\n", __func__, res_size); + standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 0c501cabc238..8160cff35089 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -33,7 +33,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) gfp_t gfp = GFP_PGTABLE_USER; if (pgdir_is_page_size()) - return (pgd_t *)__get_free_page(gfp); + return __pgd_alloc(mm, 0); else return kmem_cache_alloc(pgd_cache, gfp); } @@ -41,7 +41,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) void pgd_free(struct mm_struct *mm, pgd_t *pgd) { if (pgdir_is_page_size()) - free_page((unsigned long)pgd); + __pgd_free(mm, pgd); else kmem_cache_free(pgd_cache, pgd); } diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index 9c84c9012e53..bf8400c28b5a 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -44,7 +44,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *ret; pgd_t *init; - ret = (pgd_t *) __get_free_page(GFP_KERNEL); + ret = __pgd_alloc(mm, 0); if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init((unsigned long *)ret); @@ -63,7 +63,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #define __pte_free_tlb(tlb, pte, address) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc(tlb, page_ptdesc(pte)); \ } while (0) diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index 55988625e6fb..1ee5f5f157ca 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -22,7 +22,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; - pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + pgd = __pgd_alloc(mm, 0); /* * There may be better ways to do this, but to ensure @@ -89,7 +89,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pagetable_pte_dtor((page_ptdesc(pte))); \ + pagetable_dtor((page_ptdesc(pte))); \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 4dffc90192f7..1cc6e8843680 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -113,7 +113,10 @@ CONFIG_ZBUD=y CONFIG_ZSMALLOC=m # CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y +# CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE is not set +CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO=y +# CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL is not set +# CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE is not set CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index a7b9c9e73593..7211dff8c969 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -57,7 +57,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) #define __pte_free_tlb(tlb, pte, address) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 56934fe58170..edcfdfcad7d2 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -431,7 +431,7 @@ static void __init resource_init(void) num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); - standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); + standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 188b52bbb254..ca5aa5f46a9f 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -174,9 +174,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) pmd_t *pmd; if (p4d_none(p4dp_get(p4d))) { - pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pud) - panic("%s: Failed to allocate memory\n", __func__); + pud = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); p4d_populate(&init_mm, p4d, pud); #ifndef __PAGETABLE_PUD_FOLDED pud_init(pud); @@ -185,9 +183,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) pud = pud_offset(p4d, addr); if (pud_none(pudp_get(pud))) { - pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pmd) - panic("%s: Failed to allocate memory\n", __func__); + pmd = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pud_populate(&init_mm, pud, pmd); #ifndef __PAGETABLE_PMD_FOLDED pmd_init(pmd); @@ -198,10 +194,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) if (!pmd_present(pmdp_get(pmd))) { pte_t *pte; - pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate memory\n", __func__); - + pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(&init_mm, pmd, pte); kernel_pte_init(pte); } diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 3fa69b23ff84..22a94bb3e6e8 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -23,11 +23,10 @@ EXPORT_SYMBOL(tlb_virt_to_page); pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *init, *ret = NULL; - struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); + pgd_t *init, *ret; - if (ptdesc) { - ret = (pgd_t *)ptdesc_address(ptdesc); + ret = __pgd_alloc(mm, 0); + if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init(ret); memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index c705247e7b5b..581f0080814e 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -629,7 +629,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 6d62b9187a58..25628a1e8fa1 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -586,7 +586,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index c3c644df852d..503a9ea526b5 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -606,7 +606,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 20261f819691..3358349898ef 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -578,7 +578,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index ce4fe93a0f70..a5e933a7fdf9 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -588,7 +588,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 040ae75f47c3..a90676c04da6 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -605,7 +605,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 20d877cb4e30..f28f7783b090 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -692,7 +692,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 5e1c8d0d3da5..61308c2cd96c 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -578,7 +578,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 5d1409e6a137..9ec8eb9ea614 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -579,7 +579,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index e4c30e2b9bbb..5fd094391b9e 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -595,7 +595,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 980843a9ea1e..5e9c9d704c2e 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -575,7 +575,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 38681cc6b598..b2f5f398fe42 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -576,7 +576,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 302c5bf67179..4c648b51e7fd 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -37,7 +37,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable, { struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } @@ -61,7 +61,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable) { struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } @@ -73,7 +73,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - pagetable_free(virt_to_ptdesc(pgd)); + pagetable_dtor_free(virt_to_ptdesc(pgd)); } static inline pgd_t *pgd_alloc(struct mm_struct *mm) @@ -84,6 +84,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) if (!ptdesc) return NULL; + pagetable_pgd_ctor(ptdesc); new_pgd = ptdesc_address(ptdesc); memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t)); diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index 74a817d9387f..5abe7da8ac5a 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -9,9 +9,9 @@ extern void mmu_page_ctor(void *page); extern void mmu_page_dtor(void *page); enum m68k_table_types { - TABLE_PGD = 0, - TABLE_PMD = 0, /* same size as PGD */ - TABLE_PTE = 1, + TABLE_PGD, + TABLE_PMD, + TABLE_PTE, }; extern void init_pointer_table(void *table, int type); diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 4a137eecb6fe..f1ae4ed890db 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -19,7 +19,7 @@ extern const char bad_pmd_string[]; #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) @@ -43,7 +43,7 @@ static inline pgd_t * pgd_alloc(struct mm_struct *mm) { pgd_t *new_pgd; - new_pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); + new_pgd = __pgd_alloc(mm, 0); memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE); memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT)); return new_pgd; diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 1b47bec15832..8b11d0d545aa 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -68,10 +68,7 @@ void __init paging_init(void) high_memory = (void *) end_mem; - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT; free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 9a6fa342e872..19a75029036c 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -42,20 +42,14 @@ void __init paging_init(void) unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; int i; - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pg_dir = swapper_pg_dir; memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir)); size = num_pages * sizeof(pte_t); size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1); - next_pgtable = (unsigned long) memblock_alloc(size, PAGE_SIZE); - if (!next_pgtable) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, PAGE_SIZE); + next_pgtable = (unsigned long) memblock_alloc_or_panic(size, PAGE_SIZE); pg_dir += PAGE_OFFSET >> PGDIR_SHIFT; diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index c1761d309fc6..73651e093c4d 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -97,17 +97,19 @@ void mmu_page_dtor(void *page) typedef struct list_head ptable_desc; -static struct list_head ptable_list[2] = { +static struct list_head ptable_list[3] = { LIST_HEAD_INIT(ptable_list[0]), LIST_HEAD_INIT(ptable_list[1]), + LIST_HEAD_INIT(ptable_list[2]), }; #define PD_PTABLE(page) ((ptable_desc *)&(virt_to_page((void *)(page))->lru)) #define PD_PAGE(ptable) (list_entry(ptable, struct page, lru)) #define PD_MARKBITS(dp) (*(unsigned int *)&PD_PAGE(dp)->index) -static const int ptable_shift[2] = { - 7+2, /* PGD, PMD */ +static const int ptable_shift[3] = { + 7+2, /* PGD */ + 7+2, /* PMD */ 6+2, /* PTE */ }; @@ -156,12 +158,20 @@ void *get_pointer_table(int type) if (!(page = (void *)get_zeroed_page(GFP_KERNEL))) return NULL; - if (type == TABLE_PTE) { + switch (type) { + case TABLE_PTE: /* * m68k doesn't have SPLIT_PTE_PTLOCKS for not having * SMP. */ pagetable_pte_ctor(virt_to_ptdesc(page)); + break; + case TABLE_PMD: + pagetable_pmd_ctor(virt_to_ptdesc(page)); + break; + case TABLE_PGD: + pagetable_pgd_ctor(virt_to_ptdesc(page)); + break; } mmu_page_ctor(page); @@ -200,8 +210,7 @@ int free_pointer_table(void *table, int type) /* all tables in page are free, free page */ list_del(dp); mmu_page_dtor((void *)page); - if (type == TABLE_PTE) - pagetable_pte_dtor(virt_to_ptdesc((void *)page)); + pagetable_dtor(virt_to_ptdesc((void *)page)); free_page (page); return 1; } else if (ptable_list[type].next != dp) { @@ -491,10 +500,7 @@ void __init paging_init(void) * initialize the bad page table and bad page to point * to a couple of allocated pages */ - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); /* * Set up SFC/DFC registers diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index 494739c1783e..1ecf6bdd08bf 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -44,10 +44,7 @@ void __init paging_init(void) unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long size; - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); address = PAGE_OFFSET; pg_dir = swapper_pg_dir; @@ -57,10 +54,7 @@ void __init paging_init(void) size = num_pages * sizeof(pte_t); size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1); - next_pgtable = (unsigned long)memblock_alloc(size, PAGE_SIZE); - if (!next_pgtable) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, PAGE_SIZE); + next_pgtable = (unsigned long)memblock_alloc_or_panic(size, PAGE_SIZE); bootmem_end = (next_pgtable + size + PAGE_SIZE) & PAGE_MASK; /* Map whole memory from PAGE_OFFSET (0x0E000000) */ diff --git a/arch/m68k/sun3/sun3dvma.c b/arch/m68k/sun3/sun3dvma.c index 6ebf52740ad7..225fc735e466 100644 --- a/arch/m68k/sun3/sun3dvma.c +++ b/arch/m68k/sun3/sun3dvma.c @@ -252,12 +252,8 @@ void __init dvma_init(void) list_add(&(hole->list), &hole_list); - iommu_use = memblock_alloc(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long), + iommu_use = memblock_alloc_or_panic(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long), SMP_CACHE_BYTES); - if (!iommu_use) - panic("%s: Failed to allocate %zu bytes\n", __func__, - IOMMU_TOTAL_ENTRIES * sizeof(unsigned long)); - dvma_unmap_iommu(DVMA_START, DVMA_SIZE); sun3_dvma_init(); diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index 6c33b05f730f..084a8a0dc239 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -21,12 +21,7 @@ extern void __bad_pte(pmd_t *pmd); -static inline pgd_t *get_pgd(void) -{ - return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0); -} - -#define pgd_alloc(mm) get_pgd() +#define pgd_alloc(mm) __pgd_alloc(mm, 0) extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index f4440edcd8fe..26c7a6ede983 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -15,7 +15,6 @@ #define __HAVE_ARCH_PMD_ALLOC_ONE #define __HAVE_ARCH_PUD_ALLOC_ONE -#define __HAVE_ARCH_PGD_FREE #include static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, @@ -49,14 +48,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) extern void pgd_init(void *addr); extern pgd_t *pgd_alloc(struct mm_struct *mm); -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - pagetable_free(virt_to_ptdesc(pgd)); -} - #define __pte_free_tlb(tlb, pte, address) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 12a1a4ffb602..fbfe0771317e 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -704,10 +704,7 @@ static void __init resource_init(void) for_each_mem_range(i, &start, &end) { struct resource *res; - res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct resource)); + res = memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES); res->start = start; /* diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 4c8e3c0aa210..75c9d3618f58 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -97,11 +98,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return -EINTR; if (IS_ENABLED(CONFIG_MIPS_FP_SUPPORT)) { + unsigned long unused; + /* Map delay slot emulation page */ - base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, - VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - 0, NULL); + base = do_mmap(NULL, STACK_TOP, PAGE_SIZE, PROT_READ | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, 0, 0, &unused, + NULL); if (IS_ERR_VALUE(base)) { ret = base; goto out; diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c index 1506e458040d..10835414819f 100644 --- a/arch/mips/mm/pgtable.c +++ b/arch/mips/mm/pgtable.c @@ -10,12 +10,10 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *init, *ret = NULL; - struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, - PGD_TABLE_ORDER); + pgd_t *init, *ret; - if (ptdesc) { - ret = ptdesc_address(ptdesc); + ret = __pgd_alloc(mm, PGD_TABLE_ORDER); + if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init(ret); memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index ce6bb8e74271..12a536b7bfbd 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -30,7 +30,7 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) diff --git a/arch/nios2/mm/pgtable.c b/arch/nios2/mm/pgtable.c index 7c76e8a7447a..6470ed378782 100644 --- a/arch/nios2/mm/pgtable.c +++ b/arch/nios2/mm/pgtable.c @@ -11,6 +11,7 @@ #include #include +#include /* pteaddr: * ptbase | vpn* | zero @@ -54,7 +55,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret, *init; - ret = (pgd_t *) __get_free_page(GFP_KERNEL); + ret = __pgd_alloc(mm, 0); if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init(ret); diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index c6a73772a546..3372f4e6ab4b 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -41,15 +41,13 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *ret = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd_t *ret = __pgd_alloc(mm, 0); - if (ret) { - memset(ret, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + if (ret) memcpy(ret + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } return ret; } @@ -68,7 +66,7 @@ extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index f59ea4c10b0f..8e63e86251ca 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -38,10 +38,7 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm) if (likely(mem_init_done)) { pte = (pte_t *)get_zeroed_page(GFP_KERNEL); } else { - pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } return pte; diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index e3e142b1c5c5..2ca74a56415c 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -11,27 +11,12 @@ #include #define __HAVE_ARCH_PMD_ALLOC_ONE -#define __HAVE_ARCH_PMD_FREE -#define __HAVE_ARCH_PGD_FREE #include /* Allocate the top level pgd (page directory) */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd; - - pgd = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER); - if (unlikely(pgd == NULL)) - return NULL; - - memset(pgd, 0, PAGE_SIZE << PGD_TABLE_ORDER); - - return pgd; -} - -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - free_pages((unsigned long)pgd, PGD_TABLE_ORDER); + return __pgd_alloc(mm, PGD_TABLE_ORDER); } #if CONFIG_PGTABLE_LEVELS == 3 @@ -46,17 +31,19 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) { - pmd_t *pmd; + struct ptdesc *ptdesc; + gfp_t gfp = GFP_PGTABLE_USER; - pmd = (pmd_t *)__get_free_pages(GFP_PGTABLE_KERNEL, PMD_TABLE_ORDER); - if (likely(pmd)) - memset ((void *)pmd, 0, PAGE_SIZE << PMD_TABLE_ORDER); - return pmd; -} - -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) -{ - free_pages((unsigned long)pmd, PMD_TABLE_ORDER); + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + ptdesc = pagetable_alloc(gfp, PMD_TABLE_ORDER); + if (!ptdesc) + return NULL; + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); + return NULL; + } + return ptdesc_address(ptdesc); } #endif diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 96970fa75e4a..61c0a2477072 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -377,10 +377,8 @@ static void __ref map_pages(unsigned long start_vaddr, #if CONFIG_PGTABLE_LEVELS == 3 if (pud_none(*pud)) { - pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER, + pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER, PAGE_SIZE << PMD_TABLE_ORDER); - if (!pmd) - panic("pmd allocation failed.\n"); pud_populate(NULL, pud, pmd); } #endif @@ -388,9 +386,7 @@ static void __ref map_pages(unsigned long start_vaddr, pmd = pmd_offset(pud, vaddr); for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++, pmd++) { if (pmd_none(*pmd)) { - pg_table = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pg_table) - panic("page table allocation failed\n"); + pg_table = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(NULL, pmd, pg_table); } @@ -648,9 +644,7 @@ static void __init pagetable_init(void) } #endif - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("zero page allocation failed.\n"); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } @@ -687,19 +681,15 @@ static void __init fixmap_init(void) #if CONFIG_PGTABLE_LEVELS == 3 if (pud_none(*pud)) { - pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER, + pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER, PAGE_SIZE << PMD_TABLE_ORDER); - if (!pmd) - panic("fixmap: pmd allocation failed.\n"); pud_populate(NULL, pud, pmd); } #endif pmd = pmd_offset(pud, addr); do { - pte_t *pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("fixmap: pte allocation failed.\n"); + pte_t *pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(&init_mm, pmd, pte); diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index f39c0d000c43..bc48063fd860 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -451,7 +451,6 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m -CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 1ca7d4c4b90d..2058e8d3e013 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -37,6 +37,7 @@ extern void tlb_flush(struct mmu_gather *tlb); */ #define tlb_needs_table_invalidate() radix_enabled() +#define __HAVE_ARCH_TLB_REMOVE_TABLE /* Get the generic bits... */ #include diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 1bee15c013e7..3af6c06af02f 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -1087,12 +1087,10 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char /* Count and allocate space for cpu features */ of_scan_flat_dt_subnodes(node, count_cpufeatures_subnodes, &nr_dt_cpu_features); - dt_cpu_features = memblock_alloc(sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, PAGE_SIZE); - if (!dt_cpu_features) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, - sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, - PAGE_SIZE); + dt_cpu_features = + memblock_alloc_or_panic( + sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, + PAGE_SIZE); cpufeatures_setup_start(isa); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index ce0c8623e563..f8a3bd8cfae4 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -213,11 +213,8 @@ pci_create_OF_bus_map(void) struct property* of_prop; struct device_node *dn; - of_prop = memblock_alloc(sizeof(struct property) + 256, + of_prop = memblock_alloc_or_panic(sizeof(struct property) + 256, SMP_CACHE_BYTES); - if (!of_prop) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct property) + 256); dn = of_find_node_by_path("/"); if (dn) { memset(of_prop, -1, sizeof(struct property) + 256); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 6fa179448c33..f3ea1329c566 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -458,11 +458,8 @@ void __init smp_setup_cpu_maps(void) DBG("smp_setup_cpu_maps()\n"); - cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32), + cpu_to_phys_id = memblock_alloc_or_panic(nr_cpu_ids * sizeof(u32), __alignof__(u32)); - if (!cpu_to_phys_id) - panic("%s: Failed to allocate %zu bytes align=0x%zx\n", - __func__, nr_cpu_ids * sizeof(u32), __alignof__(u32)); for_each_node_by_type(dn, "cpu") { const __be32 *intserv; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 75dbf3e0d9c4..5a1bf501fbe1 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -140,13 +140,7 @@ arch_initcall(ppc_init); static void *__init alloc_stack(void) { - void *ptr = memblock_alloc(THREAD_SIZE, THREAD_ALIGN); - - if (!ptr) - panic("cannot allocate %d bytes for stack at %pS\n", - THREAD_SIZE, (void *)_RET_IP_); - - return ptr; + return memblock_alloc_or_panic(THREAD_SIZE, THREAD_ALIGN); } void __init irqstack_early_init(void) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 25429905ae90..86bff159c51e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4957,7 +4957,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, * states are synchronized from L0 to L1. L1 needs to inform L0 about * MER=1 only when there are pending external interrupts. * In the above if check, MER bit is set if there are pending - * external interrupts. Hence, explicity mask off MER bit + * external interrupts. Hence, explicitly mask off MER bit * here as otherwise it may generate spurious interrupts in L2 KVM * causing an endless loop, which results in L2 guest getting hung. */ diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 6978344edcb4..be9c4106e22f 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -377,10 +377,7 @@ void __init MMU_init_hw(void) * Find some memory for the hash table. */ if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); - Hash = memblock_alloc(Hash_size, Hash_size); - if (!Hash) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, Hash_size, Hash_size); + Hash = memblock_alloc_or_panic(Hash_size, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; pr_info("Total memory = %lldMB; using %ldkB for hash table\n", diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 1715b07c630c..4e1e45420bd4 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -253,7 +253,7 @@ static void pmd_frag_destroy(void *pmd_frag) count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { - pagetable_pmd_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 374542528080..ce64abea9e3e 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -330,11 +330,7 @@ void __init mmu_partition_table_init(void) unsigned long ptcr; /* Initialize the Partition Table with no entries */ - partition_tb = memblock_alloc(patb_size, patb_size); - if (!partition_tb) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, patb_size, patb_size); - + partition_tb = memblock_alloc_or_panic(patb_size, patb_size); ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); @@ -477,7 +473,7 @@ void pmd_fragment_free(unsigned long *pmd) BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { - pagetable_pmd_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c index 43c03b84ff32..60c78aac0f63 100644 --- a/arch/powerpc/mm/kasan/init_book3e_64.c +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -40,19 +40,19 @@ static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgpr pgdp = pgd_offset_k(ea); p4dp = p4d_offset(pgdp, ea); if (kasan_pud_table(*p4dp)) { - pudp = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + pudp = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE); p4d_populate(&init_mm, p4dp, pudp); } pudp = pud_offset(p4dp, ea); if (kasan_pmd_table(*pudp)) { - pmdp = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + pmdp = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE); pud_populate(&init_mm, pudp, pmdp); } pmdp = pmd_offset(pudp, ea); if (kasan_pte_table(*pmdp)) { - ptep = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + ptep = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmdp, ptep); } @@ -74,7 +74,7 @@ static void __init kasan_init_phys_region(void *start, void *end) k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); - va = memblock_alloc(k_end - k_start, PAGE_SIZE); + va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE); for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c index 3fb5ce4f48f4..7d959544c077 100644 --- a/arch/powerpc/mm/kasan/init_book3s_64.c +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -32,7 +32,7 @@ static void __init kasan_init_phys_region(void *start, void *end) k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); - va = memblock_alloc(k_end - k_start, PAGE_SIZE); + va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE); for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); } diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c index 0b181da40ddb..a1a4e697251a 100644 --- a/arch/powerpc/mm/nohash/mmu_context.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -385,21 +385,11 @@ void __init mmu_context_init(void) /* * Allocate the maps used by context management */ - context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); - if (!context_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - CTX_MAP_SIZE); - context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1), + context_map = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES); + context_mm = memblock_alloc_or_panic(sizeof(void *) * (LAST_CONTEXT + 1), SMP_CACHE_BYTES); - if (!context_mm) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(void *) * (LAST_CONTEXT + 1)); if (IS_ENABLED(CONFIG_SMP)) { - stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); - if (!stale_map[boot_cpuid]) - panic("%s: Failed to allocate %zu bytes\n", __func__, - CTX_MAP_SIZE); - + stale_map[boot_cpuid] = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES); cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, "powerpc/mmu/ctx:prepare", mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead); diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index e89f64a0f24a..713268ccb1a0 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag) count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } } @@ -111,7 +111,7 @@ static void pte_free_now(struct rcu_head *head) struct ptdesc *ptdesc; ptdesc = container_of(head, struct ptdesc, pt_rcu_head); - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 787b22206386..15276068f657 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -50,13 +50,8 @@ notrace void __init early_ioremap_init(void) void __init *early_alloc_pgtable(unsigned long size) { - void *ptr = memblock_alloc(size, size); + return memblock_alloc_or_panic(size, size); - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, size); - - return ptr; } pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va) diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index fe2e0249cbc2..a112d26185a0 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -514,10 +514,7 @@ static int __init core99_nvram_setup(struct device_node *dp, unsigned long addr) printk(KERN_ERR "nvram: no address\n"); return -EINVAL; } - nvram_image = memblock_alloc(NVRAM_SIZE, SMP_CACHE_BYTES); - if (!nvram_image) - panic("%s: Failed to allocate %u bytes\n", __func__, - NVRAM_SIZE); + nvram_image = memblock_alloc_or_panic(NVRAM_SIZE, SMP_CACHE_BYTES); nvram_data = ioremap(addr, NVRAM_SIZE*2); nvram_naddrs = 1; /* Make sure we get the correct case */ diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 877720c64515..4ac9808e55a4 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -88,26 +88,6 @@ static void flush_dcache_range_chunked(unsigned long start, unsigned long stop, } } -static void memtrace_clear_range(unsigned long start_pfn, - unsigned long nr_pages) -{ - unsigned long pfn; - - /* As HIGHMEM does not apply, use clear_page() directly. */ - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) - cond_resched(); - clear_page(__va(PFN_PHYS(pfn))); - } - /* - * Before we go ahead and use this range as cache inhibited range - * flush the cache. - */ - flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn), - (unsigned long)pfn_to_kaddr(start_pfn + nr_pages), - FLUSH_CHUNK_SIZE); -} - static u64 memtrace_alloc_node(u32 nid, u64 size) { const unsigned long nr_pages = PHYS_PFN(size); @@ -119,17 +99,18 @@ static u64 memtrace_alloc_node(u32 nid, u64 size) * by alloc_contig_pages(). */ page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE | - __GFP_NOWARN, nid, NULL); + __GFP_NOWARN | __GFP_ZERO, nid, NULL); if (!page) return 0; start_pfn = page_to_pfn(page); /* - * Clear the range while we still have a linear mapping. - * - * TODO: use __GFP_ZERO with alloc_contig_pages() once supported. + * Before we go ahead and use this range as cache inhibited range + * flush the cache. */ - memtrace_clear_range(start_pfn, nr_pages); + flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn), + (unsigned long)pfn_to_kaddr(start_pfn + nr_pages), + FLUSH_CHUNK_SIZE); /* * Set pages PageOffline(), to indicate that nobody (e.g., hibernation, diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 5d0f35bb917e..09bd93464b4f 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -180,10 +180,7 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node, /* * Allocate a buffer to hold the MC recoverable ranges. */ - mc_recoverable_range = memblock_alloc(size, __alignof__(u64)); - if (!mc_recoverable_range) - panic("%s: Failed to allocate %u bytes align=0x%lx\n", - __func__, size, __alignof__(u64)); + mc_recoverable_range = memblock_alloc_or_panic(size, __alignof__(u64)); for (i = 0; i < mc_recoverable_range_len; i++) { mc_recoverable_range[i].start_addr = diff --git a/arch/powerpc/platforms/ps3/setup.c b/arch/powerpc/platforms/ps3/setup.c index 5144f11359f7..150c09b58ae8 100644 --- a/arch/powerpc/platforms/ps3/setup.c +++ b/arch/powerpc/platforms/ps3/setup.c @@ -115,10 +115,7 @@ static void __init prealloc(struct ps3_prealloc *p) if (!p->size) return; - p->address = memblock_alloc(p->size, p->align); - if (!p->address) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, p->size, p->align); + p->address = memblock_alloc_or_panic(p->size, p->align); printk(KERN_INFO "%s: %lu bytes at %p\n", p->name, p->size, p->address); diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f84ac9fbe203..f7c9271bda58 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -544,7 +544,7 @@ static int drc_pmem_query_health(struct papr_scm_priv *p) /* Jiffies offset for which the health data is assumed to be same */ cache_timeout = p->lasthealth_jiffies + - msecs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL * 1000); + secs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL); /* Fetch new health info is its older than MIN_HEALTH_QUERY_INTERVAL */ if (time_after(jiffies, cache_timeout)) diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index 0b6e37f3ffb8..456a4f64ae0a 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -124,10 +124,7 @@ int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count, if (bmp->bitmap_from_slab) bmp->bitmap = kzalloc(size, GFP_KERNEL); else { - bmp->bitmap = memblock_alloc(size, SMP_CACHE_BYTES); - if (!bmp->bitmap) - panic("%s: Failed to allocate %u bytes\n", __func__, - size); + bmp->bitmap = memblock_alloc_or_panic(size, SMP_CACHE_BYTES); /* the bitmap won't be freed from memblock allocator */ kmemleak_not_leak(bmp->bitmap); } diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index f52264304f77..3e2aebea6312 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -12,16 +12,25 @@ #include #ifdef CONFIG_MMU -#define __HAVE_ARCH_PUD_ALLOC_ONE #define __HAVE_ARCH_PUD_FREE #include +/* + * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to + * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use + * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this + * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the + * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h + * for more details. + */ static inline void riscv_tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt) { - if (riscv_use_sbi_for_rfence()) + if (riscv_use_sbi_for_rfence()) { tlb_remove_ptdesc(tlb, pt); - else + } else { + pagetable_dtor(pt); tlb_remove_page_ptdesc(tlb, pt); + } } static inline void pmd_populate_kernel(struct mm_struct *mm, @@ -88,15 +97,6 @@ static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, } } -#define pud_alloc_one pud_alloc_one -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - if (pgtable_l4_enabled) - return __pud_alloc_one(mm, addr); - - return NULL; -} - #define pud_free pud_free static inline void pud_free(struct mm_struct *mm, pud_t *pud) { @@ -107,39 +107,8 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long addr) { - if (pgtable_l4_enabled) { - struct ptdesc *ptdesc = virt_to_ptdesc(pud); - - pagetable_pud_dtor(ptdesc); - riscv_tlb_remove_ptdesc(tlb, ptdesc); - } -} - -#define p4d_alloc_one p4d_alloc_one -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - if (pgtable_l5_enabled) { - gfp_t gfp = GFP_PGTABLE_USER; - - if (mm == &init_mm) - gfp = GFP_PGTABLE_KERNEL; - return (p4d_t *)get_zeroed_page(gfp); - } - - return NULL; -} - -static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) -{ - BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - free_page((unsigned long)p4d); -} - -#define p4d_free p4d_free -static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) -{ - if (pgtable_l5_enabled) - __p4d_free(mm, p4d); + if (pgtable_l4_enabled) + riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, @@ -161,9 +130,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; - pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd = __pgd_alloc(mm, 0); if (likely(pgd != NULL)) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); /* Copy kernel mappings */ sync_kernel_mappings(pgd); } @@ -175,10 +143,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { - struct ptdesc *ptdesc = virt_to_ptdesc(pmd); - - pagetable_pmd_dtor(ptdesc); - riscv_tlb_remove_ptdesc(tlb, ptdesc); + riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } #endif /* __PAGETABLE_PMD_FOLDED */ @@ -186,10 +151,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - struct ptdesc *ptdesc = page_ptdesc(pte); - - pagetable_pte_dtor(ptdesc); - riscv_tlb_remove_ptdesc(tlb, ptdesc); + riscv_tlb_remove_ptdesc(tlb, page_ptdesc(pte)); } #endif /* CONFIG_MMU */ diff --git a/arch/riscv/include/asm/tlb.h b/arch/riscv/include/asm/tlb.h index 1f6c38420d8e..50b63b5c15bd 100644 --- a/arch/riscv/include/asm/tlb.h +++ b/arch/riscv/include/asm/tlb.h @@ -10,24 +10,6 @@ struct mmu_gather; static void tlb_flush(struct mmu_gather *tlb); -#ifdef CONFIG_MMU -#include - -/* - * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to - * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use - * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this - * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the - * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h - * for more details. - */ -static inline void __tlb_remove_table(void *table) -{ - free_page_and_swap_cache(table); -} - -#endif /* CONFIG_MMU */ - #define tlb_flush tlb_flush #include diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 45010e71df86..f1793630fc51 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -147,9 +147,7 @@ static void __init init_resources(void) res_idx = num_resources - 1; mem_res_sz = num_resources * sizeof(*mem_res); - mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES); - if (!mem_res) - panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz); + mem_res = memblock_alloc_or_panic(mem_res_sz, SMP_CACHE_BYTES); /* * Start by adding the reserved regions, if they overlap diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 8d167e09f1fe..722178ae3488 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1573,7 +1573,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) return; } - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); if (PageReserved(page)) free_reserved_page(page); else @@ -1595,7 +1595,7 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool is_vmemm } if (!is_vmemmap) - pagetable_pmd_dtor(ptdesc); + pagetable_dtor(ptdesc); if (PageReserved(page)) free_reserved_page(page); else diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index c301c8d291d2..41c635d6aca4 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -32,7 +32,7 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned pte_t *ptep, *p; if (pmd_none(pmdp_get(pmd))) { - p = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -54,7 +54,7 @@ static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned unsigned long next; if (pud_none(pudp_get(pud))) { - p = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); set_pud(pud, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -85,7 +85,7 @@ static void __init kasan_populate_pud(p4d_t *p4d, unsigned long next; if (p4d_none(p4dp_get(p4d))) { - p = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); set_p4d(p4d, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -116,7 +116,7 @@ static void __init kasan_populate_p4d(pgd_t *pgd, unsigned long next; if (pgd_none(pgdp_get(pgd))) { - p = memblock_alloc(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE); set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -385,7 +385,7 @@ static void __init kasan_shallow_populate_pud(p4d_t *p4d, next = pud_addr_end(vaddr, end); if (pud_none(pudp_get(pud_k))) { - p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); set_pud(pud_k, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; } @@ -405,7 +405,7 @@ static void __init kasan_shallow_populate_p4d(pgd_t *pgd, next = p4d_addr_end(vaddr, end); if (p4d_none(p4dp_get(p4d_k))) { - p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); set_p4d(p4d_k, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; } @@ -424,7 +424,7 @@ static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long next = pgd_addr_end(vaddr, end); if (pgd_none(pgdp_get(pgd_k))) { - p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; } diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 7b84ef6dc4b6..b19b6ed2ab53 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -53,29 +53,42 @@ static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) { unsigned long *table = crst_table_alloc(mm); - if (table) - crst_table_init(table, _REGION2_ENTRY_EMPTY); + if (!table) + return NULL; + crst_table_init(table, _REGION2_ENTRY_EMPTY); + pagetable_p4d_ctor(virt_to_ptdesc(table)); + return (p4d_t *) table; } static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { - if (!mm_p4d_folded(mm)) - crst_table_free(mm, (unsigned long *) p4d); + if (mm_p4d_folded(mm)) + return; + + pagetable_dtor(virt_to_ptdesc(p4d)); + crst_table_free(mm, (unsigned long *) p4d); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { unsigned long *table = crst_table_alloc(mm); - if (table) - crst_table_init(table, _REGION3_ENTRY_EMPTY); + + if (!table) + return NULL; + crst_table_init(table, _REGION3_ENTRY_EMPTY); + pagetable_pud_ctor(virt_to_ptdesc(table)); + return (pud_t *) table; } static inline void pud_free(struct mm_struct *mm, pud_t *pud) { - if (!mm_pud_folded(mm)) - crst_table_free(mm, (unsigned long *) pud); + if (mm_pud_folded(mm)) + return; + + pagetable_dtor(virt_to_ptdesc(pud)); + crst_table_free(mm, (unsigned long *) pud); } static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) @@ -96,7 +109,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { if (mm_pmd_folded(mm)) return; - pagetable_pmd_dtor(virt_to_ptdesc(pmd)); + pagetable_dtor(virt_to_ptdesc(pmd)); crst_table_free(mm, (unsigned long *) pmd); } @@ -117,11 +130,18 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return (pgd_t *) crst_table_alloc(mm); + unsigned long *table = crst_table_alloc(mm); + + if (!table) + return NULL; + pagetable_pgd_ctor(virt_to_ptdesc(table)); + + return (pgd_t *) table; } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { + pagetable_dtor(virt_to_ptdesc(pgd)); crst_table_free(mm, (unsigned long *) pgd); } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index e95b2c8081eb..f39f8c4723f1 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -22,7 +22,6 @@ * Pages used for the page tables is a different story. FIXME: more */ -void __tlb_remove_table(void *_table); static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); @@ -87,7 +86,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, tlb->cleared_pmds = 1; if (mm_alloc_pgste(tlb->mm)) gmap_unlink(tlb->mm, (unsigned long *)pte, address); - tlb_remove_ptdesc(tlb, pte); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte)); } /* @@ -102,12 +101,11 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, { if (mm_pmd_folded(tlb->mm)) return; - pagetable_pmd_dtor(virt_to_ptdesc(pmd)); __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_puds = 1; - tlb_remove_ptdesc(tlb, pmd); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } /* @@ -125,7 +123,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; - tlb_remove_ptdesc(tlb, p4d); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } /* @@ -143,7 +141,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_p4ds = 1; - tlb_remove_ptdesc(tlb, pud); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index cd0c93a8fb8b..dc7328fd2ec4 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -63,9 +63,7 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu) { struct save_area *sa; - sa = memblock_alloc(sizeof(*sa), 8); - if (!sa) - return NULL; + sa = memblock_alloc_or_panic(sizeof(*sa), 8); if (is_boot_cpu) list_add(&sa->list, &dump_save_areas); diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index 6652e54cf3db..6d1ffca5f798 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -166,7 +166,7 @@ static struct timer_list lgr_timer; */ static void lgr_timer_set(void) { - mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC)); + mod_timer(&lgr_timer, jiffies + secs_to_jiffies(LGR_TIMER_INTERVAL_SECS)); } /* diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index ddc1448ea2e1..2fc40f97c0ad 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -21,12 +21,8 @@ void __init numa_setup(void) nodes_clear(node_possible_map); node_set(0, node_possible_map); node_set_online(0); - for (nid = 0; nid < MAX_NUMNODES; nid++) { - NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); - if (!NODE_DATA(nid)) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(pg_data_t), 8); - } + for (nid = 0; nid < MAX_NUMNODES; nid++) + NODE_DATA(nid) = memblock_alloc_or_panic(sizeof(pg_data_t), 8); NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; NODE_DATA(0)->node_id = 0; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index a3fea683b227..f873535eddd2 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -384,11 +384,7 @@ static unsigned long __init stack_alloc_early(void) { unsigned long stack; - stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); - if (!stack) { - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, THREAD_SIZE, THREAD_SIZE); - } + stack = (unsigned long)memblock_alloc_or_panic(THREAD_SIZE, THREAD_SIZE); return stack; } @@ -512,10 +508,7 @@ static void __init setup_resources(void) bss_resource.end = __pa_symbol(__bss_stop) - 1; for_each_mem_range(i, &start, &end) { - res = memblock_alloc(sizeof(*res), 8); - if (!res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*res), 8); + res = memblock_alloc_or_panic(sizeof(*res), 8); res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; @@ -534,10 +527,7 @@ static void __init setup_resources(void) std_res->start > res->end) continue; if (std_res->end > res->end) { - sub_res = memblock_alloc(sizeof(*sub_res), 8); - if (!sub_res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*sub_res), 8); + sub_res = memblock_alloc_or_panic(sizeof(*sub_res), 8); *sub_res = *std_res; sub_res->end = res->end; std_res->start = res->end + 1; @@ -824,9 +814,7 @@ static void __init setup_randomness(void) { struct sysinfo_3_2_2 *vmms; - vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!vmms) - panic("Failed to allocate memory for sysinfo structure\n"); + vmms = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); memblock_free(vmms, PAGE_SIZE); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 822d8e6f8717..7b08399b0846 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -611,9 +611,7 @@ void __init smp_save_dump_ipl_cpu(void) if (!dump_available()) return; sa = save_area_alloc(true); - regs = memblock_alloc(512, 8); - if (!sa || !regs) - panic("could not allocate memory for boot CPU save area\n"); + regs = memblock_alloc_or_panic(512, 8); copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); save_area_add_regs(sa, regs); memblock_free(regs, 512); @@ -646,8 +644,6 @@ void __init smp_save_dump_secondary_cpus(void) SIGP_CC_NOT_OPERATIONAL) continue; sa = save_area_alloc(false); - if (!sa) - panic("could not allocate memory for save area\n"); __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page)); save_area_add_regs(sa, page); if (cpu_has_vx()) { @@ -792,10 +788,7 @@ void __init smp_detect_cpus(void) u16 address; /* Get CPU information */ - info = memblock_alloc(sizeof(*info), 8); - if (!info) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*info), 8); + info = memblock_alloc_or_panic(sizeof(*info), 8); smp_get_core_info(info, 1); /* Find boot CPU type */ if (sclp.has_core_type) { diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 34a65c141ea0..e9f47c3a6197 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -662,12 +662,12 @@ static void stp_check_leap(void) if (ret < 0) pr_err("failed to set leap second flags\n"); /* arm Timer to clear leap second flags */ - mod_timer(&stp_timer, jiffies + msecs_to_jiffies(14400 * MSEC_PER_SEC)); + mod_timer(&stp_timer, jiffies + secs_to_jiffies(14400)); } else { /* The day the leap second is scheduled for hasn't been reached. Retry * in one hour. */ - mod_timer(&stp_timer, jiffies + msecs_to_jiffies(3600 * MSEC_PER_SEC)); + mod_timer(&stp_timer, jiffies + secs_to_jiffies(3600)); } } diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 4f9c301a705b..cf5ee6032c0b 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -371,7 +371,7 @@ static void set_topology_timer(void) if (atomic_add_unless(&topology_poll, -1, 0)) mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100)); else - mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC)); + mod_timer(&topology_timer, jiffies + secs_to_jiffies(60)); } void topology_expect_change(void) @@ -548,10 +548,7 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info, nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; nr_masks = max(nr_masks, 1); for (i = 0; i < nr_masks; i++) { - mask->next = memblock_alloc(sizeof(*mask->next), 8); - if (!mask->next) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*mask->next), 8); + mask->next = memblock_alloc_or_panic(sizeof(*mask->next), 8); mask = mask->next; } } @@ -569,10 +566,7 @@ void __init topology_init_early(void) } if (!MACHINE_HAS_TOPOLOGY) goto out; - tl_info = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!tl_info) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + tl_info = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); info = tl_info; store_topology(info); pr_info("The CPU configuration topology of the machine is: %d %d %d %d %d %d / %d\n", diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index d01724a715d0..7bf0f691827b 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -204,7 +204,7 @@ static void cmm_set_timer(void) del_timer(&cmm_timer); return; } - mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC)); + mod_timer(&cmm_timer, jiffies + secs_to_jiffies(cmm_timeout_seconds)); } static void cmm_timer_fn(struct timer_list *unused) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 58696a0c4e4a..a4e761902093 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -180,30 +180,11 @@ unsigned long *page_table_alloc(struct mm_struct *mm) return table; } -static void pagetable_pte_dtor_free(struct ptdesc *ptdesc) -{ - pagetable_pte_dtor(ptdesc); - pagetable_free(ptdesc); -} - void page_table_free(struct mm_struct *mm, unsigned long *table) { struct ptdesc *ptdesc = virt_to_ptdesc(table); - pagetable_pte_dtor_free(ptdesc); -} - -void __tlb_remove_table(void *table) -{ - struct ptdesc *ptdesc = virt_to_ptdesc(table); - struct page *page = ptdesc_page(ptdesc); - - if (compound_order(page) == CRST_ALLOC_ORDER) { - /* pmd, pud, or p4d */ - pagetable_free(ptdesc); - return; - } - pagetable_pte_dtor_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -211,7 +192,7 @@ static void pte_free_now(struct rcu_head *head) { struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head); - pagetable_pte_dtor_free(ptdesc); + pagetable_dtor_free(ptdesc); } void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 5d8577ab1591..96d938fdf224 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -34,7 +34,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 2a88b0c9e70f..289a2fecebef 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -137,10 +137,7 @@ static pmd_t * __init one_md_table_init(pud_t *pud) if (pud_none(*pud)) { pmd_t *pmd; - pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pmd) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + pmd = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pud_populate(&init_mm, pud, pmd); BUG_ON(pmd != pmd_offset(pud, 0)); } @@ -153,10 +150,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (pmd_none(*pmd)) { pte_t *pte; - pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(&init_mm, pmd, pte); BUG_ON(pte != pte_offset_kernel(pmd, 0)); } diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h index 3037187482db..1a6e694418e3 100644 --- a/arch/sparc/include/asm/tlb_64.h +++ b/arch/sparc/include/asm/tlb_64.h @@ -33,6 +33,7 @@ void flush_tlb_pending(void); #define tlb_needs_table_invalidate() (false) #endif +#define __HAVE_ARCH_TLB_REMOVE_TABLE #include #endif /* _SPARC64_TLB_H */ diff --git a/arch/sparc/kernel/prom_32.c b/arch/sparc/kernel/prom_32.c index 3df960c137f7..a67dd67f10c8 100644 --- a/arch/sparc/kernel/prom_32.c +++ b/arch/sparc/kernel/prom_32.c @@ -28,9 +28,7 @@ void * __init prom_early_alloc(unsigned long size) { void *ret; - ret = memblock_alloc(size, SMP_CACHE_BYTES); - if (!ret) - panic("%s: Failed to allocate %lu bytes\n", __func__, size); + ret = memblock_alloc_or_panic(size, SMP_CACHE_BYTES); prom_early_allocated += size; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 21f8cbbd0581..05882bca5b73 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2915,7 +2915,7 @@ static void __pte_free(pgtable_t pte) { struct ptdesc *ptdesc = virt_to_ptdesc(pte); - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 9df51a62333d..dd32711022f5 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -277,19 +277,13 @@ static void __init srmmu_nocache_init(void) bitmap_bits = srmmu_nocache_size >> SRMMU_NOCACHE_BITMAP_SHIFT; - srmmu_nocache_pool = memblock_alloc(srmmu_nocache_size, + srmmu_nocache_pool = memblock_alloc_or_panic(srmmu_nocache_size, SRMMU_NOCACHE_ALIGN_MAX); - if (!srmmu_nocache_pool) - panic("%s: Failed to allocate %lu bytes align=0x%x\n", - __func__, srmmu_nocache_size, SRMMU_NOCACHE_ALIGN_MAX); memset(srmmu_nocache_pool, 0, srmmu_nocache_size); srmmu_nocache_bitmap = - memblock_alloc(BITS_TO_LONGS(bitmap_bits) * sizeof(long), + memblock_alloc_or_panic(BITS_TO_LONGS(bitmap_bits) * sizeof(long), SMP_CACHE_BYTES); - if (!srmmu_nocache_bitmap) - panic("%s: Failed to allocate %zu bytes\n", __func__, - BITS_TO_LONGS(bitmap_bits) * sizeof(long)); bit_map_init(&srmmu_nocache_map, srmmu_nocache_bitmap, bitmap_bits); srmmu_swapper_pg_dir = __srmmu_get_nocache(SRMMU_PGD_TABLE_SIZE, SRMMU_PGD_TABLE_SIZE); @@ -372,7 +366,7 @@ void pte_free(struct mm_struct *mm, pgtable_t ptep) page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); spin_lock(&mm->page_table_lock); if (page_ref_dec_return(page) == 1) - pagetable_pte_dtor(page_ptdesc(page)); + pagetable_dtor(page_ptdesc(page)); spin_unlock(&mm->page_table_lock); srmmu_free_nocache(ptep, SRMMU_PTE_TABLE_SIZE); @@ -452,9 +446,7 @@ static void __init sparc_context_init(int numctx) unsigned long size; size = numctx * sizeof(struct ctx_list); - ctx_list_pool = memblock_alloc(size, SMP_CACHE_BYTES); - if (!ctx_list_pool) - panic("%s: Failed to allocate %lu bytes\n", __func__, size); + ctx_list_pool = memblock_alloc_or_panic(size, SMP_CACHE_BYTES); for (ctx = 0; ctx < numctx; ctx++) { struct ctx_list *clist; diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 75d04fb4994a..d5a9c5aabaec 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -636,10 +636,7 @@ static int __init eth_setup(char *str) return 1; } - new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); - if (!new) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*new)); + new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); INIT_LIST_HEAD(&new->list); new->index = n; diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 64c09db392c1..85b129e2b70b 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1694,10 +1694,7 @@ static int __init vector_setup(char *str) str, error); return 1; } - new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); - if (!new) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*new)); + new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); INIT_LIST_HEAD(&new->list); new->unit = n; new->arguments = str; diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 04fb4e6969a4..f0af23c3aeb2 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -27,7 +27,7 @@ extern pgd_t *pgd_alloc(struct mm_struct *); #define __pte_free_tlb(tlb, pte, address) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) @@ -35,7 +35,7 @@ do { \ #define __pmd_free_tlb(tlb, pmd, address) \ do { \ - pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \ + pagetable_dtor(virt_to_ptdesc(pmd)); \ tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ } while (0) @@ -43,7 +43,7 @@ do { \ #define __pud_free_tlb(tlb, pud, address) \ do { \ - pagetable_pud_dtor(virt_to_ptdesc(pud)); \ + pagetable_dtor(virt_to_ptdesc(pud)); \ tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pud)); \ } while (0) diff --git a/arch/um/kernel/load_file.c b/arch/um/kernel/load_file.c index 5cecd0e291fb..cb9d178ab7d8 100644 --- a/arch/um/kernel/load_file.c +++ b/arch/um/kernel/load_file.c @@ -48,9 +48,7 @@ void *uml_load_file(const char *filename, unsigned long long *size) return NULL; } - area = memblock_alloc(*size, SMP_CACHE_BYTES); - if (!area) - panic("%s: Failed to allocate %llu bytes\n", __func__, *size); + area = memblock_alloc_or_panic(*size, SMP_CACHE_BYTES); if (__uml_load_file(filename, area, *size)) { memblock_free(area, *size); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 53248ed04771..d98812907493 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -214,14 +214,13 @@ void free_initmem(void) pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd_t *pgd = __pgd_alloc(mm, 0); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + if (pgd) memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } + return pgd; } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9d7bd0ae48c4..2e1a3e4386de 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -322,6 +322,7 @@ config X86 select FUNCTION_ALIGNMENT_4B imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + select ARCH_SUPPORTS_PT_RECLAIM if X86_64 config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index c5b0148b8c0a..a3c9b7c67640 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1572,9 +1572,7 @@ static void __init alloc_runtime_data(int cpu) struct svsm_ca *caa; /* Allocate the SVSM CA page if an SVSM is present */ - caa = memblock_alloc(sizeof(*caa), PAGE_SIZE); - if (!caa) - panic("Can't allocate SVSM CA page\n"); + caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE); per_cpu(svsm_caa, cpu) = caa; per_cpu(svsm_caa_pa, cpu) = __pa(caa); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e525cd85f999..1ef08289e667 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -73,10 +73,14 @@ unsigned long tcp_ptr__ = raw_cpu_read_long(this_cpu_off); \ \ tcp_ptr__ += (__force unsigned long)(_ptr); \ - (typeof(*(_ptr)) __kernel __force *)tcp_ptr__; \ + (TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)tcp_ptr__; \ }) #else -#define arch_raw_cpu_ptr(_ptr) ({ BUILD_BUG(); (typeof(_ptr))0; }) +#define arch_raw_cpu_ptr(_ptr) \ +({ \ + BUILD_BUG(); \ + (TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)0; \ +}) #endif #define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel @@ -91,9 +95,23 @@ #endif /* CONFIG_SMP */ -#define __my_cpu_type(var) typeof(var) __percpu_seg_override -#define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr) -#define __my_cpu_var(var) (*__my_cpu_ptr(&(var))) +/* + * XXX: Remove test for __CHECKER__ once + * sparse learns about __typeof_unqual__. + */ +#if defined(CONFIG_USE_X86_SEG_SUPPORT) && \ + defined(CONFIG_CC_HAS_TYPEOF_UNQUAL) && !defined(__CHECKER__) +# define __my_cpu_type(var) typeof(var) +# define __my_cpu_ptr(ptr) (ptr) +# define __my_cpu_var(var) (var) + +# define __percpu_qual __percpu_seg_override +#else +# define __my_cpu_type(var) typeof(var) __percpu_seg_override +# define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr) +# define __my_cpu_var(var) (*__my_cpu_ptr(&(var))) +#endif + #define __percpu_arg(x) __percpu_prefix "%" #x #define __force_percpu_arg(x) __force_percpu_prefix "%" #x @@ -180,7 +198,7 @@ do { \ __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \ \ if (0) { \ - typeof(_var) pto_tmp__; \ + TYPEOF_UNQUAL(_var) pto_tmp__; \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ @@ -219,7 +237,7 @@ do { \ __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \ \ if (0) { \ - typeof(_var) pto_tmp__; \ + TYPEOF_UNQUAL(_var) pto_tmp__; \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ @@ -240,7 +258,7 @@ do { \ (val) == (typeof(val))-1)) ? (int)(val) : 0; \ \ if (0) { \ - typeof(var) pao_tmp__; \ + TYPEOF_UNQUAL(var) pao_tmp__; \ pao_tmp__ = (val); \ (void)pao_tmp__; \ } \ @@ -273,7 +291,7 @@ do { \ */ #define raw_percpu_xchg_op(_var, _nval) \ ({ \ - typeof(_var) pxo_old__ = raw_cpu_read(_var); \ + TYPEOF_UNQUAL(_var) pxo_old__ = raw_cpu_read(_var); \ \ raw_cpu_write(_var, _nval); \ \ @@ -287,7 +305,7 @@ do { \ */ #define this_percpu_xchg_op(_var, _nval) \ ({ \ - typeof(_var) pxo_old__ = this_cpu_read(_var); \ + TYPEOF_UNQUAL(_var) pxo_old__ = this_cpu_read(_var); \ \ do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \ \ diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dcd836b59beb..dd4841231bb9 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -147,24 +147,6 @@ static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4 set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT; - - if (mm == &init_mm) - gfp &= ~__GFP_ACCOUNT; - return (p4d_t *)get_zeroed_page(gfp); -} - -static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) -{ - if (!pgtable_l5_enabled()) - return; - - BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - free_page((unsigned long)p4d); -} - extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 4d3c9d00d6b6..77f52bc1578a 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -20,22 +20,9 @@ static inline void tlb_flush(struct mmu_gather *tlb) flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); } -/* - * While x86 architecture in general requires an IPI to perform TLB - * shootdown, enablement code for several hypervisors overrides - * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing - * a hypercall. To keep software pagetable walkers safe in this case we - * switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment - * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h - * for more details. - */ -static inline void __tlb_remove_table(void *table) -{ - free_page_and_swap_cache(table); -} - static inline void invlpg(unsigned long addr) { asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } + #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3a44a9dc3fb7..7c15d6e83c37 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -911,11 +911,8 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) * the resource tree during the lateinit timeframe. */ #define HPET_RESOURCE_NAME_SIZE 9 - hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, + hpet_res = memblock_alloc_or_panic(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, SMP_CACHE_BYTES); - if (!hpet_res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); hpet_res->name = (void *)&hpet_res[1]; hpet_res->flags = IORESOURCE_MEM; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1029ea4ac8ba..a57d3fa7c6b6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2503,9 +2503,7 @@ static struct resource * __init ioapic_setup_resources(void) n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); n *= nr_ioapics; - mem = memblock_alloc(n, SMP_CACHE_BYTES); - if (!mem) - panic("%s: Failed to allocate %lu bytes\n", __func__, n); + mem = memblock_alloc_or_panic(n, SMP_CACHE_BYTES); res = (void *)mem; mem += sizeof(struct resource) * nr_ioapics; @@ -2564,11 +2562,8 @@ void __init io_apic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE, + ioapic_phys = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - if (!ioapic_phys) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } io_apic_set_fixmap(idx, ioapic_phys); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4893d30ce438..82b96ed9890a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1146,11 +1146,8 @@ void __init e820__reserve_resources(void) struct resource *res; u64 end; - res = memblock_alloc(sizeof(*res) * e820_table->nr_entries, + res = memblock_alloc_or_panic(sizeof(*res) * e820_table->nr_entries, SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*res) * e820_table->nr_entries); e820_res = res; for (i = 0; i < e820_table->nr_entries; i++) { diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 9c9faa1634fb..102641fd2172 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -655,7 +655,7 @@ void kgdb_arch_late(void) if (breakinfo[i].pev) continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); - if (IS_ERR((void * __force)breakinfo[i].pev)) { + if (IS_ERR_PCPU(breakinfo[i].pev)) { printk(KERN_ERR "kgdb: Could not allocate hw" "breakpoints\nDisabling the kernel debugger\n"); breakinfo[i].pev = NULL; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index fec381533555..70161999c973 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -59,10 +59,20 @@ void __init native_pv_lock_init(void) static_branch_enable(&virt_spin_lock_key); } +#ifndef CONFIG_PT_RECLAIM static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) { - tlb_remove_page(tlb, table); + struct ptdesc *ptdesc = (struct ptdesc *)table; + + pagetable_dtor(ptdesc); + tlb_remove_page(tlb, ptdesc_page(ptdesc)); } +#else +static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_table(tlb, table); +} +#endif struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f1fea506e20f..cebee310e200 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -259,6 +259,7 @@ static void __init relocate_initrd(void) u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); + int ret = 0; /* We need to move the initrd down into directly mapped mem */ u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, @@ -272,7 +273,9 @@ static void __init relocate_initrd(void) printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); - copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + ret = copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + if (ret) + panic("Copy RAMDISK failed\n"); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e9e803a4d44c..e6cc84143f3e 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -246,9 +246,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */ if (v.flags & VM86_SCREEN_BITMAP) { - char comm[TASK_COMM_LEN]; - - pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current)); + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", + current->comm); return -EINVAL; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5745a354a241..1fef5ad32d5a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -19,12 +19,23 @@ EXPORT_SYMBOL(physical_mask); #endif #ifndef CONFIG_PARAVIRT +#ifndef CONFIG_PT_RECLAIM static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { - tlb_remove_page(tlb, table); + struct ptdesc *ptdesc = (struct ptdesc *)table; + + pagetable_dtor(ptdesc); + tlb_remove_page(tlb, ptdesc_page(ptdesc)); } -#endif +#else +static inline +void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_table(tlb, table); +} +#endif /* !CONFIG_PT_RECLAIM */ +#endif /* !CONFIG_PARAVIRT */ gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; @@ -52,15 +63,13 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pagetable_pte_dtor(page_ptdesc(pte)); paravirt_release_pte(page_to_pfn(pte)); - paravirt_tlb_remove_table(tlb, pte); + paravirt_tlb_remove_table(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { - struct ptdesc *ptdesc = virt_to_ptdesc(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table @@ -69,25 +78,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - pagetable_pmd_dtor(ptdesc); - paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); + paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { - struct ptdesc *ptdesc = virt_to_ptdesc(pud); - - pagetable_pud_dtor(ptdesc); paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_page(pud)); + paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); + paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ @@ -222,7 +227,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) if (pmds[i]) { ptdesc = virt_to_ptdesc(pmds[i]); - pagetable_pmd_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } @@ -392,15 +397,14 @@ void __init pgtable_cache_init(void) SLAB_PANIC, NULL); } -static inline pgd_t *_pgd_alloc(void) +static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { /* * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) - return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, - PGD_ALLOCATION_ORDER); + return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate @@ -409,24 +413,23 @@ static inline pgd_t *_pgd_alloc(void) return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); } -static inline void _pgd_free(pgd_t *pgd) +static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { if (!SHARED_KERNEL_PMD) - free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); + __pgd_free(mm, pgd); else kmem_cache_free(pgd_cache, pgd); } #else -static inline pgd_t *_pgd_alloc(void) +static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { - return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, - PGD_ALLOCATION_ORDER); + return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); } -static inline void _pgd_free(pgd_t *pgd) +static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { - free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); + __pgd_free(mm, pgd); } #endif /* CONFIG_X86_PAE */ @@ -436,7 +439,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[MAX_PREALLOCATED_PMDS]; - pgd = _pgd_alloc(); + pgd = _pgd_alloc(mm); if (pgd == NULL) goto out; @@ -479,7 +482,7 @@ out_free_pmds: if (sizeof(pmds) != 0) free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: - _pgd_free(pgd); + _pgd_free(mm, pgd); out: return NULL; } @@ -489,7 +492,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); - _pgd_free(pgd); + _pgd_free(mm, pgd); } /* @@ -856,7 +859,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) free_page((unsigned long)pmd_sv); - pagetable_pmd_dtor(virt_to_ptdesc(pmd)); + pagetable_dtor(virt_to_ptdesc(pmd)); free_page((unsigned long)pmd); return 1; diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 74ebd6882690..cf5dca2dbb91 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -136,11 +136,7 @@ void * __init prom_early_alloc(unsigned long size) * fast enough on the platforms we care about while minimizing * wasted bootmem) and hand off chunks of it to callers. */ - res = memblock_alloc(chunk_size, SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - chunk_size); - BUG_ON(!res); + res = memblock_alloc_or_panic(chunk_size, SMP_CACHE_BYTES); prom_early_allocated += chunk_size; memset(res, 0, chunk_size); free_mem = chunk_size; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b52d3e17e2c1..56914e21e303 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -178,13 +178,7 @@ static void p2m_init_identity(unsigned long *p2m, unsigned long pfn) static void * __ref alloc_p2m_page(void) { if (unlikely(!slab_is_available())) { - void *ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); - - return ptr; + return memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } return (void *)__get_free_page(GFP_KERNEL); diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index 7fc0f9126dd3..1919ee9c3dd6 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -29,7 +29,7 @@ static inline pgd_t* pgd_alloc(struct mm_struct *mm) { - return (pgd_t*) __get_free_page(GFP_KERNEL | __GFP_ZERO); + return __pgd_alloc(mm, 0); } static inline void ptes_clear(pte_t *ptep) diff --git a/arch/xtensa/mm/kasan_init.c b/arch/xtensa/mm/kasan_init.c index f00d122aa806..f39c4d83173a 100644 --- a/arch/xtensa/mm/kasan_init.c +++ b/arch/xtensa/mm/kasan_init.c @@ -39,11 +39,7 @@ static void __init populate(void *start, void *end) unsigned long i, j; unsigned long vaddr = (unsigned long)start; pmd_t *pmd = pmd_off_k(vaddr); - pte_t *pte = memblock_alloc(n_pages * sizeof(pte_t), PAGE_SIZE); - - if (!pte) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, n_pages * sizeof(pte_t), PAGE_SIZE); + pte_t *pte = memblock_alloc_or_panic(n_pages * sizeof(pte_t), PAGE_SIZE); pr_debug("%s: %p - %p\n", __func__, start, end); diff --git a/drivers/accel/habanalabs/common/context.c b/drivers/accel/habanalabs/common/context.c index b83141f58319..9f212b17611a 100644 --- a/drivers/accel/habanalabs/common/context.c +++ b/drivers/accel/habanalabs/common/context.c @@ -199,7 +199,6 @@ out_err: int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) { - char task_comm[TASK_COMM_LEN]; int rc = 0, i; ctx->hdev = hdev; @@ -272,7 +271,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) mutex_init(&ctx->ts_reg_lock); dev_dbg(hdev->dev, "create user context, comm=\"%s\", asid=%u\n", - get_task_comm(task_comm, current), ctx->asid); + current->comm, ctx->asid); } return 0; diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index e0cf3b4343bb..30277ae410d4 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -817,7 +817,7 @@ static void device_hard_reset_pending(struct work_struct *work) } queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work, - msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000)); + secs_to_jiffies(HL_PENDING_RESET_PER_SEC)); } } diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c index 708dfd10f39c..5ec13f3a46f9 100644 --- a/drivers/accel/habanalabs/common/habanalabs_drv.c +++ b/drivers/accel/habanalabs/common/habanalabs_drv.c @@ -362,8 +362,7 @@ static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout) * a different default timeout for Gaudi */ if (timeout == HL_DEFAULT_TIMEOUT_LOCKED) - hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED * - MSEC_PER_SEC); + hdev->timeout_jiffies = secs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED); hdev->reset_upon_device_release = 0; break; diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index 1dd6e23172ca..8729a0c57d78 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -1279,13 +1279,10 @@ static long _hl_ioctl(struct hl_fpriv *hpriv, unsigned int cmd, unsigned long ar retcode = -EFAULT; out_err: - if (retcode) { - char task_comm[TASK_COMM_LEN]; - + if (retcode) dev_dbg_ratelimited(dev, "error in ioctl: pid=%d, comm=\"%s\", cmd=%#010x, nr=%#04x\n", - task_pid_nr(current), get_task_comm(task_comm, current), cmd, nr); - } + task_pid_nr(current), current->comm, cmd, nr); if (kdata != stack_kdata) kfree(kdata); @@ -1308,11 +1305,9 @@ long hl_ioctl_control(struct file *filep, unsigned int cmd, unsigned long arg) if (nr == _IOC_NR(DRM_IOCTL_HL_INFO)) { ioctl = &hl_ioctls_control[nr - HL_COMMAND_START]; } else { - char task_comm[TASK_COMM_LEN]; - dev_dbg_ratelimited(hdev->dev_ctrl, "invalid ioctl: pid=%d, comm=\"%s\", cmd=%#010x, nr=%#04x\n", - task_pid_nr(current), get_task_comm(task_comm, current), cmd, nr); + task_pid_nr(current), current->comm, cmd, nr); return -ENOTTY; } diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index ca87a0939135..f7fb7205028d 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -251,6 +251,10 @@ static int __init extlog_init(void) } extlog_l1_hdr = acpi_os_map_iomem(l1_dirbase, l1_hdr_size); + if (!extlog_l1_hdr) { + rc = -ENOMEM; + goto err_release_l1_hdr; + } l1_head = (struct extlog_l1_head *)extlog_l1_hdr; l1_size = l1_head->total_len; l1_percpu_entry = l1_head->entries; @@ -268,6 +272,10 @@ static int __init extlog_init(void) goto err; } extlog_l1_addr = acpi_os_map_iomem(l1_dirbase, l1_size); + if (!extlog_l1_addr) { + rc = -ENOMEM; + goto err_release_l1_dir; + } l1_entry_base = (u64 *)((u8 *)extlog_l1_addr + l1_hdr_size); /* remap elog table */ @@ -279,6 +287,10 @@ static int __init extlog_init(void) goto err_release_l1_dir; } elog_addr = acpi_os_map_iomem(elog_base, elog_size); + if (!elog_addr) { + rc = -ENOMEM; + goto err_release_elog; + } rc = -ENOMEM; /* allocate buffer to save elog record */ @@ -300,6 +312,8 @@ err_release_l1_dir: if (extlog_l1_addr) acpi_os_unmap_iomem(extlog_l1_addr, l1_size); release_mem_region(l1_dirbase, l1_size); +err_release_l1_hdr: + release_mem_region(l1_dirbase, l1_hdr_size); err: pr_warn(FW_BUG "Extended error log disabled because of problems parsing f/w tables\n"); return rc; diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index bec0dcd1f9c3..59fffe34c9d0 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -81,6 +81,92 @@ int acpi_map_pxm_to_node(int pxm) } EXPORT_SYMBOL(acpi_map_pxm_to_node); +#ifdef CONFIG_NUMA_EMU +/* + * Take max_nid - 1 fake-numa nodes into account in both + * pxm_to_node_map()/node_to_pxm_map[] tables. + */ +int __init fix_pxm_node_maps(int max_nid) +{ + static int pxm_to_node_map_copy[MAX_PXM_DOMAINS] __initdata + = { [0 ... MAX_PXM_DOMAINS - 1] = NUMA_NO_NODE }; + static int node_to_pxm_map_copy[MAX_NUMNODES] __initdata + = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL }; + int i, j, index = -1, count = 0; + nodemask_t nodes_to_enable; + + if (numa_off || srat_disabled()) + return -1; + + /* find fake nodes PXM mapping */ + for (i = 0; i < MAX_NUMNODES; i++) { + if (node_to_pxm_map[i] != PXM_INVAL) { + for (j = 0; j <= max_nid; j++) { + if ((emu_nid_to_phys[j] == i) && + WARN(node_to_pxm_map_copy[j] != PXM_INVAL, + "Node %d is already binded to PXM %d\n", + j, node_to_pxm_map_copy[j])) + return -1; + if (emu_nid_to_phys[j] == i) { + node_to_pxm_map_copy[j] = + node_to_pxm_map[i]; + if (j > index) + index = j; + count++; + } + } + } + } + if (WARN(index != max_nid, "%d max nid when expected %d\n", + index, max_nid)) + return -1; + + nodes_clear(nodes_to_enable); + + /* map phys nodes not used for fake nodes */ + for (i = 0; i < MAX_NUMNODES; i++) { + if (node_to_pxm_map[i] != PXM_INVAL) { + for (j = 0; j <= max_nid; j++) + if (emu_nid_to_phys[j] == i) + break; + /* fake nodes PXM mapping has been done */ + if (j <= max_nid) + continue; + /* find first hole */ + for (j = 0; + j < MAX_NUMNODES && + node_to_pxm_map_copy[j] != PXM_INVAL; + j++) + ; + if (WARN(j == MAX_NUMNODES, + "Number of nodes exceeds MAX_NUMNODES\n")) + return -1; + node_to_pxm_map_copy[j] = node_to_pxm_map[i]; + node_set(j, nodes_to_enable); + count++; + } + } + + /* creating reverse mapping in pxm_to_node_map[] */ + for (i = 0; i < MAX_NUMNODES; i++) + if (node_to_pxm_map_copy[i] != PXM_INVAL && + pxm_to_node_map_copy[node_to_pxm_map_copy[i]] == NUMA_NO_NODE) + pxm_to_node_map_copy[node_to_pxm_map_copy[i]] = i; + + /* overwrite with new mapping */ + for (i = 0; i < MAX_NUMNODES; i++) { + node_to_pxm_map[i] = node_to_pxm_map_copy[i]; + pxm_to_node_map[i] = pxm_to_node_map_copy[i]; + } + + /* enable other nodes found in PXM for hotplug */ + nodes_or(numa_nodes_parsed, nodes_to_enable, numa_nodes_parsed); + + pr_debug("found %d total number of nodes\n", count); + return 0; +} +#endif + static void __init acpi_table_print_srat_entry(struct acpi_subtable_header *header) { diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 67858eeb92ed..348c5dbbfa68 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -512,7 +512,7 @@ static ssize_t auto_online_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", - online_type_to_str[mhp_default_online_type]); + online_type_to_str[mhp_get_default_online_type()]); } static ssize_t auto_online_blocks_store(struct device *dev, @@ -524,7 +524,7 @@ static ssize_t auto_online_blocks_store(struct device *dev, if (online_type < 0) return -EINVAL; - mhp_default_online_type = online_type; + mhp_set_default_online_type(online_type); return count; } diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 838064593f62..a7c2b04ab943 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -544,7 +544,7 @@ static void print_stats(struct xen_blkif_ring *ring) ring->st_rd_req, ring->st_wr_req, ring->st_f_req, ring->st_ds_req, ring->persistent_gnt_c, max_pgrants); - ring->st_print = jiffies + msecs_to_jiffies(10 * 1000); + ring->st_print = jiffies + secs_to_jiffies(10); ring->st_rd_req = 0; ring->st_wr_req = 0; ring->st_oo_req = 0; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7903a4da40ac..70ecaee25c20 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -55,8 +55,8 @@ static size_t huge_class_size; static const struct block_device_operations zram_devops; static void zram_free_page(struct zram *zram, size_t index); -static int zram_read_page(struct zram *zram, struct page *page, u32 index, - struct bio *parent); +static int zram_read_from_zspool(struct zram *zram, struct page *page, + u32 index); static int zram_slot_trylock(struct zram *zram, u32 index) { @@ -112,17 +112,6 @@ static void zram_clear_flag(struct zram *zram, u32 index, zram->table[index].flags &= ~BIT(flag); } -static inline void zram_set_element(struct zram *zram, u32 index, - unsigned long element) -{ - zram->table[index].element = element; -} - -static unsigned long zram_get_element(struct zram *zram, u32 index) -{ - return zram->table[index].element; -} - static size_t zram_get_obj_size(struct zram *zram, u32 index) { return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); @@ -143,6 +132,27 @@ static inline bool zram_allocated(struct zram *zram, u32 index) zram_test_flag(zram, index, ZRAM_WB); } +static inline void update_used_max(struct zram *zram, const unsigned long pages) +{ + unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + if (cur_max >= pages) + return; + } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages, + &cur_max, pages)); +} + +static bool zram_can_store_page(struct zram *zram) +{ + unsigned long alloced_pages; + + alloced_pages = zs_get_total_pages(zram->mem_pool); + update_used_max(zram, alloced_pages); + + return !zram->limit_pages || alloced_pages <= zram->limit_pages; +} + #if PAGE_SIZE != 4096 static inline bool is_partial_io(struct bio_vec *bvec) { @@ -277,18 +287,6 @@ static struct zram_pp_slot *select_pp_slot(struct zram_pp_ctl *ctl) } #endif -static inline void update_used_max(struct zram *zram, - const unsigned long pages) -{ - unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages); - - do { - if (cur_max >= pages) - return; - } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages, - &cur_max, pages)); -} - static inline void zram_fill_page(void *ptr, unsigned long len, unsigned long value) { @@ -833,13 +831,10 @@ static ssize_t writeback_store(struct device *dev, */ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) goto next; + if (zram_read_from_zspool(zram, page, index)) + goto next; zram_slot_unlock(zram, index); - if (zram_read_page(zram, page, index, NULL)) { - release_pp_slot(zram, pps); - continue; - } - bio_init(&bio, zram->bdev, &bio_vec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); @@ -879,7 +874,7 @@ static ssize_t writeback_store(struct device *dev, zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_WB); - zram_set_element(zram, index, blk_idx); + zram_set_handle(zram, index, blk_idx); blk_idx = 0; atomic64_inc(&zram->stats.pages_stored); spin_lock(&zram->wb_limit_lock); @@ -889,6 +884,8 @@ static ssize_t writeback_store(struct device *dev, next: zram_slot_unlock(zram, index); release_pp_slot(zram, pps); + + cond_resched(); } if (blk_idx) @@ -1505,7 +1502,7 @@ static void zram_free_page(struct zram *zram, size_t index) if (zram_test_flag(zram, index, ZRAM_WB)) { zram_clear_flag(zram, index, ZRAM_WB); - free_block_bdev(zram, zram_get_element(zram, index)); + free_block_bdev(zram, zram_get_handle(zram, index)); goto out; } @@ -1533,6 +1530,56 @@ out: zram_set_obj_size(zram, index, 0); } +static int read_same_filled_page(struct zram *zram, struct page *page, + u32 index) +{ + void *mem; + + mem = kmap_local_page(page); + zram_fill_page(mem, PAGE_SIZE, zram_get_handle(zram, index)); + kunmap_local(mem); + return 0; +} + +static int read_incompressible_page(struct zram *zram, struct page *page, + u32 index) +{ + unsigned long handle; + void *src, *dst; + + handle = zram_get_handle(zram, index); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + dst = kmap_local_page(page); + copy_page(dst, src); + kunmap_local(dst); + zs_unmap_object(zram->mem_pool, handle); + + return 0; +} + +static int read_compressed_page(struct zram *zram, struct page *page, u32 index) +{ + struct zcomp_strm *zstrm; + unsigned long handle; + unsigned int size; + void *src, *dst; + int ret, prio; + + handle = zram_get_handle(zram, index); + size = zram_get_obj_size(zram, index); + prio = zram_get_priority(zram, index); + + zstrm = zcomp_stream_get(zram->comps[prio]); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + dst = kmap_local_page(page); + ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst); + kunmap_local(dst); + zs_unmap_object(zram->mem_pool, handle); + zcomp_stream_put(zram->comps[prio]); + + return ret; +} + /* * Reads (decompresses if needed) a page from zspool (zsmalloc). * Corresponding ZRAM slot should be locked. @@ -1540,47 +1587,14 @@ out: static int zram_read_from_zspool(struct zram *zram, struct page *page, u32 index) { - struct zcomp_strm *zstrm; - unsigned long handle; - unsigned int size; - void *src, *dst; - u32 prio; - int ret; + if (zram_test_flag(zram, index, ZRAM_SAME) || + !zram_get_handle(zram, index)) + return read_same_filled_page(zram, page, index); - handle = zram_get_handle(zram, index); - if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { - unsigned long value; - void *mem; - - value = handle ? zram_get_element(zram, index) : 0; - mem = kmap_local_page(page); - zram_fill_page(mem, PAGE_SIZE, value); - kunmap_local(mem); - return 0; - } - - size = zram_get_obj_size(zram, index); - - if (size != PAGE_SIZE) { - prio = zram_get_priority(zram, index); - zstrm = zcomp_stream_get(zram->comps[prio]); - } - - src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); - if (size == PAGE_SIZE) { - dst = kmap_local_page(page); - copy_page(dst, src); - kunmap_local(dst); - ret = 0; - } else { - dst = kmap_local_page(page); - ret = zcomp_decompress(zram->comps[prio], zstrm, - src, size, dst); - kunmap_local(dst); - zcomp_stream_put(zram->comps[prio]); - } - zs_unmap_object(zram->mem_pool, handle); - return ret; + if (!zram_test_flag(zram, index, ZRAM_HUGE)) + return read_compressed_page(zram, page, index); + else + return read_incompressible_page(zram, page, index); } static int zram_read_page(struct zram *zram, struct page *page, u32 index, @@ -1600,7 +1614,7 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, */ zram_slot_unlock(zram, index); - ret = read_from_bdev(zram, page, zram_get_element(zram, index), + ret = read_from_bdev(zram, page, zram_get_handle(zram, index), parent); } @@ -1638,33 +1652,89 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, return zram_read_page(zram, bvec->bv_page, index, bio); } +static int write_same_filled_page(struct zram *zram, unsigned long fill, + u32 index) +{ + zram_slot_lock(zram, index); + zram_set_flag(zram, index, ZRAM_SAME); + zram_set_handle(zram, index, fill); + zram_slot_unlock(zram, index); + + atomic64_inc(&zram->stats.same_pages); + atomic64_inc(&zram->stats.pages_stored); + + return 0; +} + +static int write_incompressible_page(struct zram *zram, struct page *page, + u32 index) +{ + unsigned long handle; + void *src, *dst; + + /* + * This function is called from preemptible context so we don't need + * to do optimistic and fallback to pessimistic handle allocation, + * like we do for compressible pages. + */ + handle = zs_malloc(zram->mem_pool, PAGE_SIZE, + GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); + if (IS_ERR_VALUE(handle)) + return PTR_ERR((void *)handle); + + if (!zram_can_store_page(zram)) { + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); + zs_free(zram->mem_pool, handle); + return -ENOMEM; + } + + dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); + src = kmap_local_page(page); + memcpy(dst, src, PAGE_SIZE); + kunmap_local(src); + zs_unmap_object(zram->mem_pool, handle); + + zram_slot_lock(zram, index); + zram_set_flag(zram, index, ZRAM_HUGE); + zram_set_handle(zram, index, handle); + zram_set_obj_size(zram, index, PAGE_SIZE); + zram_slot_unlock(zram, index); + + atomic64_add(PAGE_SIZE, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.huge_pages); + atomic64_inc(&zram->stats.huge_pages_since); + atomic64_inc(&zram->stats.pages_stored); + + return 0; +} + static int zram_write_page(struct zram *zram, struct page *page, u32 index) { int ret = 0; - unsigned long alloced_pages; unsigned long handle = -ENOMEM; unsigned int comp_len = 0; - void *src, *dst, *mem; + void *dst, *mem; struct zcomp_strm *zstrm; unsigned long element = 0; - enum zram_pageflags flags = 0; + bool same_filled; + + /* First, free memory allocated to this slot (if any) */ + zram_slot_lock(zram, index); + zram_free_page(zram, index); + zram_slot_unlock(zram, index); mem = kmap_local_page(page); - if (page_same_filled(mem, &element)) { - kunmap_local(mem); - /* Free memory associated with this sector now. */ - flags = ZRAM_SAME; - atomic64_inc(&zram->stats.same_pages); - goto out; - } + same_filled = page_same_filled(mem, &element); kunmap_local(mem); + if (same_filled) + return write_same_filled_page(zram, element, index); compress_again: zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); - src = kmap_local_page(page); + mem = kmap_local_page(page); ret = zcomp_compress(zram->comps[ZRAM_PRIMARY_COMP], zstrm, - src, &comp_len); - kunmap_local(src); + mem, &comp_len); + kunmap_local(mem); if (unlikely(ret)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); @@ -1673,8 +1743,11 @@ compress_again: return ret; } - if (comp_len >= huge_class_size) - comp_len = PAGE_SIZE; + if (comp_len >= huge_class_size) { + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); + return write_incompressible_page(zram, page, index); + } + /* * handle allocation has 2 paths: * a) fast path is executed with preemption disabled (for @@ -1690,35 +1763,23 @@ compress_again: */ if (IS_ERR_VALUE(handle)) handle = zs_malloc(zram->mem_pool, comp_len, - __GFP_KSWAPD_RECLAIM | - __GFP_NOWARN | - __GFP_HIGHMEM | - __GFP_MOVABLE); + __GFP_KSWAPD_RECLAIM | + __GFP_NOWARN | + __GFP_HIGHMEM | + __GFP_MOVABLE); if (IS_ERR_VALUE(handle)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); atomic64_inc(&zram->stats.writestall); handle = zs_malloc(zram->mem_pool, comp_len, - GFP_NOIO | __GFP_HIGHMEM | - __GFP_MOVABLE); + GFP_NOIO | __GFP_HIGHMEM | + __GFP_MOVABLE); if (IS_ERR_VALUE(handle)) return PTR_ERR((void *)handle); - if (comp_len != PAGE_SIZE) - goto compress_again; - /* - * If the page is not compressible, you need to acquire the - * lock and execute the code below. The zcomp_stream_get() - * call is needed to disable the cpu hotplug and grab the - * zstrm buffer back. It is necessary that the dereferencing - * of the zstrm variable below occurs correctly. - */ - zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); + goto compress_again; } - alloced_pages = zs_get_total_pages(zram->mem_pool); - update_used_max(zram, alloced_pages); - - if (zram->limit_pages && alloced_pages > zram->limit_pages) { + if (!zram_can_store_page(zram)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); zs_free(zram->mem_pool, handle); return -ENOMEM; @@ -1726,41 +1787,19 @@ compress_again: dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); - src = zstrm->buffer; - if (comp_len == PAGE_SIZE) - src = kmap_local_page(page); - memcpy(dst, src, comp_len); - if (comp_len == PAGE_SIZE) - kunmap_local(src); - + memcpy(dst, zstrm->buffer, comp_len); zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); zs_unmap_object(zram->mem_pool, handle); - atomic64_add(comp_len, &zram->stats.compr_data_size); -out: - /* - * Free memory associated with this sector - * before overwriting unused sectors. - */ + zram_slot_lock(zram, index); - zram_free_page(zram, index); - - if (comp_len == PAGE_SIZE) { - zram_set_flag(zram, index, ZRAM_HUGE); - atomic64_inc(&zram->stats.huge_pages); - atomic64_inc(&zram->stats.huge_pages_since); - } - - if (flags) { - zram_set_flag(zram, index, flags); - zram_set_element(zram, index, element); - } else { - zram_set_handle(zram, index, handle); - zram_set_obj_size(zram, index, comp_len); - } + zram_set_handle(zram, index, handle); + zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); /* Update stats */ atomic64_inc(&zram->stats.pages_stored); + atomic64_add(comp_len, &zram->stats.compr_data_size); + return ret; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 134be414e210..db78d7c01b9a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -62,10 +62,7 @@ enum zram_pageflags { /* Allocated for each disk page */ struct zram_table_entry { - union { - unsigned long handle; - unsigned long element; - }; + unsigned long handle; unsigned int flags; spinlock_t lock; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c index f2117fef7c7d..9c75dcc9a534 100644 --- a/drivers/clk/ti/clk.c +++ b/drivers/clk/ti/clk.c @@ -449,10 +449,7 @@ void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem) { struct clk_iomap *io; - io = memblock_alloc(sizeof(*io), SMP_CACHE_BYTES); - if (!io) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*io)); + io = memblock_alloc_or_panic(sizeof(*io), SMP_CACHE_BYTES); io->mem = mem; diff --git a/drivers/gpu/drm/i915/display/intel_display_driver.c b/drivers/gpu/drm/i915/display/intel_display_driver.c index 56b78cf6b854..62596424a9aa 100644 --- a/drivers/gpu/drm/i915/display/intel_display_driver.c +++ b/drivers/gpu/drm/i915/display/intel_display_driver.c @@ -391,7 +391,6 @@ void intel_display_driver_resume_access(struct drm_i915_private *i915) */ bool intel_display_driver_check_access(struct drm_i915_private *i915) { - char comm[TASK_COMM_LEN]; char current_task[TASK_COMM_LEN + 16]; char allowed_task[TASK_COMM_LEN + 16] = "none"; @@ -400,12 +399,11 @@ bool intel_display_driver_check_access(struct drm_i915_private *i915) return true; snprintf(current_task, sizeof(current_task), "%s[%d]", - get_task_comm(comm, current), - task_pid_vnr(current)); + current->comm, task_pid_vnr(current)); if (i915->display.access.allowed_task) snprintf(allowed_task, sizeof(allowed_task), "%s[%d]", - get_task_comm(comm, i915->display.access.allowed_task), + i915->display.access.allowed_task->comm, task_pid_vnr(i915->display.access.allowed_task)); drm_dbg_kms(&i915->drm, diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c index 2cb2e5675807..cd659b9fd1d9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_chan.c +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c @@ -279,7 +279,6 @@ nouveau_channel_ctor(struct nouveau_cli *cli, bool priv, u64 runm, const u64 plength = 0x10000; const u64 ioffset = plength; const u64 ilength = 0x02000; - char name[TASK_COMM_LEN]; int cid, ret; u64 size; @@ -338,8 +337,7 @@ nouveau_channel_ctor(struct nouveau_cli *cli, bool priv, u64 runm, chan->userd = &chan->user; } - get_task_comm(name, current); - snprintf(args.name, sizeof(args.name), "%s[%d]", name, task_pid_nr(current)); + snprintf(args.name, sizeof(args.name), "%s[%d]", current->comm, task_pid_nr(current)); ret = nvif_object_ctor(&device->object, "abi16ChanUser", 0, hosts[cid].oclass, &args, sizeof(args), &chan->user); diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 107f63f08bd9..ea7206484d22 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -1159,7 +1159,7 @@ nouveau_drm_open(struct drm_device *dev, struct drm_file *fpriv) { struct nouveau_drm *drm = nouveau_drm(dev); struct nouveau_cli *cli; - char name[32], tmpname[TASK_COMM_LEN]; + char name[32]; int ret; /* need to bring up power immediately if opening device */ @@ -1169,10 +1169,9 @@ nouveau_drm_open(struct drm_device *dev, struct drm_file *fpriv) return ret; } - get_task_comm(tmpname, current); rcu_read_lock(); snprintf(name, sizeof(name), "%s[%d]", - tmpname, pid_nr(rcu_dereference(fpriv->pid))); + current->comm, pid_nr(rcu_dereference(fpriv->pid))); rcu_read_unlock(); if (!(cli = kzalloc(sizeof(*cli), GFP_KERNEL))) { diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 06d6db8b50f9..f260e21fa283 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -514,7 +514,7 @@ static int wait_for_lmem_ready(struct xe_device *xe) drm_dbg(&xe->drm, "Waiting for lmem initialization\n"); start = jiffies; - timeout = start + msecs_to_jiffies(60 * 1000); /* 60 sec! */ + timeout = start + secs_to_jiffies(60); /* 60 sec! */ do { if (signal_pending(current)) diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 49805a24bb0a..7259f4f55700 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -92,7 +92,7 @@ struct iowait_work { * * The lock field is used by waiters to record * the seqlock_t that guards the list head. - * Waiters explicity know that, but the destroy + * Waiters explicitly know that, but the destroy * code that unwaits QPs does not. */ struct iowait { diff --git a/drivers/infiniband/hw/usnic/usnic_abi.h b/drivers/infiniband/hw/usnic/usnic_abi.h index 7fe9502ce8d3..86a82a4da0aa 100644 --- a/drivers/infiniband/hw/usnic/usnic_abi.h +++ b/drivers/infiniband/hw/usnic/usnic_abi.h @@ -72,7 +72,7 @@ struct usnic_ib_create_qp_resp { u64 bar_bus_addr; u32 bar_len; /* - * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface + * WQ, RQ, CQ are explicitly specified bc exposing a generic resources inteface * expands the scope of ABI to many files. */ u32 wq_cnt; diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index a01bc5090cdf..a1534cc6c641 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -492,11 +492,7 @@ int __init smu_init (void) goto fail_np; } - smu = memblock_alloc(sizeof(struct smu_device), SMP_CACHE_BYTES); - if (!smu) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct smu_device)); - + smu = memblock_alloc_or_panic(sizeof(struct smu_device), SMP_CACHE_BYTES); spin_lock_init(&smu->lock); INIT_LIST_HEAD(&smu->cmd_list); INIT_LIST_HEAD(&smu->cmd_i2c_list); diff --git a/drivers/net/wireless/ath/ath11k/debugfs.c b/drivers/net/wireless/ath/ath11k/debugfs.c index 57281a135dd7..bf192529e3fe 100644 --- a/drivers/net/wireless/ath/ath11k/debugfs.c +++ b/drivers/net/wireless/ath/ath11k/debugfs.c @@ -178,7 +178,7 @@ static int ath11k_debugfs_fw_stats_request(struct ath11k *ar, * received 'update stats' event, we keep a 3 seconds timeout in case, * fw_stats_done is not marked yet */ - timeout = jiffies + msecs_to_jiffies(3 * 1000); + timeout = jiffies + secs_to_jiffies(3); ath11k_debugfs_fw_stats_reset(ar); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwsignal.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwsignal.c index 0949e7975ff1..b70d20128f98 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwsignal.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwsignal.c @@ -1810,7 +1810,7 @@ void brcmf_fws_rxreorder(struct brcmf_if *ifp, struct sk_buff *pkt) rfi->cur_idx = cur_idx; } } else { - /* explicity window move updating the expected index */ + /* explicitly window move updating the expected index */ exp_idx = reorder_data[BRCMF_RXREORDER_EXPIDX_OFFSET]; brcmf_dbg(DATA, "flow-%d (0x%x): change expected: %d -> %d\n", diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 0121100372b4..2eb718fbeffd 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1126,13 +1126,7 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) static void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) { - void *ptr = memblock_alloc(size, align); - - if (!ptr) - panic("%s: Failed to allocate %llu bytes align=0x%llx\n", - __func__, size, align); - - return ptr; + return memblock_alloc_or_panic(size, align); } bool __init early_init_dt_verify(void *dt_virt, phys_addr_t dt_phys) diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index 0fa0c0fd9a6a..c48c630074eb 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -3680,13 +3680,7 @@ static struct device_node *overlay_base_root; static void * __init dt_alloc_memory(u64 size, u64 align) { - void *ptr = memblock_alloc(size, align); - - if (!ptr) - panic("%s: Failed to allocate %llu bytes align=0x%llx\n", - __func__, size, align); - - return ptr; + return memblock_alloc_or_panic(size, align); } /* diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 35860c61468b..fd797e278549 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -1044,7 +1044,7 @@ static void arcmsr_init_get_devmap_timer(struct AdapterControlBlock *pacb) static void arcmsr_init_set_datetime_timer(struct AdapterControlBlock *pacb) { timer_setup(&pacb->refresh_timer, arcmsr_set_iop_datetime, 0); - pacb->refresh_timer.expires = jiffies + msecs_to_jiffies(60 * 1000); + pacb->refresh_timer.expires = jiffies + secs_to_jiffies(60); add_timer(&pacb->refresh_timer); } diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c index b375509d1470..97631f48e19d 100644 --- a/drivers/scsi/cxlflash/superpipe.c +++ b/drivers/scsi/cxlflash/superpipe.c @@ -966,7 +966,7 @@ static int cxlflash_disk_detach(struct scsi_device *sdev, void *detach) * * This routine is the release handler for the fops registered with * the CXL services on an initial attach for a context. It is called - * when a close (explicity by the user or as part of a process tear + * when a close (explicitly by the user or as part of a process tear * down) is performed on the adapter file descriptor returned to the * user. The user should be aware that explicitly performing a close * considered catastrophic and subsequent usage of the superpipe API diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 7f57397d91a9..4fed2e1243e0 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -598,7 +598,7 @@ lpfc_config_port_post(struct lpfc_hba *phba) jiffies + msecs_to_jiffies(1000 * timeout)); /* Set up heart beat (HB) timer */ mod_timer(&phba->hb_tmofunc, - jiffies + msecs_to_jiffies(1000 * LPFC_HB_MBOX_INTERVAL)); + jiffies + secs_to_jiffies(LPFC_HB_MBOX_INTERVAL)); clear_bit(HBA_HBEAT_INP, &phba->hba_flag); clear_bit(HBA_HBEAT_TMO, &phba->hba_flag); phba->last_completion_time = jiffies; @@ -1267,7 +1267,7 @@ lpfc_hb_mbox_cmpl(struct lpfc_hba * phba, LPFC_MBOXQ_t * pmboxq) !test_bit(FC_UNLOADING, &phba->pport->load_flag)) mod_timer(&phba->hb_tmofunc, jiffies + - msecs_to_jiffies(1000 * LPFC_HB_MBOX_INTERVAL)); + secs_to_jiffies(LPFC_HB_MBOX_INTERVAL)); return; } @@ -1555,7 +1555,7 @@ lpfc_hb_timeout_handler(struct lpfc_hba *phba) /* If IOs are completing, no need to issue a MBX_HEARTBEAT */ spin_lock_irq(&phba->pport->work_port_lock); if (time_after(phba->last_completion_time + - msecs_to_jiffies(1000 * LPFC_HB_MBOX_INTERVAL), + secs_to_jiffies(LPFC_HB_MBOX_INTERVAL), jiffies)) { spin_unlock_irq(&phba->pport->work_port_lock); if (test_bit(HBA_HBEAT_INP, &phba->hba_flag)) @@ -3354,7 +3354,7 @@ lpfc_block_mgmt_io(struct lpfc_hba *phba, int mbx_action) spin_unlock_irqrestore(&phba->hbalock, iflag); if (mbx_action == LPFC_MBX_NO_WAIT) return; - timeout = msecs_to_jiffies(LPFC_MBOX_TMO * 1000) + jiffies; + timeout = secs_to_jiffies(LPFC_MBOX_TMO) + jiffies; spin_lock_irqsave(&phba->hbalock, iflag); if (phba->sli.mbox_active) { actcmd = phba->sli.mbox_active->u.mb.mbxCommand; @@ -4924,14 +4924,14 @@ int lpfc_scan_finished(struct Scsi_Host *shost, unsigned long time) stat = 1; goto finished; } - if (time >= msecs_to_jiffies(30 * 1000)) { + if (time >= secs_to_jiffies(30)) { lpfc_printf_log(phba, KERN_INFO, LOG_INIT, "0461 Scanning longer than 30 " "seconds. Continuing initialization\n"); stat = 1; goto finished; } - if (time >= msecs_to_jiffies(15 * 1000) && + if (time >= secs_to_jiffies(15) && phba->link_state <= LPFC_LINK_DOWN) { lpfc_printf_log(phba, KERN_INFO, LOG_INIT, "0465 Link down longer than 15 " @@ -4945,7 +4945,7 @@ int lpfc_scan_finished(struct Scsi_Host *shost, unsigned long time) if (vport->num_disc_nodes || vport->fc_prli_sent) goto finished; if (!atomic_read(&vport->fc_map_cnt) && - time < msecs_to_jiffies(2 * 1000)) + time < secs_to_jiffies(2)) goto finished; if ((phba->sli.sli_flag & LPFC_SLI_MBOX_ACTIVE) != 0) goto finished; @@ -5179,8 +5179,8 @@ lpfc_vmid_poll(struct timer_list *t) lpfc_worker_wake_up(phba); /* restart the timer for the next iteration */ - mod_timer(&phba->inactive_vmid_poll, jiffies + msecs_to_jiffies(1000 * - LPFC_VMID_TIMER)); + mod_timer(&phba->inactive_vmid_poll, + jiffies + secs_to_jiffies(LPFC_VMID_TIMER)); } /** diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c index 4d88cfe71cae..08a7f5c6157f 100644 --- a/drivers/scsi/lpfc/lpfc_nportdisc.c +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c @@ -906,7 +906,7 @@ lpfc_rcv_logo(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, (ndlp->nlp_state >= NLP_STE_ADISC_ISSUE || ndlp->nlp_state <= NLP_STE_PRLI_ISSUE)) { mod_timer(&ndlp->nlp_delayfunc, - jiffies + msecs_to_jiffies(1000 * 1)); + jiffies + secs_to_jiffies(1)); set_bit(NLP_DELAY_TMO, &ndlp->nlp_flag); ndlp->nlp_last_elscmd = ELS_CMD_PLOGI; lpfc_printf_vlog(vport, KERN_INFO, @@ -1332,7 +1332,7 @@ lpfc_rcv_els_plogi_issue(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, } /* Put ndlp in npr state set plogi timer for 1 sec */ - mod_timer(&ndlp->nlp_delayfunc, jiffies + msecs_to_jiffies(1000 * 1)); + mod_timer(&ndlp->nlp_delayfunc, jiffies + secs_to_jiffies(1)); set_bit(NLP_DELAY_TMO, &ndlp->nlp_flag); ndlp->nlp_last_elscmd = ELS_CMD_PLOGI; ndlp->nlp_prev_state = NLP_STE_PLOGI_ISSUE; @@ -1936,7 +1936,7 @@ lpfc_cmpl_reglogin_reglogin_issue(struct lpfc_vport *vport, /* Put ndlp in npr state set plogi timer for 1 sec */ mod_timer(&ndlp->nlp_delayfunc, - jiffies + msecs_to_jiffies(1000 * 1)); + jiffies + secs_to_jiffies(1)); set_bit(NLP_DELAY_TMO, &ndlp->nlp_flag); ndlp->nlp_last_elscmd = ELS_CMD_PLOGI; @@ -2743,7 +2743,7 @@ lpfc_rcv_prlo_npr_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, if (!test_bit(NLP_DELAY_TMO, &ndlp->nlp_flag)) { mod_timer(&ndlp->nlp_delayfunc, - jiffies + msecs_to_jiffies(1000 * 1)); + jiffies + secs_to_jiffies(1)); set_bit(NLP_DELAY_TMO, &ndlp->nlp_flag); clear_bit(NLP_NPR_ADISC, &ndlp->nlp_flag); ndlp->nlp_last_elscmd = ELS_CMD_PLOGI; diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index 43dc1da4a156..b1adb9f59097 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -2237,7 +2237,7 @@ lpfc_nvme_lport_unreg_wait(struct lpfc_vport *vport, * wait. Print a message if a 10 second wait expires and renew the * wait. This is unexpected. */ - wait_tmo = msecs_to_jiffies(LPFC_NVME_WAIT_TMO * 1000); + wait_tmo = secs_to_jiffies(LPFC_NVME_WAIT_TMO); while (true) { ret = wait_for_completion_timeout(lport_unreg_cmp, wait_tmo); if (unlikely(!ret)) { diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 874644b31a3e..3fd9723cd271 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -9012,7 +9012,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) /* Start heart beat timer */ mod_timer(&phba->hb_tmofunc, - jiffies + msecs_to_jiffies(1000 * LPFC_HB_MBOX_INTERVAL)); + jiffies + secs_to_jiffies(LPFC_HB_MBOX_INTERVAL)); clear_bit(HBA_HBEAT_INP, &phba->hba_flag); clear_bit(HBA_HBEAT_TMO, &phba->hba_flag); phba->last_completion_time = jiffies; @@ -13323,7 +13323,7 @@ lpfc_sli_mbox_sys_shutdown(struct lpfc_hba *phba, int mbx_action) lpfc_sli_mbox_sys_flush(phba); return; } - timeout = msecs_to_jiffies(LPFC_MBOX_TMO * 1000) + jiffies; + timeout = secs_to_jiffies(LPFC_MBOX_TMO) + jiffies; /* Disable softirqs, including timers from obtaining phba->hbalock */ local_bh_disable(); diff --git a/drivers/scsi/lpfc/lpfc_vmid.c b/drivers/scsi/lpfc/lpfc_vmid.c index cc3e4736f2fe..14dbfe954e42 100644 --- a/drivers/scsi/lpfc/lpfc_vmid.c +++ b/drivers/scsi/lpfc/lpfc_vmid.c @@ -278,7 +278,7 @@ int lpfc_vmid_get_appid(struct lpfc_vport *vport, char *uuid, if (!(vport->phba->pport->vmid_flag & LPFC_VMID_TIMER_ENBLD)) { mod_timer(&vport->phba->inactive_vmid_poll, jiffies + - msecs_to_jiffies(1000 * LPFC_VMID_TIMER)); + secs_to_jiffies(LPFC_VMID_TIMER)); vport->phba->pport->vmid_flag |= LPFC_VMID_TIMER_ENBLD; } } diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index f8c81e53e93f..22e0e79e88ab 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -736,7 +736,7 @@ static int pm8001_init_sas_add(struct pm8001_hba_info *pm8001_ha) return -EIO; } time_remaining = wait_for_completion_timeout(&completion, - msecs_to_jiffies(60*1000)); // 1 min + secs_to_jiffies(60)); // 1 min if (!time_remaining) { kfree(payload.func_specific); pm8001_dbg(pm8001_ha, FAIL, "get_nvmd_req timeout\n"); diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index 07ed33464d71..224ca8d42721 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -624,10 +624,10 @@ static int alloc_private_pages(struct hmm_buffer_object *bo) const gfp_t gfp = __GFP_NOWARN | __GFP_RECLAIM | __GFP_FS; int ret; - ret = alloc_pages_bulk_array(gfp, bo->pgnr, bo->pages); + ret = alloc_pages_bulk(gfp, bo->pgnr, bo->pages); if (ret != bo->pgnr) { free_pages_bulk_array(ret, bo->pages); - dev_err(atomisp_dev, "alloc_pages_bulk_array() failed\n"); + dev_err(atomisp_dev, "alloc_pages_bulk() failed\n"); return -ENOMEM; } diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c b/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c index dc0d715ed970..0dbe76ee5570 100644 --- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c +++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c @@ -59,7 +59,7 @@ static int bcm2835_audio_send_msg_locked(struct bcm2835_audio_instance *instance if (wait) { if (!wait_for_completion_timeout(&instance->msg_avail_comp, - msecs_to_jiffies(10 * 1000))) { + secs_to_jiffies(10))) { dev_err(instance->dev, "vchi message timeout, msg=%d\n", m->type); return -ETIMEDOUT; diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index dcb1769c3625..50c0c23ae678 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2622,14 +2622,13 @@ static int tty_tiocgicount(struct tty_struct *tty, void __user *arg) static int tty_set_serial(struct tty_struct *tty, struct serial_struct *ss) { - char comm[TASK_COMM_LEN]; int flags; flags = ss->flags & ASYNC_DEPRECATED; if (flags) pr_warn_ratelimited("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", - __func__, get_task_comm(comm, current), flags); + __func__, current->comm, flags); if (!tty->ops->set_serial) return -ENOTTY; diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index eb7387ee6ebd..11eda6b207f1 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -408,7 +408,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) buf->dma_dir, 0); } - /* Undo alloc_pages_bulk_array() */ + /* Undo alloc_pages_bulk() */ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); sg_free_append_table(&buf->table); @@ -431,8 +431,8 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, return -ENOMEM; do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill, + page_list); if (!filled) { ret = -ENOMEM; goto err; @@ -1342,7 +1342,7 @@ static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) { int i; - /* Undo alloc_pages_bulk_array() */ + /* Undo alloc_pages_bulk() */ for (i = 0; i < recv_buf->npages; i++) __free_page(recv_buf->page_list[i]); @@ -1361,9 +1361,9 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, return -ENOMEM; for (;;) { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, - npages - done, - recv_buf->page_list + done); + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, + npages - done, + recv_buf->page_list + done); if (!filled) goto err; diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index ee54f4c17857..ba92bb4e9af9 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -77,8 +77,8 @@ static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf, return -ENOMEM; do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill, + page_list); if (!filled) { ret = -ENOMEM; goto err; @@ -112,7 +112,7 @@ static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf) { struct sg_page_iter sg_iter; - /* Undo alloc_pages_bulk_array() */ + /* Undo alloc_pages_bulk() */ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); sg_free_append_table(&buf->table); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index fb02c1c36004..415a5803b8f4 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -586,7 +586,7 @@ do { \ #define per_cpu_sum(_p) \ ({ \ - typeof(*_p) _ret = 0; \ + TYPEOF_UNQUAL(*_p) _ret = 0; \ \ int cpu; \ for_each_possible_cpu(cpu) \ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b923d0cec61c..d70e9461fea8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -632,7 +632,7 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array); + allocated = alloc_pages_bulk(gfp, nr_pages, page_array); if (unlikely(allocated == last)) { /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 27b2fe7f735d..3b99b1e19371 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10110,7 +10110,6 @@ out_unlock_mmap: *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; - sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 06ee397e0c3a..d90eda19bcc4 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -166,7 +166,7 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, if (IS_ERR(in)) { doutc(cl, "Can't lookup inode %llx (err: %ld)\n", realm->ino, PTR_ERR(in)); - qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */ + qri->timeout = jiffies + secs_to_jiffies(60); /* XXX */ } else { qri->timeout = 0; qri->inode = in; diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 0dd65cefce33..9c5aa9d53682 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -87,8 +87,8 @@ int z_erofs_gbuf_growsize(unsigned int nrpages) tmp_pages[j] = gbuf->pages[j]; do { last = j; - j = alloc_pages_bulk_array(GFP_KERNEL, nrpages, - tmp_pages); + j = alloc_pages_bulk(GFP_KERNEL, nrpages, + tmp_pages); if (last == j) goto out; } while (j != nrpages); diff --git a/fs/exec.c b/fs/exec.c index 98cb7ba9983c..1e1f79c514de 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, /* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it - * by hand ahead of time. + * ahead of time. */ - if (write && pos < vma->vm_start) { - mmap_write_lock(mm); - ret = expand_downwards(vma, pos); - if (unlikely(ret < 0)) { - mmap_write_unlock(mm); - return NULL; - } - mmap_write_downgrade(mm); - } else - mmap_read_lock(mm); + if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) + return NULL; /* * We are doing an exec(). 'current' is the process diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a2478c2afb3a..a9eddd782dbc 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4043,7 +4043,6 @@ retry: cur_lblock = 1; /* force Empty message */ sis->max = cur_lblock; sis->pages = cur_lblock - 1; - sis->highest_bit = cur_lblock - 1; out: if (not_aligned) f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)", diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3cd99e2dc6ac..5980ac24c7a4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2681,6 +2681,9 @@ static void wait_sb_inodes(struct super_block *sb) if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) continue; + if (mapping_writeback_indeterminate(mapping)) + continue; + spin_unlock_irq(&sb->s_inode_wblist_lock); spin_lock(&inode->i_lock); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7d92a5479998..082ee374f694 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -415,89 +415,11 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct rb_node writepages_entry; struct list_head queue_entry; - struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; }; -static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, - pgoff_t idx_from, pgoff_t idx_to) -{ - struct rb_node *n; - - n = fi->writepages.rb_node; - - while (n) { - struct fuse_writepage_args *wpa; - pgoff_t curr_index; - - wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); - WARN_ON(get_fuse_inode(wpa->inode) != fi); - curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_folios) - n = n->rb_right; - else if (idx_to < curr_index) - n = n->rb_left; - else - return wpa; - } - return NULL; -} - -/* - * Check if any page in a range is under writeback - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - bool found; - - if (RB_EMPTY_ROOT(&fi->writepages)) - return false; - - spin_lock(&fi->lock); - found = fuse_find_writeback(fi, idx_from, idx_to); - spin_unlock(&fi->lock); - - return found; -} - -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) -{ - return fuse_range_is_writeback(inode, index, index); -} - -/* - * Wait for page writeback to be completed. - * - * Since fuse doesn't rely on the VM writeback tracking, this has to - * use some other means. - */ -static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); -} - -static inline bool fuse_folio_is_writeback(struct inode *inode, - struct folio *folio) -{ - pgoff_t last = folio_next_index(folio) - 1; - return fuse_range_is_writeback(inode, folio_index(folio), last); -} - -static void fuse_wait_on_folio_writeback(struct inode *inode, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); -} - /* * Wait for all pending writepages on the inode to finish. * @@ -886,13 +808,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) ssize_t res; u64 attr_ver; - /* - * With the temporary pages that are used to complete writeback, we can - * have writeback that extends beyond the lifetime of the folio. So - * make sure we read a properly synced folio. - */ - fuse_wait_on_folio_writeback(inode, folio); - attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ @@ -1003,17 +918,12 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); unsigned int max_pages, nr_pages; - pgoff_t first = readahead_index(rac); - pgoff_t last = first + readahead_count(rac) - 1; if (fuse_is_bad(inode)) return; - wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); - max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); @@ -1172,7 +1082,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, int err; for (i = 0; i < ap->num_folios; i++) - fuse_wait_on_folio_writeback(inode, ap->folios[i]); + folio_wait_writeback(ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1629,7 +1539,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, return res; } } - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1826,38 +1736,34 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; - int i; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_folios; i++) - folio_put(ap->folios[i]); - fuse_file_put(wpa->ia.ff, false); kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) -{ - struct backing_dev_info *bdi = inode_to_bdi(inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); -} - static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) - fuse_writepage_finish_stat(inode, ap->folios[i]); + for (i = 0; i < ap->num_folios; i++) { + /* + * Benchmarks showed that ending writeback within the + * scope of the fi->lock alleviates xarray lock + * contention and noticeably improves performance. + */ + folio_end_writeback(ap->folios[i]); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + wb_writeout_inc(&bdi->wb); + } wake_up(&fi->page_waitq); } @@ -1868,7 +1774,6 @@ static void fuse_send_writepage(struct fuse_mount *fm, __releases(fi->lock) __acquires(fi->lock) { - struct fuse_writepage_args *aux, *next; struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; @@ -1905,19 +1810,8 @@ __acquires(fi->lock) out_free: fi->writectr--; - rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(wpa); spin_unlock(&fi->lock); - - /* After rb_erase() aux request list is private */ - for (aux = wpa->next; aux; aux = next) { - next = aux->next; - aux->next = NULL; - fuse_writepage_finish_stat(aux->inode, - aux->ia.ap.folios[0]); - fuse_writepage_free(aux); - } - fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1945,43 +1839,6 @@ __acquires(fi->lock) } } -static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, - struct fuse_writepage_args *wpa) -{ - pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - - WARN_ON(!wpa->ia.ap.num_folios); - while (*p) { - struct fuse_writepage_args *curr; - pgoff_t curr_index; - - parent = *p; - curr = rb_entry(parent, struct fuse_writepage_args, - writepages_entry); - WARN_ON(curr->inode != wpa->inode); - curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - - if (idx_from >= curr_index + curr->ia.ap.num_folios) - p = &(*p)->rb_right; - else if (idx_to < curr_index) - p = &(*p)->rb_left; - else - return curr; - } - - rb_link_node(&wpa->writepages_entry, parent, p); - rb_insert_color(&wpa->writepages_entry, root); - return NULL; -} - -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) -{ - WARN_ON(fuse_insert_writeback(root, wpa)); -} - static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { @@ -2001,41 +1858,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); - rb_erase(&wpa->writepages_entry, &fi->writepages); - while (wpa->next) { - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_writepage_args *next = wpa->next; - - wpa->next = next->next; - next->next = NULL; - tree_insert(&fi->writepages, next); - - /* - * Skip fuse_flush_writepages() to make it easy to crop requests - * based on primary request size. - * - * 1st case (trivial): there are no concurrent activities using - * fuse_set/release_nowrite. Then we're on safe side because - * fuse_flush_writepages() would call fuse_send_writepage() - * anyway. - * - * 2nd case: someone called fuse_set_nowrite and it is waiting - * now for completion of all in-flight requests. This happens - * rarely and no more than once per page, so this should be - * okay. - * - * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle - * of fuse_set_nowrite..fuse_release_nowrite section. The fact - * that fuse_set_nowrite returned implies that all in-flight - * requests were completed along with all of their secondary - * requests. Further primary requests are blocked by negative - * writectr. Hence there cannot be any in-flight requests and - * no invocations of fuse_writepage_end() while we're in - * fuse_set_nowrite..fuse_release_nowrite section. - */ - fuse_send_writepage(fm, next, inarg->offset + inarg->size); - } fi->writectr--; fuse_writepage_finish(wpa); spin_unlock(&fi->lock); @@ -2122,19 +1944,16 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - struct folio *tmp_folio, uint32_t folio_index) + uint32_t folio_index) { struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; - folio_copy(tmp_folio, folio); - - ap->folios[folio_index] = tmp_folio; + ap->folios[folio_index] = folio; ap->descs[folio_index].offset = 0; ap->descs[folio_index].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -2169,18 +1988,12 @@ static int fuse_writepage_locked(struct folio *folio) struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct folio *tmp_folio; struct fuse_file *ff; - int error = -ENOMEM; + int error = -EIO; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto err; - - error = -EIO; ff = fuse_write_file_get(fi); if (!ff) - goto err_nofile; + goto err; wpa = fuse_writepage_args_setup(folio, ff); error = -ENOMEM; @@ -2191,22 +2004,17 @@ static int fuse_writepage_locked(struct folio *folio) ap->num_folios = 1; folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); + fuse_writepage_args_page_fill(wpa, folio, 0); spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - folio_end_writeback(folio); - return 0; err_writepage_args: fuse_file_put(ff, false); -err_nofile: - folio_put(tmp_folio); err: mapping_set_error(folio->mapping, error); return error; @@ -2216,7 +2024,6 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct folio **orig_folios; unsigned int max_folios; }; @@ -2251,69 +2058,11 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_folios = wpa->ia.ap.num_folios; - int i; spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - - for (i = 0; i < num_folios; i++) - folio_end_writeback(data->orig_folios[i]); -} - -/* - * Check under fi->lock if the page is under writeback, and insert it onto the - * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's - * one already added for a page at this offset. If there's none, then insert - * this new request onto the auxiliary list, otherwise reuse the existing one by - * swapping the new temp page with the old one. - */ -static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); - struct fuse_writepage_args *tmp; - struct fuse_writepage_args *old_wpa; - struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - - WARN_ON(new_ap->num_folios != 0); - new_ap->num_folios = 1; - - spin_lock(&fi->lock); - old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); - if (!old_wpa) { - spin_unlock(&fi->lock); - return true; - } - - for (tmp = old_wpa->next; tmp; tmp = tmp->next) { - pgoff_t curr_index; - - WARN_ON(tmp->inode != new_wpa->inode); - curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == folio->index) { - WARN_ON(tmp->ia.ap.num_folios != 1); - swap(tmp->ia.ap.folios[0], new_ap->folios[0]); - break; - } - } - - if (!tmp) { - new_wpa->next = old_wpa->next; - old_wpa->next = new_wpa; - } - - spin_unlock(&fi->lock); - - if (tmp) { - fuse_writepage_finish_stat(new_wpa->inode, - folio); - fuse_writepage_free(new_wpa); - } - - return false; } static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, @@ -2322,15 +2071,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, { WARN_ON(!ap->num_folios); - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - if (fuse_folio_is_writeback(data->inode, folio)) - return true; - /* Reached max pages */ if (ap->num_folios == fc->max_pages) return true; @@ -2340,7 +2080,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, return true; /* Discontinuity */ - if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) + if (ap->folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) return true; /* Need to grow the pages array? If so, did the expansion fail? */ @@ -2359,7 +2099,6 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct folio *tmp_folio; int err; if (!data->ff) { @@ -2374,54 +2113,23 @@ static int fuse_writepages_fill(struct folio *folio, data->wpa = NULL; } - err = -ENOMEM; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto out_unlock; - - /* - * The page must not be redirtied until the writeout is completed - * (i.e. userspace has sent a reply to the write request). Otherwise - * there could be more than one temporary page instance for each real - * page. - * - * This is ensured by holding the page lock in page_mkwrite() while - * checking fuse_page_is_writeback(). We already hold the page lock - * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_folios. - * After this fuse_page_is_writeback() will indicate that the page is - * under writeback, so we can release the page lock. - */ if (data->wpa == NULL) { err = -ENOMEM; wpa = fuse_writepage_args_setup(folio, data->ff); - if (!wpa) { - folio_put(tmp_folio); + if (!wpa) goto out_unlock; - } fuse_file_get(wpa->ia.ff); data->max_folios = 1; ap = &wpa->ia.ap; } folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); - data->orig_folios[ap->num_folios] = folio; + fuse_writepage_args_page_fill(wpa, folio, ap->num_folios); err = 0; - if (data->wpa) { - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_folios++; - spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, folio)) { + ap->num_folios++; + if (!data->wpa) data->wpa = wpa; - } else { - folio_end_writeback(folio); - } out_unlock: folio_unlock(folio); @@ -2448,13 +2156,6 @@ static int fuse_writepages(struct address_space *mapping, data.wpa = NULL; data.ff = NULL; - err = -ENOMEM; - data.orig_folios = kcalloc(fc->max_pages, - sizeof(struct folio *), - GFP_NOFS); - if (!data.orig_folios) - goto out; - err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { WARN_ON(!data.wpa->ia.ap.num_folios); @@ -2463,7 +2164,6 @@ static int fuse_writepages(struct address_space *mapping, if (data.ff) fuse_file_put(data.ff, false); - kfree(data.orig_folios); out: return err; } @@ -2488,8 +2188,6 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, folio->index); - if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* @@ -2552,13 +2250,9 @@ static int fuse_launder_folio(struct folio *folio) { int err = 0; if (folio_clear_dirty_for_io(folio)) { - struct inode *inode = folio->mapping->host; - - /* Serialize with pending writeback for the same page */ - fuse_wait_on_page_writeback(inode, folio->index); err = fuse_writepage_locked(folio); if (!err) - fuse_wait_on_page_writeback(inode, folio->index); + folio_wait_writeback(folio); } return err; } @@ -2602,7 +2296,7 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_NOPAGE; } - fuse_wait_on_folio_writeback(inode, folio); + folio_wait_writeback(folio); return VM_FAULT_LOCKED; } @@ -3420,9 +3114,12 @@ static const struct address_space_operations fuse_file_aops = { void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + if (fc->writeback_cache) + mapping_set_writeback_indeterminate(&inode->i_data); INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); @@ -3430,7 +3127,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); - fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 74744c6f2860..23736c5c64c1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -141,9 +141,6 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; - - /* List of writepage requestst (pending or sent) */ - struct rb_root writepages; }; /* readdir cache (directory only) */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index fc1ae5132127..0fc179a59830 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -99,7 +99,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); @@ -116,10 +115,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; - /* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can @@ -819,7 +814,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, false); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 5fc0ac36dee3..b90d0eda9e51 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -189,7 +189,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; sis->max = isi.nr_pages; sis->pages = isi.nr_pages - 1; - sis->highest_bit = isi.nr_pages - 1; return isi.nr_extents; } EXPORT_SYMBOL_GPL(iomap_swapfile_activate); diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index ba3e1f591f36..6b506995818d 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -21,6 +21,8 @@ * nilfs_palloc_groups_per_desc_block - get the number of groups that a group * descriptor block can maintain * @inode: inode of metadata file using this allocator + * + * Return: Number of groups that a group descriptor block can maintain. */ static inline unsigned long nilfs_palloc_groups_per_desc_block(const struct inode *inode) @@ -32,6 +34,8 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode) /** * nilfs_palloc_groups_count - get maximum number of groups * @inode: inode of metadata file using this allocator + * + * Return: Maximum number of groups. */ static inline unsigned long nilfs_palloc_groups_count(const struct inode *inode) @@ -43,6 +47,8 @@ nilfs_palloc_groups_count(const struct inode *inode) * nilfs_palloc_init_blockgroup - initialize private variables for allocator * @inode: inode of metadata file using this allocator * @entry_size: size of the persistent object + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size) { @@ -78,6 +84,9 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size) * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) * @offset: pointer to store offset number in the group + * + * Return: Number of the group that contains the entry with the index + * specified by @nr. */ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, unsigned long *offset) @@ -93,8 +102,8 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, * @inode: inode of metadata file using this allocator * @group: group number * - * nilfs_palloc_desc_blkoff() returns block offset of the descriptor - * block which contains a descriptor of the specified group. + * Return: Index number in the metadata file of the descriptor block of + * the group specified by @group. */ static unsigned long nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) @@ -111,6 +120,9 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) * * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap * block used to allocate/deallocate entries in the specified group. + * + * Return: Index number in the metadata file of the bitmap block of + * the group specified by @group. */ static unsigned long nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) @@ -125,6 +137,8 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group * @desc: pointer to descriptor structure for the group * @lock: spin lock protecting @desc + * + * Return: Number of free entries written in the group descriptor @desc. */ static unsigned long nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc, @@ -143,6 +157,9 @@ nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc, * @desc: pointer to descriptor structure for the group * @lock: spin lock protecting @desc * @n: delta to be added + * + * Return: Number of free entries after adjusting the group descriptor + * @desc. */ static u32 nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc, @@ -161,6 +178,9 @@ nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc, * nilfs_palloc_entry_blkoff - get block offset of an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) + * + * Return: Index number in the metadata file of the block containing + * the entry specified by @nr. */ static unsigned long nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) @@ -238,6 +258,12 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, * @blkoff: block offset * @prev: nilfs_bh_assoc struct of the last used buffer * @lock: spin lock protecting @prev + * + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - Non-existent block. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff, struct nilfs_bh_assoc *prev, @@ -258,6 +284,8 @@ static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff, * @group: group number * @create: create flag * @bhp: pointer to store the resultant buffer head + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_palloc_get_desc_block(struct inode *inode, unsigned long group, @@ -277,6 +305,8 @@ static int nilfs_palloc_get_desc_block(struct inode *inode, * @group: group number * @create: create flag * @bhp: pointer to store the resultant buffer head + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_palloc_get_bitmap_block(struct inode *inode, unsigned long group, @@ -294,6 +324,8 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode, * nilfs_palloc_delete_bitmap_block - delete a bitmap block * @inode: inode of metadata file using this allocator * @group: group number + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_palloc_delete_bitmap_block(struct inode *inode, unsigned long group) @@ -312,6 +344,8 @@ static int nilfs_palloc_delete_bitmap_block(struct inode *inode, * @nr: serial number of the entry (e.g. inode number) * @create: create flag * @bhp: pointer to store the resultant buffer head + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, int create, struct buffer_head **bhp) @@ -328,6 +362,8 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, * nilfs_palloc_delete_entry_block - delete an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr) { @@ -397,6 +433,9 @@ size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr, * @bsize: size in bits * @lock: spin lock protecting @bitmap * @wrap: whether to wrap around + * + * Return: Offset number within the group of the found free entry, or + * %-ENOSPC if not found. */ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, @@ -438,6 +477,9 @@ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, * @inode: inode of metadata file using this allocator * @curr: current group number * @max: maximum number of groups + * + * Return: Number of remaining descriptors (= groups) managed by the descriptor + * block. */ static unsigned long nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, @@ -453,6 +495,8 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, * nilfs_palloc_count_desc_blocks - count descriptor blocks number * @inode: inode of metadata file using this allocator * @desc_blocks: descriptor blocks number [out] + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_palloc_count_desc_blocks(struct inode *inode, unsigned long *desc_blocks) @@ -473,6 +517,8 @@ static int nilfs_palloc_count_desc_blocks(struct inode *inode, * MDT file growing * @inode: inode of metadata file using this allocator * @desc_blocks: known current descriptor blocks count + * + * Return: true if a group can be added in the metadata file, false if not. */ static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode, unsigned long desc_blocks) @@ -487,6 +533,12 @@ static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode, * @inode: inode of metadata file using this allocator * @nused: current number of used entries * @nmaxp: max number of entries [out] + * + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. + * * %-ERANGE - Number of entries in use is out of range. */ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp) { @@ -518,6 +570,13 @@ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp) * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation * @wrap: whether to wrap around + * + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. + * * %-ENOSPC - Entries exhausted (No entries available for allocation). + * * %-EROFS - Read only filesystem */ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req, bool wrap) @@ -710,6 +769,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the removal + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_palloc_prepare_free_entry(struct inode *inode, struct nilfs_palloc_req *req) @@ -754,6 +815,8 @@ void nilfs_palloc_abort_free_entry(struct inode *inode, * @inode: inode of metadata file using this allocator * @entry_nrs: array of entry numbers to be deallocated * @nitems: number of entries stored in @entry_nrs + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) { diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 3f115ab7e9a7..046d876ea3e0 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -21,6 +21,8 @@ * * The number of entries per group is defined by the number of bits * that a bitmap block can maintain. + * + * Return: Number of entries per group. */ static inline unsigned long nilfs_palloc_entries_per_group(const struct inode *inode) diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index c9e8d9a7d820..ccc1a7aa52d2 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -47,17 +47,14 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, * @ptrp: place to store the value associated to @key * * Description: nilfs_bmap_lookup_at_level() finds a record whose key - * matches @key in the block at @level of the bmap. + * matches @key in the block at @level of the bmap. The record associated + * with @key is stored in the place pointed to by @ptrp. * - * Return Value: On success, 0 is returned and the record associated with @key - * is stored in the place pointed by @ptrp. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - A record associated with @key does not exist. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - A record associated with @key does not exist. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, __u64 *ptrp) @@ -138,14 +135,11 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) * Description: nilfs_bmap_insert() inserts the new key-record pair specified * by @key and @rec into @bmap. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EEXIST - A record associated with @key already exist. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EEXIST - A record associated with @key already exists. + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec) { @@ -193,14 +187,11 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) * Description: nilfs_bmap_seek_key() seeks a valid key on @bmap * starting from @start, and stores it to @keyp if found. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - No valid entry was found + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - No valid entry was found. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp) { @@ -236,14 +227,11 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp) * Description: nilfs_bmap_delete() deletes the key-record pair specified by * @key from @bmap. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - A record associated with @key does not exist. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - A record associated with @key does not exist. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key) { @@ -290,12 +278,10 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key) * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are * greater than or equal to @key from @bmap. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key) { @@ -330,12 +316,10 @@ void nilfs_bmap_clear(struct nilfs_bmap *bmap) * Description: nilfs_bmap_propagate() marks the buffers that directly or * indirectly refer to the block specified by @bh dirty. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh) { @@ -362,22 +346,22 @@ void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, /** * nilfs_bmap_assign - assign a new block number to a block - * @bmap: bmap - * @bh: pointer to buffer head + * @bmap: bmap + * @bh: place to store a pointer to the buffer head to which a block + * address is assigned (in/out) * @blocknr: block number - * @binfo: block information + * @binfo: block information * * Description: nilfs_bmap_assign() assigns the block number @blocknr to the - * buffer specified by @bh. + * buffer specified by @bh. The block information is stored in the memory + * pointed to by @binfo, and the buffer head may be replaced as a block + * address is assigned, in which case a pointer to the new buffer head is + * stored in the memory pointed to by @bh. * - * Return Value: On success, 0 is returned and the buffer head of a newly - * create buffer and the block information associated with the buffer are - * stored in the place pointed by @bh and @binfo, respectively. On error, one - * of the following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ int nilfs_bmap_assign(struct nilfs_bmap *bmap, struct buffer_head **bh, @@ -402,12 +386,10 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap, * Description: nilfs_bmap_mark() marks the block specified by @key and @level * as dirty. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level) { @@ -430,7 +412,7 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level) * Description: nilfs_test_and_clear() is the atomic operation to test and * clear the dirty state of @bmap. * - * Return Value: 1 is returned if @bmap is dirty, or 0 if clear. + * Return: 1 if @bmap is dirty, or 0 if clear. */ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap) { @@ -490,10 +472,10 @@ static struct lock_class_key nilfs_bmap_mdt_lock_key; * * Description: nilfs_bmap_read() initializes the bmap @bmap. * - * Return Value: On success, 0 is returned. On error, the following negative - * error code is returned. - * - * %-ENOMEM - Insufficient amount of memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (corrupted bmap). + * * %-ENOMEM - Insufficient memory available. */ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) { diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 54a3fa0cf67e..568367129092 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -201,7 +201,8 @@ void nilfs_btnode_delete(struct buffer_head *bh) * Note that the current implementation does not support folio sizes larger * than the page size. * - * Return: 0 on success, or the following negative error code on failure. + * Return: 0 on success, or one of the following negative error codes on + * failure: * * %-EIO - I/O error (metadata corruption). * * %-ENOMEM - Insufficient memory available. */ diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index ef5061bb56da..0d8f7fb15c2e 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -334,7 +334,7 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, * @inode: host inode of btree * @blocknr: block number * - * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + * Return: 0 if normal, 1 if the node is broken. */ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, size_t size, struct inode *inode, @@ -366,7 +366,7 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, * @node: btree root node to be examined * @inode: host inode of btree * - * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + * Return: 0 if normal, 1 if the root node is broken. */ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, struct inode *inode) @@ -652,8 +652,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree, * @minlevel: start level * @nextkey: place to store the next valid key * - * Return Value: If a next key was found, 0 is returned. Otherwise, - * -ENOENT is returned. + * Return: 0 if the next key was found, %-ENOENT if not found. */ static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path, diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index c20207d7a989..bcc7d76269ac 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -191,14 +191,11 @@ static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile, * @cnop: place to store the next checkpoint number * @bhp: place to store a pointer to buffer_head struct * - * Return Value: On success, it returns 0. On error, the following negative - * error code is returned. - * - * %-ENOMEM - Insufficient memory available. - * - * %-EIO - I/O error - * - * %-ENOENT - no block exists in the range. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - no block exists in the range. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile, __u64 start_cno, __u64 end_cno, @@ -239,7 +236,8 @@ static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, * stores it to the inode file given by @ifile and the nilfs root object * given by @root. * - * Return: 0 on success, or the following negative error code on failure. + * Return: 0 on success, or one of the following negative error codes on + * failure: * * %-EINVAL - Invalid checkpoint. * * %-ENOMEM - Insufficient memory available. * * %-EIO - I/O error (including metadata corruption). @@ -307,7 +305,8 @@ out_sem: * In either case, the buffer of the block containing the checkpoint entry * and the cpfile inode are made dirty for inclusion in the write log. * - * Return: 0 on success, or the following negative error code on failure. + * Return: 0 on success, or one of the following negative error codes on + * failure: * * %-ENOMEM - Insufficient memory available. * * %-EIO - I/O error (including metadata corruption). * * %-EROFS - Read only filesystem @@ -376,7 +375,8 @@ out_sem: * cpfile with the data given by the arguments @root, @blkinc, @ctime, and * @minor. * - * Return: 0 on success, or the following negative error code on failure. + * Return: 0 on success, or one of the following negative error codes on + * failure: * * %-ENOMEM - Insufficient memory available. * * %-EIO - I/O error (including metadata corruption). */ @@ -447,14 +447,11 @@ error: * the period from @start to @end, excluding @end itself. The checkpoints * which have been already deleted are ignored. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - invalid checkpoints. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EINVAL - Invalid checkpoints. + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, __u64 start, @@ -718,7 +715,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, * number to continue searching. * * Return: Count of checkpoint info items stored in the output buffer on - * success, or the following negative error code on failure. + * success, or one of the following negative error codes on failure: * * %-EINVAL - Invalid checkpoint mode. * * %-ENOMEM - Insufficient memory available. * * %-EIO - I/O error (including metadata corruption). @@ -743,7 +740,8 @@ ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, * @cpfile: checkpoint file inode * @cno: checkpoint number to delete * - * Return: 0 on success, or the following negative error code on failure. + * Return: 0 on success, or one of the following negative error codes on + * failure: * * %-EBUSY - Checkpoint in use (snapshot specified). * * %-EIO - I/O error (including metadata corruption). * * %-ENOENT - No valid checkpoint found. @@ -1011,7 +1009,7 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) * @cno: checkpoint number * * Return: 1 if the checkpoint specified by @cno is a snapshot, 0 if not, or - * the following negative error code on failure. + * one of the following negative error codes on failure: * * %-EIO - I/O error (including metadata corruption). * * %-ENOENT - No such checkpoint. * * %-ENOMEM - Insufficient memory available. @@ -1058,14 +1056,11 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) * Description: nilfs_change_cpmode() changes the mode of the checkpoint * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - No such checkpoint. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - No such checkpoint. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) { @@ -1097,14 +1092,12 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) * @cpstat: pointer to a structure of checkpoint statistics * * Description: nilfs_cpfile_get_stat() returns information about checkpoints. + * The checkpoint statistics are stored in the location pointed to by @cpstat. * - * Return Value: On success, 0 is returned, and checkpoints information is - * stored in the place pointed by @cpstat. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) { @@ -1135,6 +1128,8 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) * @cpsize: size of a checkpoint entry * @raw_inode: on-disk cpfile inode * @inodep: buffer to store the inode + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, struct nilfs_inode *raw_inode, struct inode **inodep) diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index e220dcb08aa6..c664daba56ae 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -276,7 +276,8 @@ void nilfs_dat_abort_update(struct inode *dat, * @dat: DAT file inode * @vblocknr: virtual block number * - * Return: 0 on success, or the following negative error code on failure. + * Return: 0 on success, or one of the following negative error codes on + * failure: * * %-EINVAL - Invalid DAT entry (internal code). * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. @@ -302,14 +303,11 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr) * Description: nilfs_dat_freev() frees the virtual block numbers specified by * @vblocknrs and @nitems. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - The virtual block number have not been allocated. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - The virtual block number have not been allocated. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems) { @@ -325,12 +323,10 @@ int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems) * Description: nilfs_dat_move() changes the block number associated with * @vblocknr to @blocknr. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) { @@ -390,17 +386,14 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) * @blocknrp: pointer to a block number * * Description: nilfs_dat_translate() maps the virtual block number @vblocknr - * to the corresponding block number. + * to the corresponding block number. The block number associated with + * @vblocknr is stored in the place pointed to by @blocknrp. * - * Return Value: On success, 0 is returned and the block number associated - * with @vblocknr is stored in the place pointed by @blocknrp. On error, one - * of the following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - A block number associated with @vblocknr does not exist. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - A block number associated with @vblocknr does not exist. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) { @@ -489,6 +482,8 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, * @entry_size: size of a dat entry * @raw_inode: on-disk dat inode * @inodep: buffer to store the inode + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_dat_read(struct super_block *sb, size_t entry_size, struct nilfs_inode *raw_inode, struct inode **inodep) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 0a3aea6c416b..9b7f8e9655a2 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -400,7 +400,7 @@ int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino) return 0; } -void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, +int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, struct folio *folio, struct inode *inode) { size_t from = offset_in_folio(folio, de); @@ -410,11 +410,15 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, folio_lock(folio); err = nilfs_prepare_chunk(folio, from, to); - BUG_ON(err); + if (unlikely(err)) { + folio_unlock(folio); + return err; + } de->inode = cpu_to_le64(inode->i_ino); de->file_type = fs_umode_to_ftype(inode->i_mode); nilfs_commit_chunk(folio, mapping, from, to); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + return 0; } /* @@ -543,7 +547,10 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct folio *folio) from = (char *)pde - kaddr; folio_lock(folio); err = nilfs_prepare_chunk(folio, from, to); - BUG_ON(err); + if (unlikely(err)) { + folio_unlock(folio); + goto out; + } if (pde) pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 2dbb15767df1..561c220799c7 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -46,14 +46,11 @@ * specified by @pbn to the GC pagecache with the key @blkoff. * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer. * - * Return Value: On success, 0 is returned. On Error, one of the following - * negative error code is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - The block specified with @pbn does not exist. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - The block specified with @pbn does not exist. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, sector_t pbn, __u64 vbn, @@ -114,12 +111,11 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, * specified by @vbn to the GC pagecache. @pbn can be supplied by the * caller to avoid translation of the disk block address. * - * Return Value: On success, 0 is returned. On Error, one of the following - * negative error code is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - Invalid virtual block address. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, __u64 vbn, struct buffer_head **out_bh) diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index e7339eb3c08a..c4cd4a4dedd0 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -38,17 +38,16 @@ static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile) * @out_ino: pointer to a variable to store inode number * @out_bh: buffer_head contains newly allocated disk inode * - * Return Value: On success, 0 is returned and the newly allocated inode - * number is stored in the place pointed by @ino, and buffer_head pointer - * that contains newly allocated disk inode structure is stored in the - * place pointed by @out_bh - * On error, one of the following negative error codes is returned. + * nilfs_ifile_create_inode() allocates a new inode in the ifile metadata + * file and stores the inode number in the variable pointed to by @out_ino, + * as well as storing the ifile's buffer with the disk inode in the location + * pointed to by @out_bh. * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOSPC - No inode left. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. + * * %-ENOSPC - No inode left. */ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, struct buffer_head **out_bh) @@ -83,14 +82,11 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, * @ifile: ifile inode * @ino: inode number * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - The inode number @ino have not been allocated. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - Inode number unallocated. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) { @@ -150,6 +146,8 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, * @ifile: ifile inode * @nmaxinodes: current maximum of available inodes count [out] * @nfreeinodes: free inodes count [out] + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_ifile_count_free_inodes(struct inode *ifile, u64 *nmaxinodes, u64 *nfreeinodes) @@ -174,7 +172,8 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile, * @cno: number of checkpoint entry to read * @inode_size: size of an inode * - * Return: 0 on success, or the following negative error code on failure. + * Return: 0 on success, or one of the following negative error codes on + * failure: * * %-EINVAL - Invalid checkpoint. * * %-ENOMEM - Insufficient memory available. * * %-EIO - I/O error (including metadata corruption). diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 23f3a75edd50..e8015d24a82c 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -68,6 +68,8 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n) * * This function does not issue actual read request of the specified data * block. It is done by VFS. + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_get_block(struct inode *inode, sector_t blkoff, struct buffer_head *bh_result, int create) @@ -141,6 +143,8 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, * address_space_operations. * @file: file struct of the file to be read * @folio: the folio to be read + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_read_folio(struct file *file, struct folio *folio) { @@ -598,10 +602,7 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, * or does nothing if the inode already has it. This function allocates * an additional inode to maintain page cache of B-tree nodes one-on-one. * - * Return Value: On success, 0 is returned. On errors, one of the following - * negative error code is returned. - * - * %-ENOMEM - Insufficient memory available. + * Return: 0 on success, or %-ENOMEM if memory is insufficient. */ int nilfs_attach_btree_node_cache(struct inode *inode) { @@ -660,11 +661,8 @@ void nilfs_detach_btree_node_cache(struct inode *inode) * in one inode and the one for b-tree node pages is set up in the * other inode, which is attached to the former inode. * - * Return Value: On success, a pointer to the inode for data pages is - * returned. On errors, one of the following negative error code is returned - * in a pointer type. - * - * %-ENOMEM - Insufficient memory available. + * Return: a pointer to the inode for data pages on success, or %-ENOMEM + * if memory is insufficient. */ struct inode *nilfs_iget_for_shadow(struct inode *inode) { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index fa77f78df681..a66d62a51f77 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -33,17 +33,14 @@ * @dofunc: concrete function of get/set metadata info * * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of - * calling dofunc() function on the basis of @argv argument. + * calling dofunc() function on the basis of @argv argument. If successful, + * the requested metadata information is copied to userspace memory. * - * Return Value: On success, 0 is returned and requested metadata info - * is copied into userspace. On error, one of the following - * negative error codes is returned. - * - * %-EINVAL - Invalid arguments from userspace. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EFAULT - Failure during execution of requested operation. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EFAULT - Failure during execution of requested operation. + * * %-EINVAL - Invalid arguments from userspace. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, struct nilfs_argv *argv, int dir, @@ -190,13 +187,10 @@ static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp) * given checkpoint between checkpoint and snapshot state. This ioctl * is used in chcp and mkcp utilities. * - * Return Value: On success, 0 is returned and mode of a checkpoint is - * changed. On error, one of the following negative error codes - * is returned. - * - * %-EPERM - Operation not permitted. - * - * %-EFAULT - Failure during checkpoint mode changing. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * %-EFAULT - Failure during checkpoint mode changing. + * %-EPERM - Operation not permitted. */ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) @@ -244,13 +238,10 @@ out: * checkpoint from NILFS2 file system. This ioctl is used in rmcp * utility. * - * Return Value: On success, 0 is returned and a checkpoint is - * removed. On error, one of the following negative error codes - * is returned. - * - * %-EPERM - Operation not permitted. - * - * %-EFAULT - Failure during checkpoint removing. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * %-EFAULT - Failure during checkpoint removing. + * %-EPERM - Operation not permitted. */ static int nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, @@ -296,7 +287,7 @@ out: * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in * lscp utility and by nilfs_cleanerd daemon. * - * Return value: count of nilfs_cpinfo structures in output buffer. + * Return: Count of nilfs_cpinfo structures in output buffer. */ static ssize_t nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, @@ -320,17 +311,14 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, * * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints. * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities - * and by nilfs_cleanerd daemon. + * and by nilfs_cleanerd daemon. The checkpoint statistics are copied to + * the userspace memory pointed to by @argp. * - * Return Value: On success, 0 is returned, and checkpoints information is - * copied into userspace pointer @argp. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EFAULT - Failure during getting checkpoints statistics. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EFAULT - Failure during getting checkpoints statistics. + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) @@ -363,7 +351,8 @@ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon. * - * Return value: count of nilfs_suinfo structures in output buffer. + * Return: Count of nilfs_suinfo structures in output buffer on success, + * or a negative error code on failure. */ static ssize_t nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, @@ -387,17 +376,14 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, * * Description: nilfs_ioctl_get_sustat() returns segment usage statistics. * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities - * and by nilfs_cleanerd daemon. + * and by nilfs_cleanerd daemon. The requested segment usage information is + * copied to the userspace memory pointed to by @argp. * - * Return Value: On success, 0 is returned, and segment usage information is - * copied into userspace pointer @argp. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EFAULT - Failure during getting segment usage statistics. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EFAULT - Failure during getting segment usage statistics. + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) @@ -430,7 +416,8 @@ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used * by nilfs_cleanerd daemon. * - * Return value: count of nilfs_vinfo structures in output buffer. + * Return: Count of nilfs_vinfo structures in output buffer on success, or + * a negative error code on failure. */ static ssize_t nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, @@ -457,7 +444,8 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl * is used by nilfs_cleanerd daemon. * - * Return value: count of nilfs_bdescs structures in output buffer. + * Return: Count of nilfs_bdescs structures in output buffer on success, or + * a negative error code on failure. */ static ssize_t nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, @@ -494,19 +482,15 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, * * Description: nilfs_ioctl_do_get_bdescs() function returns information * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl - * is used by nilfs_cleanerd daemon. + * is used by nilfs_cleanerd daemon. If successful, disk block descriptors + * are copied to userspace pointer @argp. * - * Return Value: On success, 0 is returned, and disk block descriptors are - * copied into userspace pointer @argp. On error, one of the following - * negative error codes is returned. - * - * %-EINVAL - Invalid arguments from userspace. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EFAULT - Failure during getting disk block descriptors. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EFAULT - Failure during getting disk block descriptors. + * * %-EINVAL - Invalid arguments from userspace. + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) @@ -540,16 +524,12 @@ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, * Description: nilfs_ioctl_move_inode_block() function registers data/node * buffer in the GC pagecache and submit read request. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - Requested block doesn't exist. - * - * %-EEXIST - Blocks conflict is detected. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EEXIST - Block conflict detected. + * * %-EIO - I/O error. + * * %-ENOENT - Requested block doesn't exist. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_ioctl_move_inode_block(struct inode *inode, struct nilfs_vdesc *vdesc, @@ -604,8 +584,8 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode, * blocks that garbage collector specified with the array of nilfs_vdesc * structures and stores them into page caches of GC inodes. * - * Return Value: Number of processed nilfs_vdesc structures or - * error code, otherwise. + * Return: Number of processed nilfs_vdesc structures on success, or + * a negative error code on failure. */ static int nilfs_ioctl_move_blocks(struct super_block *sb, struct nilfs_argv *argv, void *buf) @@ -682,14 +662,11 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, * in the period from p_start to p_end, excluding p_end itself. The checkpoints * which have been already deleted are ignored. * - * Return Value: Number of processed nilfs_period structures or - * error code, otherwise. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - invalid checkpoints. + * Return: Number of processed nilfs_period structures on success, or one of + * the following negative error codes on failure: + * * %-EINVAL - invalid checkpoints. + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) @@ -717,14 +694,11 @@ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, * Description: nilfs_ioctl_free_vblocknrs() function frees * the virtual block numbers specified by @buf and @argv->v_nmembs. * - * Return Value: Number of processed virtual block numbers or - * error code, otherwise. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - The virtual block number have not been allocated. + * Return: Number of processed virtual block numbers on success, or one of the + * following negative error codes on failure: + * * %-EIO - I/O error. + * * %-ENOENT - Unallocated virtual block number. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) @@ -746,14 +720,11 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, * Description: nilfs_ioctl_mark_blocks_dirty() function marks * metadata file or data blocks as dirty. * - * Return Value: Number of processed block descriptors or - * error code, otherwise. - * - * %-ENOMEM - Insufficient memory available. - * - * %-EIO - I/O error - * - * %-ENOENT - the specified block does not exist (hole block) + * Return: Number of processed block descriptors on success, or one of the + * following negative error codes on failure: + * * %-EIO - I/O error. + * * %-ENOENT - Non-existent block (hole block). + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) @@ -852,7 +823,7 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by * nilfs_cleanerd daemon. * - * Return Value: On success, 0 is returned or error code, otherwise. + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) @@ -976,20 +947,14 @@ out: * and metadata are written out to the device when it successfully * returned. * - * Return Value: On success, 0 is retured. On errors, one of the following - * negative error code is returned. - * - * %-EROFS - Read only filesystem. - * - * %-EIO - I/O error - * - * %-ENOSPC - No space left on device (only in a panic state). - * - * %-ERESTARTSYS - Interrupted. - * - * %-ENOMEM - Insufficient memory available. - * - * %-EFAULT - Failure during execution of requested operation. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EFAULT - Failure during execution of requested operation. + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. + * * %-ENOSPC - No space left on device (only in a panic state). + * * %-ERESTARTSYS - Interrupted. + * * %-EROFS - Read only filesystem. */ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) @@ -1023,7 +988,7 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, * @filp: file object * @argp: pointer on argument from userspace * - * Return Value: On success, 0 is returned or error code, otherwise. + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, void __user *argp) @@ -1059,7 +1024,7 @@ out: * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which * performs the actual trim operation. * - * Return Value: On success, 0 is returned or negative error code, otherwise. + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp) { @@ -1101,7 +1066,7 @@ static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp) * of segments in bytes and upper limit of segments in bytes. * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility. * - * Return Value: On success, 0 is returned or error code, otherwise. + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) { @@ -1152,17 +1117,15 @@ out: * @dofunc: concrete function of getting metadata info * * Description: nilfs_ioctl_get_info() gets metadata info by means of - * calling dofunc() function. + * calling dofunc() function. The requested metadata information is copied + * to userspace memory @argp. * - * Return Value: On success, 0 is returned and requested metadata info - * is copied into userspace. On error, one of the following - * negative error codes is returned. - * - * %-EINVAL - Invalid arguments from userspace. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EFAULT - Failure during execution of requested operation. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EFAULT - Failure during execution of requested operation. + * * %-EINVAL - Invalid arguments from userspace. + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp, @@ -1202,18 +1165,14 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, * encapsulated in nilfs_argv and updates the segment usage info * according to the flags in nilfs_suinfo_update. * - * Return Value: On success, 0 is returned. On error, one of the - * following negative error codes is returned. - * - * %-EPERM - Not enough permissions - * - * %-EFAULT - Error copying input data - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid values in input (segment number, flags or nblocks) + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EEXIST - Block conflict detected. + * * %-EFAULT - Error copying input data. + * * %-EINVAL - Invalid values in input (segment number, flags or nblocks). + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. + * * %-EPERM - Not enough permissions. */ static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) @@ -1309,7 +1268,8 @@ static int nilfs_ioctl_get_fslabel(struct super_block *sb, void __user *argp) * @filp: file object * @argp: pointer to userspace memory that contains the volume name * - * Return: 0 on success, or the following negative error code on failure. + * Return: 0 on success, or one of the following negative error codes on + * failure: * * %-EFAULT - Error copying input data. * * %-EINVAL - Label length exceeds record size in superblock. * * %-EIO - I/O error. diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 965b5ad1c0df..2f850a18d6e7 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -226,20 +226,21 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, * @out_bh: output of a pointer to the buffer_head * * nilfs_mdt_get_block() looks up the specified buffer and tries to create - * a new buffer if @create is not zero. On success, the returned buffer is - * assured to be either existing or formatted using a buffer lock on success. - * @out_bh is substituted only when zero is returned. + * a new buffer if @create is not zero. If (and only if) this function + * succeeds, it stores a pointer to the retrieved buffer head in the location + * pointed to by @out_bh. * - * Return Value: On success, it returns 0. On error, the following negative - * error code is returned. + * The retrieved buffer may be either an existing one or a newly allocated one. + * For a newly created buffer, if the callback function argument @init_block + * is non-NULL, the callback will be called with the buffer locked to format + * the block. * - * %-ENOMEM - Insufficient memory available. - * - * %-EIO - I/O error - * - * %-ENOENT - the specified block does not exist (hole block) - * - * %-EROFS - Read only filesystem (for create mode) + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - The specified block does not exist (hole block). + * * %-ENOMEM - Insufficient memory available. + * * %-EROFS - Read only filesystem (for create mode). */ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, void (*init_block)(struct inode *, @@ -275,14 +276,11 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, * @out_bh, and block offset to @blkoff, respectively. @out_bh and * @blkoff are substituted only when zero is returned. * - * Return Value: On success, it returns 0. On error, the following negative - * error code is returned. - * - * %-ENOMEM - Insufficient memory available. - * - * %-EIO - I/O error - * - * %-ENOENT - no block was found in the range + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - No block was found in the range. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_mdt_find_block(struct inode *inode, unsigned long start, unsigned long end, unsigned long *blkoff, @@ -321,12 +319,11 @@ out: * @inode: inode of the meta data file * @block: block offset * - * Return Value: On success, zero is returned. - * On error, one of the following negative error code is returned. - * - * %-ENOMEM - Insufficient memory available. - * - * %-EIO - I/O error + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - Non-existent block. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) { @@ -349,12 +346,10 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and * tries to release the page including the buffer from a page cache. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error code is returned. - * - * %-EBUSY - page has an active buffer. - * - * %-ENOENT - page cache has no page addressed by the offset. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EBUSY - Page has an active buffer. + * * %-ENOENT - Page cache has no page addressed by the offset. */ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) { @@ -524,6 +519,8 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size, * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file * @inode: inode of the metadata file * @shadow: shadow mapping + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct nilfs_shadow_map *shadow) @@ -545,6 +542,8 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, /** * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map * @inode: inode of the metadata file + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_mdt_save_to_shadow_map(struct inode *inode) { diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 1d836a5540f3..953fbd5f0851 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -370,6 +370,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, struct folio *old_folio; struct nilfs_dir_entry *old_de; struct nilfs_transaction_info ti; + bool old_is_dir = S_ISDIR(old_inode->i_mode); int err; if (flags & ~RENAME_NOREPLACE) @@ -385,7 +386,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, goto out; } - if (S_ISDIR(old_inode->i_mode)) { + if (old_is_dir && old_dir != new_dir) { err = -EIO; dir_de = nilfs_dotdot(old_inode, &dir_folio); if (!dir_de) @@ -397,7 +398,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, struct nilfs_dir_entry *new_de; err = -ENOTEMPTY; - if (dir_de && !nilfs_empty_dir(new_inode)) + if (old_is_dir && !nilfs_empty_dir(new_inode)) goto out_dir; new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, @@ -406,11 +407,13 @@ static int nilfs_rename(struct mnt_idmap *idmap, err = PTR_ERR(new_de); goto out_dir; } - nilfs_set_link(new_dir, new_de, new_folio, old_inode); + err = nilfs_set_link(new_dir, new_de, new_folio, old_inode); folio_release_kmap(new_folio, new_de); + if (unlikely(err)) + goto out_dir; nilfs_mark_inode_dirty(new_dir); inode_set_ctime_current(new_inode); - if (dir_de) + if (old_is_dir) drop_nlink(new_inode); drop_nlink(new_inode); nilfs_mark_inode_dirty(new_inode); @@ -418,7 +421,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, err = nilfs_add_link(new_dentry, old_inode); if (err) goto out_dir; - if (dir_de) { + if (old_is_dir) { inc_nlink(new_dir); nilfs_mark_inode_dirty(new_dir); } @@ -430,28 +433,28 @@ static int nilfs_rename(struct mnt_idmap *idmap, */ inode_set_ctime_current(old_inode); - nilfs_delete_entry(old_de, old_folio); - - if (dir_de) { - nilfs_set_link(old_inode, dir_de, dir_folio, new_dir); - folio_release_kmap(dir_folio, dir_de); - drop_nlink(old_dir); + err = nilfs_delete_entry(old_de, old_folio); + if (likely(!err)) { + if (old_is_dir) { + if (old_dir != new_dir) + err = nilfs_set_link(old_inode, dir_de, + dir_folio, new_dir); + drop_nlink(old_dir); + } + nilfs_mark_inode_dirty(old_dir); } - folio_release_kmap(old_folio, old_de); - - nilfs_mark_inode_dirty(old_dir); nilfs_mark_inode_dirty(old_inode); - err = nilfs_transaction_commit(old_dir->i_sb); - return err; - out_dir: if (dir_de) folio_release_kmap(dir_folio, dir_de); out_old: folio_release_kmap(old_folio, old_de); out: - nilfs_transaction_abort(old_dir->i_sb); + if (likely(!err)) + err = nilfs_transaction_commit(old_dir->i_sb); + else + nilfs_transaction_abort(old_dir->i_sb); return err; } diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index dff241c53fc5..cb6ed54accd7 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -261,8 +261,8 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *, int nilfs_delete_entry(struct nilfs_dir_entry *, struct folio *); int nilfs_empty_dir(struct inode *); struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct folio **); -void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, - struct folio *, struct inode *); +int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, + struct folio *folio, struct inode *inode); /* file.c */ extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 9de2a494a069..806b056d2260 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -135,8 +135,7 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) * nilfs_folio_buffers_clean - Check if a folio has dirty buffers or not. * @folio: Folio to be checked. * - * nilfs_folio_buffers_clean() returns false if the folio has dirty buffers. - * Otherwise, it returns true. + * Return: false if the folio has dirty buffers, true otherwise. */ bool nilfs_folio_buffers_clean(struct folio *folio) { @@ -392,6 +391,11 @@ void nilfs_clear_dirty_pages(struct address_space *mapping) /** * nilfs_clear_folio_dirty - discard dirty folio * @folio: dirty folio that will be discarded + * + * nilfs_clear_folio_dirty() clears working states including dirty state for + * the folio and its buffers. If the folio has buffers, clear only if it is + * confirmed that none of the buffer heads are busy (none have valid + * references and none are locked). */ void nilfs_clear_folio_dirty(struct folio *folio) { @@ -399,10 +403,6 @@ void nilfs_clear_folio_dirty(struct folio *folio) BUG_ON(!folio_test_locked(folio)); - folio_clear_uptodate(folio); - folio_clear_mappedtodisk(folio); - folio_clear_checked(folio); - head = folio_buffers(folio); if (head) { const unsigned long clear_bits = @@ -410,6 +410,25 @@ void nilfs_clear_folio_dirty(struct folio *folio) BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) | BIT(BH_Delay)); + bool busy, invalidated = false; + +recheck_buffers: + busy = false; + bh = head; + do { + if (atomic_read(&bh->b_count) | buffer_locked(bh)) { + busy = true; + break; + } + } while (bh = bh->b_this_page, bh != head); + + if (busy) { + if (invalidated) + return; + invalidate_bh_lrus(); + invalidated = true; + goto recheck_buffers; + } bh = head; do { @@ -419,6 +438,9 @@ void nilfs_clear_folio_dirty(struct folio *folio) } while (bh = bh->b_this_page, bh != head); } + folio_clear_uptodate(folio); + folio_clear_mappedtodisk(folio); + folio_clear_checked(folio); __nilfs_clear_folio_dirty(folio); } @@ -477,8 +499,9 @@ void __nilfs_clear_folio_dirty(struct folio *folio) * This function searches an extent of buffers marked "delayed" which * starts from a block offset equal to or larger than @start_blk. If * such an extent was found, this will store the start offset in - * @blkoff and return its length in blocks. Otherwise, zero is - * returned. + * @blkoff and return its length in blocks. + * + * Return: Length in blocks of found extent, 0 otherwise. */ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index e43405bf521e..22aecf6e2344 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -88,6 +88,8 @@ static int nilfs_warn_segment_error(struct super_block *sb, int err) * @check_bytes: number of bytes to be checked * @start: DBN of start block * @nblock: number of blocks to be checked + * + * Return: 0 on success, or %-EIO if an I/O error occurs. */ static int nilfs_compute_checksum(struct the_nilfs *nilfs, struct buffer_head *bhs, u32 *sum, @@ -126,6 +128,11 @@ static int nilfs_compute_checksum(struct the_nilfs *nilfs, * @sr_block: disk block number of the super root block * @pbh: address of a buffer_head pointer to return super root buffer * @check: CRC check flag + * + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EINVAL - Super root block corrupted. + * * %-EIO - I/O error. */ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, struct buffer_head **pbh, int check) @@ -176,6 +183,8 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, * @nilfs: nilfs object * @start_blocknr: start block number of the log * @sum: pointer to return segment summary structure + * + * Return: Buffer head pointer, or NULL if an I/O error occurs. */ static struct buffer_head * nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr, @@ -195,6 +204,13 @@ nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr, * @seg_seq: sequence number of segment * @bh_sum: buffer head of summary block * @sum: segment summary struct + * + * Return: 0 on success, or one of the following internal codes on failure: + * * %NILFS_SEG_FAIL_MAGIC - Magic number mismatch. + * * %NILFS_SEG_FAIL_SEQ - Sequence number mismatch. + * * %NIFLS_SEG_FAIL_CONSISTENCY - Block count out of range. + * * %NILFS_SEG_FAIL_IO - I/O error. + * * %NILFS_SEG_FAIL_CHECKSUM_FULL - Full log checksum verification failed. */ static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq, struct buffer_head *bh_sum, @@ -238,6 +254,9 @@ out: * @pbh: the current buffer head on summary blocks [in, out] * @offset: the current byte offset on summary blocks [in, out] * @bytes: byte size of the item to be read + * + * Return: Kernel space address of current segment summary entry, or + * NULL if an I/O error occurs. */ static void *nilfs_read_summary_info(struct the_nilfs *nilfs, struct buffer_head **pbh, @@ -300,6 +319,11 @@ static void nilfs_skip_summary_info(struct the_nilfs *nilfs, * @start_blocknr: start block number of the log * @sum: log summary information * @head: list head to add nilfs_recovery_block struct + * + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, struct nilfs_segment_summary *sum, @@ -571,6 +595,12 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, * @sb: super block instance * @root: NILFS root instance * @ri: pointer to a nilfs_recovery_info + * + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EINVAL - Log format error. + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, struct super_block *sb, @@ -754,18 +784,13 @@ static void nilfs_abort_roll_forward(struct the_nilfs *nilfs) * @sb: super block instance * @ri: pointer to a nilfs_recovery_info struct to store search results. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error code is returned. - * - * %-EINVAL - Inconsistent filesystem state. - * - * %-EIO - I/O error - * - * %-ENOSPC - No space left on device (only in a panic state). - * - * %-ERESTARTSYS - Interrupted. - * - * %-ENOMEM - Insufficient memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EINVAL - Inconsistent filesystem state. + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. + * * %-ENOSPC - No space left on device (only in a panic state). + * * %-ERESTARTSYS - Interrupted. */ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb, @@ -830,14 +855,11 @@ failed: * segment pointed by the superblock. It sets up struct the_nilfs through * this search. It fills nilfs_recovery_info (ri) required for recovery. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error code is returned. - * - * %-EINVAL - No valid segment found - * - * %-EIO - I/O error - * - * %-ENOMEM - Insufficient memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EINVAL - No valid segment found. + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. */ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index e08cab03366b..a8bdf3d318ea 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -406,12 +406,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, * @segbuf: buffer storing a log to be written * @nilfs: nilfs object * - * Return Value: On Success, 0 is returned. On Error, one of the following - * negative error code is returned. - * - * %-EIO - I/O error - * - * %-ENOMEM - Insufficient memory available. + * Return: Always 0. */ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, struct the_nilfs *nilfs) @@ -452,10 +447,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * nilfs_segbuf_wait - wait for completion of requested BIOs * @segbuf: segment buffer * - * Return Value: On Success, 0 is returned. On Error, one of the following - * negative error code is returned. - * - * %-EIO - I/O error + * Return: 0 on success, or %-EIO if I/O error is detected. */ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) { diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 587251830897..3a202e51b360 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -191,12 +191,10 @@ static int nilfs_prepare_segment_lock(struct super_block *sb, * When @vacancy_check flag is set, this function will check the amount of * free space, and will wait for the GC to reclaim disk space if low capacity. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error code is returned. - * - * %-ENOMEM - Insufficient memory available. - * - * %-ENOSPC - No space left on device + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-ENOMEM - Insufficient memory available. + * * %-ENOSPC - No space left on device (if checking free space). */ int nilfs_transaction_begin(struct super_block *sb, struct nilfs_transaction_info *ti, @@ -252,6 +250,8 @@ int nilfs_transaction_begin(struct super_block *sb, * nilfs_transaction_commit() sets a timer to start the segment * constructor. If a sync flag is set, it starts construction * directly. + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_transaction_commit(struct super_block *sb) { @@ -407,6 +407,8 @@ static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, /** * nilfs_segctor_reset_segment_buffer - reset the current segment buffer * @sci: nilfs_sc_info + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) { @@ -734,7 +736,6 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, if (!head) head = create_empty_buffers(folio, i_blocksize(inode), 0); - folio_unlock(folio); bh = head; do { @@ -744,11 +745,14 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, list_add_tail(&bh->b_assoc_buffers, listp); ndirties++; if (unlikely(ndirties >= nlimit)) { + folio_unlock(folio); folio_batch_release(&fbatch); cond_resched(); return ndirties; } } while (bh = bh->b_this_page, bh != head); + + folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); @@ -1118,7 +1122,8 @@ static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci, * a super root block containing this sufile change is complete, and it can * be canceled with nilfs_sufile_cancel_freev() until then. * - * Return: 0 on success, or the following negative error code on failure. + * Return: 0 on success, or one of the following negative error codes on + * failure: * * %-EINVAL - Invalid segment number. * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. @@ -1315,6 +1320,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) * nilfs_segctor_begin_construction - setup segment buffer to make a new log * @sci: nilfs_sc_info * @nilfs: nilfs object + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) @@ -2312,18 +2319,13 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err, bool force) * nilfs_construct_segment - construct a logical segment * @sb: super block * - * Return Value: On success, 0 is returned. On errors, one of the following - * negative error code is returned. - * - * %-EROFS - Read only filesystem. - * - * %-EIO - I/O error - * - * %-ENOSPC - No space left on device (only in a panic state). - * - * %-ERESTARTSYS - Interrupted. - * - * %-ENOMEM - Insufficient memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. + * * %-ENOSPC - No space left on device (only in a panic state). + * * %-ERESTARTSYS - Interrupted. + * * %-EROFS - Read only filesystem. */ int nilfs_construct_segment(struct super_block *sb) { @@ -2347,18 +2349,13 @@ int nilfs_construct_segment(struct super_block *sb) * @start: start byte offset * @end: end byte offset (inclusive) * - * Return Value: On success, 0 is returned. On errors, one of the following - * negative error code is returned. - * - * %-EROFS - Read only filesystem. - * - * %-EIO - I/O error - * - * %-ENOSPC - No space left on device (only in a panic state). - * - * %-ERESTARTSYS - Interrupted. - * - * %-ENOMEM - Insufficient memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. + * * %-ENOSPC - No space left on device (only in a panic state). + * * %-ERESTARTSYS - Interrupted. + * * %-EROFS - Read only filesystem. */ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, loff_t start, loff_t end) @@ -2464,6 +2461,8 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err) * nilfs_segctor_construct - form logs and write them to disk * @sci: segment constructor object * @mode: mode of log forming + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) { @@ -2836,7 +2835,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) * This allocates a log writer object, initializes it, and starts the * log writer. * - * Return: 0 on success, or the following negative error code on failure. + * Return: 0 on success, or one of the following negative error codes on + * failure: * * %-EINTR - Log writer thread creation failed due to interruption. * * %-ENOMEM - Insufficient memory available. */ diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index d3ecc813d633..330f269abedf 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -133,6 +133,8 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, /** * nilfs_sufile_get_ncleansegs - return the number of clean segments * @sufile: inode of segment usage file + * + * Return: Number of clean segments. */ unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile) { @@ -155,17 +157,13 @@ unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile) * of successfully modified segments from the head is stored in the * place @ndone points to. * - * Return Value: On success, zero is returned. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - Given segment usage is in hole block (may be returned if - * @create is zero) - * - * %-EINVAL - Invalid segment usage number + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EINVAL - Invalid segment usage number + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - Given segment usage is in hole block (may be returned if + * @create is zero) + * * %-ENOMEM - Insufficient memory available. */ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs, int create, size_t *ndone, @@ -272,10 +270,7 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, * @start: minimum segment number of allocatable region (inclusive) * @end: maximum segment number of allocatable region (inclusive) * - * Return Value: On success, 0 is returned. On error, one of the - * following negative error codes is returned. - * - * %-ERANGE - invalid segment region + * Return: 0 on success, or %-ERANGE if segment range is invalid. */ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end) { @@ -300,17 +295,14 @@ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end) * @sufile: inode of segment usage file * @segnump: pointer to segment number * - * Description: nilfs_sufile_alloc() allocates a clean segment. + * Description: nilfs_sufile_alloc() allocates a clean segment, and stores + * its segment number in the place pointed to by @segnump. * - * Return Value: On success, 0 is returned and the segment number of the - * allocated segment is stored in the place pointed by @segnump. On error, one - * of the following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOSPC - No clean segment left. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. + * * %-ENOSPC - No clean segment left. */ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) { @@ -510,6 +502,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty * @sufile: inode of segment usage file * @segnum: segment number + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) { @@ -569,6 +563,8 @@ out_sem: * @segnum: segment number * @nblocks: number of live blocks in the segment * @modtime: modification time (option) + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, unsigned long nblocks, time64_t modtime) @@ -610,16 +606,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, * @sufile: inode of segment usage file * @sustat: pointer to a structure of segment usage statistics * - * Description: nilfs_sufile_get_stat() returns information about segment - * usage. + * Description: nilfs_sufile_get_stat() retrieves segment usage statistics + * and stores them in the location pointed to by @sustat. * - * Return Value: On success, 0 is returned, and segment usage information is - * stored in the place pointed by @sustat. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) { @@ -683,16 +676,12 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, * @start: start segment number (inclusive) * @end: end segment number (inclusive) * - * Return Value: On success, 0 is returned. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid number of segments specified - * - * %-EBUSY - Dirty or active segments are present in the range + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EBUSY - Dirty or active segments are present in the range. + * * %-EINVAL - Invalid number of segments specified. + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ static int nilfs_sufile_truncate_range(struct inode *sufile, __u64 start, __u64 end) @@ -787,16 +776,12 @@ out: * @sufile: inode of segment usage file * @newnsegs: new number of segments * - * Return Value: On success, 0 is returned. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOSPC - Enough free space is not left for shrinking - * - * %-EBUSY - Dirty or active segments exist in the region to be truncated + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EBUSY - Dirty or active segments exist in the region to be truncated. + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. + * * %-ENOSPC - Enough free space is not left for shrinking. */ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) { @@ -865,7 +850,7 @@ out: * @nsi: size of suinfo array * * Return: Count of segment usage info items stored in the output buffer on - * success, or the following negative error code on failure. + * success, or one of the following negative error codes on failure: * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. */ @@ -939,14 +924,11 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, * segment usage accordingly. Only the fields indicated by the sup_flags * are updated. * - * Return Value: On success, 0 is returned. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid values in input (segment number, flags or nblocks) + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EINVAL - Invalid values in input (segment number, flags or nblocks). + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, unsigned int supsz, size_t nsup) @@ -1073,7 +1055,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, * and start+len is rounded down. For each clean segment blkdev_issue_discard * function is invoked. * - * Return Value: On success, 0 is returned or negative error code, otherwise. + * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) { @@ -1219,6 +1201,8 @@ out_sem: * @susize: size of a segment usage entry * @raw_inode: on-disk sufile inode * @inodep: buffer to store the inode + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_read(struct super_block *sb, size_t susize, struct nilfs_inode *raw_inode, struct inode **inodep) diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 8e8a1a5a0402..cd6f28ab3521 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -58,6 +58,8 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range); * nilfs_sufile_scrap - make a segment garbage * @sufile: inode of segment usage file * @segnum: segment number to be freed + * + * Return: 0 on success, or a negative error code on failure. */ static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum) { @@ -68,6 +70,8 @@ static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum) * nilfs_sufile_free - free segment * @sufile: inode of segment usage file * @segnum: segment number to be freed + * + * Return: 0 on success, or a negative error code on failure. */ static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum) { @@ -80,6 +84,8 @@ static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum) * @segnumv: array of segment numbers * @nsegs: size of @segnumv array * @ndone: place to store the number of freed segments + * + * Return: 0 on success, or a negative error code on failure. */ static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv, size_t nsegs, size_t *ndone) @@ -95,8 +101,7 @@ static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv, * @nsegs: size of @segnumv array * @ndone: place to store the number of cancelled segments * - * Return Value: On success, 0 is returned. On error, a negative error codes - * is returned. + * Return: 0 on success, or a negative error code on failure. */ static inline int nilfs_sufile_cancel_freev(struct inode *sufile, __u64 *segnumv, size_t nsegs, @@ -114,14 +119,11 @@ static inline int nilfs_sufile_cancel_freev(struct inode *sufile, * Description: nilfs_sufile_set_error() marks the segment specified by * @segnum as erroneous. The error segment will never be used again. * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid segment usage number. + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EINVAL - Invalid segment usage number. + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOMEM - Insufficient memory available. */ static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum) { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index eca79cca3803..badc2cbc895e 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -309,6 +309,8 @@ int nilfs_commit_super(struct super_block *sb, int flag) * This function restores state flags in the on-disk super block. * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the * filesystem was not clean previously. + * + * Return: 0 on success, %-EIO if I/O error or superblock is corrupted. */ int nilfs_cleanup_super(struct super_block *sb) { @@ -339,6 +341,8 @@ int nilfs_cleanup_super(struct super_block *sb) * nilfs_move_2nd_super - relocate secondary super block * @sb: super block instance * @sb2off: new offset of the secondary super block (in bytes) + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) { @@ -420,6 +424,8 @@ out: * nilfs_resize_fs - resize the filesystem * @sb: super block instance * @newsize: new size of the filesystem (in bytes) + * + * Return: 0 on success, or a negative error code on failure. */ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) { @@ -987,7 +993,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, * nilfs_tree_is_busy() - try to shrink dentries of a checkpoint * @root_dentry: root dentry of the tree to be shrunk * - * This function returns true if the tree was in-use. + * Return: true if the tree was in-use, false otherwise. */ static bool nilfs_tree_is_busy(struct dentry *root_dentry) { @@ -1033,6 +1039,8 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) * * This function is called exclusively by nilfs->ns_mount_mutex. * So, the recovery process is protected from other simultaneous mounts. + * + * Return: 0 on success, or a negative error code on failure. */ static int nilfs_fill_super(struct super_block *sb, struct fs_context *fc) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index ac03fd3c330c..cb01ea81724d 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -49,8 +49,8 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs, * alloc_nilfs - allocate a nilfs object * @sb: super block instance * - * Return Value: On success, pointer to the_nilfs is returned. - * On error, NULL is returned. + * Return: a pointer to the allocated nilfs object on success, or NULL on + * failure. */ struct the_nilfs *alloc_nilfs(struct super_block *sb) { @@ -165,6 +165,9 @@ static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri) * containing a super root from a given super block, and initializes * relevant information on the nilfs object preparatory for log * scanning and recovery. + * + * Return: 0 on success, or %-EINVAL if current segment number is out + * of range. */ static int nilfs_store_log_cursor(struct the_nilfs *nilfs, struct nilfs_super_block *sbp) @@ -200,8 +203,7 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs, * exponent information written in @sbp and stores it in @blocksize, * or aborts with an error message if it's too large. * - * Return Value: On success, 0 is returned. If the block size is too - * large, -EINVAL is returned. + * Return: 0 on success, or %-EINVAL if the block size is too large. */ static int nilfs_get_blocksize(struct super_block *sb, struct nilfs_super_block *sbp, int *blocksize) @@ -226,6 +228,13 @@ static int nilfs_get_blocksize(struct super_block *sb, * load_nilfs() searches and load the latest super root, * attaches the last segment, and does recovery if needed. * The caller must call this exclusively for simultaneous mounts. + * + * Return: 0 on success, or one of the following negative error codes on + * failure: + * * %-EINVAL - No valid segment found. + * * %-EIO - I/O error. + * * %-ENOMEM - Insufficient memory available. + * * %-EROFS - Read only device or RO compat mode (if recovery is required) */ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) { @@ -395,6 +404,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits) * nilfs_nrsvsegs - calculate the number of reserved segments * @nilfs: nilfs object * @nsegs: total number of segments + * + * Return: Number of reserved segments. */ unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs) { @@ -406,6 +417,8 @@ unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs) /** * nilfs_max_segment_count - calculate the maximum number of segments * @nilfs: nilfs object + * + * Return: Maximum number of segments */ static u64 nilfs_max_segment_count(struct the_nilfs *nilfs) { @@ -538,7 +551,7 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp) * area, or if the parameters themselves are not normal, it is * determined to be invalid. * - * Return Value: true if invalid, false if valid. + * Return: true if invalid, false if valid. */ static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) { @@ -684,8 +697,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, * reading the super block, getting disk layout information, initializing * shared fields in the_nilfs). * - * Return Value: On success, 0 is returned. On error, a negative error - * code is returned. + * Return: 0 on success, or a negative error code on failure. */ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb) { diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 395e23920632..4414743b638e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -566,7 +566,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec); /* - * Reset the actual path elements so that we can re-use the structure + * Reset the actual path elements so that we can reuse the structure * to build another path. Generally, this involves freeing the buffer * heads. */ @@ -1182,7 +1182,7 @@ static int ocfs2_add_branch(handle_t *handle, /* * If there is a gap before the root end and the real end - * of the righmost leaf block, we need to remove the gap + * of the rightmost leaf block, we need to remove the gap * between new_cpos and root_end first so that the tree * is consistent after we add a new branch(it will start * from new_cpos). @@ -1238,7 +1238,7 @@ static int ocfs2_add_branch(handle_t *handle, /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. - * conversly, new_eb_bhs[0] is the new bottommost leaf. + * conversely, new_eb_bhs[0] is the new bottommost leaf. * * when we leave the loop, new_last_eb_blk will point to the * newest leaf, and next_blkno will point to the topmost extent @@ -3712,7 +3712,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, * update split_index here. * * When the split_index is zero, we need to merge it to the - * prevoius extent block. It is more efficient and easier + * previous extent block. It is more efficient and easier * if we do merge_right first and merge_left later. */ ret = ocfs2_merge_rec_right(path, handle, et, split_rec, @@ -4517,7 +4517,7 @@ static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, } /* - * This should only be called against the righmost leaf extent list. + * This should only be called against the rightmost leaf extent list. * * ocfs2_figure_appending_type() will figure out whether we'll have to * insert at the tail of the rightmost leaf. @@ -6154,6 +6154,9 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, int status; struct inode *inode = NULL; struct buffer_head *bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + unsigned int tl_count; inode = ocfs2_get_system_file_inode(osb, TRUNCATE_LOG_SYSTEM_INODE, @@ -6171,6 +6174,18 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, goto bail; } + di = (struct ocfs2_dinode *)bh->b_data; + tl = &di->id2.i_dealloc; + tl_count = le16_to_cpu(tl->tl_count); + if (unlikely(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || + tl_count == 0)) { + status = -EFSCORRUPTED; + iput(inode); + brelse(bh); + mlog_errno(status); + goto bail; + } + *tl_inode = inode; *tl_bh = bh; bail: @@ -6808,27 +6823,27 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) return 0; } -void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, - unsigned int from, unsigned int to, - struct page *page, int zero, u64 *phys) +void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle, + size_t from, size_t to, struct folio *folio, int zero, + u64 *phys) { int ret, partial = 0; - loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from; + loff_t start_byte = folio_pos(folio) + from; loff_t length = to - from; - ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); + ret = ocfs2_map_folio_blocks(folio, phys, inode, from, to, 0); if (ret) mlog_errno(ret); if (zero) - zero_user_segment(page, from, to); + folio_zero_segment(folio, from, to); /* * Need to set the buffers we zero'd into uptodate * here if they aren't - ocfs2_map_page_blocks() * might've skipped some */ - ret = walk_page_buffers(handle, page_buffers(page), + ret = walk_page_buffers(handle, folio_buffers(folio), from, to, &partial, ocfs2_zero_func); if (ret < 0) @@ -6841,92 +6856,88 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, } if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); - flush_dcache_page(page); + flush_dcache_folio(folio); } -static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, - loff_t end, struct page **pages, - int numpages, u64 phys, handle_t *handle) +static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start, + loff_t end, struct folio **folios, int numfolios, + u64 phys, handle_t *handle) { int i; - struct page *page; - unsigned int from, to = PAGE_SIZE; struct super_block *sb = inode->i_sb; BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); - if (numpages == 0) + if (numfolios == 0) goto out; - to = PAGE_SIZE; - for(i = 0; i < numpages; i++) { - page = pages[i]; + for (i = 0; i < numfolios; i++) { + struct folio *folio = folios[i]; + size_t to = folio_size(folio); + size_t from = offset_in_folio(folio, start); - from = start & (PAGE_SIZE - 1); - if ((end >> PAGE_SHIFT) == page->index) - to = end & (PAGE_SIZE - 1); + if (to > end - folio_pos(folio)) + to = end - folio_pos(folio); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1, + &phys); - ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, - &phys); - - start = (page->index + 1) << PAGE_SHIFT; + start = folio_next_index(folio) << PAGE_SHIFT; } out: - if (pages) - ocfs2_unlock_and_free_pages(pages, numpages); + if (folios) + ocfs2_unlock_and_free_folios(folios, numfolios); } -int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num) +static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end, + struct folio **folios, int *num) { - int numpages, ret = 0; + int numfolios, ret = 0; struct address_space *mapping = inode->i_mapping; unsigned long index; loff_t last_page_bytes; BUG_ON(start > end); - numpages = 0; + numfolios = 0; last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_SHIFT; do { - pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); - if (!pages[numpages]) { - ret = -ENOMEM; + folios[numfolios] = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folios[numfolios])) { + ret = PTR_ERR(folios[numfolios]); mlog_errno(ret); goto out; } - numpages++; - index++; + index = folio_next_index(folios[numfolios]); + numfolios++; } while (index < (last_page_bytes >> PAGE_SHIFT)); out: if (ret != 0) { - if (pages) - ocfs2_unlock_and_free_pages(pages, numpages); - numpages = 0; + if (folios) + ocfs2_unlock_and_free_folios(folios, numfolios); + numfolios = 0; } - *num = numpages; + *num = numfolios; return ret; } -static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num) +static int ocfs2_grab_eof_folios(struct inode *inode, loff_t start, loff_t end, + struct folio **folios, int *num) { struct super_block *sb = inode->i_sb; BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); - return ocfs2_grab_pages(inode, start, end, pages, num); + return ocfs2_grab_folios(inode, start, end, folios, num); } /* @@ -6940,8 +6951,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, u64 range_start, u64 range_end) { - int ret = 0, numpages; - struct page **pages = NULL; + int ret = 0, numfolios; + struct folio **folios = NULL; u64 phys; unsigned int ext_flags; struct super_block *sb = inode->i_sb; @@ -6954,17 +6965,17 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, return 0; /* - * Avoid zeroing pages fully beyond current i_size. It is pointless as - * underlying blocks of those pages should be already zeroed out and + * Avoid zeroing folios fully beyond current i_size. It is pointless as + * underlying blocks of those folios should be already zeroed out and * page writeback will skip them anyway. */ range_end = min_t(u64, range_end, i_size_read(inode)); if (range_start >= range_end) return 0; - pages = kcalloc(ocfs2_pages_per_cluster(sb), - sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { + folios = kcalloc(ocfs2_pages_per_cluster(sb), + sizeof(struct folio *), GFP_NOFS); + if (folios == NULL) { ret = -ENOMEM; mlog_errno(ret); goto out; @@ -6985,18 +6996,18 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN) goto out; - ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, - &numpages); + ret = ocfs2_grab_eof_folios(inode, range_start, range_end, folios, + &numfolios); if (ret) { mlog_errno(ret); goto out; } - ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, - numpages, phys, handle); + ocfs2_zero_cluster_folios(inode, range_start, range_end, folios, + numfolios, phys, handle); /* - * Initiate writeout of the pages we zero'd here. We don't + * Initiate writeout of the folios we zero'd here. We don't * wait on them - the truncate_inode_pages() call later will * do that for us. */ @@ -7006,7 +7017,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, mlog_errno(ret); out: - kfree(pages); + kfree(folios); return ret; } @@ -7059,7 +7070,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { - int ret, has_data, num_pages = 0; + int ret, has_data, num_folios = 0; int need_free = 0; u32 bit_off, num; handle_t *handle; @@ -7068,7 +7079,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_alloc_context *data_ac = NULL; - struct page *page = NULL; + struct folio *folio = NULL; struct ocfs2_extent_tree et; int did_quota = 0; @@ -7119,12 +7130,12 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, /* * Save two copies, one for insert, and one that can - * be changed by ocfs2_map_and_dirty_page() below. + * be changed by ocfs2_map_and_dirty_folio() below. */ block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); - ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page, - &num_pages); + ret = ocfs2_grab_eof_folios(inode, 0, page_end, &folio, + &num_folios); if (ret) { mlog_errno(ret); need_free = 1; @@ -7135,15 +7146,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * This should populate the 1st page for us and mark * it up to date. */ - ret = ocfs2_read_inline_data(inode, page, di_bh); + ret = ocfs2_read_inline_data(inode, folio, di_bh); if (ret) { mlog_errno(ret); need_free = 1; goto out_unlock; } - ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0, - &phys); + ocfs2_map_and_dirty_folio(inode, handle, 0, page_end, folio, 0, + &phys); } spin_lock(&oi->ip_lock); @@ -7174,8 +7185,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } out_unlock: - if (page) - ocfs2_unlock_and_free_pages(&page, num_pages); + if (folio) + ocfs2_unlock_and_free_folios(&folio, num_folios); out_commit: if (ret < 0 && did_quota) diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 4af7abaa6e40..1c0c83362904 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -254,11 +254,9 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) return !rec->e_leaf_clusters; } -int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num); -void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, - unsigned int from, unsigned int to, - struct page *page, int zero, u64 *phys); +void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle, + size_t from, size_t to, struct folio *folio, int zero, + u64 *phys); /* * Structures which describe a path through a btree, and functions to * manipulate them. diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index db72b3e924b3..5bbeb6fbb1ac 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -215,10 +215,9 @@ bail: return err; } -int ocfs2_read_inline_data(struct inode *inode, struct page *page, +int ocfs2_read_inline_data(struct inode *inode, struct folio *folio, struct buffer_head *di_bh) { - void *kaddr; loff_t size; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; @@ -230,7 +229,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, size = i_size_read(inode); - if (size > PAGE_SIZE || + if (size > folio_size(folio) || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, "Inode %llu has with inline data has bad size: %Lu\n", @@ -239,25 +238,18 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, return -EROFS; } - kaddr = kmap_atomic(page); - if (size) - memcpy(kaddr, di->id2.i_data.id_data, size); - /* Clear the remaining part of the page */ - memset(kaddr + size, 0, PAGE_SIZE - size); - flush_dcache_page(page); - kunmap_atomic(kaddr); - - SetPageUptodate(page); + folio_fill_tail(folio, 0, di->id2.i_data.id_data, size); + folio_mark_uptodate(folio); return 0; } -static int ocfs2_readpage_inline(struct inode *inode, struct page *page) +static int ocfs2_readpage_inline(struct inode *inode, struct folio *folio) { int ret; struct buffer_head *di_bh = NULL; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); ret = ocfs2_read_inode_block(inode, &di_bh); @@ -266,9 +258,9 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) goto out; } - ret = ocfs2_read_inline_data(inode, page, di_bh); + ret = ocfs2_read_inline_data(inode, folio, di_bh); out: - unlock_page(page); + folio_unlock(folio); brelse(di_bh); return ret; @@ -283,7 +275,7 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, folio->index); - ret = ocfs2_inode_lock_with_page(inode, NULL, 0, &folio->page); + ret = ocfs2_inode_lock_with_folio(inode, NULL, 0, folio); if (ret != 0) { if (ret == AOP_TRUNCATED_PAGE) unlock = 0; @@ -305,7 +297,7 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) } /* - * i_size might have just been updated as we grabed the meta lock. We + * i_size might have just been updated as we grabbed the meta lock. We * might now be discovering a truncate that hit on another node. * block_read_full_folio->get_block freaks out if it is asked to read * beyond the end of a file, so we check here. Callers @@ -322,7 +314,7 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) } if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - ret = ocfs2_readpage_inline(inode, &folio->page); + ret = ocfs2_readpage_inline(inode, folio); else ret = block_read_full_folio(folio, ocfs2_get_block); unlock = 0; @@ -534,7 +526,7 @@ static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, * * from == to == 0 is code for "zero the entire cluster region" */ -static void ocfs2_clear_page_regions(struct page *page, +static void ocfs2_clear_folio_regions(struct folio *folio, struct ocfs2_super *osb, u32 cpos, unsigned from, unsigned to) { @@ -543,7 +535,7 @@ static void ocfs2_clear_page_regions(struct page *page, ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); if (from || to) { if (from > cluster_start) @@ -554,13 +546,13 @@ static void ocfs2_clear_page_regions(struct page *page, memset(kaddr + cluster_start, 0, cluster_end - cluster_start); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); } /* * Nonsparse file systems fully allocate before we get to the write * code. This prevents ocfs2_write() from tagging the write as an - * allocating one, which means ocfs2_map_page_blocks() might try to + * allocating one, which means ocfs2_map_folio_blocks() might try to * read-in the blocks at the tail of our file. Avoid reading them by * testing i_size against each block offset. */ @@ -585,11 +577,10 @@ static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio, * * This will also skip zeroing, which is handled externally. */ -int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, +int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new) { - struct folio *folio = page_folio(page); int ret = 0; struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; unsigned int block_end, block_start; @@ -729,24 +720,24 @@ struct ocfs2_write_ctxt { unsigned int w_large_pages; /* - * Pages involved in this write. + * Folios involved in this write. * - * w_target_page is the page being written to by the user. + * w_target_folio is the folio being written to by the user. * - * w_pages is an array of pages which always contains - * w_target_page, and in the case of an allocating write with + * w_folios is an array of folios which always contains + * w_target_folio, and in the case of an allocating write with * page_size < cluster size, it will contain zero'd and mapped - * pages adjacent to w_target_page which need to be written + * pages adjacent to w_target_folio which need to be written * out in so that future reads from that region will get * zero's. */ - unsigned int w_num_pages; - struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; - struct page *w_target_page; + unsigned int w_num_folios; + struct folio *w_folios[OCFS2_MAX_CTXT_PAGES]; + struct folio *w_target_folio; /* * w_target_locked is used for page_mkwrite path indicating no unlocking - * against w_target_page in ocfs2_write_end_nolock. + * against w_target_folio in ocfs2_write_end_nolock. */ unsigned int w_target_locked:1; @@ -771,40 +762,40 @@ struct ocfs2_write_ctxt { unsigned int w_unwritten_count; }; -void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) +void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios) { int i; - for(i = 0; i < num_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - mark_page_accessed(pages[i]); - put_page(pages[i]); - } + for(i = 0; i < num_folios; i++) { + if (!folios[i]) + continue; + folio_unlock(folios[i]); + folio_mark_accessed(folios[i]); + folio_put(folios[i]); } } -static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) +static void ocfs2_unlock_folios(struct ocfs2_write_ctxt *wc) { int i; /* * w_target_locked is only set to true in the page_mkwrite() case. * The intent is to allow us to lock the target page from write_begin() - * to write_end(). The caller must hold a ref on w_target_page. + * to write_end(). The caller must hold a ref on w_target_folio. */ if (wc->w_target_locked) { - BUG_ON(!wc->w_target_page); - for (i = 0; i < wc->w_num_pages; i++) { - if (wc->w_target_page == wc->w_pages[i]) { - wc->w_pages[i] = NULL; + BUG_ON(!wc->w_target_folio); + for (i = 0; i < wc->w_num_folios; i++) { + if (wc->w_target_folio == wc->w_folios[i]) { + wc->w_folios[i] = NULL; break; } } - mark_page_accessed(wc->w_target_page); - put_page(wc->w_target_page); + folio_mark_accessed(wc->w_target_folio); + folio_put(wc->w_target_folio); } - ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); + ocfs2_unlock_and_free_folios(wc->w_folios, wc->w_num_folios); } static void ocfs2_free_unwritten_list(struct inode *inode, @@ -826,7 +817,7 @@ static void ocfs2_free_write_ctxt(struct inode *inode, struct ocfs2_write_ctxt *wc) { ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list); - ocfs2_unlock_pages(wc); + ocfs2_unlock_folios(wc); brelse(wc->w_di_bh); kfree(wc); } @@ -869,29 +860,30 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, * and dirty so they'll be written out (in order to prevent uninitialised * block data from leaking). And clear the new bit. */ -static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) +static void ocfs2_zero_new_buffers(struct folio *folio, size_t from, size_t to) { unsigned int block_start, block_end; struct buffer_head *head, *bh; - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) + BUG_ON(!folio_test_locked(folio)); + head = folio_buffers(folio); + if (!head) return; - bh = head = page_buffers(page); + bh = head; block_start = 0; do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { unsigned start, end; start = max(from, block_start); end = min(to, block_end); - zero_user_segment(page, start, end); + folio_zero_segment(folio, start, end); set_buffer_uptodate(bh); } @@ -916,29 +908,26 @@ static void ocfs2_write_failure(struct inode *inode, int i; unsigned from = user_pos & (PAGE_SIZE - 1), to = user_pos + user_len; - struct page *tmppage; - if (wc->w_target_page) - ocfs2_zero_new_buffers(wc->w_target_page, from, to); + if (wc->w_target_folio) + ocfs2_zero_new_buffers(wc->w_target_folio, from, to); - for(i = 0; i < wc->w_num_pages; i++) { - tmppage = wc->w_pages[i]; + for (i = 0; i < wc->w_num_folios; i++) { + struct folio *folio = wc->w_folios[i]; - if (tmppage && page_has_buffers(tmppage)) { + if (folio && folio_buffers(folio)) { if (ocfs2_should_order_data(inode)) ocfs2_jbd2_inode_add_write(wc->w_handle, inode, user_pos, user_len); - block_commit_write(tmppage, from, to); + block_commit_write(&folio->page, from, to); } } } -static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, - struct ocfs2_write_ctxt *wc, - struct page *page, u32 cpos, - loff_t user_pos, unsigned user_len, - int new) +static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno, + struct ocfs2_write_ctxt *wc, struct folio *folio, u32 cpos, + loff_t user_pos, unsigned user_len, int new) { int ret; unsigned int map_from = 0, map_to = 0; @@ -951,20 +940,19 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, /* treat the write as new if the a hole/lseek spanned across * the page boundary. */ - new = new | ((i_size_read(inode) <= page_offset(page)) && - (page_offset(page) <= user_pos)); + new = new | ((i_size_read(inode) <= folio_pos(folio)) && + (folio_pos(folio) <= user_pos)); - if (page == wc->w_target_page) { + if (folio == wc->w_target_folio) { map_from = user_pos & (PAGE_SIZE - 1); map_to = map_from + user_len; if (new) - ret = ocfs2_map_page_blocks(page, p_blkno, inode, - cluster_start, cluster_end, - new); + ret = ocfs2_map_folio_blocks(folio, p_blkno, inode, + cluster_start, cluster_end, new); else - ret = ocfs2_map_page_blocks(page, p_blkno, inode, - map_from, map_to, new); + ret = ocfs2_map_folio_blocks(folio, p_blkno, inode, + map_from, map_to, new); if (ret) { mlog_errno(ret); goto out; @@ -978,7 +966,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, } } else { /* - * If we haven't allocated the new page yet, we + * If we haven't allocated the new folio yet, we * shouldn't be writing it out without copying user * data. This is likely a math error from the caller. */ @@ -987,8 +975,8 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, map_from = cluster_start; map_to = cluster_end; - ret = ocfs2_map_page_blocks(page, p_blkno, inode, - cluster_start, cluster_end, new); + ret = ocfs2_map_folio_blocks(folio, p_blkno, inode, + cluster_start, cluster_end, new); if (ret) { mlog_errno(ret); goto out; @@ -996,20 +984,20 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, } /* - * Parts of newly allocated pages need to be zero'd. + * Parts of newly allocated folios need to be zero'd. * * Above, we have also rewritten 'to' and 'from' - as far as * the rest of the function is concerned, the entire cluster - * range inside of a page needs to be written. + * range inside of a folio needs to be written. * - * We can skip this if the page is up to date - it's already + * We can skip this if the folio is uptodate - it's already * been zero'd from being read in as a hole. */ - if (new && !PageUptodate(page)) - ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), + if (new && !folio_test_uptodate(folio)) + ocfs2_clear_folio_regions(folio, OCFS2_SB(inode->i_sb), cpos, user_data_from, user_data_to); - flush_dcache_page(page); + flush_dcache_folio(folio); out: return ret; @@ -1018,11 +1006,9 @@ out: /* * This function will only grab one clusters worth of pages. */ -static int ocfs2_grab_pages_for_write(struct address_space *mapping, - struct ocfs2_write_ctxt *wc, - u32 cpos, loff_t user_pos, - unsigned user_len, int new, - struct page *mmap_page) +static int ocfs2_grab_folios_for_write(struct address_space *mapping, + struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, + unsigned user_len, int new, struct folio *mmap_folio) { int ret = 0, i; unsigned long start, target_index, end_index, index; @@ -1039,7 +1025,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, * last page of the write. */ if (new) { - wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); + wc->w_num_folios = ocfs2_pages_per_cluster(inode->i_sb); start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); /* * We need the index *past* the last page we could possibly @@ -1049,15 +1035,15 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, last_byte = max(user_pos + user_len, i_size_read(inode)); BUG_ON(last_byte < 1); end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1; - if ((start + wc->w_num_pages) > end_index) - wc->w_num_pages = end_index - start; + if ((start + wc->w_num_folios) > end_index) + wc->w_num_folios = end_index - start; } else { - wc->w_num_pages = 1; + wc->w_num_folios = 1; start = target_index; } end_index = (user_pos + user_len - 1) >> PAGE_SHIFT; - for(i = 0; i < wc->w_num_pages; i++) { + for(i = 0; i < wc->w_num_folios; i++) { index = start + i; if (index >= target_index && index <= end_index && @@ -1067,37 +1053,38 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, * and wants us to directly use the page * passed in. */ - lock_page(mmap_page); + folio_lock(mmap_folio); /* Exit and let the caller retry */ - if (mmap_page->mapping != mapping) { - WARN_ON(mmap_page->mapping); - unlock_page(mmap_page); + if (mmap_folio->mapping != mapping) { + WARN_ON(mmap_folio->mapping); + folio_unlock(mmap_folio); ret = -EAGAIN; goto out; } - get_page(mmap_page); - wc->w_pages[i] = mmap_page; + folio_get(mmap_folio); + wc->w_folios[i] = mmap_folio; wc->w_target_locked = true; } else if (index >= target_index && index <= end_index && wc->w_type == OCFS2_WRITE_DIRECT) { /* Direct write has no mapping page. */ - wc->w_pages[i] = NULL; + wc->w_folios[i] = NULL; continue; } else { - wc->w_pages[i] = find_or_create_page(mapping, index, - GFP_NOFS); - if (!wc->w_pages[i]) { - ret = -ENOMEM; + wc->w_folios[i] = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + GFP_NOFS); + if (IS_ERR(wc->w_folios[i])) { + ret = PTR_ERR(wc->w_folios[i]); mlog_errno(ret); goto out; } } - wait_for_stable_page(wc->w_pages[i]); + folio_wait_stable(wc->w_folios[i]); if (index == target_index) - wc->w_target_page = wc->w_pages[i]; + wc->w_target_folio = wc->w_folios[i]; } out: if (ret) @@ -1181,19 +1168,18 @@ static int ocfs2_write_cluster(struct address_space *mapping, if (!should_zero) p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1); - for(i = 0; i < wc->w_num_pages; i++) { + for (i = 0; i < wc->w_num_folios; i++) { int tmpret; /* This is the direct io target page. */ - if (wc->w_pages[i] == NULL) { + if (wc->w_folios[i] == NULL) { p_blkno += (1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits)); continue; } - tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, - wc->w_pages[i], cpos, - user_pos, user_len, - should_zero); + tmpret = ocfs2_prepare_folio_for_write(inode, &p_blkno, wc, + wc->w_folios[i], cpos, user_pos, user_len, + should_zero); if (tmpret) { mlog_errno(tmpret); if (ret == 0) @@ -1472,7 +1458,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct page *page; + struct folio *folio; handle_t *handle; struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; @@ -1483,19 +1469,21 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, goto out; } - page = find_or_create_page(mapping, 0, GFP_NOFS); - if (!page) { + folio = __filemap_get_folio(mapping, 0, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { ocfs2_commit_trans(osb, handle); - ret = -ENOMEM; + ret = PTR_ERR(folio); mlog_errno(ret); goto out; } /* - * If we don't set w_num_pages then this page won't get unlocked + * If we don't set w_num_folios then this folio won't get unlocked * and freed on cleanup of the write context. */ - wc->w_pages[0] = wc->w_target_page = page; - wc->w_num_pages = 1; + wc->w_target_folio = folio; + wc->w_folios[0] = folio; + wc->w_num_folios = 1; ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1509,8 +1497,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) ocfs2_set_inode_data_inline(inode, di); - if (!PageUptodate(page)) { - ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh); + if (!folio_test_uptodate(folio)) { + ret = ocfs2_read_inline_data(inode, folio, wc->w_di_bh); if (ret) { ocfs2_commit_trans(osb, handle); @@ -1533,9 +1521,8 @@ int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size) } static int ocfs2_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, loff_t pos, - unsigned len, struct page *mmap_page, - struct ocfs2_write_ctxt *wc) + struct inode *inode, loff_t pos, size_t len, + struct folio *mmap_folio, struct ocfs2_write_ctxt *wc) { int ret, written = 0; loff_t end = pos + len; @@ -1550,7 +1537,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, * Handle inodes which already have inline data 1st. */ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - if (mmap_page == NULL && + if (mmap_folio == NULL && ocfs2_size_fits_inline_data(wc->w_di_bh, end)) goto do_inline_write; @@ -1574,7 +1561,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, * Check whether the write can fit. */ di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; - if (mmap_page || + if (mmap_folio || end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) return 0; @@ -1641,9 +1628,9 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, } int ocfs2_write_begin_nolock(struct address_space *mapping, - loff_t pos, unsigned len, ocfs2_write_type_t type, - struct folio **foliop, void **fsdata, - struct buffer_head *di_bh, struct page *mmap_page) + loff_t pos, unsigned len, ocfs2_write_type_t type, + struct folio **foliop, void **fsdata, + struct buffer_head *di_bh, struct folio *mmap_folio) { int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0; @@ -1666,7 +1653,7 @@ try_again: if (ocfs2_supports_inline_data(osb)) { ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len, - mmap_page, wc); + mmap_folio, wc); if (ret == 1) { ret = 0; goto success; @@ -1718,7 +1705,7 @@ try_again: (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), - pos, len, type, mmap_page, + pos, len, type, mmap_folio, clusters_to_alloc, extents_to_split); /* @@ -1789,21 +1776,21 @@ try_again: } /* - * Fill our page array first. That way we've grabbed enough so + * Fill our folio array first. That way we've grabbed enough so * that we can zero and flush if we error after adding the * extent. */ - ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, - cluster_of_pages, mmap_page); + ret = ocfs2_grab_folios_for_write(mapping, wc, wc->w_cpos, pos, len, + cluster_of_pages, mmap_folio); if (ret) { /* - * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock - * the target page. In this case, we exit with no error and no target - * page. This will trigger the caller, page_mkwrite(), to re-try - * the operation. + * ocfs2_grab_folios_for_write() returns -EAGAIN if it + * could not lock the target folio. In this case, we exit + * with no error and no target folio. This will trigger + * the caller, page_mkwrite(), to re-try the operation. */ if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) { - BUG_ON(wc->w_target_page); + BUG_ON(wc->w_target_folio); ret = 0; goto out_quota; } @@ -1826,7 +1813,7 @@ try_again: success: if (foliop) - *foliop = page_folio(wc->w_target_page); + *foliop = wc->w_target_folio; *fsdata = wc; return 0; out_quota: @@ -1845,7 +1832,7 @@ out: * to VM code. */ if (wc->w_target_locked) - unlock_page(mmap_page); + folio_unlock(mmap_folio); ocfs2_free_write_ctxt(inode, wc); @@ -1924,18 +1911,15 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, struct ocfs2_dinode *di, struct ocfs2_write_ctxt *wc) { - void *kaddr; - if (unlikely(*copied < len)) { - if (!PageUptodate(wc->w_target_page)) { + if (!folio_test_uptodate(wc->w_target_folio)) { *copied = 0; return; } } - kaddr = kmap_atomic(wc->w_target_page); - memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); - kunmap_atomic(kaddr); + memcpy_from_folio(di->id2.i_data.id_data + pos, wc->w_target_folio, + pos, *copied); trace_ocfs2_write_end_inline( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1944,17 +1928,16 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, le16_to_cpu(di->i_dyn_features)); } -int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, void *fsdata) +int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, + unsigned len, unsigned copied, void *fsdata) { int i, ret; - unsigned from, to, start = pos & (PAGE_SIZE - 1); + size_t from, to, start = pos & (PAGE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_write_ctxt *wc = fsdata; struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; handle_t *handle = wc->w_handle; - struct page *tmppage; BUG_ON(!list_empty(&wc->w_unwritten_list)); @@ -1973,44 +1956,44 @@ int ocfs2_write_end_nolock(struct address_space *mapping, goto out_write_size; } - if (unlikely(copied < len) && wc->w_target_page) { + if (unlikely(copied < len) && wc->w_target_folio) { loff_t new_isize; - if (!PageUptodate(wc->w_target_page)) + if (!folio_test_uptodate(wc->w_target_folio)) copied = 0; new_isize = max_t(loff_t, i_size_read(inode), pos + copied); - if (new_isize > page_offset(wc->w_target_page)) - ocfs2_zero_new_buffers(wc->w_target_page, start+copied, + if (new_isize > folio_pos(wc->w_target_folio)) + ocfs2_zero_new_buffers(wc->w_target_folio, start+copied, start+len); else { /* - * When page is fully beyond new isize (data copy - * failed), do not bother zeroing the page. Invalidate + * When folio is fully beyond new isize (data copy + * failed), do not bother zeroing the folio. Invalidate * it instead so that writeback does not get confused * put page & buffer dirty bits into inconsistent * state. */ - block_invalidate_folio(page_folio(wc->w_target_page), - 0, PAGE_SIZE); + block_invalidate_folio(wc->w_target_folio, 0, + folio_size(wc->w_target_folio)); } } - if (wc->w_target_page) - flush_dcache_page(wc->w_target_page); + if (wc->w_target_folio) + flush_dcache_folio(wc->w_target_folio); - for(i = 0; i < wc->w_num_pages; i++) { - tmppage = wc->w_pages[i]; + for (i = 0; i < wc->w_num_folios; i++) { + struct folio *folio = wc->w_folios[i]; - /* This is the direct io target page. */ - if (tmppage == NULL) + /* This is the direct io target folio */ + if (folio == NULL) continue; - if (tmppage == wc->w_target_page) { + if (folio == wc->w_target_folio) { from = wc->w_target_from; to = wc->w_target_to; - BUG_ON(from > PAGE_SIZE || - to > PAGE_SIZE || + BUG_ON(from > folio_size(folio) || + to > folio_size(folio) || to < from); } else { /* @@ -2019,19 +2002,17 @@ int ocfs2_write_end_nolock(struct address_space *mapping, * to flush their entire range. */ from = 0; - to = PAGE_SIZE; + to = folio_size(folio); } - if (page_has_buffers(tmppage)) { + if (folio_buffers(folio)) { if (handle && ocfs2_should_order_data(inode)) { - loff_t start_byte = - ((loff_t)tmppage->index << PAGE_SHIFT) + - from; + loff_t start_byte = folio_pos(folio) + from; loff_t length = to - from; ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); } - block_commit_write(tmppage, from, to); + block_commit_write(&folio->page, from, to); } } @@ -2060,7 +2041,7 @@ out: * this lock and will ask for the page lock when flushing the data. * put it here to preserve the unlock order. */ - ocfs2_unlock_pages(wc); + ocfs2_unlock_folios(wc); if (handle) ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 1d1b4b7edba0..114efc9111e4 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -8,16 +8,11 @@ #include -handle_t *ocfs2_start_walk_page_trans(struct inode *inode, - struct page *page, - unsigned from, - unsigned to); - -int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, +int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new); -void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages); +void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios); int walk_page_buffers( handle_t *handle, struct buffer_head *head, @@ -37,11 +32,11 @@ typedef enum { } ocfs2_write_type_t; int ocfs2_write_begin_nolock(struct address_space *mapping, - loff_t pos, unsigned len, ocfs2_write_type_t type, - struct folio **foliop, void **fsdata, - struct buffer_head *di_bh, struct page *mmap_page); + loff_t pos, unsigned len, ocfs2_write_type_t type, + struct folio **foliop, void **fsdata, + struct buffer_head *di_bh, struct folio *mmap_folio); -int ocfs2_read_inline_data(struct inode *inode, struct page *page, +int ocfs2_read_inline_data(struct inode *inode, struct folio *folio, struct buffer_head *di_bh); int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4200a0341343..724350925aff 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -3,6 +3,7 @@ * Copyright (C) 2004, 2005 Oracle. All rights reserved. */ +#include "linux/kstrtox.h" #include #include #include @@ -1020,7 +1021,7 @@ fire_callbacks: if (list_empty(&slot->ds_live_item)) goto out; - /* live nodes only go dead after enough consequtive missed + /* live nodes only go dead after enough consecutive missed * samples.. reset the missed counter whenever we see * activity */ if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) { @@ -1535,10 +1536,11 @@ static int o2hb_read_block_input(struct o2hb_region *reg, { unsigned long bytes; char *p = (char *)page; + int ret; - bytes = simple_strtoul(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 0, &bytes); + if (ret) + return ret; /* Heartbeat and fs min / max block sizes are the same. */ if (bytes > 4096 || bytes < 512) @@ -1622,13 +1624,14 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item, struct o2hb_region *reg = to_o2hb_region(item); unsigned long tmp; char *p = (char *)page; + int ret; if (reg->hr_bdev_file) return -EINVAL; - tmp = simple_strtoul(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 0, &tmp); + if (ret) + return ret; if (tmp > O2NM_MAX_NODES || tmp == 0) return -ERANGE; @@ -1776,8 +1779,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, if (o2nm_this_node() == O2NM_MAX_NODES) return -EINVAL; - fd = simple_strtol(p, &p, 0); - if (!p || (*p && (*p != '\n'))) + ret = kstrtol(p, 0, &fd); + if (ret < 0) return -EINVAL; if (fd < 0 || fd >= INT_MAX) @@ -2136,10 +2139,11 @@ static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *ite { unsigned long tmp; char *p = (char *)page; + int ret; - tmp = simple_strtoul(p, &p, 10); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 10, &tmp); + if (ret) + return ret; /* this will validate ranges for us. */ o2hb_dead_threshold_set((unsigned int) tmp); diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index b73fc42e46ff..630bd5a3dd0d 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -29,7 +29,7 @@ * just calling printk() so that this can eventually make its way through * relayfs along with the debugging messages. Everything else gets KERN_DEBUG. * The inline tests and macro dance give GCC the opportunity to quite cleverly - * only emit the appropriage printk() when the caller passes in a constant + * only emit the appropriate printk() when the caller passes in a constant * mask, as is almost always the case. * * All this bitmask nonsense is managed from the files under diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 8bf17231d7b7..bfb8b456876c 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -23,7 +23,7 @@ * race between when we see a node start heartbeating and when we connect * to it. * - * So nodes that are in this transtion put a hold on the quorum decision + * So nodes that are in this transition put a hold on the quorum decision * with a counter. As they fall out of this transition they drop the count * and if they're the last, they fire off the decision. */ @@ -189,7 +189,7 @@ static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) } /* as a node comes up we delay the quorum decision until we know the fate of - * the connection. the hold will be droped in conn_up or hb_down. it might be + * the connection. the hold will be dropped in conn_up or hb_down. it might be * perpetuated by con_err until hb_down. if we already have a conn, we might * be dropping a hold that conn_up got. */ void o2quo_hb_up(u8 node) @@ -256,7 +256,7 @@ void o2quo_hb_still_up(u8 node) } /* This is analogous to hb_up. as a node's connection comes up we delay the - * quorum decision until we see it heartbeating. the hold will be droped in + * quorum decision until we see it heartbeating. the hold will be dropped in * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if * it's already heartbeating we might be dropping a hold that conn_up got. * */ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2b8fa3e782fb..0f46b22561d6 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -5,13 +5,13 @@ * * ---- * - * Callers for this were originally written against a very simple synchronus + * Callers for this were originally written against a very simple synchronous * API. This implementation reflects those simple callers. Some day I'm sure * we'll need to move to a more robust posting/callback mechanism. * * Transmit calls pass in kernel virtual addresses and block copying this into * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting - * for a failed socket to timeout. TX callers can also pass in a poniter to an + * for a failed socket to timeout. TX callers can also pass in a pointer to an * 'int' which gets filled with an errno off the wire in response to the * message they send. * @@ -101,7 +101,7 @@ static struct socket *o2net_listen_sock; * o2net_wq. teardown detaches the callbacks before destroying the workqueue. * quorum work is queued as sock containers are shutdown.. stop_listening * tears down all the node's sock containers, preventing future shutdowns - * and queued quroum work, before canceling delayed quorum work and + * and queued quorum work, before canceling delayed quorum work and * destroying the work queue. */ static struct workqueue_struct *o2net_wq; @@ -1419,7 +1419,7 @@ out: return ret; } -/* this work func is triggerd by data ready. it reads until it can read no +/* this work func is triggered by data ready. it reads until it can read no * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing * our work the work struct will be marked and we'll be called again. */ static void o2net_rx_until_empty(struct work_struct *work) diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h index 847a52dcbe7d..1969db8ffa9c 100644 --- a/fs/ocfs2/dlm/dlmapi.h +++ b/fs/ocfs2/dlm/dlmapi.h @@ -118,7 +118,7 @@ struct dlm_lockstatus { #define LKM_VALBLK 0x00000100 /* lock value block request */ #define LKM_NOQUEUE 0x00000200 /* non blocking request */ #define LKM_CONVERT 0x00000400 /* conversion request */ -#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */ +#define LKM_NODLCKWT 0x00000800 /* this lock won't deadlock (U) */ #define LKM_UNLOCK 0x00001000 /* deallocate this lock */ #define LKM_CANCEL 0x00002000 /* cancel conversion request */ #define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */ diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 50da8af988c1..54c548ef037a 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -207,7 +207,7 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) * 1) all recovery threads cluster wide will work on recovering * ONE node at a time * 2) negotiate who will take over all the locks for the dead node. - * thats right... ALL the locks. + * that's right... ALL the locks. * 3) once a new master is chosen, everyone scans all locks * and moves aside those mastered by the dead guy * 4) each of these locks should be locked until recovery is done @@ -1469,7 +1469,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, * The first one is handled at the end of this function. The * other two are handled in the worker thread after locks have * been attached. Yes, we don't wait for purge time to match - * kref_init. The lockres will still have atleast one ref + * kref_init. The lockres will still have at least one ref * added because it is in the hash __dlm_insert_lockres() */ extra_refs++; @@ -1735,7 +1735,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&res->spinlock); } } else { - /* put.. incase we are not the master */ + /* put.. in case we are not the master */ spin_unlock(&res->spinlock); dlm_lockres_put(res); } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 7fc0e920eda7..2a7f36643895 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -506,9 +507,7 @@ bail: return status; } -static int dlmfs_fill_super(struct super_block * sb, - void * data, - int silent) +static int dlmfs_fill_super(struct super_block *sb, struct fs_context *fc) { sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; @@ -556,17 +555,27 @@ static const struct inode_operations dlmfs_file_inode_operations = { .setattr = dlmfs_file_setattr, }; -static struct dentry *dlmfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int dlmfs_get_tree(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, dlmfs_fill_super); + return get_tree_nodev(fc, dlmfs_fill_super); +} + +static const struct fs_context_operations dlmfs_context_ops = { + .get_tree = dlmfs_get_tree, +}; + +static int dlmfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &dlmfs_context_ops; + + return 0; } static struct file_system_type dlmfs_fs_type = { .owner = THIS_MODULE, .name = "ocfs2_dlmfs", - .mount = dlmfs_mount, .kill_sb = kill_litter_super, + .init_fs_context = dlmfs_init_fs_context, }; MODULE_ALIAS_FS("ocfs2_dlmfs"); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 764ecbd5ad41..c9b62a6d8673 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -794,7 +794,7 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) /* * Keep a list of processes who have interest in a lockres. - * Note: this is now only uesed for check recursive cluster locking. + * Note: this is now only used for check recursive cluster locking. */ static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, struct ocfs2_lock_holder *oh) @@ -2529,30 +2529,28 @@ bail: /* * This is working around a lock inversion between tasks acquiring DLM - * locks while holding a page lock and the downconvert thread which - * blocks dlm lock acquiry while acquiring page locks. + * locks while holding a folio lock and the downconvert thread which + * blocks dlm lock acquiry while acquiring folio locks. * - * ** These _with_page variantes are only intended to be called from aop - * methods that hold page locks and return a very specific *positive* error + * ** These _with_folio variants are only intended to be called from aop + * methods that hold folio locks and return a very specific *positive* error * code that aop methods pass up to the VFS -- test for errors with != 0. ** * * The DLM is called such that it returns -EAGAIN if it would have * blocked waiting for the downconvert thread. In that case we unlock - * our page so the downconvert thread can make progress. Once we've + * our folio so the downconvert thread can make progress. Once we've * done this we have to return AOP_TRUNCATED_PAGE so the aop method * that called us can bubble that back up into the VFS who will then * immediately retry the aop call. */ -int ocfs2_inode_lock_with_page(struct inode *inode, - struct buffer_head **ret_bh, - int ex, - struct page *page) +int ocfs2_inode_lock_with_folio(struct inode *inode, + struct buffer_head **ret_bh, int ex, struct folio *folio) { int ret; ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { - unlock_page(page); + folio_unlock(folio); /* * If we can't get inode lock immediately, we should not return * directly here, since this will lead to a softlockup problem. @@ -2630,7 +2628,7 @@ void ocfs2_inode_unlock(struct inode *inode, } /* - * This _tracker variantes are introduced to deal with the recursive cluster + * This _tracker variants are introduced to deal with the recursive cluster * locking issue. The idea is to keep track of a lock holder on the stack of * the current process. If there's a lock holder on the stack, we know the * task context is already protected by cluster locking. Currently, they're @@ -2735,7 +2733,7 @@ void ocfs2_inode_unlock_tracker(struct inode *inode, struct ocfs2_lock_res *lockres; lockres = &OCFS2_I(inode)->ip_inode_lockres; - /* had_lock means that the currect process already takes the cluster + /* had_lock means that the current process already takes the cluster * lock previously. * If had_lock is 1, we have nothing to do here. * If had_lock is 0, we will release the lock. @@ -3802,9 +3800,9 @@ recheck: * set when the ast is received for an upconvert just before the * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast * on the heels of the ast, we want to delay the downconvert just - * enough to allow the up requestor to do its task. Because this + * enough to allow the up requester to do its task. Because this * lock is in the blocked queue, the lock will be downconverted - * as soon as the requestor is done with the lock. + * as soon as the requester is done with the lock. */ if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) goto leave_requeue; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index e5da5809ed95..a3ebd7303ea2 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -137,10 +137,8 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, int ex, int arg_flags, int subclass); -int ocfs2_inode_lock_with_page(struct inode *inode, - struct buffer_head **ret_bh, - int ex, - struct page *page); +int ocfs2_inode_lock_with_folio(struct inode *inode, + struct buffer_head **ret_bh, int ex, struct folio *folio); /* Variants without special locking class or flags */ #define ocfs2_inode_lock_full(i, r, e, f)\ ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL) diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index f7672472fa82..930150ed5db1 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -435,6 +435,16 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, } } + if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { + ocfs2_error(inode->i_sb, + "Inode %lu has an invalid extent (next_free_rec %u, count %u)\n", + inode->i_ino, + le16_to_cpu(el->l_next_free_rec), + le16_to_cpu(el->l_count)); + ret = -EROFS; + goto out; + } + i = ocfs2_search_extent_list(el, v_cluster); if (i == -1) { /* diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 957ced628eb1..e54f2c4b5a90 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -782,11 +782,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, goto out_commit_trans; } - /* Get the offsets within the page that we want to zero */ - zero_from = abs_from & (PAGE_SIZE - 1); - zero_to = abs_to & (PAGE_SIZE - 1); + /* Get the offsets within the folio that we want to zero */ + zero_from = offset_in_folio(folio, abs_from); + zero_to = offset_in_folio(folio, abs_to); if (!zero_to) - zero_to = PAGE_SIZE; + zero_to = folio_size(folio); trace_ocfs2_write_zero_page( (unsigned long long)OCFS2_I(inode)->ip_blkno, diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 2cc5c99fe941..12e5d1f73325 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -200,6 +200,20 @@ bail: return inode; } +static int ocfs2_dinode_has_extents(struct ocfs2_dinode *di) +{ + /* inodes flagged with other stuff in id2 */ + if (di->i_flags & (OCFS2_SUPER_BLOCK_FL | OCFS2_LOCAL_ALLOC_FL | + OCFS2_CHAIN_FL | OCFS2_DEALLOC_FL)) + return 0; + /* i_flags doesn't indicate when id2 is a fast symlink */ + if (S_ISLNK(di->i_mode) && di->i_size && di->i_clusters == 0) + return 0; + if (di->i_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + return 1; +} /* * here's how inodes get read from disk: @@ -1122,7 +1136,7 @@ static void ocfs2_clear_inode(struct inode *inode) dquot_drop(inode); - /* To preven remote deletes we hold open lock before, now it + /* To prevent remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); @@ -1437,7 +1451,7 @@ static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, * Call ocfs2_validate_meta_ecc() first since it has ecc repair * function, but we should not return error immediately when ecc * validation fails, because the reason is quite likely the invalid - * inode number inputed. + * inode number inputted. */ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); if (rc) { @@ -1547,6 +1561,16 @@ static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, le32_to_cpu(di->i_fs_generation)); } + if (ocfs2_dinode_has_extents(di) && + le16_to_cpu(di->id2.i_list.l_next_free_rec) > le16_to_cpu(di->id2.i_list.l_count)) { + di->id2.i_list.l_next_free_rec = di->id2.i_list.l_count; + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: l_next_free_rec to %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(di->id2.i_list.l_next_free_rec)); + } + if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); mark_buffer_dirty(bh); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 71beef7f8a60..7ae96fb8807a 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -796,7 +796,7 @@ bail: /* * OCFS2_IOC_INFO handles an array of requests passed from userspace. * - * ocfs2_info_handle() recevies a large info aggregation, grab and + * ocfs2_info_handle() receives a large info aggregation, grab and * validate the request count from header, then break it into small * pieces, later specific handlers can handle them one by one. * diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1bf188b6866a..f1b4b3e611cb 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1956,7 +1956,7 @@ bail: /* * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some - * randomness to the timeout to minimize multple nodes firing the timer at the + * randomness to the timeout to minimize multiple nodes firing the timer at the * same time. */ static inline unsigned long ocfs2_orphan_scan_timeout(void) diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 6ef4cb045ccd..6a314e9f2b49 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -44,13 +44,13 @@ static vm_fault_t ocfs2_fault(struct vm_fault *vmf) } static vm_fault_t __ocfs2_page_mkwrite(struct file *file, - struct buffer_head *di_bh, struct page *page) + struct buffer_head *di_bh, struct folio *folio) { int err; vm_fault_t ret = VM_FAULT_NOPAGE; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; - loff_t pos = page_offset(page); + loff_t pos = folio_pos(folio); unsigned int len = PAGE_SIZE; pgoff_t last_index; struct folio *locked_folio = NULL; @@ -72,9 +72,9 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, * * Let VM retry with these cases. */ - if ((page->mapping != inode->i_mapping) || - (!PageUptodate(page)) || - (page_offset(page) >= size)) + if ((folio->mapping != inode->i_mapping) || + !folio_test_uptodate(folio) || + (pos >= size)) goto out; /* @@ -87,11 +87,11 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, * worry about ocfs2_write_begin() skipping some buffer reads * because the "write" would invalidate their data. */ - if (page->index == last_index) + if (folio->index == last_index) len = ((size - 1) & ~PAGE_MASK) + 1; err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, - &locked_folio, &fsdata, di_bh, page); + &locked_folio, &fsdata, di_bh, folio); if (err) { if (err != -ENOSPC) mlog_errno(err); @@ -112,7 +112,7 @@ out: static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); struct buffer_head *di_bh = NULL; sigset_t oldset; @@ -141,7 +141,7 @@ static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf) */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, page); + ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, folio); up_write(&OCFS2_I(inode)->ip_alloc_sem); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index f9d6a4f9ca92..369c7d27befd 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -492,7 +492,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, bg = (struct ocfs2_group_desc *)gd_bh->b_data; /* - * moving goal is not allowd to start with a group desc blok(#0 blk) + * moving goal is not allowed to start with a group desc blok(#0 blk) * let's compromise to the latter cluster. */ if (range->me_goal == le64_to_cpu(bg->bg_blkno)) @@ -658,7 +658,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, /* * probe the victim cluster group to find a proper - * region to fit wanted movement, it even will perfrom + * region to fit wanted movement, it even will perform * a best-effort attempt by compromising to a threshold * around the goal. */ @@ -920,7 +920,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) } /* - * rememer ip_xattr_sem also needs to be held if necessary + * remember ip_xattr_sem also needs to be held if necessary */ down_write(&OCFS2_I(inode)->ip_alloc_sem); @@ -1022,7 +1022,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) context->range = ⦥ /* - * ok, the default theshold for the defragmentation + * ok, the default threshold for the defragmentation * is 1M, since our maximum clustersize was 1M also. * any thought? */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 5550f8afa438..0ec63a1a94b8 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -508,7 +508,6 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct inode *inode, dev_t dev, struct buffer_head **new_fe_bh, - struct buffer_head *parent_fe_bh, handle_t *handle, struct ocfs2_alloc_context *inode_ac, u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit) @@ -641,8 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, } return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, - parent_fe_bh, handle, inode_ac, - fe_blkno, suballoc_loc, suballoc_bit); + handle, inode_ac, fe_blkno, + suballoc_loc, suballoc_bit); } static int ocfs2_mkdir(struct mnt_idmap *idmap, @@ -2576,7 +2575,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, clear_nlink(inode); /* do the real work now. */ status = __ocfs2_mknod_locked(dir, inode, - 0, &new_di_bh, parent_di_bh, handle, + 0, &new_di_bh, handle, inode_ac, di_blkno, suballoc_loc, suballoc_bit); if (status < 0) { diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index c93689b568fe..e8e94599e907 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -132,7 +132,7 @@ * well as the name of the cluster being joined. * mount.ocfs2 must pass in a matching stack name. * - * If not set, the classic stack will be used. This is compatbile with + * If not set, the classic stack will be used. This is compatible with * all older versions. */ #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 @@ -143,7 +143,7 @@ /* Support for extended attributes */ #define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 -/* Support for indexed directores */ +/* Support for indexed directories */ #define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400 /* Metadata checksum and error correction */ @@ -156,7 +156,7 @@ #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 /* - * Incompat bit to indicate useable clusterinfo with stackflags for all + * Incompat bit to indicate usable clusterinfo with stackflags for all * cluster stacks (userspace adnd o2cb). If this bit is set, * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. */ @@ -1083,7 +1083,7 @@ struct ocfs2_xattr_block { struct ocfs2_xattr_header xb_header; /* xattr header if this block contains xattr */ struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this - block cotains xattr + block contains xattr tree. */ } xb_attrs; }; diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 9680797bc531..2de2f8733283 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -215,7 +215,7 @@ struct ocfs2_move_extents { movement less likely to fail, may make fs even more fragmented */ -#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation +#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmentation completely gets done. */ diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 8ac357ce6a30..9b234c03d693 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -93,7 +93,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_DATA] = "Data", [OCFS2_LOCK_TYPE_SUPER] = "Super", [OCFS2_LOCK_TYPE_RENAME] = "Rename", - /* Need to differntiate from [R]ename.. serializing writes is the + /* Need to differentiate from [R]ename.. serializing writes is the * important job it does, anyway. */ [OCFS2_LOCK_TYPE_RW] = "Write/Read", [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 0511c69c9fde..54ed1495de9a 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1658,34 +1658,34 @@ TRACE_EVENT(ocfs2_remount, ); TRACE_EVENT(ocfs2_fill_super, - TP_PROTO(void *sb, void *data, int silent), - TP_ARGS(sb, data, silent), + TP_PROTO(void *sb, void *fc, int silent), + TP_ARGS(sb, fc, silent), TP_STRUCT__entry( __field(void *, sb) - __field(void *, data) + __field(void *, fc) __field(int, silent) ), TP_fast_assign( __entry->sb = sb; - __entry->data = data; + __entry->fc = fc; __entry->silent = silent; ), TP_printk("%p %p %d", __entry->sb, - __entry->data, __entry->silent) + __entry->fc, __entry->silent) ); TRACE_EVENT(ocfs2_parse_options, - TP_PROTO(int is_remount, char *options), - TP_ARGS(is_remount, options), + TP_PROTO(int is_remount, const char *option), + TP_ARGS(is_remount, option), TP_STRUCT__entry( __field(int, is_remount) - __string(options, options) + __string(option, option) ), TP_fast_assign( __entry->is_remount = is_remount; - __assign_str(options); + __assign_str(option); ), - TP_printk("%d %s", __entry->is_remount, __get_str(options)) + TP_printk("%d %s", __entry->is_remount, __get_str(option)) ); DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 3404e7a30c33..15d9acd456ec 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -761,6 +761,11 @@ static int ocfs2_release_dquot(struct dquot *dquot) handle = ocfs2_start_trans(osb, ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type)); if (IS_ERR(handle)) { + /* + * Mark dquot as inactive to avoid endless cycle in + * quota_release_workfn(). + */ + clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); status = PTR_ERR(handle); mlog_errno(status); goto out_ilock; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 004393b13c0a..8f732742b26e 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2420,7 +2420,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, * * If we will insert a new one, this is easy and only happens * during adding refcounted flag to the extent, so we don't - * have a chance of spliting. We just need one record. + * have a chance of splitting. We just need one record. * * If the refcount rec already exists, that would be a little * complicated. we may have to: @@ -2610,11 +2610,11 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, /* * Calculate out the start and number of virtual clusters we need to CoW. * - * cpos is vitual start cluster position we want to do CoW in a + * cpos is virtual start cluster position we want to do CoW in a * file and write_len is the cluster length. * max_cpos is the place where we want to stop CoW intentionally. * - * Normal we will start CoW from the beginning of extent record cotaining cpos. + * Normal we will start CoW from the beginning of extent record containing cpos. * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we * get good I/O from the resulting extent tree. */ @@ -2902,7 +2902,6 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, int ret = 0, partial; struct super_block *sb = inode->i_sb; u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); - struct page *page; pgoff_t page_index; unsigned int from, to; loff_t offset, end, map_end; @@ -2921,6 +2920,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, end = i_size_read(inode); while (offset < end) { + struct folio *folio; page_index = offset >> PAGE_SHIFT; map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; if (map_end > end) @@ -2933,9 +2933,10 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, to = map_end & (PAGE_SIZE - 1); retry: - page = find_or_create_page(mapping, page_index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, page_index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); mlog_errno(ret); break; } @@ -2945,9 +2946,9 @@ retry: * page, so write it back. */ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { - if (PageDirty(page)) { - unlock_page(page); - put_page(page); + if (folio_test_dirty(folio)) { + folio_unlock(folio); + folio_put(folio); ret = filemap_write_and_wait_range(mapping, offset, map_end - 1); @@ -2955,9 +2956,7 @@ retry: } } - if (!PageUptodate(page)) { - struct folio *folio = page_folio(page); - + if (!folio_test_uptodate(folio)) { ret = block_read_full_folio(folio, ocfs2_get_block); if (ret) { mlog_errno(ret); @@ -2966,8 +2965,8 @@ retry: folio_lock(folio); } - if (page_has_buffers(page)) { - ret = walk_page_buffers(handle, page_buffers(page), + if (folio_buffers(folio)) { + ret = walk_page_buffers(handle, folio_buffers(folio), from, to, &partial, ocfs2_clear_cow_buffer); if (ret) { @@ -2976,14 +2975,12 @@ retry: } } - ocfs2_map_and_dirty_page(inode, - handle, from, to, - page, 0, &new_block); - mark_page_accessed(page); + ocfs2_map_and_dirty_folio(inode, handle, from, to, + folio, 0, &new_block); + folio_mark_accessed(folio); unlock: - unlock_page(page); - put_page(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); offset = map_end; if (ret) break; diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index ec8101ef5717..4fce17180342 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -31,7 +31,7 @@ struct ocfs2_alloc_reservation { #define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */ #define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be - * destroyed immedately after use */ + * destroyed immediately after use */ #define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed * directory btree */ @@ -125,7 +125,7 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, /** * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used. * @resmap: reservations bitmap - * @resv: optional reservation to recalulate based on new bitmap + * @resv: optional reservation to recalculate based on new bitmap * @cstart: start of allocation in clusters * @clen: end of allocation in clusters. * diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 10157d9d7a9c..f58e891aa2da 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -227,7 +227,7 @@ static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) } /* - * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB + * o2dlm always has a "valid" LVB. If the dlm loses track of the LVB * contents, it will zero out the LVB. Thus the caller can always trust * the contents. */ diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 02ab072c528a..5486a6dce70a 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -210,7 +210,7 @@ struct ocfs2_stack_operations { struct file_lock *fl); /* - * This is an optoinal debugging hook. If provided, the + * This is an optional debugging hook. If provided, the * stack can dump debugging information about this lock. */ void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c79b4291777f..e0b91dbaa0ac 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -19,10 +19,10 @@ #include #include #include -#include +#include +#include #include #include -#include #include #include #include @@ -80,17 +80,15 @@ struct mount_options unsigned int resv_level; int dir_resv_level; char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + bool user_stack; }; -static int ocfs2_parse_options(struct super_block *sb, char *options, - struct mount_options *mopt, - int is_remount); +static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options); static int ocfs2_show_options(struct seq_file *s, struct dentry *root); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); -static int ocfs2_remount(struct super_block *sb, int *flags, char *data); static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); static int ocfs2_initialize_mem_caches(void); static void ocfs2_free_mem_caches(void); @@ -135,7 +133,6 @@ static const struct super_operations ocfs2_sops = { .evict_inode = ocfs2_evict_inode, .sync_fs = ocfs2_sync_fs, .put_super = ocfs2_put_super, - .remount_fs = ocfs2_remount, .show_options = ocfs2_show_options, .quota_read = ocfs2_quota_read, .quota_write = ocfs2_quota_write, @@ -144,15 +141,10 @@ static const struct super_operations ocfs2_sops = { enum { Opt_barrier, - Opt_err_panic, - Opt_err_ro, + Opt_errors, Opt_intr, - Opt_nointr, - Opt_hb_none, - Opt_hb_local, - Opt_hb_global, - Opt_data_ordered, - Opt_data_writeback, + Opt_heartbeat, + Opt_data, Opt_atime_quantum, Opt_slot, Opt_commit, @@ -160,52 +152,64 @@ enum { Opt_localflocks, Opt_stack, Opt_user_xattr, - Opt_nouser_xattr, Opt_inode64, Opt_acl, - Opt_noacl, Opt_usrquota, Opt_grpquota, - Opt_coherency_buffered, - Opt_coherency_full, + Opt_coherency, Opt_resv_level, Opt_dir_resv_level, Opt_journal_async_commit, - Opt_err_cont, - Opt_err, }; -static const match_table_t tokens = { - {Opt_barrier, "barrier=%u"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_intr, "intr"}, - {Opt_nointr, "nointr"}, - {Opt_hb_none, OCFS2_HB_NONE}, - {Opt_hb_local, OCFS2_HB_LOCAL}, - {Opt_hb_global, OCFS2_HB_GLOBAL}, - {Opt_data_ordered, "data=ordered"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_atime_quantum, "atime_quantum=%u"}, - {Opt_slot, "preferred_slot=%u"}, - {Opt_commit, "commit=%u"}, - {Opt_localalloc, "localalloc=%d"}, - {Opt_localflocks, "localflocks"}, - {Opt_stack, "cluster_stack=%s"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_inode64, "inode64"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_usrquota, "usrquota"}, - {Opt_grpquota, "grpquota"}, - {Opt_coherency_buffered, "coherency=buffered"}, - {Opt_coherency_full, "coherency=full"}, - {Opt_resv_level, "resv_level=%u"}, - {Opt_dir_resv_level, "dir_resv_level=%u"}, - {Opt_journal_async_commit, "journal_async_commit"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err, NULL} +static const struct constant_table ocfs2_param_errors[] = { + {"panic", OCFS2_MOUNT_ERRORS_PANIC}, + {"remount-ro", OCFS2_MOUNT_ERRORS_ROFS}, + {"continue", OCFS2_MOUNT_ERRORS_CONT}, + {} +}; + +static const struct constant_table ocfs2_param_heartbeat[] = { + {"local", OCFS2_MOUNT_HB_LOCAL}, + {"none", OCFS2_MOUNT_HB_NONE}, + {"global", OCFS2_MOUNT_HB_GLOBAL}, + {} +}; + +static const struct constant_table ocfs2_param_data[] = { + {"writeback", OCFS2_MOUNT_DATA_WRITEBACK}, + {"ordered", 0}, + {} +}; + +static const struct constant_table ocfs2_param_coherency[] = { + {"buffered", OCFS2_MOUNT_COHERENCY_BUFFERED}, + {"full", 0}, + {} +}; + +static const struct fs_parameter_spec ocfs2_param_spec[] = { + fsparam_u32 ("barrier", Opt_barrier), + fsparam_enum ("errors", Opt_errors, ocfs2_param_errors), + fsparam_flag_no ("intr", Opt_intr), + fsparam_enum ("heartbeat", Opt_heartbeat, ocfs2_param_heartbeat), + fsparam_enum ("data", Opt_data, ocfs2_param_data), + fsparam_u32 ("atime_quantum", Opt_atime_quantum), + fsparam_u32 ("preferred_slot", Opt_slot), + fsparam_u32 ("commit", Opt_commit), + fsparam_s32 ("localalloc", Opt_localalloc), + fsparam_flag ("localflocks", Opt_localflocks), + fsparam_string ("cluster_stack", Opt_stack), + fsparam_flag_no ("user_xattr", Opt_user_xattr), + fsparam_flag ("inode64", Opt_inode64), + fsparam_flag_no ("acl", Opt_acl), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_enum ("coherency", Opt_coherency, ocfs2_param_coherency), + fsparam_u32 ("resv_level", Opt_resv_level), + fsparam_u32 ("dir_resv_level", Opt_dir_resv_level), + fsparam_flag ("journal_async_commit", Opt_journal_async_commit), + {} }; #ifdef CONFIG_DEBUG_FS @@ -600,32 +604,32 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, return (((unsigned long long)bytes) << bitshift) - trim; } -static int ocfs2_remount(struct super_block *sb, int *flags, char *data) +static int ocfs2_reconfigure(struct fs_context *fc) { int incompat_features; int ret = 0; - struct mount_options parsed_options; + struct mount_options *parsed_options = fc->fs_private; + struct super_block *sb = fc->root->d_sb; struct ocfs2_super *osb = OCFS2_SB(sb); u32 tmp; sync_filesystem(sb); - if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || - !ocfs2_check_set_options(sb, &parsed_options)) { + if (!ocfs2_check_set_options(sb, parsed_options)) { ret = -EINVAL; goto out; } tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; - if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { + if ((osb->s_mount_opt & tmp) != (parsed_options->mount_opt & tmp)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); goto out; } if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != - (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { + (parsed_options->mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot change data mode on remount\n"); goto out; @@ -634,16 +638,16 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) /* Probably don't want this on remount; it might * mess with other nodes */ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && - (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { + (parsed_options->mount_opt & OCFS2_MOUNT_INODE64)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); goto out; } /* We're going to/from readonly mode. */ - if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { + if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { /* Disable quota accounting before remounting RO */ - if (*flags & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { ret = ocfs2_susp_quotas(osb, 0); if (ret < 0) goto out; @@ -657,7 +661,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto unlock_osb; } - if (*flags & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { sb->s_flags |= SB_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; } else { @@ -678,11 +682,11 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~SB_RDONLY; osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; } - trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); + trace_ocfs2_remount(sb->s_flags, osb->osb_flags, fc->sb_flags); unlock_osb: spin_unlock(&osb->osb_lock); /* Enable quota accounting after remounting RW */ - if (!ret && !(*flags & SB_RDONLY)) { + if (!ret && !(fc->sb_flags & SB_RDONLY)) { if (sb_any_quota_suspended(sb)) ret = ocfs2_susp_quotas(osb, 1); else @@ -701,11 +705,11 @@ unlock_osb: if (!ret) { /* Only save off the new mount options in case of a successful * remount. */ - osb->s_mount_opt = parsed_options.mount_opt; - osb->s_atime_quantum = parsed_options.atime_quantum; - osb->preferred_slot = parsed_options.slot; - if (parsed_options.commit_interval) - osb->osb_commit_interval = parsed_options.commit_interval; + osb->s_mount_opt = parsed_options->mount_opt; + osb->s_atime_quantum = parsed_options->atime_quantum; + osb->preferred_slot = parsed_options->slot; + if (parsed_options->commit_interval) + osb->osb_commit_interval = parsed_options->commit_interval; if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); @@ -966,23 +970,18 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) } } -static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) +static int ocfs2_fill_super(struct super_block *sb, struct fs_context *fc) { struct dentry *root; int status, sector_size; - struct mount_options parsed_options; + struct mount_options *parsed_options = fc->fs_private; struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; char nodestr[12]; struct ocfs2_blockcheck_stats stats; - trace_ocfs2_fill_super(sb, data, silent); - - if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { - status = -EINVAL; - goto out; - } + trace_ocfs2_fill_super(sb, fc, fc->sb_flags & SB_SILENT); /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); @@ -999,24 +998,24 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb = OCFS2_SB(sb); - if (!ocfs2_check_set_options(sb, &parsed_options)) { + if (!ocfs2_check_set_options(sb, parsed_options)) { status = -EINVAL; goto out_super; } - osb->s_mount_opt = parsed_options.mount_opt; - osb->s_atime_quantum = parsed_options.atime_quantum; - osb->preferred_slot = parsed_options.slot; - osb->osb_commit_interval = parsed_options.commit_interval; + osb->s_mount_opt = parsed_options->mount_opt; + osb->s_atime_quantum = parsed_options->atime_quantum; + osb->preferred_slot = parsed_options->slot; + osb->osb_commit_interval = parsed_options->commit_interval; - ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); - osb->osb_resv_level = parsed_options.resv_level; - osb->osb_dir_resv_level = parsed_options.resv_level; - if (parsed_options.dir_resv_level == -1) - osb->osb_dir_resv_level = parsed_options.resv_level; + ocfs2_la_set_sizes(osb, parsed_options->localalloc_opt); + osb->osb_resv_level = parsed_options->resv_level; + osb->osb_dir_resv_level = parsed_options->resv_level; + if (parsed_options->dir_resv_level == -1) + osb->osb_dir_resv_level = parsed_options->resv_level; else - osb->osb_dir_resv_level = parsed_options.dir_resv_level; + osb->osb_dir_resv_level = parsed_options->dir_resv_level; - status = ocfs2_verify_userspace_stack(osb, &parsed_options); + status = ocfs2_verify_userspace_stack(osb, parsed_options); if (status) goto out_super; @@ -1180,27 +1179,72 @@ out: return status; } -static struct dentry *ocfs2_mount(struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *data) +static int ocfs2_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); + return get_tree_bdev(fc, ocfs2_fill_super); +} + +static void ocfs2_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations ocfs2_context_ops = { + .parse_param = ocfs2_parse_param, + .get_tree = ocfs2_get_tree, + .reconfigure = ocfs2_reconfigure, + .free = ocfs2_free_fc, +}; + +static int ocfs2_init_fs_context(struct fs_context *fc) +{ + struct mount_options *mopt; + + mopt = kzalloc(sizeof(struct mount_options), GFP_KERNEL); + if (!mopt) + return -EINVAL; + + mopt->commit_interval = 0; + mopt->mount_opt = OCFS2_MOUNT_NOINTR; + mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; + mopt->slot = OCFS2_INVALID_SLOT; + mopt->localalloc_opt = -1; + mopt->cluster_stack[0] = '\0'; + mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; + mopt->dir_resv_level = -1; + + fc->fs_private = mopt; + fc->ops = &ocfs2_context_ops; + + return 0; } static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", - .mount = ocfs2_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, - .next = NULL + .next = NULL, + .init_fs_context = ocfs2_init_fs_context, + .parameters = ocfs2_param_spec, }; MODULE_ALIAS_FS("ocfs2"); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options) { + if (options->user_stack == 0) { + u32 tmp; + + /* Ensure only one heartbeat mode */ + tmp = options->mount_opt & (OCFS2_MOUNT_HB_LOCAL | + OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + return 0; + } + } if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { @@ -1232,241 +1276,142 @@ static int ocfs2_check_set_options(struct super_block *sb, return 1; } -static int ocfs2_parse_options(struct super_block *sb, - char *options, - struct mount_options *mopt, - int is_remount) +static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) { - int status, user_stack = 0; - char *p; - u32 tmp; - int token, option; - substring_t args[MAX_OPT_ARGS]; + struct fs_parse_result result; + int opt; + struct mount_options *mopt = fc->fs_private; + bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); - trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); + trace_ocfs2_parse_options(is_remount, param->key); - mopt->commit_interval = 0; - mopt->mount_opt = OCFS2_MOUNT_NOINTR; - mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; - mopt->slot = OCFS2_INVALID_SLOT; - mopt->localalloc_opt = -1; - mopt->cluster_stack[0] = '\0'; - mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; - mopt->dir_resv_level = -1; + opt = fs_parse(fc, ocfs2_param_spec, param, &result); + if (opt < 0) + return opt; - if (!options) { - status = 1; - goto bail; - } - - while ((p = strsep(&options, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_hb_local: - mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; - break; - case Opt_hb_none: - mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; - break; - case Opt_hb_global: - mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; - break; - case Opt_barrier: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option) - mopt->mount_opt |= OCFS2_MOUNT_BARRIER; - else - mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; - break; - case Opt_intr: - mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; - break; - case Opt_nointr: + switch (opt) { + case Opt_heartbeat: + mopt->mount_opt |= result.uint_32; + break; + case Opt_barrier: + if (result.uint_32) + mopt->mount_opt |= OCFS2_MOUNT_BARRIER; + else + mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; + break; + case Opt_intr: + if (result.negated) mopt->mount_opt |= OCFS2_MOUNT_NOINTR; - break; - case Opt_err_panic: - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; - mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; - break; - case Opt_err_ro: - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; - mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS; - break; - case Opt_err_cont: - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; - mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT; - break; - case Opt_data_ordered: - mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; - break; - case Opt_data_writeback: - mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; - break; - case Opt_user_xattr: - mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; - break; - case Opt_nouser_xattr: + else + mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; + break; + case Opt_errors: + mopt->mount_opt &= ~(OCFS2_MOUNT_ERRORS_CONT | + OCFS2_MOUNT_ERRORS_ROFS | + OCFS2_MOUNT_ERRORS_PANIC); + mopt->mount_opt |= result.uint_32; + break; + case Opt_data: + mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; + mopt->mount_opt |= result.uint_32; + break; + case Opt_user_xattr: + if (result.negated) mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; - break; - case Opt_atime_quantum: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= 0) - mopt->atime_quantum = option; - break; - case Opt_slot: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option) - mopt->slot = (u16)option; - break; - case Opt_commit: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option < 0) - return 0; - if (option == 0) - option = JBD2_DEFAULT_MAX_COMMIT_AGE; - mopt->commit_interval = HZ * option; - break; - case Opt_localalloc: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= 0) - mopt->localalloc_opt = option; - break; - case Opt_localflocks: - /* - * Changing this during remount could race - * flock() requests, or "unbalance" existing - * ones (e.g., a lock is taken in one mode but - * dropped in the other). If users care enough - * to flip locking modes during remount, we - * could add a "local" flag to individual - * flock structures for proper tracking of - * state. - */ - if (!is_remount) - mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; - break; - case Opt_stack: - /* Check both that the option we were passed - * is of the right length and that it is a proper - * string of the right length. - */ - if (((args[0].to - args[0].from) != - OCFS2_STACK_LABEL_LEN) || - (strnlen(args[0].from, - OCFS2_STACK_LABEL_LEN) != - OCFS2_STACK_LABEL_LEN)) { - mlog(ML_ERROR, - "Invalid cluster_stack option\n"); - status = 0; - goto bail; - } - memcpy(mopt->cluster_stack, args[0].from, - OCFS2_STACK_LABEL_LEN); - mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; - /* - * Open code the memcmp here as we don't have - * an osb to pass to - * ocfs2_userspace_stack(). - */ - if (memcmp(mopt->cluster_stack, - OCFS2_CLASSIC_CLUSTER_STACK, - OCFS2_STACK_LABEL_LEN)) - user_stack = 1; - break; - case Opt_inode64: - mopt->mount_opt |= OCFS2_MOUNT_INODE64; - break; - case Opt_usrquota: - mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; - break; - case Opt_grpquota: - mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; - break; - case Opt_coherency_buffered: - mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; - break; - case Opt_coherency_full: - mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; - break; - case Opt_acl: - mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; - mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; - break; - case Opt_noacl: + else + mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_atime_quantum: + mopt->atime_quantum = result.uint_32; + break; + case Opt_slot: + if (result.uint_32) + mopt->slot = (u16)result.uint_32; + break; + case Opt_commit: + if (result.uint_32 == 0) + mopt->commit_interval = HZ * JBD2_DEFAULT_MAX_COMMIT_AGE; + else + mopt->commit_interval = HZ * result.uint_32; + break; + case Opt_localalloc: + if (result.int_32 >= 0) + mopt->localalloc_opt = result.int_32; + break; + case Opt_localflocks: + /* + * Changing this during remount could race flock() requests, or + * "unbalance" existing ones (e.g., a lock is taken in one mode + * but dropped in the other). If users care enough to flip + * locking modes during remount, we could add a "local" flag to + * individual flock structures for proper tracking of state. + */ + if (!is_remount) + mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; + break; + case Opt_stack: + /* Check both that the option we were passed is of the right + * length and that it is a proper string of the right length. + */ + if (strlen(param->string) != OCFS2_STACK_LABEL_LEN) { + mlog(ML_ERROR, "Invalid cluster_stack option\n"); + return -EINVAL; + } + memcpy(mopt->cluster_stack, param->string, OCFS2_STACK_LABEL_LEN); + mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + /* + * Open code the memcmp here as we don't have an osb to pass + * to ocfs2_userspace_stack(). + */ + if (memcmp(mopt->cluster_stack, + OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + mopt->user_stack = 1; + break; + case Opt_inode64: + mopt->mount_opt |= OCFS2_MOUNT_INODE64; + break; + case Opt_usrquota: + mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; + break; + case Opt_grpquota: + mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; + break; + case Opt_coherency: + mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; + mopt->mount_opt |= result.uint_32; + break; + case Opt_acl: + if (result.negated) { mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; - break; - case Opt_resv_level: - if (is_remount) - break; - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= OCFS2_MIN_RESV_LEVEL && - option < OCFS2_MAX_RESV_LEVEL) - mopt->resv_level = option; - break; - case Opt_dir_resv_level: - if (is_remount) - break; - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= OCFS2_MIN_RESV_LEVEL && - option < OCFS2_MAX_RESV_LEVEL) - mopt->dir_resv_level = option; - break; - case Opt_journal_async_commit: - mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; - break; - default: - mlog(ML_ERROR, - "Unrecognized mount option \"%s\" " - "or missing value\n", p); - status = 0; - goto bail; + } else { + mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; } + break; + case Opt_resv_level: + if (is_remount) + break; + if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL && + result.uint_32 < OCFS2_MAX_RESV_LEVEL) + mopt->resv_level = result.uint_32; + break; + case Opt_dir_resv_level: + if (is_remount) + break; + if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL && + result.uint_32 < OCFS2_MAX_RESV_LEVEL) + mopt->dir_resv_level = result.uint_32; + break; + case Opt_journal_async_commit: + mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; + break; + default: + return -EINVAL; } - if (user_stack == 0) { - /* Ensure only one heartbeat mode */ - tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | - OCFS2_MOUNT_HB_GLOBAL | - OCFS2_MOUNT_HB_NONE); - if (hweight32(tmp) != 1) { - mlog(ML_ERROR, "Invalid heartbeat mount options\n"); - status = 0; - goto bail; - } - } - - status = 1; - -bail: - return status; + return 0; } static int ocfs2_show_options(struct seq_file *s, struct dentry *root) @@ -1858,7 +1803,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); - /* Remove file check sysfs related directores/files, + /* Remove file check sysfs related directories/files, * and wait for the pending file check operations */ ocfs2_filecheck_remove_sysfs(osb); diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index d4c5fdcfa1e4..ad8be3300b49 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -54,31 +54,27 @@ static int ocfs2_fast_symlink_read_folio(struct file *f, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct buffer_head *bh = NULL; int status = ocfs2_read_inode_block(inode, &bh); struct ocfs2_dinode *fe; const char *link; - void *kaddr; size_t len; if (status < 0) { mlog_errno(status); - return status; + goto out; } fe = (struct ocfs2_dinode *) bh->b_data; link = (char *) fe->id2.i_symlink; /* will be less than a page size */ len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb)); - kaddr = kmap_atomic(page); - memcpy(kaddr, link, len + 1); - kunmap_atomic(kaddr); - SetPageUptodate(page); - unlock_page(page); + memcpy_to_folio(folio, 0, link, len + 1); +out: + folio_end_read(folio, status == 0); brelse(bh); - return 0; + return status; } const struct address_space_operations ocfs2_fast_symlink_aops = { diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 73a6f6fd8a8e..d70a20d29e3e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -648,7 +648,7 @@ int ocfs2_calc_xattr_init(struct inode *dir, * 256(name) + 80(value) + 16(entry) = 352 bytes, * The max space of acl xattr taken inline is * 80(value) + 16(entry) * 2(if directory) = 192 bytes, - * when blocksize = 512, may reserve one more cluser for + * when blocksize = 512, may reserve one more cluster for * xattr bucket, otherwise reserve one metadata block * for them is ok. * If this is a new directory with inline data, @@ -4371,7 +4371,7 @@ static int cmp_xe_offset(const void *a, const void *b) /* * defrag a xattr bucket if we find that the bucket has some - * holes beteen name/value pairs. + * holes between name/value pairs. * We will move all the name/value pairs to the end of the bucket * so that we can spare some space for insertion. */ @@ -5011,7 +5011,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode, * 2. If cluster_size == bucket_size: * a) If the previous extent rec has more than one cluster and the insert * place isn't in the last cluster, copy the entire last cluster to the - * new one. This time, we don't need to upate the first_bh and header_bh + * new one. This time, we don't need to update the first_bh and header_bh * since they will not be moved into the new cluster. * b) Otherwise, move the bottom half of the xattrs in the last cluster into * the new one. And we set the extend flag to zero if the insert place is @@ -6189,7 +6189,7 @@ struct ocfs2_xattr_reflink { /* * Given a xattr header and xe offset, * return the proper xv and the corresponding bh. - * xattr in inode, block and xattr tree have different implementaions. + * xattr in inode, block and xattr tree have different implementations. */ typedef int (get_xattr_value_root)(struct super_block *sb, struct buffer_head *bh, @@ -6269,7 +6269,7 @@ static int ocfs2_get_xattr_value_root(struct super_block *sb, } /* - * Lock the meta_ac and caculate how much credits we need for reflink xattrs. + * Lock the meta_ac and calculate how much credits we need for reflink xattrs. * It is only used for inline xattr and xattr block. */ static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb, diff --git a/fs/open.c b/fs/open.c index e6911101fe71..0f75e220b700 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1504,7 +1504,7 @@ static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; - if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, + if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp, "VFS: Close: file count is 0 (f_op=%ps)", filp->f_op)) { return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 0edf14a9840e..a50b222a5917 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3269,6 +3269,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; + int ret = 0; mm = get_task_mm(task); if (mm) { @@ -3276,6 +3277,16 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); + seq_printf(m, "ksm_merge_any: %s\n", + test_bit(MMF_VM_MERGE_ANY, &mm->flags) ? "yes" : "no"); + ret = mmap_read_lock_killable(mm); + if (ret) { + mmput(mm); + return ret; + } + seq_printf(m, "ksm_mergeable: %s\n", + ksm_process_mergeable(mm) ? "yes" : "no"); + mmap_read_unlock(mm); mmput(mm); } diff --git a/fs/splice.c b/fs/splice.c index 2898fa1e9e63..28cfa63aa236 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -342,7 +342,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, return -ENOMEM; pages = (struct page **)(bv + npages); - npages = alloc_pages_bulk_array(GFP_USER, npages, pages); + npages = alloc_pages_bulk(GFP_USER, npages, pages); if (!npages) { kfree(bv); return -ENOMEM; diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 60fc98bdf421..b1091e70434a 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -5,8 +5,8 @@ config SQUASHFS help Saying Y here includes support for SquashFS 4.0 (a Compressed Read-Only File System). Squashfs is a highly compressed read-only - filesystem for Linux. It uses zlib, lzo or xz compression to - compress both files, inodes and directories. Inodes in the system + filesystem for Linux. It uses zlib, lz4, lzo, xz or zstd compression + to compress both files, inodes and directories. Inodes in the system are very small and all blocks are packed to minimise data overhead. Block sizes greater than 4K are supported up to a maximum of 1 Mbytes (default block size 128K). SquashFS 4.0 supports 64 bit filesystems @@ -16,7 +16,7 @@ config SQUASHFS Squashfs is intended for general read-only filesystem use, for archival use (i.e. in cases where a .tar.gz file may be used), and in embedded systems where low overhead is needed. Further information - and tools are available from http://squashfs.sourceforge.net. + and tools are available from github.com/plougher/squashfs-tools. If you want to compile this as a module ( = code which can be inserted in and removed from the running kernel whenever you want), diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 5062326d0efb..4db0d2b0aab8 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -224,11 +224,15 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, int block_size) { int i, j; - struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL); + struct squashfs_cache *cache; + if (entries == 0) + return NULL; + + cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) { ERROR("Failed to allocate %s cache\n", name); - return NULL; + return ERR_PTR(-ENOMEM); } cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL); @@ -281,7 +285,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, cleanup: squashfs_cache_delete(cache); - return NULL; + return ERR_PTR(-ENOMEM); } diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 21aaa96856c1..5ca2baa16dc2 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -362,29 +362,33 @@ static int read_blocklist(struct inode *inode, int index, u64 *block) return squashfs_block_size(size); } -void squashfs_fill_page(struct page *page, struct squashfs_cache_entry *buffer, int offset, int avail) +static bool squashfs_fill_page(struct folio *folio, + struct squashfs_cache_entry *buffer, size_t offset, + size_t avail) { - int copied; + size_t copied; void *pageaddr; - pageaddr = kmap_atomic(page); + pageaddr = kmap_local_folio(folio, 0); copied = squashfs_copy_data(pageaddr, buffer, offset, avail); memset(pageaddr + copied, 0, PAGE_SIZE - copied); - kunmap_atomic(pageaddr); + kunmap_local(pageaddr); - flush_dcache_page(page); - if (copied == avail) - SetPageUptodate(page); + flush_dcache_folio(folio); + + return copied == avail; } /* Copy data into page cache */ -void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer, - int bytes, int offset) +void squashfs_copy_cache(struct folio *folio, + struct squashfs_cache_entry *buffer, size_t bytes, + size_t offset) { - struct inode *inode = page->mapping->host; + struct address_space *mapping = folio->mapping; + struct inode *inode = mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; int i, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - int start_index = page->index & ~mask, end_index = start_index | mask; + int start_index = folio->index & ~mask, end_index = start_index | mask; /* * Loop copying datablock into pages. As the datablock likely covers @@ -394,32 +398,35 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer, */ for (i = start_index; i <= end_index && bytes > 0; i++, bytes -= PAGE_SIZE, offset += PAGE_SIZE) { - struct page *push_page; - int avail = buffer ? min_t(int, bytes, PAGE_SIZE) : 0; + struct folio *push_folio; + size_t avail = buffer ? min(bytes, PAGE_SIZE) : 0; + bool updated = false; - TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); + TRACE("bytes %zu, i %d, available_bytes %zu\n", bytes, i, avail); - push_page = (i == page->index) ? page : - grab_cache_page_nowait(page->mapping, i); + push_folio = (i == folio->index) ? folio : + __filemap_get_folio(mapping, i, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + mapping_gfp_mask(mapping)); - if (!push_page) + if (IS_ERR(push_folio)) continue; - if (PageUptodate(push_page)) - goto skip_page; + if (folio_test_uptodate(push_folio)) + goto skip_folio; - squashfs_fill_page(push_page, buffer, offset, avail); -skip_page: - unlock_page(push_page); - if (i != page->index) - put_page(push_page); + updated = squashfs_fill_page(push_folio, buffer, offset, avail); +skip_folio: + folio_end_read(push_folio, updated); + if (i != folio->index) + folio_put(push_folio); } } /* Read datablock stored packed inside a fragment (tail-end packed block) */ -static int squashfs_readpage_fragment(struct page *page, int expected) +static int squashfs_readpage_fragment(struct folio *folio, int expected) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, squashfs_i(inode)->fragment_block, squashfs_i(inode)->fragment_size); @@ -430,36 +437,34 @@ static int squashfs_readpage_fragment(struct page *page, int expected) squashfs_i(inode)->fragment_block, squashfs_i(inode)->fragment_size); else - squashfs_copy_cache(page, buffer, expected, + squashfs_copy_cache(folio, buffer, expected, squashfs_i(inode)->fragment_offset); squashfs_cache_put(buffer); return res; } -static int squashfs_readpage_sparse(struct page *page, int expected) +static int squashfs_readpage_sparse(struct folio *folio, int expected) { - squashfs_copy_cache(page, NULL, expected, 0); + squashfs_copy_cache(folio, NULL, expected, 0); return 0; } static int squashfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int index = page->index >> (msblk->block_log - PAGE_SHIFT); + int index = folio->index >> (msblk->block_log - PAGE_SHIFT); int file_end = i_size_read(inode) >> msblk->block_log; int expected = index == file_end ? (i_size_read(inode) & (msblk->block_size - 1)) : msblk->block_size; int res = 0; - void *pageaddr; TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", - page->index, squashfs_i(inode)->start); + folio->index, squashfs_i(inode)->start); - if (page->index >= ((i_size_read(inode) + PAGE_SIZE - 1) >> + if (folio->index >= ((i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT)) goto out; @@ -472,23 +477,18 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) goto out; if (res == 0) - res = squashfs_readpage_sparse(page, expected); + res = squashfs_readpage_sparse(folio, expected); else - res = squashfs_readpage_block(page, block, res, expected); + res = squashfs_readpage_block(folio, block, res, expected); } else - res = squashfs_readpage_fragment(page, expected); + res = squashfs_readpage_fragment(folio, expected); if (!res) return 0; out: - pageaddr = kmap_atomic(page); - memset(pageaddr, 0, PAGE_SIZE); - kunmap_atomic(pageaddr); - flush_dcache_page(page); - if (res == 0) - SetPageUptodate(page); - unlock_page(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_end_read(folio, res == 0); return res; } diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c index 54c17b7c85fd..40e59a43d098 100644 --- a/fs/squashfs/file_cache.c +++ b/fs/squashfs/file_cache.c @@ -18,9 +18,9 @@ #include "squashfs.h" /* Read separately compressed datablock and memcopy into page cache */ -int squashfs_readpage_block(struct page *page, u64 block, int bsize, int expected) +int squashfs_readpage_block(struct folio *folio, u64 block, int bsize, int expected) { - struct inode *i = page->mapping->host; + struct inode *i = folio->mapping->host; struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, block, bsize); int res = buffer->error; @@ -29,7 +29,7 @@ int squashfs_readpage_block(struct page *page, u64 block, int bsize, int expecte ERROR("Unable to read page, block %llx, size %x\n", block, bsize); else - squashfs_copy_cache(page, buffer, expected, 0); + squashfs_copy_cache(folio, buffer, expected, 0); squashfs_cache_put(buffer); return res; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index d19d4db74af8..2c3e809d6891 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -19,12 +19,11 @@ #include "page_actor.h" /* Read separately compressed datablock directly into page cache */ -int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, - int expected) - +int squashfs_readpage_block(struct folio *folio, u64 block, int bsize, + int expected) { - struct folio *folio = page_folio(target_page); - struct inode *inode = target_page->mapping->host; + struct page *target_page = &folio->page; + struct inode *inode = folio->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; @@ -48,7 +47,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, /* Try to grab all the pages covered by the Squashfs block */ for (i = 0, index = start_index; index <= end_index; index++) { page[i] = (index == folio->index) ? target_page : - grab_cache_page_nowait(target_page->mapping, index); + grab_cache_page_nowait(folio->mapping, index); if (page[i] == NULL) continue; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 5a756e6790b5..218868b20f16 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -14,6 +14,12 @@ #define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args) +#ifdef CONFIG_SQUASHFS_FILE_CACHE +#define SQUASHFS_READ_PAGES msblk->max_thread_num +#else +#define SQUASHFS_READ_PAGES 0 +#endif + /* block.c */ extern int squashfs_read_data(struct super_block *, u64, int, u64 *, struct squashfs_page_actor *); @@ -67,12 +73,11 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *, u64, u64, unsigned int); /* file.c */ -void squashfs_fill_page(struct page *, struct squashfs_cache_entry *, int, int); -void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int, - int); +void squashfs_copy_cache(struct folio *, struct squashfs_cache_entry *, + size_t bytes, size_t offset); /* file_xxx.c */ -extern int squashfs_readpage_block(struct page *, u64, int, int); +int squashfs_readpage_block(struct folio *, u64 block, int bsize, int expected); /* id.c */ extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 22e812808e5c..67c55fe32ce8 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -314,26 +314,29 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_RDONLY; sb->s_op = &squashfs_super_ops; - err = -ENOMEM; - msblk->block_cache = squashfs_cache_init("metadata", SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); - if (msblk->block_cache == NULL) + if (IS_ERR(msblk->block_cache)) { + err = PTR_ERR(msblk->block_cache); goto failed_mount; + } /* Allocate read_page block */ msblk->read_page = squashfs_cache_init("data", - msblk->max_thread_num, msblk->block_size); - if (msblk->read_page == NULL) { + SQUASHFS_READ_PAGES, msblk->block_size); + if (IS_ERR(msblk->read_page)) { errorf(fc, "Failed to allocate read_page block"); + err = PTR_ERR(msblk->read_page); goto failed_mount; } if (msblk->devblksize == PAGE_SIZE) { struct inode *cache = new_inode(sb); - if (cache == NULL) + if (cache == NULL) { + err = -ENOMEM; goto failed_mount; + } set_nlink(cache, 1); cache->i_size = OFFSET_MAX; @@ -405,9 +408,9 @@ handle_fragments: goto check_directory_table; msblk->fragment_cache = squashfs_cache_init("fragment", - SQUASHFS_CACHED_FRAGMENTS, msblk->block_size); - if (msblk->fragment_cache == NULL) { - err = -ENOMEM; + min(SQUASHFS_CACHED_FRAGMENTS, fragments), msblk->block_size); + if (IS_ERR(msblk->fragment_cache)) { + err = PTR_ERR(msblk->fragment_cache); goto failed_mount; } diff --git a/fs/super.c b/fs/super.c index c9c7223bc2a2..5a7db4a556e3 100644 --- a/fs/super.c +++ b/fs/super.c @@ -647,7 +647,7 @@ void generic_shutdown_super(struct super_block *sb) */ fscrypt_destroy_keyring(sb); - if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), + if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), NULL, "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index aa63b8efd782..82db3ab0e8b4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -395,8 +395,8 @@ xfs_buf_alloc_pages( for (;;) { long last = filled; - filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count, - bp->b_pages); + filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, + bp->b_pages); if (filled == bp->b_page_count) { XFS_STATS_INC(bp->b_mount, xb_page_found); break; diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index b5f594754a9e..99b960bd473c 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -17,11 +17,16 @@ extern int node_to_pxm(int); extern int acpi_map_pxm_to_node(int); extern unsigned char acpi_srat_revision; extern void disable_srat(void); +extern int fix_pxm_node_maps(int max_nid); extern void bad_srat(void); extern int srat_disabled(void); #else /* CONFIG_ACPI_NUMA */ +static inline int fix_pxm_node_maps(int max_nid) +{ + return 0; +} static inline void disable_srat(void) { } diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index 9d0479f50f97..5db59a1efb65 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -35,7 +35,7 @@ extern void early_ioremap_reset(void); /* * Early copy from unmapped memory to kernel mapped memory. */ -extern void copy_from_early_mem(void *dest, phys_addr_t src, +extern int copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size); #else diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 94cbd50cc870..02aeca21479a 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -6,6 +6,19 @@ #include #include +/* + * __percpu_qual is the qualifier for the percpu named address space. + * + * Most arches use generic named address space for percpu variables but + * some arches define percpu variables in different named address space + * (on the x86 arch, percpu variable may be declared as being relative + * to the %fs or %gs segments using __seg_fs or __seg_gs named address + * space qualifier). + */ +#ifndef __percpu_qual +# define __percpu_qual +#endif + #ifdef CONFIG_SMP /* @@ -74,7 +87,7 @@ do { \ #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ \ *__p += val; \ *__p; \ @@ -82,8 +95,8 @@ do { \ #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) __ret; \ __ret = *__p; \ *__p = nval; \ __ret; \ @@ -91,7 +104,7 @@ do { \ #define __cpu_fallback_try_cmpxchg(pcp, ovalp, nval, _cmpxchg) \ ({ \ - typeof(pcp) __val, __old = *(ovalp); \ + TYPEOF_UNQUAL(pcp) __val, __old = *(ovalp); \ __val = _cmpxchg(pcp, __old, nval); \ if (__val != __old) \ *(ovalp) = __val; \ @@ -100,8 +113,8 @@ do { \ #define raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ - typeof(pcp) __val = *__p, ___old = *(ovalp); \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) __val = *__p, ___old = *(ovalp); \ bool __ret; \ if (__val == ___old) { \ *__p = nval; \ @@ -115,14 +128,14 @@ do { \ #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ - typeof(pcp) __old = (oval); \ + TYPEOF_UNQUAL(pcp) __old = (oval); \ raw_cpu_generic_try_cmpxchg(pcp, &__old, nval); \ __old; \ }) #define __this_cpu_generic_read_nopreempt(pcp) \ ({ \ - typeof(pcp) ___ret; \ + TYPEOF_UNQUAL(pcp) ___ret; \ preempt_disable_notrace(); \ ___ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \ preempt_enable_notrace(); \ @@ -131,7 +144,7 @@ do { \ #define __this_cpu_generic_read_noirq(pcp) \ ({ \ - typeof(pcp) ___ret; \ + TYPEOF_UNQUAL(pcp) ___ret; \ unsigned long ___flags; \ raw_local_irq_save(___flags); \ ___ret = raw_cpu_generic_read(pcp); \ @@ -141,7 +154,7 @@ do { \ #define this_cpu_generic_read(pcp) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ if (__native_word(pcp)) \ __ret = __this_cpu_generic_read_nopreempt(pcp); \ else \ @@ -160,7 +173,7 @@ do { \ #define this_cpu_generic_add_return(pcp, val) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_add_return(pcp, val); \ @@ -170,7 +183,7 @@ do { \ #define this_cpu_generic_xchg(pcp, nval) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_xchg(pcp, nval); \ @@ -190,7 +203,7 @@ do { \ #define this_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \ diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 7c48f5fbf8aa..892ece4558a2 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -109,8 +109,7 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); - pagetable_pte_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } @@ -153,8 +152,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pagetable_pmd_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #endif @@ -202,8 +200,7 @@ static inline void __pud_free(struct mm_struct *mm, pud_t *pud) struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - pagetable_pud_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PUD_FREE @@ -215,10 +212,82 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) #endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#if CONFIG_PGTABLE_LEVELS > 4 + +static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc_noprof(gfp, 0); + if (!ptdesc) + return NULL; + + pagetable_p4d_ctor(ptdesc); + return ptdesc_address(ptdesc); +} +#define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) + +#ifndef __HAVE_ARCH_P4D_ALLOC_ONE +static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) +{ + return __p4d_alloc_one_noprof(mm, addr); +} +#define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) +#endif + +static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(p4d); + + BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + pagetable_dtor_free(ptdesc); +} + +#ifndef __HAVE_ARCH_P4D_FREE +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + if (!mm_p4d_folded(mm)) + __p4d_free(mm, p4d); +} +#endif + +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ + +static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc_noprof(gfp, order); + if (!ptdesc) + return NULL; + + pagetable_pgd_ctor(ptdesc); + return ptdesc_address(ptdesc); +} +#define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) + +static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); + + BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); + pagetable_dtor_free(ptdesc); +} + #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - pagetable_free(virt_to_ptdesc(pgd)); + __pgd_free(mm, pgd); } #endif diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 5a80fe728dc8..182b039ce5fa 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. * * This file is a stub providing documentation for what functions - * asm-ARCH/syscall.h files need to define. Most arch definitions + * arch/ARCH/include/asm/syscall.h files need to define. Most arch definitions * will be simple inlines. * * All of these functions expect to be called with no locks, diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 709830274b75..e402aef79c93 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -153,8 +153,9 @@ * * Useful if your architecture has non-page page directories. * - * When used, an architecture is expected to provide __tlb_remove_table() - * which does the actual freeing of these pages. + * When used, an architecture is expected to provide __tlb_remove_table() or + * use the generic __tlb_remove_table(), which does the actual freeing of these + * pages. * * MMU_GATHER_RCU_TABLE_FREE * @@ -207,16 +208,31 @@ struct mmu_table_batch { #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) +#ifndef __HAVE_ARCH_TLB_REMOVE_TABLE +static inline void __tlb_remove_table(void *table) +{ + struct ptdesc *ptdesc = (struct ptdesc *)table; + + pagetable_dtor_free(ptdesc); +} +#endif + extern void tlb_remove_table(struct mmu_gather *tlb, void *table); -#else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */ +#else /* !CONFIG_MMU_GATHER_TABLE_FREE */ +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ -#define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page)) +static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct page *page = (struct page *)table; + pagetable_dtor(page_ptdesc(page)); + tlb_remove_page(tlb, page); +} #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 0bbbe537c5f9..a946e0203e6d 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -224,9 +224,14 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} #define alloc_hooks_tag(_tag, _do_alloc) \ ({ \ - struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag); \ - typeof(_do_alloc) _res = _do_alloc; \ - alloc_tag_restore(_tag, _old); \ + typeof(_do_alloc) _res; \ + if (mem_alloc_profiling_enabled()) { \ + struct alloc_tag * __maybe_unused _old; \ + _old = alloc_tag_save(_tag); \ + _res = _do_alloc; \ + alloc_tag_restore(_tag, _old); \ + } else \ + _res = _do_alloc; \ _res; \ }) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 262b6596eca5..2026953e2c4e 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -23,7 +23,7 @@ struct device; * * Function implementations generic to all architectures are in * lib/bitmap.c. Functions implementations that are architecture - * specific are in various include/asm-/bitops.h headers + * specific are in various arch//include/asm/bitops.h headers * and other arch/ specific files. * * See lib/bitmap.c for more details. diff --git a/include/linux/bug.h b/include/linux/bug.h index 348acf2558f3..a9948a9f1093 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -73,15 +73,23 @@ static inline void generic_bug_clear_once(void) {} #endif /* CONFIG_GENERIC_BUG */ +#ifdef CONFIG_PRINTK +void mem_dump_obj(void *object); +#else +static inline void mem_dump_obj(void *object) {} +#endif + /* * Since detected data corruption should stop operation on the affected * structures. Return value must be checked and sanely acted on by caller. */ static inline __must_check bool check_data_corruption(bool v) { return v; } -#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ +#define CHECK_DATA_CORRUPTION(condition, addr, fmt, ...) \ check_data_corruption(({ \ bool corruption = unlikely(condition); \ if (corruption) { \ + if (addr) \ + mem_dump_obj(addr); \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ pr_err(fmt, ##__VA_ARGS__); \ BUG(); \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 240c632c5b95..567a7af257d1 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -336,6 +336,19 @@ static inline void *offset_to_ptr(const int *off) */ #define prevent_tail_call_optimization() mb() +/* + * Define TYPEOF_UNQUAL() to use __typeof_unqual__() as typeof + * operator when available, to return unqualified type of the exp. + * + * XXX: Remove test for __CHECKER__ once + * sparse learns about __typeof_unqual__. + */ +#if defined(CONFIG_CC_HAS_TYPEOF_UNQUAL) && !defined(__CHECKER__) +# define TYPEOF_UNQUAL(exp) __typeof_unqual__(exp) +#else +# define TYPEOF_UNQUAL(exp) __typeof__(exp) +#endif + #include #endif /* __LINUX_COMPILER_H */ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 981cc3d7e3aa..5d6544545658 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -57,7 +57,7 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __user BTF_TYPE_TAG(user) # endif # define __iomem -# define __percpu BTF_TYPE_TAG(percpu) +# define __percpu __percpu_qual BTF_TYPE_TAG(percpu) # define __rcu BTF_TYPE_TAG(rcu) # define __chk_user_ptr(x) (void)0 diff --git a/include/linux/damon.h b/include/linux/damon.h index a67f2c4940e9..af525252b853 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -193,11 +193,16 @@ struct damos_quota_goal { * size quota is set, DAMON tries to apply the action only up to &sz bytes * within &reset_interval. * - * Internally, the time quota is transformed to a size quota using estimated - * throughput of the scheme's action. DAMON then compares it against &sz and - * uses smaller one as the effective quota. + * To convince the different types of quotas and goals, DAMON internally + * converts those into one single size quota called "effective quota". DAMON + * internally uses it as the only one real quota. The conversion is made as + * follows. * - * If @goals is not empt, DAMON calculates yet another size quota based on the + * The time quota is transformed to a size quota using estimated throughput of + * the scheme's action. DAMON then compares it against &sz and uses smaller + * one as the effective quota. + * + * If @goals is not empty, DAMON calculates yet another size quota based on the * goals using its internal feedback loop algorithm, for every @reset_interval. * Then, if the new size quota is smaller than the effective quota, it uses the * new size quota as the effective quota. @@ -286,13 +291,33 @@ struct damos_watermarks { * @sz_tried: Total size of regions that the scheme is tried to be applied. * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. + * @sz_ops_filter_passed: + * Total bytes that passed ops layer-handled DAMOS filters. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + * + * "Tried an action to a region" in this context means the DAMOS core logic + * determined the region as eligible to apply the action. The access pattern + * (&struct damos_access_pattern), quotas (&struct damos_quota), watermarks + * (&struct damos_watermarks) and filters (&struct damos_filter) that handled + * on core logic can affect this. The core logic asks the operation set + * (&struct damon_operations) to apply the action to the region. + * + * "Applied an action to a region" in this context means the operation set + * (&struct damon_operations) successfully applied the action to the region, at + * least to a part of the region. The filters (&struct damos_filter) that + * handled on operation set layer and type of the action and pages of the + * region can affect this. For example, if a filter is set to exclude + * anonymous pages and the region has only anonymous pages, the region will be + * failed at applying the action. If the action is &DAMOS_PAGEOUT and all + * pages of the region are already paged out, the region will be failed at + * applying the action. */ struct damos_stat { unsigned long nr_tried; unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; }; @@ -327,8 +352,9 @@ enum damos_filter_type { /** * struct damos_filter - DAMOS action target memory filter. - * @type: Type of the page. - * @matching: If the matching page should filtered out or in. + * @type: Type of the target memory. + * @matching: Whether this is for @type-matching memory. + * @allow: Whether to include or exclude the @matching memory. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @target_idx: Index of the &struct damon_target of @@ -337,13 +363,15 @@ enum damos_filter_type { * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each - * page of the region matches to this and avoid applying the action if so. - * Support of each filter type depends on the running &struct damon_operations - * and the type. Refer to &enum damos_filter_type for more detai. + * byte of the region matches to this given condition and avoid applying the + * action if so. Support of each filter type depends on the running &struct + * damon_operations and the type. Refer to &enum damos_filter_type for more + * details. */ struct damos_filter { enum damos_filter_type type; bool matching; + bool allow; union { unsigned short memcg_id; struct damon_addr_range addr_range; @@ -352,6 +380,31 @@ struct damos_filter { struct list_head list; }; +struct damon_ctx; +struct damos; + +/** + * struct damos_walk_control - Control damos_walk(). + * + * @walk_fn: Function to be called back for each region. + * @data: Data that will be passed to walk functions. + * + * Control damos_walk(), which requests specific kdamond to invoke the given + * function to each region that eligible to apply actions of the kdamond's + * schemes. Refer to damos_walk() for more details. + */ +struct damos_walk_control { + void (*walk_fn)(void *data, struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s, unsigned long sz_filter_passed); + void *data; +/* private: internal use only */ + /* informs if the kdamond finished handling of the walk request */ + struct completion completion; + /* informs if the walk is canceled. */ + bool canceled; +}; + /** * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. @@ -415,6 +468,8 @@ struct damos { * @action */ unsigned long next_apply_sis; + /* informs if ongoing DAMOS walk for this scheme is finished */ + bool walk_completed; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; @@ -442,8 +497,6 @@ enum damon_ops_id { NR_DAMON_OPS, }; -struct damon_ctx; - /** * struct damon_operations - Monitoring operations for given use cases. * @@ -487,7 +540,8 @@ struct damon_ctx; * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action * to the region and return bytes of the region that the action is successfully - * applied. + * applied. It should also report how many bytes of the region has passed + * filters (&struct damos_filter) that handled by itself. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -504,7 +558,7 @@ struct damon_operations { struct damos *scheme); unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, - struct damos *scheme); + struct damos *scheme, unsigned long *sz_filter_passed); bool (*target_valid)(struct damon_target *t); void (*cleanup)(struct damon_ctx *context); }; @@ -552,6 +606,27 @@ struct damon_callback { void (*before_terminate)(struct damon_ctx *context); }; +/* + * struct damon_call_control - Control damon_call(). + * + * @fn: Function to be called back. + * @data: Data that will be passed to @fn. + * @return_code: Return code from @fn invocation. + * + * Control damon_call(), which requests specific kdamond to invoke a given + * function. Refer to damon_call() for more details. + */ +struct damon_call_control { + int (*fn)(void *data); + void *data; + int return_code; +/* private: internal use only */ + /* informs if the kdamond finished handling of the request */ + struct completion completion; + /* informs if the kdamond canceled @fn infocation */ + bool canceled; +}; + /** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * @@ -632,6 +707,12 @@ struct damon_ctx { /* for scheme quotas prioritization */ unsigned long *regions_score_histogram; + struct damon_call_control *call_control; + struct mutex call_control_lock; + + struct damos_walk_control *walk_control; + struct mutex walk_control_lock; + /* public: */ struct task_struct *kdamond; struct mutex kdamond_lock; @@ -725,7 +806,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); struct damos_filter *damos_new_filter(enum damos_filter_type type, - bool matching); + bool matching, bool allow); void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); @@ -779,6 +860,9 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); +int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); +int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); + int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end); diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 6639f48dac36..800dcc360db2 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -29,25 +29,39 @@ struct task_delay_info { * XXX_delay contains the accumulated delay time in nanoseconds. */ u64 blkio_start; + u64 blkio_delay_max; + u64 blkio_delay_min; u64 blkio_delay; /* wait for sync block io completion */ u64 swapin_start; + u64 swapin_delay_max; + u64 swapin_delay_min; u64 swapin_delay; /* wait for swapin */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ u32 swapin_count; /* total count of swapin */ u64 freepages_start; + u64 freepages_delay_max; + u64 freepages_delay_min; u64 freepages_delay; /* wait for memory reclaim */ u64 thrashing_start; + u64 thrashing_delay_max; + u64 thrashing_delay_min; u64 thrashing_delay; /* wait for thrashing page */ u64 compact_start; + u64 compact_delay_max; + u64 compact_delay_min; u64 compact_delay; /* wait for memory compact */ u64 wpcopy_start; + u64 wpcopy_delay_max; + u64 wpcopy_delay_min; u64 wpcopy_delay; /* wait for write-protect copy */ + u64 irq_delay_max; + u64 irq_delay_min; u64 irq_delay; /* wait for IRQ/SOFTIRQ */ u32 freepages_count; /* total count of memory reclaim */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..58a618853574 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -322,6 +322,7 @@ struct readahead_control; #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -356,7 +357,8 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC"}, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -2127,6 +2129,8 @@ struct file_operations { #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) +/* File system supports uncached read/write buffered IO */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, @@ -2874,6 +2878,8 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end); static inline int file_write_and_wait(struct file *file) { @@ -2906,6 +2912,11 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; + } else if (iocb->ki_flags & IOCB_DONTCACHE) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, + iocb->ki_pos + count); } return count; @@ -3614,6 +3625,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b0fe9f62d15b..6bb1a5a7a4ae 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -212,35 +212,31 @@ struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) -unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); -#define alloc_pages_bulk_array_mempolicy(...) \ - alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) +#define alloc_pages_bulk_mempolicy(...) \ + alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ -#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \ - __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL) - -#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ - __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array) +#define alloc_pages_bulk(_gfp, _nr_pages, _page_array) \ + __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long -alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, +alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array); + return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } -#define alloc_pages_bulk_array_node(...) \ - alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__)) +#define alloc_pages_bulk_node(...) \ + alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__)) static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { @@ -300,8 +296,6 @@ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); @@ -312,11 +306,6 @@ static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } -static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid) -{ - return alloc_pages_noprof(gfp, order); -} static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); @@ -331,7 +320,6 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) -#define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b94c2e8ee918..93e509b6c00e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -121,6 +121,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_ZSWPOUT, MTHP_STAT_SWPIN, + MTHP_STAT_SWPIN_FALLBACK, + MTHP_STAT_SWPIN_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ae4fe8615bb6..ec8c0ccc8f95 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -153,11 +153,11 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -bool isolate_hugetlb(struct folio *folio, struct list_head *list); +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void folio_putback_active_hugetlb(struct folio *folio); +void folio_putback_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -414,7 +414,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) +static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { return false; } @@ -430,7 +430,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void folio_putback_active_hugetlb(struct folio *folio) +static inline void folio_putback_hugetlb(struct folio *folio) { } @@ -681,8 +681,9 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); + unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback); @@ -1059,9 +1060,15 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } +static inline int replace_free_hugepage_folios(unsigned long start_pfn, + unsigned long end_pfn) +{ + return 0; +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, - int avoid_reserve) + bool cow_from_owner) { return NULL; } diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 6bbfc8aa42e8..890011071f2b 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -153,7 +153,7 @@ static __always_inline void kasan_unpoison_new_object(struct kmem_cache *cache, void __kasan_poison_new_object(struct kmem_cache *cache, void *object); /** - * kasan_unpoison_new_object - Repoison a new slab object. + * kasan_poison_new_object - Repoison a new slab object. * @cache: Cache the object belong to. * @object: Pointer to the object. * @@ -491,7 +491,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); -void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ @@ -509,7 +508,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache, static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} -static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 6a53ac4885bb..d73095b5cd96 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -93,6 +93,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); +bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 05c166811f6b..fe739d35a864 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -91,13 +91,24 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing - * nothing. Therefore the caller does not need to keep state about whether or - * not the element already belongs in the list and is allowed to lazy update - * it. Note however that this is valid for *a* list, not *this* list. If - * the caller organize itself in a way that elements can be in more than - * one type of list, it is up to the caller to fully remove the item from - * the previous list (with list_lru_del() for instance) before moving it - * to @lru. + * nothing. This means that it is not necessary to keep state about whether or + * not the element already belongs in the list. That said, this logic only + * works if the item is in *this* list. If the item might be in some other + * list, then you cannot rely on this check and you must remove it from the + * other list before trying to insert it. + * + * The lru list consists of many sublists internally; the @nid and @memcg + * parameters are used to determine which sublist to insert the item into. + * It's important to use the right value of @nid and @memcg when deleting the + * item, since it might otherwise get deleted from the wrong sublist. + * + * This also applies when attempting to insert the item multiple times - if + * the item is currently in one sublist and you call list_lru_add() again, you + * must pass the right @nid and @memcg parameters so that the same sublist is + * used. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). * * Return: true if the list was updated, false otherwise */ @@ -113,7 +124,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); @@ -125,8 +136,19 @@ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); * @memcg: the cgroup of the sublist to delete the item from. * * This function works analogously as list_lru_add() in terms of list - * manipulation. The comments about an element already pertaining to - * a list are also valid for list_lru_del(). + * manipulation. + * + * The comments in list_lru_add() about an element already being in a list are + * also valid for list_lru_del(), that is, you can delete an item that has + * already been removed or never been added. However, if the item is in a + * list, it must be in *this* list, and you must pass the right value of @nid + * and @memcg so that the right sublist is used. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). When a memcg is deleted, list_lru entries + * are automatically moved to the parent memcg. This is done in a race-free + * way, so during deletion of an memcg both the old and new memcg will resolve + * to the same sublist internally. * * Return: true if the list was updated, false otherwise */ @@ -142,7 +164,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * - * Return value: true if the list was updated, false otherwise. + * Return: true if the list was updated, false otherwise. */ bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 673d5cae7c81..e79eb6ac516f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -378,6 +378,10 @@ static inline int memblock_get_region_node(const struct memblock_region *r) /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 +/* + * MEMBLOCK_ALLOC_NOLEAKTRACE avoids kmemleak tracing. It implies + * MEMBLOCK_ALLOC_ACCESSIBLE + */ #define MEMBLOCK_ALLOC_NOLEAKTRACE 1 /* We are using top down, so it is safe to use 0 here */ @@ -417,6 +421,12 @@ static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } +void *__memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func); + +#define memblock_alloc_or_panic(size, align) \ + __memblock_alloc_or_panic(size, align, __func__) + static inline void *memblock_alloc_raw(phys_addr_t size, phys_addr_t align) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5502aa8e138e..6e74b8254d9b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -620,8 +620,6 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, page_counter_read(&memcg->memory); } -void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); - int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** @@ -646,8 +644,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } -int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, - long nr_pages); +int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); @@ -677,7 +674,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) __mem_cgroup_uncharge_folios(folios); } -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); @@ -1046,6 +1042,23 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, int old_order, int new_order); +static inline u64 cgroup_id_from_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + u64 id; + + if (mem_cgroup_disabled()) + return 0; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (!memcg) + memcg = root_mem_cgroup; + id = cgroup_id(memcg->css.cgroup); + rcu_read_unlock(); + return id; +} + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1135,21 +1148,15 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, return false; } -static inline void mem_cgroup_commit_charge(struct folio *folio, - struct mem_cgroup *memcg) -{ -} - static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { return 0; } -static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, - gfp_t gfp, long nr_pages) +static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) { - return 0; + return 0; } static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, @@ -1170,11 +1177,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } -static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) -{ -} - static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { @@ -1466,6 +1468,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } + +static inline u64 cgroup_id_from_mm(struct mm_struct *mm) +{ + return 0; +} #endif /* CONFIG_MEMCG */ /* diff --git a/include/linux/memfd.h b/include/linux/memfd.h index d437e3070850..246daadbfde8 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,7 +7,14 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); -unsigned int *memfd_file_seals_ptr(struct file *file); +/* + * Check for any existing seals on mmap, return an error if access is denied due + * to sealing, or 0 otherwise. + * + * We also update VMA flags if appropriate by manipulating the VMA flags pointed + * to by vm_flags_ptr. + */ +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -17,19 +24,11 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } - -static inline unsigned int *memfd_file_seals_ptr(struct file *file) +static inline int memfd_check_seals_mmap(struct file *file, + unsigned long *vm_flags_ptr) { - return NULL; + return 0; } #endif -/* Retrieve memfd seals associated with the file, if any. */ -static inline unsigned int memfd_file_seals(struct file *file) -{ - unsigned int *sealsp = memfd_file_seals_ptr(file); - - return sealsp ? *sealsp : 0; -} - #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index b27ddce5d324..eaac5ae8c05c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -144,8 +144,6 @@ extern u64 max_mem_size; extern int mhp_online_type_from_str(const char *str); -/* Default online_type (MMOP_*) when new memory blocks are added. */ -extern int mhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) @@ -303,6 +301,9 @@ static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_MEMORY_HOTPLUG +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int mhp_get_default_online_type(void); +extern void mhp_set_default_online_type(int online_type); extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 002e49b2ebd9..29919faea2f1 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -144,16 +144,14 @@ const struct movable_operations *page_movable_ops(struct page *page) #ifdef CONFIG_NUMA_BALANCING int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node); -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node); +int migrate_misplaced_folio(struct folio *folio, int node); #else static inline int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node) { return -EAGAIN; /* can't migrate now */ } -static inline int migrate_misplaced_folio(struct folio *folio, - struct vm_area_struct *vma, int node) +static inline int migrate_misplaced_folio(struct folio *folio, int node) { return -EAGAIN; /* can't migrate now */ } diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index e781727c8916..55bfe670bbb9 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -6,6 +6,17 @@ #include #include +/* + * The Min Heap API provides utilities for managing min-heaps, a binary tree + * structure where each node's value is less than or equal to its children's + * values, ensuring the smallest element is at the root. + * + * Users should avoid directly calling functions prefixed with __min_heap_*(). + * Instead, use the provided macro wrappers. + * + * For further details and examples, refer to Documentation/core-api/min_heap.rst. + */ + /** * Data structure to hold a min-heap. * @nr: Number of elements currently in the heap. @@ -218,7 +229,7 @@ void __min_heap_init_inline(min_heap_char *heap, void *data, int size) } #define min_heap_init_inline(_heap, _data, _size) \ - __min_heap_init_inline((min_heap_char *)_heap, _data, _size) + __min_heap_init_inline(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size) /* Get the minimum element from the heap. */ static __always_inline @@ -228,7 +239,8 @@ void *__min_heap_peek_inline(struct min_heap_char *heap) } #define min_heap_peek_inline(_heap) \ - (__minheap_cast(_heap) __min_heap_peek_inline((min_heap_char *)_heap)) + (__minheap_cast(_heap) \ + __min_heap_peek_inline(container_of(&(_heap)->nr, min_heap_char, nr))) /* Check if the heap is full. */ static __always_inline @@ -238,7 +250,7 @@ bool __min_heap_full_inline(min_heap_char *heap) } #define min_heap_full_inline(_heap) \ - __min_heap_full_inline((min_heap_char *)_heap) + __min_heap_full_inline(container_of(&(_heap)->nr, min_heap_char, nr)) /* Sift the element at pos down the heap. */ static __always_inline @@ -277,8 +289,8 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, } #define min_heap_sift_down_inline(_heap, _pos, _func, _args) \ - __min_heap_sift_down_inline((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_sift_down_inline(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \ + __minheap_obj_size(_heap), _func, _args) /* Sift up ith element from the heap, O(log2(nr)). */ static __always_inline @@ -304,8 +316,8 @@ void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx } #define min_heap_sift_up_inline(_heap, _idx, _func, _args) \ - __min_heap_sift_up_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ - _func, _args) + __min_heap_sift_up_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) /* Floyd's approach to heapification that is O(nr). */ static __always_inline @@ -319,7 +331,8 @@ void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size, } #define min_heapify_all_inline(_heap, _func, _args) \ - __min_heapify_all_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heapify_all_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline @@ -340,7 +353,8 @@ bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size, } #define min_heap_pop_inline(_heap, _func, _args) \ - __min_heap_pop_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heap_pop_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) /* * Remove the minimum element and then push the given element. The @@ -356,8 +370,8 @@ void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t } #define min_heap_pop_push_inline(_heap, _element, _func, _args) \ - __min_heap_pop_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_pop_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) /* Push an element on to the heap, O(log2(nr)). */ static __always_inline @@ -382,8 +396,8 @@ bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t ele } #define min_heap_push_inline(_heap, _element, _func, _args) \ - __min_heap_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) /* Remove ith element from the heap, O(log2(nr)). */ static __always_inline @@ -411,8 +425,8 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, } #define min_heap_del_inline(_heap, _idx, _func, _args) \ - __min_heap_del_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ - _func, _args) + __min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) void __min_heap_init(min_heap_char *heap, void *data, int size); void *__min_heap_peek(struct min_heap_char *heap); @@ -433,25 +447,31 @@ bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, const struct min_heap_callbacks *func, void *args); #define min_heap_init(_heap, _data, _size) \ - __min_heap_init((min_heap_char *)_heap, _data, _size) + __min_heap_init(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size) #define min_heap_peek(_heap) \ - (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) + (__minheap_cast(_heap) __min_heap_peek(container_of(&(_heap)->nr, min_heap_char, nr))) #define min_heap_full(_heap) \ - __min_heap_full((min_heap_char *)_heap) + __min_heap_full(container_of(&(_heap)->nr, min_heap_char, nr)) #define min_heap_sift_down(_heap, _pos, _func, _args) \ - __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) + __min_heap_sift_down(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_sift_up(_heap, _idx, _func, _args) \ - __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + __min_heap_sift_up(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) #define min_heapify_all(_heap, _func, _args) \ - __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heapify_all(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_pop(_heap, _func, _args) \ - __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heap_pop(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_pop_push(_heap, _element, _func, _args) \ - __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_pop_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_push(_heap, _element, _func, _args) \ - __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) + __min_heap_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_del(_heap, _idx, _func, _args) \ - __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + __min_heap_del(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) #endif /* _LINUX_MIN_HEAP_H */ diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 98008dd92153..eaaf5c008e4d 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -8,13 +8,10 @@ #include /* - * min()/max()/clamp() macros must accomplish three things: + * min()/max()/clamp() macros must accomplish several things: * * - Avoid multiple evaluations of the arguments (so side-effects like * "x++" happen only once) when non-constant. - * - Retain result as a constant expressions when called with only - * constant expressions (to avoid tripping VLA warnings in stack - * allocation usage). * - Perform signed v unsigned type-checking (to generate compile * errors instead of nasty runtime surprises). * - Unsigned char/short are always promoted to signed int and can be @@ -31,58 +28,54 @@ * bit #0 set if ok for unsigned comparisons * bit #1 set if ok for signed comparisons * - * In particular, statically non-negative signed integer - * expressions are ok for both. + * In particular, statically non-negative signed integer expressions + * are ok for both. * - * NOTE! Unsigned types smaller than 'int' are implicitly - * converted to 'int' in expressions, and are accepted for - * signed conversions for now. This is debatable. + * NOTE! Unsigned types smaller than 'int' are implicitly converted to 'int' + * in expressions, and are accepted for signed conversions for now. + * This is debatable. * - * Note that 'x' is the original expression, and 'ux' is - * the unique variable that contains the value. + * Note that 'x' is the original expression, and 'ux' is the unique variable + * that contains the value. * - * We use 'ux' for pure type checking, and 'x' for when - * we need to look at the value (but without evaluating - * it for side effects! Careful to only ever evaluate it - * with sizeof() or __builtin_constant_p() etc). + * We use 'ux' for pure type checking, and 'x' for when we need to look at the + * value (but without evaluating it for side effects! + * Careful to only ever evaluate it with sizeof() or __builtin_constant_p() etc). * - * Pointers end up being checked by the normal C type - * rules at the actual comparison, and these expressions - * only need to be careful to not cause warnings for - * pointer use. + * Pointers end up being checked by the normal C type rules at the actual + * comparison, and these expressions only need to be careful to not cause + * warnings for pointer use. */ -#define __signed_type_use(x,ux) (2+__is_nonneg(x,ux)) -#define __unsigned_type_use(x,ux) (1+2*(sizeof(ux)<4)) -#define __sign_use(x,ux) (is_signed_type(typeof(ux))? \ - __signed_type_use(x,ux):__unsigned_type_use(x,ux)) +#define __sign_use(ux) (is_signed_type(typeof(ux)) ? \ + (2 + __is_nonneg(ux)) : (1 + 2 * (sizeof(ux) < 4))) /* - * To avoid warnings about casting pointers to integers - * of different sizes, we need that special sign type. + * Check whether a signed value is always non-negative. * - * On 64-bit we can just always use 'long', since any - * integer or pointer type can just be cast to that. + * A cast is needed to avoid any warnings from values that aren't signed + * integer types (in which case the result doesn't matter). * - * This does not work for 128-bit signed integers since - * the cast would truncate them, but we do not use s128 - * types in the kernel (we do use 'u128', but they will - * be handled by the !is_signed_type() case). + * On 64-bit any integer or pointer type can safely be cast to 'long long'. + * But on 32-bit we need to avoid warnings about casting pointers to integers + * of different sizes without truncating 64-bit values so 'long' or 'long long' + * must be used depending on the size of the value. * - * NOTE! The cast is there only to avoid any warnings - * from when values that aren't signed integer types. + * This does not work for 128-bit signed integers since the cast would truncate + * them, but we do not use s128 types in the kernel (we do use 'u128', + * but they are handled by the !is_signed_type() case). */ -#ifdef CONFIG_64BIT - #define __signed_type(ux) long +#if __SIZEOF_POINTER__ == __SIZEOF_LONG_LONG__ +#define __is_nonneg(ux) statically_true((long long)(ux) >= 0) #else - #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux)>4,1LL,1L)) +#define __is_nonneg(ux) statically_true( \ + (typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)))(ux) >= 0) #endif -#define __is_nonneg(x,ux) statically_true((__signed_type(ux))(x)>=0) -#define __types_ok(x,y,ux,uy) \ - (__sign_use(x,ux) & __sign_use(y,uy)) +#define __types_ok(ux, uy) \ + (__sign_use(ux) & __sign_use(uy)) -#define __types_ok3(x,y,z,ux,uy,uz) \ - (__sign_use(x,ux) & __sign_use(y,uy) & __sign_use(z,uz)) +#define __types_ok3(ux, uy, uz) \ + (__sign_use(ux) & __sign_use(uy) & __sign_use(uz)) #define __cmp_op_min < #define __cmp_op_max > @@ -97,30 +90,13 @@ #define __careful_cmp_once(op, x, y, ux, uy) ({ \ __auto_type ux = (x); __auto_type uy = (y); \ - BUILD_BUG_ON_MSG(!__types_ok(x,y,ux,uy), \ + BUILD_BUG_ON_MSG(!__types_ok(ux, uy), \ #op"("#x", "#y") signedness error"); \ __cmp(op, ux, uy); }) #define __careful_cmp(op, x, y) \ __careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) -#define __clamp(val, lo, hi) \ - ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) - -#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ - __auto_type uval = (val); \ - __auto_type ulo = (lo); \ - __auto_type uhi = (hi); \ - static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ - (lo) <= (hi), true), \ - "clamp() low limit " #lo " greater than high limit " #hi); \ - BUILD_BUG_ON_MSG(!__types_ok3(val,lo,hi,uval,ulo,uhi), \ - "clamp("#val", "#lo", "#hi") signedness error"); \ - __clamp(uval, ulo, uhi); }) - -#define __careful_clamp(val, lo, hi) \ - __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) - /** * min - return minimum of two values of the same or compatible types * @x: first value @@ -154,7 +130,7 @@ #define __careful_op3(op, x, y, z, ux, uy, uz) ({ \ __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\ - BUILD_BUG_ON_MSG(!__types_ok3(x,y,z,ux,uy,uz), \ + BUILD_BUG_ON_MSG(!__types_ok3(ux, uy, uz), \ #op"3("#x", "#y", "#z") signedness error"); \ __cmp(op, ux, __cmp(op, uy, uz)); }) @@ -176,34 +152,6 @@ #define max3(x, y, z) \ __careful_op3(max, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_)) -/** - * min_not_zero - return the minimum that is _not_ zero, unless both are zero - * @x: value1 - * @y: value2 - */ -#define min_not_zero(x, y) ({ \ - typeof(x) __x = (x); \ - typeof(y) __y = (y); \ - __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) - -/** - * clamp - return a value clamped to a given range with strict typechecking - * @val: current value - * @lo: lowest allowable value - * @hi: highest allowable value - * - * This macro does strict typechecking of @lo/@hi to make sure they are of the - * same type as @val. See the unnecessary pointer comparisons. - */ -#define clamp(val, lo, hi) __careful_clamp(val, lo, hi) - -/* - * ..and if you can't take the strict - * types, you can specify one yourself. - * - * Or not use min/max/clamp at all, of course. - */ - /** * min_t - return minimum of two values, using the specified type * @type: data type to use @@ -220,6 +168,68 @@ */ #define max_t(type, x, y) __cmp_once(max, type, x, y) +/** + * min_not_zero - return the minimum that is _not_ zero, unless both are zero + * @x: value1 + * @y: value2 + */ +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) + +#define __clamp(val, lo, hi) \ + ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) + +#define __clamp_once(type, val, lo, hi, uval, ulo, uhi) ({ \ + type uval = (val); \ + type ulo = (lo); \ + type uhi = (hi); \ + BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ + "clamp() low limit " #lo " greater than high limit " #hi); \ + BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ + "clamp("#val", "#lo", "#hi") signedness error"); \ + __clamp(uval, ulo, uhi); }) + +#define __careful_clamp(type, val, lo, hi) \ + __clamp_once(type, val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) + +/** + * clamp - return a value clamped to a given range with typechecking + * @val: current value + * @lo: lowest allowable value + * @hi: highest allowable value + * + * This macro checks @val/@lo/@hi to make sure they have compatible + * signedness. + */ +#define clamp(val, lo, hi) __careful_clamp(__auto_type, val, lo, hi) + +/** + * clamp_t - return a value clamped to a given range using a given type + * @type: the type of variable to use + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of type + * @type to make all the comparisons. + */ +#define clamp_t(type, val, lo, hi) __careful_clamp(type, val, lo, hi) + +/** + * clamp_val - return a value clamped to a given range using val's type + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of whatever + * type the input argument @val is. This is useful when @val is an unsigned + * type and @lo and @hi are literals that will otherwise be assigned a signed + * integer type. + */ +#define clamp_val(val, lo, hi) __careful_clamp(typeof(val), val, lo, hi) + /* * Do not check the array parameter using __must_be_array(). * In the following legit use-case where the "array" passed is a simple pointer, @@ -263,31 +273,6 @@ */ #define max_array(array, len) __minmax_array(max, array, len) -/** - * clamp_t - return a value clamped to a given range using a given type - * @type: the type of variable to use - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of type - * @type to make all the comparisons. - */ -#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi)) - -/** - * clamp_val - return a value clamped to a given range using val's type - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of whatever - * type the input argument @val is. This is useful when @val is an unsigned - * type and @lo and @hi are literals that will otherwise be assigned a signed - * integer type. - */ -#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) - static inline bool in_range64(u64 val, u64 start, u64 len) { return (val - start) < len; @@ -326,9 +311,9 @@ static inline bool in_range32(u32 val, u32 start, u32 len) * Use these carefully: no type checking, and uses the arguments * multiple times. Use for obvious constants only. */ -#define MIN(a,b) __cmp(min,a,b) -#define MAX(a,b) __cmp(max,a,b) -#define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b)) -#define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b)) +#define MIN(a, b) __cmp(min, a, b) +#define MAX(a, b) __cmp(max, a, b) +#define MIN_T(type, a, b) __cmp(min, (type)(a), (type)(b)) +#define MAX_T(type, a, b) __cmp(max, (type)(a), (type)(b)) #endif /* _LINUX_MINMAX_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index b1c3db9cf355..ac78425e9838 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -257,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code, struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); -/* Use only if VMA has no other users */ -void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; @@ -697,13 +696,54 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key lockdep_key; + + lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); +#endif + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); + vma->vm_lock_seq = UINT_MAX; +} + +static inline bool is_vma_writer_only(int refcnt) +{ + /* + * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma + * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on + * a detached vma happens only in vma_mark_detached() and is a rare + * case, therefore most of the time there will be no unnecessary wakeup. + */ + return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; +} + +static inline void vma_refcount_put(struct vm_area_struct *vma) +{ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *mm = vma->vm_mm; + int oldcnt; + + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { + + if (is_vma_writer_only(oldcnt - 1)) + rcuwait_wake_up(&mm->vma_writer_wait); + } +} + /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. */ -static inline bool vma_start_read(struct vm_area_struct *vma) +static inline bool vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) { + int oldcnt; + /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need @@ -711,16 +751,22 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) - return false; - - if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) return false; /* - * Overflow might produce false locked result. + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited() will fail + * because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + */ + if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) + return false; + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + /* + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are @@ -728,22 +774,51 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { - up_read(&vma->vm_lock->lock); + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); return false; } + return true; } +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +{ + int oldcnt; + + mmap_assert_locked(vma->vm_mm); + if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) + return false; + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + return true; +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked(struct vm_area_struct *vma) +{ + return vma_start_read_locked_nested(vma, 0); +} + static inline void vma_end_read(struct vm_area_struct *vma) { - rcu_read_lock(); /* keeps vma alive till the end of up_read */ - up_read(&vma->vm_lock->lock); - rcu_read_unlock(); + vma_refcount_put(vma); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -751,10 +826,12 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); + /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently @@ -762,43 +839,51 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - down_write(&vma->vm_lock->lock); - /* - * We should use WRITE_ONCE() here because we can have concurrent reads - * from the early lockless pessimistic check in vma_start_read(). - * We don't really care about the correctness of that early check, but - * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. - */ - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock->lock); + __vma_start_write(vma, mm_lock_seq); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } static inline void vma_assert_locked(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_lock->lock)) + if (refcount_read(&vma->vm_refcnt) <= 1) vma_assert_write_locked(vma); } -static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ +static inline void vma_assert_attached(struct vm_area_struct *vma) { - /* When detaching vma should be write-locked */ - if (detached) - vma_assert_write_locked(vma); - vma->detached = detached; + VM_BUG_ON_VMA(!refcount_read(&vma->vm_refcnt), vma); } +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt), vma); +} + +static inline void vma_mark_attached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set(&vma->vm_refcnt, 1); +} + +void vma_mark_detached(struct vm_area_struct *vma); + static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) @@ -820,14 +905,17 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ -static inline bool vma_start_read(struct vm_area_struct *vma) +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} +static inline bool vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } -static inline void vma_mark_detached(struct vm_area_struct *vma, - bool detached) {} +static inline void vma_assert_attached(struct vm_area_struct *vma) {} +static inline void vma_assert_detached(struct vm_area_struct *vma) {} +static inline void vma_mark_attached(struct vm_area_struct *vma) {} +static inline void vma_mark_detached(struct vm_area_struct *vma) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) @@ -854,18 +942,13 @@ static inline void assert_fault_locked(struct vm_fault *vmf) extern const struct vm_operations_struct vma_dummy_vm_ops; -/* - * WARNING: vma_init does not initialize vma->vm_lock. - * Use vm_area_alloc()/vm_area_free() if vma needs locking. - */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_mark_detached(vma, false); - vma_numab_state_init(vma); + vma_lock_init(vma, false); } /* Use when VMA is not part of the VMA tree and needs no locking */ @@ -1058,6 +1141,7 @@ static inline int vma_iter_bulk_store(struct vma_iterator *vmi, if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; + vma_mark_attached(vma); return 0; } @@ -2320,6 +2404,7 @@ extern void pagefault_out_of_memory(void); struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ + bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; @@ -2991,18 +3076,15 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) +static inline void __pagetable_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); - if (!ptlock_init(ptdesc)) - return false; __folio_set_pgtable(folio); lruvec_stat_add_folio(folio, NR_PAGETABLE); - return true; } -static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) +static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3011,6 +3093,20 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } +static inline void pagetable_dtor_free(struct ptdesc *ptdesc) +{ + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} + +static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) +{ + if (!ptlock_init(ptdesc)) + return false; + __pagetable_ctor(ptdesc); + return true; +} + pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) @@ -3087,14 +3183,6 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) return ptlock_init(ptdesc); } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); -#endif - ptlock_free(ptdesc); -} - #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else @@ -3105,7 +3193,6 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -3120,25 +3207,13 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - if (!pmd_ptlock_init(ptdesc)) return false; - __folio_set_pgtable(folio); ptdesc_pmd_pts_init(ptdesc); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); return true; } -static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - pmd_ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be @@ -3160,18 +3235,17 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); } -static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) +static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + __pagetable_ctor(ptdesc); +} - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); +static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); @@ -3324,6 +3398,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, @@ -3371,9 +3447,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return __get_unmapped_area(file, addr, len, pgoff, flags, 0); } -extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, @@ -3437,9 +3510,6 @@ extern unsigned long stack_guard_gap; int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); -/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address); - /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, @@ -4096,67 +4166,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, extern int sysctl_nr_trim_pages; -#ifdef CONFIG_PRINTK -void mem_dump_obj(void *object); -#else -static inline void mem_dump_obj(void *object) {} -#endif - -static inline bool is_write_sealed(int seals) -{ - return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); -} - -/** - * is_readonly_sealed - Checks whether write-sealed but mapped read-only, - * in which case writes should be disallowing moving - * forwards. - * @seals: the seals to check - * @vm_flags: the VMA flags to check - * - * Returns whether readonly sealed, in which case writess should be disallowed - * going forward. - */ -static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) -{ - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (is_write_sealed(seals) && - ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) - return true; - - return false; -} - -/** - * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and - * handle them. - * @seals: the seals to check - * @vma: the vma to operate on - * - * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper - * check/handling on the vma flags. Return 0 if check pass, or <0 for errors. - */ -static inline int seal_check_write(int seals, struct vm_area_struct *vma) -{ - if (!is_write_sealed(seals)) - return 0; - - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - return 0; -} - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 1b6a917fffa4..f9157a0c42a5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -133,31 +133,25 @@ static inline int lru_hist_from_seq(unsigned long seq) return seq % NR_HIST_GENS; } -static inline int lru_tier_from_refs(int refs) +static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); - /* see the comment in folio_lru_refs() */ - return order_base_2(refs + 1); + /* see the comment on MAX_NR_TIERS */ + return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); - bool workingset = flags & BIT(PG_workingset); + if (!(flags & BIT(PG_referenced))) + return 0; /* - * Return the number of accesses beyond PG_referenced, i.e., N-1 if the - * total number of accesses is N>1, since N=0,1 both map to the first - * tier. lru_tier_from_refs() will account for this off-by-one. Also see - * the comment on MAX_NR_TIERS. + * Return the total number of accesses including PG_referenced. Also see + * the comment on LRU_REFS_FLAGS. */ - return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; -} - -static inline void folio_clear_lru_refs(struct folio *folio) -{ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) @@ -223,11 +217,43 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } +static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, + bool reclaiming) +{ + int gen; + int type = folio_is_file_lru(folio); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + /* + * +-----------------------------------+-----------------------------------+ + * | Accessed through page tables and | Accessed through file descriptors | + * | promoted by folio_update_gen() | and protected by folio_inc_gen() | + * +-----------------------------------+-----------------------------------+ + * | PG_active (set while isolated) | | + * +-----------------+-----------------+-----------------+-----------------+ + * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | + * +-----------------------------------+-----------------------------------+ + * |<---------- MIN_NR_GENS ---------->| | + * |<---------------------------- MAX_NR_GENS ---------------------------->| + */ + if (folio_test_active(folio)) + gen = MIN_NR_GENS - folio_test_workingset(folio); + else if (reclaiming) + gen = MAX_NR_GENS; + else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || + (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio)))) + gen = MIN_NR_GENS; + else + gen = MAX_NR_GENS - folio_test_workingset(folio); + + return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; - unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -237,40 +263,12 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, if (folio_test_unevictable(folio) || !lrugen->enabled) return false; - /* - * There are four common cases for this page: - * 1. If it's hot, i.e., freshly faulted in, add it to the youngest - * generation, and it's protected over the rest below. - * 2. If it can't be evicted immediately, i.e., a dirty page pending - * writeback, add it to the second youngest generation. - * 3. If it should be evicted first, e.g., cold and clean from - * folio_rotate_reclaimable(), add it to the oldest generation. - * 4. Everything else falls between 2 & 3 above and is added to the - * second oldest generation if it's considered inactive, or the - * oldest generation otherwise. See lru_gen_is_active(). - */ - if (folio_test_active(folio)) - seq = lrugen->max_seq; - else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || - (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio)))) - seq = lrugen->max_seq - 1; - else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) - seq = lrugen->min_seq[type]; - else - seq = lrugen->min_seq[type] + 1; + seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - mask = LRU_GEN_MASK; - /* - * Don't clear PG_workingset here because it can affect PSI accounting - * if the activation is due to workingset refault. - */ - if (folio_test_active(folio)) - mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); - set_mask_bits(&folio->flags, mask, flags); + set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ @@ -564,9 +562,9 @@ static inline pte_marker copy_pte_marker( * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * - * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. + * Returns true if an uffd-wp pte was installed, false otherwise. */ -static inline void +static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { @@ -583,7 +581,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) - return; + return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) @@ -596,10 +594,13 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; - if (unlikely(arm_uffd_pte)) + if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); + return true; + } #endif + return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 332cee285662..d366ec6302e6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -438,7 +439,9 @@ FOLIO_MATCH(compound_head, _head_2a); * struct ptdesc - Memory descriptor for page tables. * @__page_flags: Same as page flags. Powerpc only. * @pt_rcu_head: For freeing page table pages. - * @pt_list: List of used page tables. Used for s390 and x86. + * @pt_list: List of used page tables. Used for s390 gmap shadow pages + * (which are not linked into the user page tables) and x86 + * pgds. * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. @@ -571,6 +574,12 @@ static inline void *folio_get_private(struct folio *folio) typedef unsigned long vm_flags_t; +/* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that @@ -627,9 +636,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) } #endif -struct vma_lock { - struct rw_semaphore lock; -}; +#define VMA_LOCK_OFFSET 0x40000000 +#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1) struct vma_numab_state { /* @@ -675,6 +683,9 @@ struct vma_numab_state { * * Only explicitly marked struct members may be accessed by RCU readers before * getting a stable reference. + * + * WARNING: when adding new members, please update vm_area_init_from() to copy + * them during vm_area_struct content duplication. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -685,9 +696,7 @@ struct vm_area_struct { unsigned long vm_start; unsigned long vm_end; }; -#ifdef CONFIG_PER_VMA_LOCK - struct rcu_head vm_rcu; /* Used for deferred freeing. */ -#endif + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ }; /* @@ -707,19 +716,13 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* - * Flag to indicate areas detached from the mm->mm_mt tree. - * Unstable RCU readers are allowed to read this. - */ - bool detached; - /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_lock->lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_lock->lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -727,21 +730,8 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; - /* Unstable RCU readers are allowed to read this. */ - struct vma_lock *vm_lock; + unsigned int vm_lock_seq; #endif - - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree. - * - */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma @@ -761,14 +751,6 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ -#ifdef CONFIG_ANON_VMA_NAME - /* - * For private and shared anonymous mappings, a pointer to a null - * terminated string containing the name given to the vma, or NULL if - * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. - */ - struct anon_vma_name *anon_name; -#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif @@ -780,6 +762,30 @@ struct vm_area_struct { #endif #ifdef CONFIG_NUMA_BALANCING struct vma_numab_state *numab_state; /* NUMA Balancing state */ +#endif +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + refcount_t vm_refcnt ____cacheline_aligned_in_smp; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map vmlock_dep_map; +#endif +#endif + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; @@ -915,12 +921,16 @@ struct mm_struct { * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK + struct rcuwait vma_writer_wait; /* * This field has lock-like semantics, meaning it is sometimes * accessed with ACQUIRE/RELEASE semantics. * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -929,7 +939,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba..4706c6769902 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,6 +71,85 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK + +static inline void mm_lock_seqcount_init(struct mm_struct *mm) +{ + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); +} + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + /* + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. + */ + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + +static inline void mmap_write_lock(struct mm_struct *mm) +{ + __mmap_lock_trace_start_locking(mm, true); + down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); + __mmap_lock_trace_acquire_returned(mm, true, true); +} + +static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) +{ + __mmap_lock_trace_start_locking(mm, true); + down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); + __mmap_lock_trace_acquire_returned(mm, true, true); +} + +static inline int mmap_write_lock_killable(struct mm_struct *mm) +{ + int ret; + + __mmap_lock_trace_start_locking(mm, true); + ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); + __mmap_lock_trace_acquire_returned(mm, true, ret == 0); + return ret; +} + /* * Drop all currently-held per-VMA locks. * This is called from the mmap_lock implementation directly before releasing @@ -82,46 +161,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); - /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). - */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); -} -#else -static inline void vma_end_write_all(struct mm_struct *mm) {} -#endif - -static inline void mmap_init_lock(struct mm_struct *mm) -{ - init_rwsem(&mm->mmap_lock); -} - -static inline void mmap_write_lock(struct mm_struct *mm) -{ - __mmap_lock_trace_start_locking(mm, true); - down_write(&mm->mmap_lock); - __mmap_lock_trace_acquire_returned(mm, true, true); -} - -static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) -{ - __mmap_lock_trace_start_locking(mm, true); - down_write_nested(&mm->mmap_lock, subclass); - __mmap_lock_trace_acquire_returned(mm, true, true); -} - -static inline int mmap_write_lock_killable(struct mm_struct *mm) -{ - int ret; - - __mmap_lock_trace_start_locking(mm, true); - ret = down_write_killable(&mm->mmap_lock); - __mmap_lock_trace_acquire_returned(mm, true, ret == 0); - return ret; + mm_lock_seqcount_end(mm); } static inline void mmap_write_unlock(struct mm_struct *mm) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index d7cb1e5ecbda..a0a3894900ed 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -9,10 +9,12 @@ struct page; struct vm_area_struct; struct mm_struct; struct vma_iterator; +struct vma_merge_struct; void dump_page(const struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); +void dump_vmg(const struct vma_merge_struct *vmg, const char *reason); void vma_iter_dump_tree(const struct vma_iterator *vmi); #ifdef CONFIG_DEBUG_VM @@ -87,6 +89,15 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_VMG(cond, vmg) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_vmg(vmg, "VM_WARN_ON_VMG(" __stringify(cond)")"); \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -104,9 +115,10 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_MM(cond, mm) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_VMG(cond, vmg) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) -#endif +#endif /* CONFIG_DEBUG_VM */ #ifdef CONFIG_DEBUG_VM_IRQSOFF #define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled()) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b36124145a16..9540b41894da 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -332,66 +332,88 @@ enum lruvec_flags { #endif /* !__GENERATING_BOUNDS_H */ /* - * Evictable pages are divided into multiple generations. The youngest and the + * Evictable folios are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->folios[]. Otherwise it stores 0. + * a folio is on one of lrugen->folios[]. Otherwise it stores 0. * - * A page is added to the youngest generation on faulting. The aging needs to - * check the accessed bit at least twice before handing this page over to the - * eviction. The first check takes care of the accessed bit set on the initial - * fault; the second check makes sure this page hasn't been used since then. - * This process, AKA second chance, requires a minimum of two generations, - * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive - * LRU, e.g., /proc/vmstat, these two generations are considered active; the - * rest of generations, if they exist, are considered inactive. See - * lru_gen_is_active(). + * After a folio is faulted in, the aging needs to check the accessed bit at + * least twice before handing this folio over to the eviction. The first check + * clears the accessed bit from the initial fault; the second check makes sure + * this folio hasn't been used since then. This process, AKA second chance, + * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI + * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two + * generations are considered active; the rest of generations, if they exist, + * are considered inactive. See lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->folios[] so - * that the aging needs not to worry about it. And it's set again when a page - * considered active is isolated for non-reclaiming purposes, e.g., migration. - * See lru_gen_add_folio() and lru_gen_del_folio(). + * PG_active is always cleared while a folio is on one of lrugen->folios[] so + * that the sliding window needs not to worry about it. And it's set again when + * a folio considered active is isolated for non-reclaiming purposes, e.g., + * migration. See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits - * in folio->flags. + * in folio->flags, masked by LRU_GEN_MASK. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* - * Each generation is divided into multiple tiers. A page accessed N times - * through file descriptors is in tier order_base_2(N). A page in the first tier - * (N=0,1) is marked by PG_referenced unless it was faulted in through page - * tables or read ahead. A page in any other tier (N>1) is marked by - * PG_referenced and PG_workingset. This implies a minimum of two tiers is - * supported without using additional bits in folio->flags. + * Each generation is divided into multiple tiers. A folio accessed N times + * through file descriptors is in tier order_base_2(N). A folio in the first + * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page + * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by + * PG_workingset. A folio in any other tier (1flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, - * comparisons of refaulted/(evicted+protected) from the first tier and the - * rest infer whether pages accessed multiple times through file descriptors - * are statistically hot and thus worth protecting. + * comparisons of refaulted/(evicted+protected) from the first tier and the rest + * infer whether folios accessed multiple times through file descriptors are + * statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in - * folio->flags. + * folio->flags, masked by LRU_REFS_MASK. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H -struct lruvec; -struct page_vma_mapped_walk; - #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +/* + * For folios accessed multiple times through file descriptors, + * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags + * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its + * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily + * promoted into the second oldest generation in the eviction path. And when + * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that + * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is + * only valid when PG_referenced is set. + * + * For folios accessed multiple times through page tables, folio_update_gen() + * from a page table walk or lru_gen_set_refs() from a rmap walk sets + * PG_referenced after the accessed bit is cleared for the first time. + * Thereafter, those two paths set PG_workingset and promote folios to the + * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears + * PG_referenced. Note that for this case, LRU_REFS_MASK is not used. + * + * For both cases above, after PG_workingset is set on a folio, it remains until + * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It + * can be set again if lru_gen_test_recent() returns true upon a refault. + */ +#define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced)) + +struct lruvec; +struct page_vma_mapped_walk; + #ifdef CONFIG_LRU_GEN enum { @@ -406,8 +428,6 @@ enum { NR_LRU_GEN_CAPS }; -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) @@ -421,12 +441,11 @@ enum { /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are - * stored in min_seq[] separately for anon and file types as clean file pages - * can be evicted regardless of swap constraints. - * - * Normally anon and file min_seq are in sync. But if swapping is constrained, - * e.g., out of swap space, file min_seq is allowed to advance and leave anon - * min_seq behind. + * stored in min_seq[] separately for anon and file types so that they can be + * incremented independently. Ideally min_seq[] are kept in sync when both anon + * and file types are evictable. However, to adapt to situations like extreme + * swappiness, they are allowed to be out of sync by at most + * MAX_NR_GENS-MIN_NR_GENS-1. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. @@ -446,8 +465,8 @@ struct lru_gen_folio { unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; - /* the first tier doesn't need protection, hence the minus one */ - unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; + /* can only be modified under the LRU lock */ + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; @@ -498,7 +517,7 @@ struct lru_gen_mm_walk { int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; - bool can_swap; + int swappiness; bool force_scan; }; diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index cfad6ce7e1bd..dd85613cdd86 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -29,7 +29,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi); int __init numa_memblks_init(int (*init_func)(void), bool memblock_force_top_down); +extern int numa_distance_cnt; + #ifdef CONFIG_NUMA_EMU +extern int emu_nid_to_phys[MAX_NUMNODES]; int numa_emu_cmdline(char *str); void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, unsigned int nr_emu_nids); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 16fa8f0cea02..3f6a64ff968a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -110,6 +110,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ + PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif @@ -599,6 +600,10 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) +FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) + __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) + #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not @@ -931,21 +936,9 @@ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } - -/* - * PageTransTail returns true for both transparent huge pages - * and hugetlbfs pages, so it should only be called when it's known - * that hugetlbfs pages aren't involved. - */ -static inline int PageTransTail(const struct page *page) -{ - return PageTail(page); -} #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) -TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) -TESTPAGEFLAG_FALSE(TransTail, transtail) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) @@ -955,11 +948,9 @@ TESTPAGEFLAG_FALSE(TransTail, transtail) * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ -PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) - TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) +FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else -PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) - TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) +FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 73dc2c1841ec..898bb788243b 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -31,7 +31,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, int migratetype); int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags); + int migratetype, int flags); void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcf0865a38ae..d0be5f36082a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -210,6 +210,7 @@ enum mapping_flags { AS_STABLE_WRITES = 7, /* must wait for writeback before modifying folio contents */ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ + AS_WRITEBACK_INDETERMINATE = 9, /* Use caution when waiting on writeback */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -335,6 +336,16 @@ static inline bool mapping_inaccessible(struct address_space *mapping) return test_bit(AS_INACCESSIBLE, &mapping->flags); } +static inline void mapping_set_writeback_indeterminate(struct address_space *mapping) +{ + set_bit(AS_WRITEBACK_INDETERMINATE, &mapping->flags); +} + +static inline bool mapping_writeback_indeterminate(struct address_space *mapping) +{ + return test_bit(AS_WRITEBACK_INDETERMINATE, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; @@ -710,6 +721,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, * * %FGP_NOFS - __GFP_FS will get cleared in gfp. * * %FGP_NOWAIT - Don't block on the folio lock. * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) + * * %FGP_DONTCACHE - Uncached buffered IO * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin() * implementation. */ @@ -723,10 +735,21 @@ typedef unsigned int __bitwise fgf_t; #define FGP_NOWAIT ((__force fgf_t)0x00000020) #define FGP_FOR_MMAP ((__force fgf_t)0x00000040) #define FGP_STABLE ((__force fgf_t)0x00000080) +#define FGP_DONTCACHE ((__force fgf_t)0x00000100) #define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +static inline unsigned int filemap_get_order(size_t size) +{ + unsigned int shift = ilog2(size); + + if (shift <= PAGE_SHIFT) + return 0; + + return shift - PAGE_SHIFT; +} + /** * fgf_set_order - Encode a length in the fgf_t flags. * @size: The suggested size of the folio to create. @@ -740,11 +763,11 @@ typedef unsigned int __bitwise fgf_t; */ static inline fgf_t fgf_set_order(size_t size) { - unsigned int shift = ilog2(size); + unsigned int order = filemap_get_order(size); - if (shift <= PAGE_SHIFT) + if (!order) return 0; - return (__force fgf_t)((shift - PAGE_SHIFT) << 26); + return (__force fgf_t)(order << 26); } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); @@ -1270,11 +1293,6 @@ void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); -/* - * Add an arbitrary waiter to a page's wait queue - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter); - /* * Fault in userspace address range. */ @@ -1353,6 +1371,7 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool dropbehind; bool _workingset; unsigned long _pflags; }; diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index ac8c44dd8237..c5e9cac0575e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -33,7 +33,7 @@ struct disk_stats { #define part_stat_read(part, field) \ ({ \ - typeof((part)->bd_stats->field) res = 0; \ + TYPEOF_UNQUAL((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5b520fe86b60..b4859e87846c 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,7 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ - (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) + ((TYPEOF_UNQUAL(*(__p)) __force __kernel *)(__force unsigned long)(__p)) #ifdef CONFIG_SMP @@ -317,7 +317,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { } #define __pcpu_size_call_return(stem, variable) \ ({ \ - typeof(variable) pscr_ret__; \ + TYPEOF_UNQUAL(variable) pscr_ret__; \ __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr_ret__ = stem##1(variable); break; \ @@ -332,7 +332,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { } #define __pcpu_size_call_return2(stem, variable, ...) \ ({ \ - typeof(variable) pscr2_ret__; \ + TYPEOF_UNQUAL(variable) pscr2_ret__; \ __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index adef9d6e9b1b..94d267d02372 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -533,7 +533,14 @@ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep_get_and_clear(mm, addr, ptep); + pte_t pte = ptep_get(ptep); + + pte_clear(mm, addr, ptep); + /* + * No need for ptep_get_and_clear(): page table check doesn't care about + * any bits that could have been set by HW concurrently. + */ + page_table_check_pte_clear(mm, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 27343424225c..9ad134a04b41 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -4,18 +4,7 @@ #include #include - -/* - * rcuwait provides a way of blocking and waking up a single - * task in an rcu-safe manner. - * - * The only time @task is non-nil is when a user is blocked (or - * checking if it needs to) on a condition, and reset as soon as we - * know that the condition has succeeded and are awoken. - */ -struct rcuwait { - struct task_struct __rcu *task; -}; +#include #define __RCUWAIT_INITIALIZER(name) \ { .task = NULL, } diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 35f039ecb272..5072ba99f05e 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -137,13 +137,23 @@ static inline unsigned int refcount_read(const refcount_t *r) } static inline __must_check __signed_wrap -bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) +bool __refcount_add_not_zero_limited(int i, refcount_t *r, int *oldp, + int limit) { int old = refcount_read(r); do { if (!old) break; + + if (statically_true(limit == INT_MAX)) + continue; + + if (i > limit - old) { + if (oldp) + *oldp = old; + return false; + } } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); if (oldp) @@ -155,6 +165,12 @@ bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) return old; } +static inline __must_check __signed_wrap +bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero_limited(i, r, oldp, INT_MAX); +} + /** * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount @@ -213,6 +229,12 @@ static inline void refcount_add(int i, refcount_t *r) __refcount_add(i, r, NULL); } +static inline __must_check bool __refcount_inc_not_zero_limited(refcount_t *r, + int *oldp, int limit) +{ + return __refcount_add_not_zero_limited(1, r, oldp, limit); +} + static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) { return __refcount_add_not_zero(1, r, oldp); diff --git a/include/linux/sched.h b/include/linux/sched.h index 64934e0830af..155012467b21 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -398,6 +398,12 @@ struct sched_info { /* Time spent waiting on a runqueue: */ unsigned long long run_delay; + /* Max time spent waiting on a runqueue: */ + unsigned long long max_run_delay; + + /* Min time spent waiting on a runqueue: */ + unsigned long long min_run_delay; + /* Timestamps: */ /* When did we last run on a CPU? */ diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 412cdaba33eb..17e04859b9a4 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -18,10 +18,6 @@ extern int sched_cpu_dying(unsigned int cpu); # define sched_cpu_dying NULL #endif -#ifdef CONFIG_HOTPLUG_CPU -extern void idle_task_exit(void); -#else static inline void idle_task_exit(void) {} -#endif #endif /* _LINUX_SCHED_HOTPLUG_H */ diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..22c2c48b4265 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seq; \ }) +/** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization diff --git a/include/linux/slab.h b/include/linux/slab.h index 10a971c2bde3..681b685b6c4e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -234,12 +234,6 @@ enum _slab_flag_bits { #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif -/* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * diff --git a/include/linux/swap.h b/include/linux/swap.h index 187715eec3cb..b13b72645db3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -219,7 +219,6 @@ enum { SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -258,18 +257,27 @@ struct swap_cluster_info { u8 order; struct list_head list; }; -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ -#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ -#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ -#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ + +/* All on-list cluster must have a non-zero flag. */ +enum swap_cluster_flags { + CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ + CLUSTER_FLAG_FREE, + CLUSTER_FLAG_NONFULL, + CLUSTER_FLAG_FRAG, + /* Clusters with flags above are allocatable */ + CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, + CLUSTER_FLAG_FULL, + CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_MAX, +}; /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the * cluster to which it belongs being marked free. Therefore 0 is safe to use as - * a sentinel to indicate next is not valid in percpu_cluster. + * a sentinel to indicate an entry is not valid. */ -#define SWAP_NEXT_INVALID 0 +#define SWAP_ENTRY_INVALID 0 #ifdef CONFIG_THP_SWAP #define SWAP_NR_ORDERS (PMD_ORDER + 1) @@ -283,6 +291,7 @@ struct swap_cluster_info { * throughput. */ struct percpu_cluster { + local_lock_t lock; /* Protect the percpu_cluster above */ unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; @@ -305,15 +314,12 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; - unsigned int lowest_bit; /* index of first free in swap_map */ - unsigned int highest_bit; /* index of last free in swap_map */ + atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ - unsigned int inuse_pages; /* number of those currently in use */ - unsigned int cluster_next; /* likely index for next allocation */ - unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ + atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ + struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index ae73a87775b3..b5ec038069da 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,10 +6,8 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new); -extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents); +extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent); +extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); @@ -17,8 +15,12 @@ extern void swap_cgroup_swapoff(int type); #else static inline -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents) +void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +{ +} + +static inline +unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { return 0; } diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 15adfb8c813a..840aec3523b2 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -16,15 +16,12 @@ struct swap_slots_cache { swp_entry_t *slots; int nr; int cur; - spinlock_t free_lock; /* protects slots_ret, n_ret */ - swp_entry_t *slots_ret; int n_ret; }; void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); void enable_swap_slots_cache(void); -void free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 2964171856e0..0646804860ff 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -19,9 +19,6 @@ enum task_work_notify_mode { TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, - - TWA_FLAGS = 0xff00, - TWAF_NO_ALLOC = 0x0100, }; static inline bool task_work_pending(struct task_struct *task) diff --git a/include/linux/types.h b/include/linux/types.h index 2d7b9ae8714c..a3d2182c2686 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -43,7 +43,7 @@ typedef unsigned long uintptr_t; typedef long intptr_t; #ifdef CONFIG_HAVE_UID16 -/* This is defined by include/asm-{arch}/posix_types.h */ +/* This is defined by arch/{arch}/include/asm/posix_types.h */ typedef __kernel_old_uid_t old_uid_t; typedef __kernel_old_gid_t old_gid_t; #endif /* CONFIG_UID16 */ @@ -248,5 +248,17 @@ typedef void (*swap_func_t)(void *a, void *b, int size); typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv); typedef int (*cmp_func_t)(const void *a, const void *b); +/* + * rcuwait provides a way of blocking and waking up a single + * task in an rcu-safe manner. + * + * The only time @task is non-nil is when a user is blocked (or + * checking if it needs to) on a condition, and reset as soon as we + * know that the condition has succeeded and are awoken. + */ +struct rcuwait { + struct task_struct __rcu *task; +}; + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_TYPES_H */ diff --git a/include/net/snmp.h b/include/net/snmp.h index 468a67836e2f..4cb4326dfebe 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -159,7 +159,7 @@ struct linux_tls_mib { #define __SNMP_ADD_STATS64(mib, field, addend) \ do { \ - __typeof__(*mib) *ptr = raw_cpu_ptr(mib); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[field] += addend; \ u64_stats_update_end(&ptr->syncp); \ @@ -176,8 +176,7 @@ struct linux_tls_mib { #define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1) #define __SNMP_UPD_PO_STATS64(mib, basefield, addend) \ do { \ - __typeof__(*mib) *ptr; \ - ptr = raw_cpu_ptr((mib)); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[basefield##PKTS]++; \ ptr->mibs[basefield##OCTETS] += addend; \ diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index bc2e3ad787b3..cf9f9faf8914 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -5,6 +5,7 @@ #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MMAP_LOCK_H +#include #include #include @@ -12,64 +13,61 @@ struct mm_struct; DECLARE_EVENT_CLASS(mmap_lock, - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), + TP_PROTO(struct mm_struct *mm, bool write), - TP_ARGS(mm, memcg_path, write), + TP_ARGS(mm, write), TP_STRUCT__entry( __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) + __field(u64, memcg_id) __field(bool, write) ), TP_fast_assign( __entry->mm = mm; - __assign_str(memcg_path); + __entry->memcg_id = cgroup_id_from_mm(mm); __entry->write = write; ), TP_printk( - "mm=%p memcg_path=%s write=%s", - __entry->mm, - __get_str(memcg_path), + "mm=%p memcg_id=%llu write=%s", + __entry->mm, __entry->memcg_id, __entry->write ? "true" : "false" ) ); #define DEFINE_MMAP_LOCK_EVENT(name) \ DEFINE_EVENT(mmap_lock, name, \ - TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ - bool write), \ - TP_ARGS(mm, memcg_path, write)) + TP_PROTO(struct mm_struct *mm, bool write), \ + TP_ARGS(mm, write)) DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); TRACE_EVENT(mmap_lock_acquire_returned, - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, - bool success), + TP_PROTO(struct mm_struct *mm, bool write, bool success), - TP_ARGS(mm, memcg_path, write, success), + TP_ARGS(mm, write, success), TP_STRUCT__entry( __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) + __field(u64, memcg_id) __field(bool, write) __field(bool, success) ), TP_fast_assign( __entry->mm = mm; - __assign_str(memcg_path); + __entry->memcg_id = cgroup_id_from_mm(mm); __entry->write = write; __entry->success = success; ), TP_printk( - "mm=%p memcg_path=%s write=%s success=%s", + "mm=%p memcg_id=%llu write=%s success=%s", __entry->mm, - __get_str(memcg_path), + __entry->memcg_id, __entry->write ? "true" : "false", __entry->success ? "true" : "false" ) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index bb8a59c6caa2..3bc8656c8359 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -116,7 +116,8 @@ DEF_PAGEFLAG_NAME(head), \ DEF_PAGEFLAG_NAME(reclaim), \ DEF_PAGEFLAG_NAME(swapbacked), \ - DEF_PAGEFLAG_NAME(unevictable) \ + DEF_PAGEFLAG_NAME(unevictable), \ + DEF_PAGEFLAG_NAME(dropbehind) \ IF_HAVE_PG_MLOCK(mlocked) \ IF_HAVE_PG_HWPOISON(hwpoison) \ IF_HAVE_PG_IDLE(idle) \ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..56a4f93a08f4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -332,9 +332,13 @@ typedef int __bitwise __kernel_rwf_t; /* Atomic Write */ #define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) +/* buffered IO that drops the cache after reading or writing data */ +#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ + RWF_DONTCACHE) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index b50b2eb257a0..934e20ef7f79 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -72,6 +72,8 @@ struct taskstats { */ __u64 cpu_count __attribute__((aligned(8))); __u64 cpu_delay_total; + __u64 cpu_delay_max; + __u64 cpu_delay_min; /* Following four fields atomically updated using task->delays->lock */ @@ -80,10 +82,14 @@ struct taskstats { */ __u64 blkio_count; __u64 blkio_delay_total; + __u64 blkio_delay_max; + __u64 blkio_delay_min; /* Delay waiting for page fault I/O (swap in only) */ __u64 swapin_count; __u64 swapin_delay_total; + __u64 swapin_delay_max; + __u64 swapin_delay_min; /* cpu "wall-clock" running time * On some architectures, value will adjust for cpu time stolen @@ -166,10 +172,14 @@ struct taskstats { /* Delay waiting for memory reclaim */ __u64 freepages_count; __u64 freepages_delay_total; + __u64 freepages_delay_max; + __u64 freepages_delay_min; /* Delay waiting for thrashing page */ __u64 thrashing_count; __u64 thrashing_delay_total; + __u64 thrashing_delay_max; + __u64 thrashing_delay_min; /* v10: 64-bit btime to avoid overflow */ __u64 ac_btime64; /* 64-bit begin time */ @@ -177,6 +187,8 @@ struct taskstats { /* v11: Delay waiting for memory compact */ __u64 compact_count; __u64 compact_delay_total; + __u64 compact_delay_max; + __u64 compact_delay_min; /* v12 begin */ __u32 ac_tgid; /* thread group ID */ @@ -198,10 +210,15 @@ struct taskstats { /* v13: Delay waiting for write-protect copy */ __u64 wpcopy_count; __u64 wpcopy_delay_total; + __u64 wpcopy_delay_max; + __u64 wpcopy_delay_min; /* v14: Delay waiting for IRQ/SOFTIRQ */ __u64 irq_count; __u64 irq_delay_total; + __u64 irq_delay_max; + __u64 irq_delay_min; + /* v15: add Delay max */ }; diff --git a/init/Kconfig b/init/Kconfig index a20e6efd3f0f..c1f9eb3d5f2e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -894,6 +894,9 @@ config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH config CC_HAS_INT128 def_bool !$(cc-option,$(m64-flag) -D__SIZEOF_INT128__=0) && 64BIT +config CC_HAS_TYPEOF_UNQUAL + def_bool $(success,echo 'int foo (int a) { __typeof_unqual__(a) b = a; return b; }' | $(CC) -x c - -S -o /dev/null) + config CC_IMPLICIT_FALLTHROUGH string default "-Wimplicit-fallthrough=5" if CC_IS_GCC && $(cc-option,-Wimplicit-fallthrough=5) diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 22c7f41ff642..f86ef92a6c46 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -89,7 +89,7 @@ static void __init handle_initrd(char *root_device_name) extern char *envp_init[]; int error; - pr_warn("using deprecated initrd support, will be removed in 2021.\n"); + pr_warn("using deprecated initrd support, will be removed soon.\n"); real_root_dev = new_encode_dev(ROOT_DEV); create_dev("/dev/root.old", Root_RAM0); diff --git a/init/main.c b/init/main.c index 00fac1170294..4bae539ebc05 100644 --- a/init/main.c +++ b/init/main.c @@ -640,15 +640,11 @@ static void __init setup_command_line(char *command_line) len = xlen + strlen(boot_command_line) + ilen + 1; - saved_command_line = memblock_alloc(len, SMP_CACHE_BYTES); - if (!saved_command_line) - panic("%s: Failed to allocate %zu bytes\n", __func__, len); + saved_command_line = memblock_alloc_or_panic(len, SMP_CACHE_BYTES); len = xlen + strlen(command_line) + 1; - static_command_line = memblock_alloc(len, SMP_CACHE_BYTES); - if (!static_command_line) - panic("%s: Failed to allocate %zu bytes\n", __func__, len); + static_command_line = memblock_alloc_or_panic(len, SMP_CACHE_BYTES); if (xlen) { /* @@ -1145,16 +1141,10 @@ static int __init initcall_blacklist(char *str) str_entry = strsep(&str, ","); if (str_entry) { pr_debug("blacklisting initcall %s\n", str_entry); - entry = memblock_alloc(sizeof(*entry), + entry = memblock_alloc_or_panic(sizeof(*entry), SMP_CACHE_BYTES); - if (!entry) - panic("%s: Failed to allocate %zu bytes\n", - __func__, sizeof(*entry)); - entry->buf = memblock_alloc(strlen(str_entry) + 1, + entry->buf = memblock_alloc_or_panic(strlen(str_entry) + 1, SMP_CACHE_BYTES); - if (!entry->buf) - panic("%s: Failed to allocate %zu bytes\n", - __func__, strlen(str_entry) + 1); strcpy(entry->buf, str_entry); list_add(&entry->next, &blacklisted_initcalls); } diff --git a/ipc/util.c b/ipc/util.c index 05cb9de66735..cae60f11d9c2 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -615,12 +615,11 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) } /** - * ipc_obtain_object_idr + * ipc_obtain_object_idr - Look for an id in the ipc ids idr and + * return associated ipc object. * @ids: ipc identifier set * @id: ipc id to look for * - * Look for an id in the ipc ids idr and return associated ipc object. - * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ @@ -637,13 +636,11 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) } /** - * ipc_obtain_object_check + * ipc_obtain_object_check - Similar to ipc_obtain_object_idr() but + * also checks the ipc object sequence number. * @ids: ipc identifier set * @id: ipc id to look for * - * Similar to ipc_obtain_object_idr() but also checks the ipc object - * sequence number. - * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 945a5680f6a5..9927cd4c9e0e 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -443,7 +443,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } - /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */ + /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); if (!pages) return 0; diff --git a/kernel/capability.c b/kernel/capability.c index dac4df77e376..e089d2628c29 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -38,10 +38,8 @@ __setup("no_file_caps", file_caps_disable); static void warn_legacy_capability_use(void) { - char name[sizeof(current->comm)]; - pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", - get_task_comm(name, current)); + current->comm); } /* @@ -62,10 +60,8 @@ static void warn_legacy_capability_use(void) static void warn_deprecated_v2(void) { - char name[sizeof(current->comm)]; - pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", - get_task_comm(name, current)); + current->comm); } /* diff --git a/kernel/cpu.c b/kernel/cpu.c index b605334f8ee6..7f3bf759cbdf 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -905,12 +905,13 @@ static int finish_cpu(unsigned int cpu) struct mm_struct *mm = idle->active_mm; /* - * idle_task_exit() will have switched to &init_mm, now - * clean up any remaining active_mm state. + * sched_force_init_mm() ensured the use of &init_mm, + * drop that refcount now that the CPU has stopped. */ - if (mm != &init_mm) - idle->active_mm = &init_mm; + WARN_ON(mm != &init_mm); + idle->active_mm = NULL; mmdrop_lazy_tlb(mm); + return 0; } diff --git a/kernel/delayacct.c b/kernel/delayacct.c index dead51de8eb5..b238eb8c6573 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -93,9 +93,9 @@ void __delayacct_tsk_init(struct task_struct *tsk) /* * Finish delay accounting for a statistic using its timestamps (@start), - * accumalator (@total) and @count + * accumulator (@total) and @count */ -static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count) +static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min) { s64 ns = local_clock() - *start; unsigned long flags; @@ -104,6 +104,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou raw_spin_lock_irqsave(lock, flags); *total += ns; (*count)++; + if (ns > *max) + *max = ns; + if (*min == 0 || ns < *min) + *min = ns; raw_spin_unlock_irqrestore(lock, flags); } } @@ -122,7 +126,9 @@ void __delayacct_blkio_end(struct task_struct *p) delayacct_end(&p->delays->lock, &p->delays->blkio_start, &p->delays->blkio_delay, - &p->delays->blkio_count); + &p->delays->blkio_count, + &p->delays->blkio_delay_max, + &p->delays->blkio_delay_min); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -153,10 +159,12 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_count += t1; + d->cpu_delay_max = tsk->sched_info.max_run_delay; + d->cpu_delay_min = tsk->sched_info.min_run_delay; tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; - tmp = (s64)d->cpu_run_virtual_total + t3; + d->cpu_run_virtual_total = (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; @@ -164,20 +172,33 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) return 0; /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ - raw_spin_lock_irqsave(&tsk->delays->lock, flags); + d->blkio_delay_max = tsk->delays->blkio_delay_max; + d->blkio_delay_min = tsk->delays->blkio_delay_min; tmp = d->blkio_delay_total + tsk->delays->blkio_delay; d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; + d->swapin_delay_max = tsk->delays->swapin_delay_max; + d->swapin_delay_min = tsk->delays->swapin_delay_min; tmp = d->swapin_delay_total + tsk->delays->swapin_delay; d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; + d->freepages_delay_max = tsk->delays->freepages_delay_max; + d->freepages_delay_min = tsk->delays->freepages_delay_min; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; + d->thrashing_delay_max = tsk->delays->thrashing_delay_max; + d->thrashing_delay_min = tsk->delays->thrashing_delay_min; tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; + d->compact_delay_max = tsk->delays->compact_delay_max; + d->compact_delay_min = tsk->delays->compact_delay_min; tmp = d->compact_delay_total + tsk->delays->compact_delay; d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; + d->wpcopy_delay_max = tsk->delays->wpcopy_delay_max; + d->wpcopy_delay_min = tsk->delays->wpcopy_delay_min; tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; + d->irq_delay_max = tsk->delays->irq_delay_max; + d->irq_delay_min = tsk->delays->irq_delay_min; tmp = d->irq_delay_total + tsk->delays->irq_delay; d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; @@ -213,7 +234,9 @@ void __delayacct_freepages_end(void) delayacct_end(¤t->delays->lock, ¤t->delays->freepages_start, ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); + ¤t->delays->freepages_count, + ¤t->delays->freepages_delay_max, + ¤t->delays->freepages_delay_min); } void __delayacct_thrashing_start(bool *in_thrashing) @@ -235,7 +258,9 @@ void __delayacct_thrashing_end(bool *in_thrashing) delayacct_end(¤t->delays->lock, ¤t->delays->thrashing_start, ¤t->delays->thrashing_delay, - ¤t->delays->thrashing_count); + ¤t->delays->thrashing_count, + ¤t->delays->thrashing_delay_max, + ¤t->delays->thrashing_delay_min); } void __delayacct_swapin_start(void) @@ -248,7 +273,9 @@ void __delayacct_swapin_end(void) delayacct_end(¤t->delays->lock, ¤t->delays->swapin_start, ¤t->delays->swapin_delay, - ¤t->delays->swapin_count); + ¤t->delays->swapin_count, + ¤t->delays->swapin_delay_max, + ¤t->delays->swapin_delay_min); } void __delayacct_compact_start(void) @@ -261,7 +288,9 @@ void __delayacct_compact_end(void) delayacct_end(¤t->delays->lock, ¤t->delays->compact_start, ¤t->delays->compact_delay, - ¤t->delays->compact_count); + ¤t->delays->compact_count, + ¤t->delays->compact_delay_max, + ¤t->delays->compact_delay_min); } void __delayacct_wpcopy_start(void) @@ -274,7 +303,9 @@ void __delayacct_wpcopy_end(void) delayacct_end(¤t->delays->lock, ¤t->delays->wpcopy_start, ¤t->delays->wpcopy_delay, - ¤t->delays->wpcopy_count); + ¤t->delays->wpcopy_count, + ¤t->delays->wpcopy_delay_max, + ¤t->delays->wpcopy_delay_min); } void __delayacct_irq(struct task_struct *task, u32 delta) @@ -284,6 +315,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta) raw_spin_lock_irqsave(&task->delays->lock, flags); task->delays->irq_delay += delta; task->delays->irq_count++; + if (delta > task->delays->irq_delay_max) + task->delays->irq_delay_max = delta; + if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min)) + task->delays->irq_delay_min = delta; raw_spin_unlock_irqrestore(&task->delays->lock, flags); } diff --git a/kernel/fork.c b/kernel/fork.c index 9b301180fd41..5a3dab459d8a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -#ifdef CONFIG_PER_VMA_LOCK - -/* SLAB cache for vm_area_struct.lock */ -static struct kmem_cache *vma_lock_cachep; - -static bool vma_lock_alloc(struct vm_area_struct *vma) -{ - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; - - return true; -} - -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - kmem_cache_free(vma_lock_cachep, vma->vm_lock); -} - -#else /* CONFIG_PER_VMA_LOCK */ - -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } -static inline void vma_lock_free(struct vm_area_struct *vma) {} - -#endif /* CONFIG_PER_VMA_LOCK */ - struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -474,14 +445,46 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return NULL; vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - kmem_cache_free(vm_area_cachep, vma); - return NULL; - } return vma; } +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +} + struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); @@ -491,15 +494,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); - if (!vma_lock_alloc(new)) { - kmem_cache_free(vm_area_cachep, new); - return NULL; - } + vm_area_init_from(orig, new); + vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); dup_anon_vma_name(orig, new); @@ -507,33 +503,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return new; } -void __vm_area_free(struct vm_area_struct *vma) -{ - vma_numab_state_free(vma); - free_anon_vma_name(vma); - vma_lock_free(vma); - kmem_cache_free(vm_area_cachep, vma); -} - -#ifdef CONFIG_PER_VMA_LOCK -static void vm_area_free_rcu_cb(struct rcu_head *head) -{ - struct vm_area_struct *vma = container_of(head, struct vm_area_struct, - vm_rcu); - - /* The vma should not be locked while being destroyed. */ - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); - __vm_area_free(vma); -} -#endif - void vm_area_free(struct vm_area_struct *vma) { -#ifdef CONFIG_PER_VMA_LOCK - call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); -#else - __vm_area_free(vma); -#endif + /* The vma should be detached while being destroyed. */ + vma_assert_detached(vma); + vma_numab_state_free(vma); + free_anon_vma_name(vma); + kmem_cache_free(vm_area_cachep, vma); } static void account_kernel_stack(struct task_struct *tsk, int account) @@ -1252,6 +1228,15 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif } +static inline void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); +#ifdef CONFIG_PER_VMA_LOCK + rcuwait_init(&mm->vma_writer_wait); +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -1262,9 +1247,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); -#ifdef CONFIG_PER_VMA_LOCK - mm->mm_lock_seq = 0; -#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; @@ -1514,12 +1496,13 @@ struct file *get_task_exe_file(struct task_struct *task) struct file *exe_file = NULL; struct mm_struct *mm; + if (task->flags & PF_KTHREAD) + return NULL; + task_lock(task); mm = task->mm; - if (mm) { - if (!(task->flags & PF_KTHREAD)) - exe_file = get_mm_exe_file(mm); - } + if (mm) + exe_file = get_mm_exe_file(mm); task_unlock(task); return exe_file; } @@ -3175,6 +3158,11 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3191,11 +3179,10 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); -#ifdef CONFIG_PER_VMA_LOCK - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); -#endif + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| + SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 3a10375d9521..eb86a7ade06a 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -210,13 +210,12 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { if (oparg < 0 || oparg > 31) { - char comm[sizeof(current->comm)]; /* * kill this print and return -EINVAL when userspace * is sane again */ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", - get_task_comm(comm, current), oparg); + current->comm, oparg); oparg &= 31; } oparg = 1 << oparg; diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index 7670a811a565..8b888a6193cc 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -264,10 +264,10 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) /** * gcov_info_add - add up profiling data - * @dest: profiling data set to which data is added - * @source: profiling data set which is added + * @dst: profiling data set to which data is added + * @src: profiling data set which is added * - * Adds profiling counts of @source to @dest. + * Adds profiling counts of @src to @dst. */ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) { diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c18717189f32..953169893a95 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -147,6 +147,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); + if (t->flags & PF_POSTCOREDUMP) + pr_err(" Blocked by coredump.\n"); pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 2f4fb336dda1..73f7e1fd4ab4 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -147,7 +147,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; - kasan_record_aux_stack_noalloc(work); + kasan_record_aux_stack(work); preempt_disable(); if (cpu != smp_processor_id()) { diff --git a/kernel/kthread.c b/kernel/kthread.c index a5ac612b1609..f847be88bf07 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -900,7 +900,7 @@ fail_task: } /** - * kthread_create_worker - create a kthread worker + * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * @@ -1015,7 +1015,7 @@ static void kthread_insert_work(struct kthread_worker *worker, * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task - * must have been created with kthread_worker_create(). Returns %true + * must have been created with kthread_create_worker(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 7a75eab9c179..77ee3ea8a573 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -158,9 +158,9 @@ account_global_scheduler_latency(struct task_struct *tsk, /** * __account_scheduler_latency - record an occurred latency - * @tsk - the task struct of the task hitting the latency - * @usecs - the duration of the latency in microseconds - * @inter - 1 if the sleep was interruptible, 0 if uninterruptible + * @tsk: the task struct of the task hitting the latency + * @usecs: the duration of the latency in microseconds + * @inter: 1 if the sleep was interruptible, 0 if uninterruptible * * This function is the main entry point for recording latency entries * as called by the scheduler. diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 6083883c4fe0..d6964fc29f51 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ - typeof(var) __sum = 0; \ + TYPEOF_UNQUAL(var) __sum = 0; \ int cpu; \ compiletime_assert_atomic_type(__sum); \ for_each_possible_cpu(cpu) \ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 30894d8f0a78..c9fb559a6399 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1011,11 +1011,8 @@ void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pf } } /* This allocation cannot fail */ - region = memblock_alloc(sizeof(struct nosave_region), + region = memblock_alloc_or_panic(sizeof(struct nosave_region), SMP_CACHE_BYTES); - if (!region) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index b3b3ce34df63..4b3f31911465 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -250,7 +250,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); void kvfree_call_rcu(struct rcu_head *head, void *ptr) { if (head) - kasan_record_aux_stack_noalloc(ptr); + kasan_record_aux_stack(ptr); __kvfree_call_rcu(head, ptr); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ff98233d4aa5..3885aae5f9cb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3083,7 +3083,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } head->func = func; head->next = NULL; - kasan_record_aux_stack_noalloc(head); + kasan_record_aux_stack(head); local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); lazy = lazy_in && !rcu_async_should_hurry(); @@ -3817,7 +3817,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) return; } - kasan_record_aux_stack_noalloc(ptr); + kasan_record_aux_stack(ptr); success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); diff --git a/kernel/resource.c b/kernel/resource.c index b7c0e24d9398..12004452d999 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1683,8 +1683,7 @@ void __devm_release_region(struct device *dev, struct resource *parent, { struct region_devres match_data = { parent, start, n }; - __release_region(parent, start, n); - WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, + WARN_ON(devres_release(dev, devm_region_release, devm_region_match, &match_data)); } EXPORT_SYMBOL(__devm_release_region); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e5a6bf587f9..aeeeb68e6809 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7701,9 +7701,9 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n", + pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n", free, task_pid_nr(p), task_tgid_nr(p), - ppid, read_task_thread_flags(p)); + ppid, p->flags, read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); @@ -7930,19 +7930,26 @@ void sched_setnuma(struct task_struct *p, int nid) #ifdef CONFIG_HOTPLUG_CPU /* - * Ensure that the idle task is using init_mm right before its CPU goes - * offline. + * Invoked on the outgoing CPU in context of the CPU hotplug thread + * after ensuring that there are no user space tasks left on the CPU. + * + * If there is a lazy mm in use on the hotplug thread, drop it and + * switch to init_mm. + * + * The reference count on init_mm is dropped in finish_cpu(). */ -void idle_task_exit(void) +static void sched_force_init_mm(void) { struct mm_struct *mm = current->active_mm; - BUG_ON(cpu_online(smp_processor_id())); - BUG_ON(current != this_rq()->idle); - if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); + mmgrab_lazy_tlb(&init_mm); + local_irq_disable(); + current->active_mm = &init_mm; + switch_mm_irqs_off(mm, &init_mm, current); + local_irq_enable(); finish_arch_post_lock_switch(); + mmdrop_lazy_tlb(mm); } /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ @@ -8344,6 +8351,7 @@ int sched_cpu_starting(unsigned int cpu) int sched_cpu_wait_empty(unsigned int cpu) { balance_hotplug_wait(); + sched_force_init_mm(); return 0; } @@ -10590,7 +10598,7 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); + task_work_add(curr, work, TWA_RESUME); } void sched_mm_cid_exit_signals(struct task_struct *t) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8ee0add5a48a..693537b908a1 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -244,7 +244,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) delta = rq_clock(rq) - t->sched_info.last_queued; t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; - + if (delta > t->sched_info.max_run_delay) + t->sched_info.max_run_delay = delta; + if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) + t->sched_info.min_run_delay = delta; rq_sched_info_dequeue(rq, delta); } @@ -266,6 +269,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; + if (delta > t->sched_info.max_run_delay) + t->sched_info.max_run_delay = delta; + if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) + t->sched_info.min_run_delay = delta; rq_sched_info_arrive(rq, delta); } diff --git a/kernel/task_work.c b/kernel/task_work.c index c969f1f26be5..d1efec571a4a 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -55,26 +55,14 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; - int flags = notify & TWA_FLAGS; - notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { - /* - * Record the work call stack in order to print it in KASAN - * reports. - * - * Note that stack allocation can fail if TWAF_NO_ALLOC flag - * is set and new page is needed to expand the stack buffer. - */ - if (flags & TWAF_NO_ALLOC) - kasan_record_aux_stack_noalloc(work); - else - kasan_record_aux_stack(work); + kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); diff --git a/kernel/ucount.c b/kernel/ucount.c index f950b5e59d63..86c5f1c0bad9 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -164,8 +164,8 @@ struct ucounts *get_ucounts(struct ucounts *ucounts) struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); - struct ucounts *ucounts, *new; bool wrapped; + struct ucounts *ucounts, *new = NULL; spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -182,17 +182,17 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); - if (ucounts) { - kfree(new); - } else { + if (!ucounts) { hlist_add_head(&new->node, hashent); get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } } + wrapped = !get_ucounts_or_wrap(ucounts); spin_unlock_irq(&ucounts_lock); + kfree(new); if (wrapped) { put_ucounts(ucounts); return NULL; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 41e0f7e9fa35..177abb7d0d4e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -190,7 +190,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) * with printk_cpu_sync_get_irqsave() that we can still at least * get the message about the lockup out. */ - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); + pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu); printk_cpu_sync_get_irqsave(flags); print_modules(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9362484a653c..56bd89a7fde4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2180,7 +2180,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack_noalloc(work); + kasan_record_aux_stack(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f3d723705879..d597930d381f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2269,7 +2269,6 @@ config TEST_LIST_SORT config TEST_MIN_HEAP tristate "Min heap test" depends on DEBUG_KERNEL || m - select MIN_HEAP help Enable this to turn on min heap function tests. This test is executed only once during system boot (so affects only boot time), @@ -2457,8 +2456,22 @@ config TEST_BITMAP config TEST_UUID tristate "Test functions located in the uuid module at runtime" -config TEST_XARRAY - tristate "Test the XArray code at runtime" +config XARRAY_KUNIT + tristate "KUnit test XArray code at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test the Xarray code at boot. + + KUnit tests run during boot and output the results to the debug log + in TAP format (http://testanything.org/). Only useful for kernel devs + running the KUnit test harness, and not intended for inclusion into a + production build. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. config TEST_MAPLE_TREE tristate "Test the Maple Tree code at runtime or module load" @@ -3161,6 +3174,21 @@ config INT_POW_TEST If unsure, say N +config INT_SQRT_KUNIT_TEST + tristate "Integer square root test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the int_sqrt() function, + which performs square root calculation. The test suite checks + various scenarios, including edge cases, to ensure correctness. + + Enabling this option will include tests that check various scenarios + and edge cases to ensure the accuracy and reliability of the square root + function. + + If unsure, say N + endif # RUNTIME_TESTING_MENU config ARCH_USE_MEMTEST diff --git a/lib/Makefile b/lib/Makefile index a8155c972f02..c0458ff841fe 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -94,7 +94,6 @@ GCOV_PROFILE_test_bitmap.o := n endif obj-$(CONFIG_TEST_UUID) += test_uuid.o -obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_MAPLE_TREE) += test_maple_tree.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o obj-$(CONFIG_TEST_KMOD) += test_kmod.o @@ -375,6 +374,7 @@ CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o +obj-$(CONFIG_XARRAY_KUNIT) += test_xarray.o obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o obj-$(CONFIG_HASHTABLE_KUNIT_TEST) += hashtable_test.o obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 65e706e1bc19..19b45617bdcf 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -29,6 +29,8 @@ EXPORT_SYMBOL(_shared_alloc_tag); DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, mem_alloc_profiling_key); +EXPORT_SYMBOL(mem_alloc_profiling_key); + DEFINE_STATIC_KEY_FALSE(mem_profiling_compressed); struct alloc_tag_kernel_section kernel_tags = { NULL, 0 }; @@ -423,8 +425,8 @@ static int vm_module_tags_populate(void) unsigned long nr; more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT; - nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN, - NUMA_NO_NODE, more_pages, next_page); + nr = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN, + NUMA_NO_NODE, more_pages, next_page); if (nr < more_pages || vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL, next_page, PAGE_SHIFT) < 0) { diff --git a/lib/cpumask.c b/lib/cpumask.c index e77ee9d46f71..57274ba8b6d9 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -83,10 +83,7 @@ EXPORT_SYMBOL(alloc_cpumask_var_node); */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { - *mask = memblock_alloc(cpumask_size(), SMP_CACHE_BYTES); - if (!*mask) - panic("%s: Failed to allocate %u bytes\n", __func__, - cpumask_size()); + *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES); } /** diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 52eb6ba29698..999053fa133e 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include #include -#include +#include #include #include #include @@ -12,6 +12,24 @@ #include #include +/* + * The should_fail() functions use prandom instead of the normal Linux RNG + * since they don't need cryptographically secure random numbers. + */ +static DEFINE_PER_CPU(struct rnd_state, fault_rnd_state); + +static u32 fault_prandom_u32_below_100(void) +{ + struct rnd_state *state; + u32 res; + + state = &get_cpu_var(fault_rnd_state); + res = prandom_u32_state(state); + put_cpu_var(fault_rnd_state); + + return res % 100; +} + /* * setup_fault_attr() is a helper function for various __setup handlers, so it * returns 0 on error, because that is what __setup handlers do. @@ -31,6 +49,8 @@ int setup_fault_attr(struct fault_attr *attr, char *str) return 0; } + prandom_init_once(&fault_rnd_state); + attr->probability = probability; attr->interval = interval; atomic_set(&attr->times, times); @@ -146,7 +166,7 @@ bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags) return false; } - if (attr->probability <= get_random_u32_below(100)) + if (attr->probability <= fault_prandom_u32_below_100()) return false; fail: @@ -219,6 +239,8 @@ struct dentry *fault_create_debugfs_attr(const char *name, if (IS_ERR(dir)) return dir; + prandom_init_once(&fault_rnd_state); + debugfs_create_ul("probability", mode, dir, &attr->probability); debugfs_create_ul("interval", mode, dir, &attr->interval); debugfs_create_atomic_t("times", mode, dir, &attr->times); @@ -431,6 +453,8 @@ static const struct config_item_type fault_config_type = { void fault_config_init(struct fault_config *config, const char *name) { + prandom_init_once(&fault_rnd_state); + config_group_init_type_name(&config->group, name, &fault_config_type); } EXPORT_SYMBOL_GPL(fault_config_init); diff --git a/lib/inflate.c b/lib/inflate.c index fbaf03c1748d..eab886baa1b4 100644 --- a/lib/inflate.c +++ b/lib/inflate.c @@ -1257,8 +1257,6 @@ static int INIT gunzip(void) /* Decompress */ if ((res = inflate())) { switch (res) { - case 0: - break; case 1: error("invalid compressed format (err=1)"); break; diff --git a/lib/kunit_iov_iter.c b/lib/kunit_iov_iter.c index 13e15687675a..27d3c7f00ede 100644 --- a/lib/kunit_iov_iter.c +++ b/lib/kunit_iov_iter.c @@ -57,15 +57,12 @@ static void *__init iov_kunit_create_buffer(struct kunit *test, KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pages); *ppages = pages; - got = alloc_pages_bulk_array(GFP_KERNEL, npages, pages); + got = alloc_pages_bulk(GFP_KERNEL, npages, pages); if (got != npages) { release_pages(pages, got); KUNIT_ASSERT_EQ(test, got, npages); } - for (int i = 0; i < npages; i++) - pages[i]->index = i; - buffer = vmap(pages, npages, VM_MAP | VM_MAP_PUT_PAGES, PAGE_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buffer); diff --git a/lib/list_debug.c b/lib/list_debug.c index db602417febf..ee7eeeb8f92c 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -22,17 +22,17 @@ __list_valid_slowpath bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev, struct list_head *next) { - if (CHECK_DATA_CORRUPTION(prev == NULL, + if (CHECK_DATA_CORRUPTION(prev == NULL, NULL, "list_add corruption. prev is NULL.\n") || - CHECK_DATA_CORRUPTION(next == NULL, + CHECK_DATA_CORRUPTION(next == NULL, NULL, "list_add corruption. next is NULL.\n") || - CHECK_DATA_CORRUPTION(next->prev != prev, + CHECK_DATA_CORRUPTION(next->prev != prev, next, "list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n", prev, next->prev, next) || - CHECK_DATA_CORRUPTION(prev->next != next, + CHECK_DATA_CORRUPTION(prev->next != next, prev, "list_add corruption. prev->next should be next (%px), but was %px. (prev=%px).\n", next, prev->next, prev) || - CHECK_DATA_CORRUPTION(new == prev || new == next, + CHECK_DATA_CORRUPTION(new == prev || new == next, NULL, "list_add double add: new=%px, prev=%px, next=%px.\n", new, prev, next)) return false; @@ -49,20 +49,20 @@ bool __list_del_entry_valid_or_report(struct list_head *entry) prev = entry->prev; next = entry->next; - if (CHECK_DATA_CORRUPTION(next == NULL, + if (CHECK_DATA_CORRUPTION(next == NULL, NULL, "list_del corruption, %px->next is NULL\n", entry) || - CHECK_DATA_CORRUPTION(prev == NULL, + CHECK_DATA_CORRUPTION(prev == NULL, NULL, "list_del corruption, %px->prev is NULL\n", entry) || - CHECK_DATA_CORRUPTION(next == LIST_POISON1, + CHECK_DATA_CORRUPTION(next == LIST_POISON1, next, "list_del corruption, %px->next is LIST_POISON1 (%px)\n", entry, LIST_POISON1) || - CHECK_DATA_CORRUPTION(prev == LIST_POISON2, + CHECK_DATA_CORRUPTION(prev == LIST_POISON2, prev, "list_del corruption, %px->prev is LIST_POISON2 (%px)\n", entry, LIST_POISON2) || - CHECK_DATA_CORRUPTION(prev->next != entry, + CHECK_DATA_CORRUPTION(prev->next != entry, prev, "list_del corruption. prev->next should be %px, but was %px. (prev=%px)\n", entry, prev->next, prev) || - CHECK_DATA_CORRUPTION(next->prev != entry, + CHECK_DATA_CORRUPTION(next->prev != entry, next, "list_del corruption. next->prev should be %px, but was %px. (next=%px)\n", entry, next->prev, next)) return false; diff --git a/lib/list_sort.c b/lib/list_sort.c index 8d3f623536fe..a310ecb7ccc0 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -108,6 +108,13 @@ static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head, * and list_sort is a stable sort, so it is not necessary to distinguish * the @a < @b and @a == @b cases. * + * The comparison function must adhere to specific mathematical properties + * to ensure correct and stable sorting: + * - Antisymmetry: cmp(@a, @b) must return the opposite sign of + * cmp(@b, @a). + * - Transitivity: if cmp(@a, @b) <= 0 and cmp(@b, @c) <= 0, then + * cmp(@a, @c) <= 0. + * * This is compatible with two styles of @cmp function: * - The traditional style which returns <0 / =0 / >0, or * - Returning a boolean 0/1. diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 047397136f15..f7153ade1be5 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1863,11 +1863,11 @@ static inline int mab_no_null_split(struct maple_big_node *b_node, * Return: The first split location. The middle split is set in @mid_split. */ static inline int mab_calc_split(struct ma_state *mas, - struct maple_big_node *bn, unsigned char *mid_split, unsigned long min) + struct maple_big_node *bn, unsigned char *mid_split) { unsigned char b_end = bn->b_end; int split = b_end / 2; /* Assume equal split. */ - unsigned char slot_min, slot_count = mt_slots[bn->type]; + unsigned char slot_count = mt_slots[bn->type]; /* * To support gap tracking, all NULL entries are kept together and a node cannot @@ -1900,18 +1900,7 @@ static inline int mab_calc_split(struct ma_state *mas, split = b_end / 3; *mid_split = split * 2; } else { - slot_min = mt_min_slots[bn->type]; - *mid_split = 0; - /* - * Avoid having a range less than the slot count unless it - * causes one node to be deficient. - * NOTE: mt_min_slots is 1 based, b_end and split are zero. - */ - while ((split < slot_count - 1) && - ((bn->pivot[split] - min) < slot_count - 1) && - (b_end - split > slot_min)) - split++; } /* Avoid ending a node on a NULL entry */ @@ -2377,7 +2366,7 @@ static inline struct maple_enode static inline unsigned char mas_mab_to_node(struct ma_state *mas, struct maple_big_node *b_node, struct maple_enode **left, struct maple_enode **right, struct maple_enode **middle, - unsigned char *mid_split, unsigned long min) + unsigned char *mid_split) { unsigned char split = 0; unsigned char slot_count = mt_slots[b_node->type]; @@ -2390,7 +2379,7 @@ static inline unsigned char mas_mab_to_node(struct ma_state *mas, if (b_node->b_end < slot_count) { split = b_node->b_end; } else { - split = mab_calc_split(mas, b_node, mid_split, min); + split = mab_calc_split(mas, b_node, mid_split); *right = mas_new_ma_node(mas, b_node); } @@ -2877,7 +2866,7 @@ static void mas_spanning_rebalance(struct ma_state *mas, mast->bn->b_end--; mast->bn->type = mte_node_type(mast->orig_l->node); split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, - &mid_split, mast->orig_l->min); + &mid_split); mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); @@ -3365,7 +3354,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) if (mas_push_data(mas, height, &mast, false)) break; - split = mab_calc_split(mas, b_node, &mid_split, prev_l_mas.min); + split = mab_calc_split(mas, b_node, &mid_split); mast_split_data(&mast, mas, split); /* * Usually correct, mab_mas_cp in the above call overwrites @@ -4745,29 +4734,6 @@ again: return entry; } -/* - * mas_next_entry() - Internal function to get the next entry. - * @mas: The maple state - * @limit: The maximum range start. - * - * Set the @mas->node to the next entry and the range_start to - * the beginning value for the entry. Does not check beyond @limit. - * Sets @mas->index and @mas->last to the range, Does not update @mas->index and - * @mas->last on overflow. - * Restarts on dead nodes. - * - * Return: the next entry or %NULL. - */ -static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) -{ - if (mas->last >= limit) { - mas->status = ma_overflow; - return NULL; - } - - return mas_next_slot(mas, limit, false); -} - /* * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the * highest gap address of a given size in a given node and descend. @@ -4903,15 +4869,14 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) if (gap >= size) { if (ma_is_leaf(type)) { found = true; - goto done; - } - if (mas->index <= pivot) { - mas->node = mas_slot(mas, slots, offset); - mas->min = min; - mas->max = pivot; - offset = 0; break; } + + mas->node = mas_slot(mas, slots, offset); + mas->min = min; + mas->max = pivot; + offset = 0; + break; } next_slot: min = pivot + 1; @@ -4921,9 +4886,6 @@ next_slot: } } - if (mte_is_root(mas->node)) - found = true; -done: mas->offset = offset; return found; } @@ -5027,8 +4989,8 @@ static inline void mas_awalk(struct ma_state *mas, unsigned long size) * There are 4 options: * go to child (descend) * go back to parent (ascend) - * no gap found. (return, slot == MAPLE_NODE_SLOTS) - * found the gap. (return, slot != MAPLE_NODE_SLOTS) + * no gap found. (return, error == -EBUSY) + * found the gap. (return) */ while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) { if (last == mas->node) @@ -5113,9 +5075,6 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, return xa_err(mas->node); offset = mas->offset; - if (unlikely(offset == MAPLE_NODE_SLOTS)) - return -EBUSY; - node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); @@ -6938,7 +6897,7 @@ retry: goto unlock; while (mas_is_active(&mas) && (mas.last < max)) { - entry = mas_next_entry(&mas, max); + entry = mas_next_slot(&mas, max, false); if (likely(entry && !xa_is_zero(entry))) break; } @@ -7597,7 +7556,7 @@ void mt_validate(struct maple_tree *mt) MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && - (mas.max != ULONG_MAX))) { + (!mte_is_root(mas.node)))) { pr_err("Invalid size %u of " PTR_FMT "\n", end, mas_mn(&mas)); } diff --git a/lib/math/Makefile b/lib/math/Makefile index 3ef11305f8d2..853f023ae537 100644 --- a/lib/math/Makefile +++ b/lib/math/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_INT_POW_TEST) += tests/int_pow_kunit.o obj-$(CONFIG_TEST_DIV64) += test_div64.o obj-$(CONFIG_TEST_MULDIV64) += test_mul_u64_u64_div_u64.o obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o +obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += tests/int_sqrt_kunit.o \ No newline at end of file diff --git a/lib/math/tests/Makefile b/lib/math/tests/Makefile index 6a169123320a..e1a79f093b2d 100644 --- a/lib/math/tests/Makefile +++ b/lib/math/tests/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_INT_POW_TEST) += int_pow_kunit.o +obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += int_sqrt_kunit.o diff --git a/lib/math/tests/int_sqrt_kunit.c b/lib/math/tests/int_sqrt_kunit.c new file mode 100644 index 000000000000..1798e1312eb7 --- /dev/null +++ b/lib/math/tests/int_sqrt_kunit.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +struct test_case_params { + unsigned long x; + unsigned long expected_result; + const char *name; +}; + +static const struct test_case_params params[] = { + { 0, 0, "edge case: square root of 0" }, + { 1, 1, "perfect square: square root of 1" }, + { 2, 1, "non-perfect square: square root of 2" }, + { 3, 1, "non-perfect square: square root of 3" }, + { 4, 2, "perfect square: square root of 4" }, + { 5, 2, "non-perfect square: square root of 5" }, + { 6, 2, "non-perfect square: square root of 6" }, + { 7, 2, "non-perfect square: square root of 7" }, + { 8, 2, "non-perfect square: square root of 8" }, + { 9, 3, "perfect square: square root of 9" }, + { 15, 3, "non-perfect square: square root of 15 (N-1 from 16)" }, + { 16, 4, "perfect square: square root of 16" }, + { 17, 4, "non-perfect square: square root of 17 (N+1 from 16)" }, + { 80, 8, "non-perfect square: square root of 80 (N-1 from 81)" }, + { 81, 9, "perfect square: square root of 81" }, + { 82, 9, "non-perfect square: square root of 82 (N+1 from 81)" }, + { 255, 15, "non-perfect square: square root of 255 (N-1 from 256)" }, + { 256, 16, "perfect square: square root of 256" }, + { 257, 16, "non-perfect square: square root of 257 (N+1 from 256)" }, + { 2147483648, 46340, "large input: square root of 2147483648" }, + { 4294967295, 65535, "edge case: ULONG_MAX for 32-bit" }, +}; + +static void get_desc(const struct test_case_params *tc, char *desc) +{ + strscpy(desc, tc->name, KUNIT_PARAM_DESC_SIZE); +} + +KUNIT_ARRAY_PARAM(int_sqrt, params, get_desc); + +static void int_sqrt_test(struct kunit *test) +{ + const struct test_case_params *tc = (const struct test_case_params *)test->param_value; + + KUNIT_EXPECT_EQ(test, tc->expected_result, int_sqrt(tc->x)); +} + +static struct kunit_case math_int_sqrt_test_cases[] = { + KUNIT_CASE_PARAM(int_sqrt_test, int_sqrt_gen_params), + {} +}; + +static struct kunit_suite int_sqrt_test_suite = { + .name = "math-int_sqrt", + .test_cases = math_int_sqrt_test_cases, +}; + +kunit_test_suites(&int_sqrt_test_suite); + +MODULE_DESCRIPTION("math.int_sqrt KUnit test suite"); +MODULE_LICENSE("GPL"); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 6c902639728b..0682c9a8de82 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -665,7 +665,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow); * structure outside the hash table. * * This function may be called from any process context, including - * non-preemptable context, but cannot be called from softirq or + * non-preemptible context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. diff --git a/lib/sort.c b/lib/sort.c index 048b7a6ef967..8e73dc55476b 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -200,6 +200,13 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size) * copy (e.g. fix up pointers or auxiliary data), but the built-in swap * avoids a slow retpoline and so is significantly faster. * + * The comparison function must adhere to specific mathematical + * properties to ensure correct and stable sorting: + * - Antisymmetry: cmp_func(a, b) must return the opposite sign of + * cmp_func(b, a). + * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then + * cmp_func(a, c) <= 0. + * * Sorting time is O(n log n) both on average and worst-case. While * quicksort is slightly faster on average, it suffers from exploitable * O(n*n) worst-case behavior and extra memory requirements that make diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 704cb1093ae8..13e2a10d7554 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1563,6 +1563,30 @@ static noinline void __init check_root_expand(struct maple_tree *mt) mas_unlock(&mas); } +static noinline void __init check_deficient_node(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, 0); + int count; + + mas_lock(&mas); + for (count = 0; count < 10; count++) { + mas_set(&mas, count); + mas_store_gfp(&mas, xa_mk_value(count), GFP_KERNEL); + } + + for (count = 20; count < 39; count++) { + mas_set(&mas, count); + mas_store_gfp(&mas, xa_mk_value(count), GFP_KERNEL); + } + + for (count = 10; count < 12; count++) { + mas_set(&mas, count); + mas_store_gfp(&mas, xa_mk_value(count), GFP_KERNEL); + } + mas_unlock(&mas); + mt_validate(mt); +} + static noinline void __init check_gap_combining(struct maple_tree *mt) { struct maple_enode *mn1, *mn2; @@ -3714,6 +3738,34 @@ static noinline void __init alloc_cyclic_testing(struct maple_tree *mt) } mtree_destroy(mt); + + /* + * Issue with reverse search was discovered + * https://lore.kernel.org/all/20241216060600.287B4C4CED0@smtp.kernel.org/ + * Exhausting the allocation area and forcing the search to wrap needs a + * mas_reset() in mas_alloc_cyclic(). + */ + next = 0; + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (int i = 0; i < 1023; i++) { + mtree_alloc_cyclic(mt, &location, mt, 2, 1024, &next, GFP_KERNEL); + MT_BUG_ON(mt, i != location - 2); + MT_BUG_ON(mt, i != next - 3); + MT_BUG_ON(mt, mtree_load(mt, location) != mt); + } + mtree_erase(mt, 123); + MT_BUG_ON(mt, mtree_load(mt, 123) != NULL); + mtree_alloc_cyclic(mt, &location, mt, 2, 1024, &next, GFP_KERNEL); + MT_BUG_ON(mt, 123 != location); + MT_BUG_ON(mt, 124 != next); + MT_BUG_ON(mt, mtree_load(mt, location) != mt); + mtree_erase(mt, 100); + mtree_alloc_cyclic(mt, &location, mt, 2, 1024, &next, GFP_KERNEL); + MT_BUG_ON(mt, 100 != location); + MT_BUG_ON(mt, 101 != next); + MT_BUG_ON(mt, mtree_load(mt, location) != mt); + mtree_destroy(mt); + /* Overflow test */ next = ULONG_MAX - 1; ret = mtree_alloc_cyclic(mt, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); @@ -3796,6 +3848,10 @@ static int __init maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_deficient_node(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_store_null(&tree); mtree_destroy(&tree); diff --git a/lib/test_min_heap.c b/lib/test_min_heap.c index e6fbb798558b..a9c4a74d3898 100644 --- a/lib/test_min_heap.c +++ b/lib/test_min_heap.c @@ -32,7 +32,7 @@ static __init int pop_verify_heap(bool min_heap, int last; last = values[0]; - min_heap_pop(heap, funcs, NULL); + min_heap_pop_inline(heap, funcs, NULL); while (heap->nr > 0) { if (min_heap) { if (last > values[0]) { @@ -48,7 +48,7 @@ static __init int pop_verify_heap(bool min_heap, } } last = values[0]; - min_heap_pop(heap, funcs, NULL); + min_heap_pop_inline(heap, funcs, NULL); } return err; } @@ -69,7 +69,7 @@ static __init int test_heapify_all(bool min_heap) int i, err; /* Test with known set of values. */ - min_heapify_all(&heap, &funcs, NULL); + min_heapify_all_inline(&heap, &funcs, NULL); err = pop_verify_heap(min_heap, &heap, &funcs); @@ -78,7 +78,7 @@ static __init int test_heapify_all(bool min_heap) for (i = 0; i < heap.nr; i++) values[i] = get_random_u32(); - min_heapify_all(&heap, &funcs, NULL); + min_heapify_all_inline(&heap, &funcs, NULL); err += pop_verify_heap(min_heap, &heap, &funcs); return err; @@ -102,14 +102,14 @@ static __init int test_heap_push(bool min_heap) /* Test with known set of values copied from data. */ for (i = 0; i < ARRAY_SIZE(data); i++) - min_heap_push(&heap, &data[i], &funcs, NULL); + min_heap_push_inline(&heap, &data[i], &funcs, NULL); err = pop_verify_heap(min_heap, &heap, &funcs); /* Test with randomly generated values. */ while (heap.nr < heap.size) { temp = get_random_u32(); - min_heap_push(&heap, &temp, &funcs, NULL); + min_heap_push_inline(&heap, &temp, &funcs, NULL); } err += pop_verify_heap(min_heap, &heap, &funcs); @@ -135,22 +135,22 @@ static __init int test_heap_pop_push(bool min_heap) /* Fill values with data to pop and replace. */ temp = min_heap ? 0x80000000 : 0x7FFFFFFF; for (i = 0; i < ARRAY_SIZE(data); i++) - min_heap_push(&heap, &temp, &funcs, NULL); + min_heap_push_inline(&heap, &temp, &funcs, NULL); /* Test with known set of values copied from data. */ for (i = 0; i < ARRAY_SIZE(data); i++) - min_heap_pop_push(&heap, &data[i], &funcs, NULL); + min_heap_pop_push_inline(&heap, &data[i], &funcs, NULL); err = pop_verify_heap(min_heap, &heap, &funcs); heap.nr = 0; for (i = 0; i < ARRAY_SIZE(data); i++) - min_heap_push(&heap, &temp, &funcs, NULL); + min_heap_push_inline(&heap, &temp, &funcs, NULL); /* Test with randomly generated values. */ for (i = 0; i < ARRAY_SIZE(data); i++) { temp = get_random_u32(); - min_heap_pop_push(&heap, &temp, &funcs, NULL); + min_heap_pop_push_inline(&heap, &temp, &funcs, NULL); } err += pop_verify_heap(min_heap, &heap, &funcs); @@ -163,7 +163,7 @@ static __init int test_heap_del(bool min_heap) -3, -1, -2, -4, 0x8000000, 0x7FFFFFF }; struct min_heap_test heap; - min_heap_init(&heap, values, ARRAY_SIZE(values)); + min_heap_init_inline(&heap, values, ARRAY_SIZE(values)); heap.nr = ARRAY_SIZE(values); struct min_heap_callbacks funcs = { .less = min_heap ? less_than : greater_than, @@ -172,9 +172,9 @@ static __init int test_heap_del(bool min_heap) int i, err; /* Test with known set of values. */ - min_heapify_all(&heap, &funcs, NULL); + min_heapify_all_inline(&heap, &funcs, NULL); for (i = 0; i < ARRAY_SIZE(values) / 2; i++) - min_heap_del(&heap, get_random_u32() % heap.nr, &funcs, NULL); + min_heap_del_inline(&heap, get_random_u32() % heap.nr, &funcs, NULL); err = pop_verify_heap(min_heap, &heap, &funcs); @@ -182,10 +182,10 @@ static __init int test_heap_del(bool min_heap) heap.nr = ARRAY_SIZE(values); for (i = 0; i < heap.nr; i++) values[i] = get_random_u32(); - min_heapify_all(&heap, &funcs, NULL); + min_heapify_all_inline(&heap, &funcs, NULL); for (i = 0; i < ARRAY_SIZE(values) / 2; i++) - min_heap_del(&heap, get_random_u32() % heap.nr, &funcs, NULL); + min_heap_del_inline(&heap, get_random_u32() % heap.nr, &funcs, NULL); err += pop_verify_heap(min_heap, &heap, &funcs); return err; diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 4ddf769861ff..f585949ff696 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -373,7 +373,7 @@ vm_map_ram_test(void) if (!pages) return -1; - nr_allocated = alloc_pages_bulk_array(GFP_KERNEL, map_nr_pages, pages); + nr_allocated = alloc_pages_bulk(GFP_KERNEL, map_nr_pages, pages); if (nr_allocated != map_nr_pages) goto cleanup; diff --git a/lib/test_xarray.c b/lib/test_xarray.c index d5c5cbba33ed..eab5971d0a48 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -6,11 +6,10 @@ * Author: Matthew Wilcox */ -#include -#include +#include -static unsigned int tests_run; -static unsigned int tests_passed; +#include +#include static const unsigned int order_limit = IS_ENABLED(CONFIG_XARRAY_MULTI) ? BITS_PER_LONG : 1; @@ -20,15 +19,12 @@ static const unsigned int order_limit = void xa_dump(const struct xarray *xa) { } # endif #undef XA_BUG_ON -#define XA_BUG_ON(xa, x) do { \ - tests_run++; \ - if (x) { \ - printk("BUG at %s:%d\n", __func__, __LINE__); \ - xa_dump(xa); \ - dump_stack(); \ - } else { \ - tests_passed++; \ - } \ +#define XA_BUG_ON(xa, x) do { \ + if (x) { \ + KUNIT_FAIL(test, #x); \ + xa_dump(xa); \ + dump_stack(); \ + } \ } while (0) #endif @@ -42,13 +38,13 @@ static void *xa_store_index(struct xarray *xa, unsigned long index, gfp_t gfp) return xa_store(xa, index, xa_mk_index(index), gfp); } -static void xa_insert_index(struct xarray *xa, unsigned long index) +static void xa_insert_index(struct kunit *test, struct xarray *xa, unsigned long index) { XA_BUG_ON(xa, xa_insert(xa, index, xa_mk_index(index), GFP_KERNEL) != 0); } -static void xa_alloc_index(struct xarray *xa, unsigned long index, gfp_t gfp) +static void xa_alloc_index(struct kunit *test, struct xarray *xa, unsigned long index, gfp_t gfp) { u32 id; @@ -57,7 +53,7 @@ static void xa_alloc_index(struct xarray *xa, unsigned long index, gfp_t gfp) XA_BUG_ON(xa, id != index); } -static void xa_erase_index(struct xarray *xa, unsigned long index) +static void xa_erase_index(struct kunit *test, struct xarray *xa, unsigned long index) { XA_BUG_ON(xa, xa_erase(xa, index) != xa_mk_index(index)); XA_BUG_ON(xa, xa_load(xa, index) != NULL); @@ -83,8 +79,15 @@ static void *xa_store_order(struct xarray *xa, unsigned long index, return curr; } -static noinline void check_xa_err(struct xarray *xa) +static inline struct xarray *xa_param(struct kunit *test) { + return *(struct xarray **)test->param_value; +} + +static noinline void check_xa_err(struct kunit *test) +{ + struct xarray *xa = xa_param(test); + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 0, GFP_NOWAIT)) != 0); XA_BUG_ON(xa, xa_err(xa_erase(xa, 0)) != 0); #ifndef __KERNEL__ @@ -99,8 +102,10 @@ static noinline void check_xa_err(struct xarray *xa) // XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL); } -static noinline void check_xas_retry(struct xarray *xa) +static noinline void check_xas_retry(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); void *entry; @@ -109,7 +114,7 @@ static noinline void check_xas_retry(struct xarray *xa) rcu_read_lock(); XA_BUG_ON(xa, xas_find(&xas, ULONG_MAX) != xa_mk_value(0)); - xa_erase_index(xa, 1); + xa_erase_index(test, xa, 1); XA_BUG_ON(xa, !xa_is_retry(xas_reload(&xas))); XA_BUG_ON(xa, xas_retry(&xas, NULL)); XA_BUG_ON(xa, xas_retry(&xas, xa_mk_value(0))); @@ -140,12 +145,14 @@ static noinline void check_xas_retry(struct xarray *xa) } xas_unlock(&xas); - xa_erase_index(xa, 0); - xa_erase_index(xa, 1); + xa_erase_index(test, xa, 0); + xa_erase_index(test, xa, 1); } -static noinline void check_xa_load(struct xarray *xa) +static noinline void check_xa_load(struct kunit *test) { + struct xarray *xa = xa_param(test); + unsigned long i, j; for (i = 0; i < 1024; i++) { @@ -167,13 +174,15 @@ static noinline void check_xa_load(struct xarray *xa) else XA_BUG_ON(xa, entry); } - xa_erase_index(xa, i); + xa_erase_index(test, xa, i); } XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) +static noinline void check_xa_mark_1(struct kunit *test, unsigned long index) { + struct xarray *xa = xa_param(test); + unsigned int order; unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 8 : 1; @@ -193,7 +202,7 @@ static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_1)); /* Storing NULL clears marks, and they can't be set again */ - xa_erase_index(xa, index); + xa_erase_index(test, xa, index); XA_BUG_ON(xa, !xa_empty(xa)); XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); xa_set_mark(xa, index, XA_MARK_0); @@ -244,15 +253,17 @@ static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_0)); XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_1)); XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_2)); - xa_erase_index(xa, index); - xa_erase_index(xa, next); + xa_erase_index(test, xa, index); + xa_erase_index(test, xa, next); XA_BUG_ON(xa, !xa_empty(xa)); } XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_xa_mark_2(struct xarray *xa) +static noinline void check_xa_mark_2(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); unsigned long index; unsigned int count = 0; @@ -289,9 +300,11 @@ static noinline void check_xa_mark_2(struct xarray *xa) xa_destroy(xa); } -static noinline void check_xa_mark_3(struct xarray *xa) +static noinline void check_xa_mark_3(struct kunit *test) { #ifdef CONFIG_XARRAY_MULTI + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0x41); void *entry; int count = 0; @@ -310,19 +323,21 @@ static noinline void check_xa_mark_3(struct xarray *xa) #endif } -static noinline void check_xa_mark(struct xarray *xa) +static noinline void check_xa_mark(struct kunit *test) { unsigned long index; for (index = 0; index < 16384; index += 4) - check_xa_mark_1(xa, index); + check_xa_mark_1(test, index); - check_xa_mark_2(xa); - check_xa_mark_3(xa); + check_xa_mark_2(test); + check_xa_mark_3(test); } -static noinline void check_xa_shrink(struct xarray *xa) +static noinline void check_xa_shrink(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 1); struct xa_node *node; unsigned int order; @@ -347,7 +362,7 @@ static noinline void check_xa_shrink(struct xarray *xa) XA_BUG_ON(xa, xas_load(&xas) != NULL); xas_unlock(&xas); XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); - xa_erase_index(xa, 0); + xa_erase_index(test, xa, 0); XA_BUG_ON(xa, !xa_empty(xa)); for (order = 0; order < max_order; order++) { @@ -364,45 +379,49 @@ static noinline void check_xa_shrink(struct xarray *xa) XA_BUG_ON(xa, xa_head(xa) == node); rcu_read_unlock(); XA_BUG_ON(xa, xa_load(xa, max + 1) != NULL); - xa_erase_index(xa, ULONG_MAX); + xa_erase_index(test, xa, ULONG_MAX); XA_BUG_ON(xa, xa->xa_head != node); - xa_erase_index(xa, 0); + xa_erase_index(test, xa, 0); } } -static noinline void check_insert(struct xarray *xa) +static noinline void check_insert(struct kunit *test) { + struct xarray *xa = xa_param(test); + unsigned long i; for (i = 0; i < 1024; i++) { - xa_insert_index(xa, i); + xa_insert_index(test, xa, i); XA_BUG_ON(xa, xa_load(xa, i - 1) != NULL); XA_BUG_ON(xa, xa_load(xa, i + 1) != NULL); - xa_erase_index(xa, i); + xa_erase_index(test, xa, i); } for (i = 10; i < BITS_PER_LONG; i++) { - xa_insert_index(xa, 1UL << i); + xa_insert_index(test, xa, 1UL << i); XA_BUG_ON(xa, xa_load(xa, (1UL << i) - 1) != NULL); XA_BUG_ON(xa, xa_load(xa, (1UL << i) + 1) != NULL); - xa_erase_index(xa, 1UL << i); + xa_erase_index(test, xa, 1UL << i); - xa_insert_index(xa, (1UL << i) - 1); + xa_insert_index(test, xa, (1UL << i) - 1); XA_BUG_ON(xa, xa_load(xa, (1UL << i) - 2) != NULL); XA_BUG_ON(xa, xa_load(xa, 1UL << i) != NULL); - xa_erase_index(xa, (1UL << i) - 1); + xa_erase_index(test, xa, (1UL << i) - 1); } - xa_insert_index(xa, ~0UL); + xa_insert_index(test, xa, ~0UL); XA_BUG_ON(xa, xa_load(xa, 0UL) != NULL); XA_BUG_ON(xa, xa_load(xa, ~1UL) != NULL); - xa_erase_index(xa, ~0UL); + xa_erase_index(test, xa, ~0UL); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_cmpxchg(struct xarray *xa) +static noinline void check_cmpxchg(struct kunit *test) { + struct xarray *xa = xa_param(test); + void *FIVE = xa_mk_value(5); void *SIX = xa_mk_value(6); void *LOTS = xa_mk_value(12345678); @@ -418,14 +437,16 @@ static noinline void check_cmpxchg(struct xarray *xa) XA_BUG_ON(xa, xa_insert(xa, 5, FIVE, GFP_KERNEL) != -EBUSY); XA_BUG_ON(xa, xa_cmpxchg(xa, 5, FIVE, NULL, GFP_KERNEL) != FIVE); XA_BUG_ON(xa, xa_insert(xa, 5, FIVE, GFP_KERNEL) == -EBUSY); - xa_erase_index(xa, 12345678); - xa_erase_index(xa, 5); + xa_erase_index(test, xa, 12345678); + xa_erase_index(test, xa, 5); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_cmpxchg_order(struct xarray *xa) +static noinline void check_cmpxchg_order(struct kunit *test) { #ifdef CONFIG_XARRAY_MULTI + struct xarray *xa = xa_param(test); + void *FIVE = xa_mk_value(5); unsigned int i, order = 3; @@ -476,8 +497,10 @@ static noinline void check_cmpxchg_order(struct xarray *xa) #endif } -static noinline void check_reserve(struct xarray *xa) +static noinline void check_reserve(struct kunit *test) { + struct xarray *xa = xa_param(test); + void *entry; unsigned long index; int count; @@ -494,7 +517,7 @@ static noinline void check_reserve(struct xarray *xa) XA_BUG_ON(xa, xa_reserve(xa, 12345678, GFP_KERNEL) != 0); XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_NOWAIT) != NULL); xa_release(xa, 12345678); - xa_erase_index(xa, 12345678); + xa_erase_index(test, xa, 12345678); XA_BUG_ON(xa, !xa_empty(xa)); /* cmpxchg sees a reserved entry as ZERO */ @@ -502,7 +525,7 @@ static noinline void check_reserve(struct xarray *xa) XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, XA_ZERO_ENTRY, xa_mk_value(12345678), GFP_NOWAIT) != NULL); xa_release(xa, 12345678); - xa_erase_index(xa, 12345678); + xa_erase_index(test, xa, 12345678); XA_BUG_ON(xa, !xa_empty(xa)); /* xa_insert treats it as busy */ @@ -542,8 +565,10 @@ static noinline void check_reserve(struct xarray *xa) xa_destroy(xa); } -static noinline void check_xas_erase(struct xarray *xa) +static noinline void check_xas_erase(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); void *entry; unsigned long i, j; @@ -581,9 +606,11 @@ static noinline void check_xas_erase(struct xarray *xa) } #ifdef CONFIG_XARRAY_MULTI -static noinline void check_multi_store_1(struct xarray *xa, unsigned long index, +static noinline void check_multi_store_1(struct kunit *test, unsigned long index, unsigned int order) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, index); unsigned long min = index & ~((1UL << order) - 1); unsigned long max = min + (1UL << order); @@ -602,13 +629,15 @@ static noinline void check_multi_store_1(struct xarray *xa, unsigned long index, XA_BUG_ON(xa, xa_load(xa, max) != NULL); XA_BUG_ON(xa, xa_load(xa, min - 1) != NULL); - xa_erase_index(xa, min); + xa_erase_index(test, xa, min); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_multi_store_2(struct xarray *xa, unsigned long index, +static noinline void check_multi_store_2(struct kunit *test, unsigned long index, unsigned int order) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, index); xa_store_order(xa, index, order, xa_mk_value(0), GFP_KERNEL); @@ -620,9 +649,11 @@ static noinline void check_multi_store_2(struct xarray *xa, unsigned long index, XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_multi_store_3(struct xarray *xa, unsigned long index, +static noinline void check_multi_store_3(struct kunit *test, unsigned long index, unsigned int order) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); void *entry; int n = 0; @@ -647,9 +678,11 @@ static noinline void check_multi_store_3(struct xarray *xa, unsigned long index, } #endif -static noinline void check_multi_store(struct xarray *xa) +static noinline void check_multi_store(struct kunit *test) { #ifdef CONFIG_XARRAY_MULTI + struct xarray *xa = xa_param(test); + unsigned long i, j, k; unsigned int max_order = (sizeof(long) == 4) ? 30 : 60; @@ -714,26 +747,28 @@ static noinline void check_multi_store(struct xarray *xa) } for (i = 0; i < 20; i++) { - check_multi_store_1(xa, 200, i); - check_multi_store_1(xa, 0, i); - check_multi_store_1(xa, (1UL << i) + 1, i); + check_multi_store_1(test, 200, i); + check_multi_store_1(test, 0, i); + check_multi_store_1(test, (1UL << i) + 1, i); } - check_multi_store_2(xa, 4095, 9); + check_multi_store_2(test, 4095, 9); for (i = 1; i < 20; i++) { - check_multi_store_3(xa, 0, i); - check_multi_store_3(xa, 1UL << i, i); + check_multi_store_3(test, 0, i); + check_multi_store_3(test, 1UL << i, i); } #endif } #ifdef CONFIG_XARRAY_MULTI /* mimics page cache __filemap_add_folio() */ -static noinline void check_xa_multi_store_adv_add(struct xarray *xa, +static noinline void check_xa_multi_store_adv_add(struct kunit *test, unsigned long index, unsigned int order, void *p) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, index); unsigned int nrpages = 1UL << order; @@ -761,10 +796,12 @@ static noinline void check_xa_multi_store_adv_add(struct xarray *xa, } /* mimics page_cache_delete() */ -static noinline void check_xa_multi_store_adv_del_entry(struct xarray *xa, +static noinline void check_xa_multi_store_adv_del_entry(struct kunit *test, unsigned long index, unsigned int order) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, index); xas_set_order(&xas, index, order); @@ -772,12 +809,14 @@ static noinline void check_xa_multi_store_adv_del_entry(struct xarray *xa, xas_init_marks(&xas); } -static noinline void check_xa_multi_store_adv_delete(struct xarray *xa, +static noinline void check_xa_multi_store_adv_delete(struct kunit *test, unsigned long index, unsigned int order) { + struct xarray *xa = xa_param(test); + xa_lock_irq(xa); - check_xa_multi_store_adv_del_entry(xa, index, order); + check_xa_multi_store_adv_del_entry(test, index, order); xa_unlock_irq(xa); } @@ -814,10 +853,12 @@ static unsigned long some_val = 0xdeadbeef; static unsigned long some_val_2 = 0xdeaddead; /* mimics the page cache usage */ -static noinline void check_xa_multi_store_adv(struct xarray *xa, +static noinline void check_xa_multi_store_adv(struct kunit *test, unsigned long pos, unsigned int order) { + struct xarray *xa = xa_param(test); + unsigned int nrpages = 1UL << order; unsigned long index, base, next_index, next_next_index; unsigned int i; @@ -827,7 +868,7 @@ static noinline void check_xa_multi_store_adv(struct xarray *xa, next_index = round_down(base + nrpages, nrpages); next_next_index = round_down(next_index + nrpages, nrpages); - check_xa_multi_store_adv_add(xa, base, order, &some_val); + check_xa_multi_store_adv_add(test, base, order, &some_val); for (i = 0; i < nrpages; i++) XA_BUG_ON(xa, test_get_entry(xa, base + i) != &some_val); @@ -835,20 +876,20 @@ static noinline void check_xa_multi_store_adv(struct xarray *xa, XA_BUG_ON(xa, test_get_entry(xa, next_index) != NULL); /* Use order 0 for the next item */ - check_xa_multi_store_adv_add(xa, next_index, 0, &some_val_2); + check_xa_multi_store_adv_add(test, next_index, 0, &some_val_2); XA_BUG_ON(xa, test_get_entry(xa, next_index) != &some_val_2); /* Remove the next item */ - check_xa_multi_store_adv_delete(xa, next_index, 0); + check_xa_multi_store_adv_delete(test, next_index, 0); /* Now use order for a new pointer */ - check_xa_multi_store_adv_add(xa, next_index, order, &some_val_2); + check_xa_multi_store_adv_add(test, next_index, order, &some_val_2); for (i = 0; i < nrpages; i++) XA_BUG_ON(xa, test_get_entry(xa, next_index + i) != &some_val_2); - check_xa_multi_store_adv_delete(xa, next_index, order); - check_xa_multi_store_adv_delete(xa, base, order); + check_xa_multi_store_adv_delete(test, next_index, order); + check_xa_multi_store_adv_delete(test, base, order); XA_BUG_ON(xa, !xa_empty(xa)); /* starting fresh again */ @@ -856,7 +897,7 @@ static noinline void check_xa_multi_store_adv(struct xarray *xa, /* let's test some holes now */ /* hole at base and next_next */ - check_xa_multi_store_adv_add(xa, next_index, order, &some_val_2); + check_xa_multi_store_adv_add(test, next_index, order, &some_val_2); for (i = 0; i < nrpages; i++) XA_BUG_ON(xa, test_get_entry(xa, base + i) != NULL); @@ -867,12 +908,12 @@ static noinline void check_xa_multi_store_adv(struct xarray *xa, for (i = 0; i < nrpages; i++) XA_BUG_ON(xa, test_get_entry(xa, next_next_index + i) != NULL); - check_xa_multi_store_adv_delete(xa, next_index, order); + check_xa_multi_store_adv_delete(test, next_index, order); XA_BUG_ON(xa, !xa_empty(xa)); /* hole at base and next */ - check_xa_multi_store_adv_add(xa, next_next_index, order, &some_val_2); + check_xa_multi_store_adv_add(test, next_next_index, order, &some_val_2); for (i = 0; i < nrpages; i++) XA_BUG_ON(xa, test_get_entry(xa, base + i) != NULL); @@ -883,12 +924,12 @@ static noinline void check_xa_multi_store_adv(struct xarray *xa, for (i = 0; i < nrpages; i++) XA_BUG_ON(xa, test_get_entry(xa, next_next_index + i) != &some_val_2); - check_xa_multi_store_adv_delete(xa, next_next_index, order); + check_xa_multi_store_adv_delete(test, next_next_index, order); XA_BUG_ON(xa, !xa_empty(xa)); } #endif -static noinline void check_multi_store_advanced(struct xarray *xa) +static noinline void check_multi_store_advanced(struct kunit *test) { #ifdef CONFIG_XARRAY_MULTI unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; @@ -900,59 +941,59 @@ static noinline void check_multi_store_advanced(struct xarray *xa) */ for (pos = 7; pos < end; pos = (pos * pos) + 564) { for (i = 0; i < max_order; i++) { - check_xa_multi_store_adv(xa, pos, i); - check_xa_multi_store_adv(xa, pos + 157, i); + check_xa_multi_store_adv(test, pos, i); + check_xa_multi_store_adv(test, pos + 157, i); } } #endif } -static noinline void check_xa_alloc_1(struct xarray *xa, unsigned int base) +static noinline void check_xa_alloc_1(struct kunit *test, struct xarray *xa, unsigned int base) { int i; u32 id; XA_BUG_ON(xa, !xa_empty(xa)); /* An empty array should assign %base to the first alloc */ - xa_alloc_index(xa, base, GFP_KERNEL); + xa_alloc_index(test, xa, base, GFP_KERNEL); /* Erasing it should make the array empty again */ - xa_erase_index(xa, base); + xa_erase_index(test, xa, base); XA_BUG_ON(xa, !xa_empty(xa)); /* And it should assign %base again */ - xa_alloc_index(xa, base, GFP_KERNEL); + xa_alloc_index(test, xa, base, GFP_KERNEL); /* Allocating and then erasing a lot should not lose base */ for (i = base + 1; i < 2 * XA_CHUNK_SIZE; i++) - xa_alloc_index(xa, i, GFP_KERNEL); + xa_alloc_index(test, xa, i, GFP_KERNEL); for (i = base; i < 2 * XA_CHUNK_SIZE; i++) - xa_erase_index(xa, i); - xa_alloc_index(xa, base, GFP_KERNEL); + xa_erase_index(test, xa, i); + xa_alloc_index(test, xa, base, GFP_KERNEL); /* Destroying the array should do the same as erasing */ xa_destroy(xa); /* And it should assign %base again */ - xa_alloc_index(xa, base, GFP_KERNEL); + xa_alloc_index(test, xa, base, GFP_KERNEL); /* The next assigned ID should be base+1 */ - xa_alloc_index(xa, base + 1, GFP_KERNEL); - xa_erase_index(xa, base + 1); + xa_alloc_index(test, xa, base + 1, GFP_KERNEL); + xa_erase_index(test, xa, base + 1); /* Storing a value should mark it used */ xa_store_index(xa, base + 1, GFP_KERNEL); - xa_alloc_index(xa, base + 2, GFP_KERNEL); + xa_alloc_index(test, xa, base + 2, GFP_KERNEL); /* If we then erase base, it should be free */ - xa_erase_index(xa, base); - xa_alloc_index(xa, base, GFP_KERNEL); + xa_erase_index(test, xa, base); + xa_alloc_index(test, xa, base, GFP_KERNEL); - xa_erase_index(xa, base + 1); - xa_erase_index(xa, base + 2); + xa_erase_index(test, xa, base + 1); + xa_erase_index(test, xa, base + 2); for (i = 1; i < 5000; i++) { - xa_alloc_index(xa, base + i, GFP_KERNEL); + xa_alloc_index(test, xa, base + i, GFP_KERNEL); } xa_destroy(xa); @@ -975,14 +1016,14 @@ static noinline void check_xa_alloc_1(struct xarray *xa, unsigned int base) XA_BUG_ON(xa, xa_alloc(xa, &id, xa_mk_index(10), XA_LIMIT(10, 5), GFP_KERNEL) != -EBUSY); - XA_BUG_ON(xa, xa_store_index(xa, 3, GFP_KERNEL) != 0); + XA_BUG_ON(xa, xa_store_index(xa, 3, GFP_KERNEL) != NULL); XA_BUG_ON(xa, xa_alloc(xa, &id, xa_mk_index(10), XA_LIMIT(10, 5), GFP_KERNEL) != -EBUSY); - xa_erase_index(xa, 3); + xa_erase_index(test, xa, 3); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_xa_alloc_2(struct xarray *xa, unsigned int base) +static noinline void check_xa_alloc_2(struct kunit *test, struct xarray *xa, unsigned int base) { unsigned int i, id; unsigned long index; @@ -1018,7 +1059,7 @@ static noinline void check_xa_alloc_2(struct xarray *xa, unsigned int base) XA_BUG_ON(xa, id != 5); xa_for_each(xa, index, entry) { - xa_erase_index(xa, index); + xa_erase_index(test, xa, index); } for (i = base; i < base + 9; i++) { @@ -1033,7 +1074,7 @@ static noinline void check_xa_alloc_2(struct xarray *xa, unsigned int base) xa_destroy(xa); } -static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) +static noinline void check_xa_alloc_3(struct kunit *test, struct xarray *xa, unsigned int base) { struct xa_limit limit = XA_LIMIT(1, 0x3fff); u32 next = 0; @@ -1049,8 +1090,8 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(0x3ffd), limit, &next, GFP_KERNEL) != 0); XA_BUG_ON(xa, id != 0x3ffd); - xa_erase_index(xa, 0x3ffd); - xa_erase_index(xa, 1); + xa_erase_index(test, xa, 0x3ffd); + xa_erase_index(test, xa, 1); XA_BUG_ON(xa, !xa_empty(xa)); for (i = 0x3ffe; i < 0x4003; i++) { @@ -1065,8 +1106,8 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) /* Check wrap-around is handled correctly */ if (base != 0) - xa_erase_index(xa, base); - xa_erase_index(xa, base + 1); + xa_erase_index(test, xa, base); + xa_erase_index(test, xa, base + 1); next = UINT_MAX; XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(UINT_MAX), xa_limit_32b, &next, GFP_KERNEL) != 0); @@ -1079,7 +1120,7 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) XA_BUG_ON(xa, id != base + 1); xa_for_each(xa, index, entry) - xa_erase_index(xa, index); + xa_erase_index(test, xa, index); XA_BUG_ON(xa, !xa_empty(xa)); } @@ -1087,19 +1128,21 @@ static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base) static DEFINE_XARRAY_ALLOC(xa0); static DEFINE_XARRAY_ALLOC1(xa1); -static noinline void check_xa_alloc(void) +static noinline void check_xa_alloc(struct kunit *test) { - check_xa_alloc_1(&xa0, 0); - check_xa_alloc_1(&xa1, 1); - check_xa_alloc_2(&xa0, 0); - check_xa_alloc_2(&xa1, 1); - check_xa_alloc_3(&xa0, 0); - check_xa_alloc_3(&xa1, 1); + check_xa_alloc_1(test, &xa0, 0); + check_xa_alloc_1(test, &xa1, 1); + check_xa_alloc_2(test, &xa0, 0); + check_xa_alloc_2(test, &xa1, 1); + check_xa_alloc_3(test, &xa0, 0); + check_xa_alloc_3(test, &xa1, 1); } -static noinline void __check_store_iter(struct xarray *xa, unsigned long start, +static noinline void __check_store_iter(struct kunit *test, unsigned long start, unsigned int order, unsigned int present) { + struct xarray *xa = xa_param(test); + XA_STATE_ORDER(xas, xa, start, order); void *entry; unsigned int count = 0; @@ -1123,50 +1166,54 @@ retry: XA_BUG_ON(xa, xa_load(xa, start) != xa_mk_index(start)); XA_BUG_ON(xa, xa_load(xa, start + (1UL << order) - 1) != xa_mk_index(start)); - xa_erase_index(xa, start); + xa_erase_index(test, xa, start); } -static noinline void check_store_iter(struct xarray *xa) +static noinline void check_store_iter(struct kunit *test) { + struct xarray *xa = xa_param(test); + unsigned int i, j; unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; for (i = 0; i < max_order; i++) { unsigned int min = 1 << i; unsigned int max = (2 << i) - 1; - __check_store_iter(xa, 0, i, 0); + __check_store_iter(test, 0, i, 0); XA_BUG_ON(xa, !xa_empty(xa)); - __check_store_iter(xa, min, i, 0); + __check_store_iter(test, min, i, 0); XA_BUG_ON(xa, !xa_empty(xa)); xa_store_index(xa, min, GFP_KERNEL); - __check_store_iter(xa, min, i, 1); + __check_store_iter(test, min, i, 1); XA_BUG_ON(xa, !xa_empty(xa)); xa_store_index(xa, max, GFP_KERNEL); - __check_store_iter(xa, min, i, 1); + __check_store_iter(test, min, i, 1); XA_BUG_ON(xa, !xa_empty(xa)); for (j = 0; j < min; j++) xa_store_index(xa, j, GFP_KERNEL); - __check_store_iter(xa, 0, i, min); + __check_store_iter(test, 0, i, min); XA_BUG_ON(xa, !xa_empty(xa)); for (j = 0; j < min; j++) xa_store_index(xa, min + j, GFP_KERNEL); - __check_store_iter(xa, min, i, min); + __check_store_iter(test, min, i, min); XA_BUG_ON(xa, !xa_empty(xa)); } #ifdef CONFIG_XARRAY_MULTI xa_store_index(xa, 63, GFP_KERNEL); xa_store_index(xa, 65, GFP_KERNEL); - __check_store_iter(xa, 64, 2, 1); - xa_erase_index(xa, 63); + __check_store_iter(test, 64, 2, 1); + xa_erase_index(test, xa, 63); #endif XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_multi_find_1(struct xarray *xa, unsigned order) +static noinline void check_multi_find_1(struct kunit *test, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI + struct xarray *xa = xa_param(test); + unsigned long multi = 3 << order; unsigned long next = 4 << order; unsigned long index; @@ -1189,15 +1236,17 @@ static noinline void check_multi_find_1(struct xarray *xa, unsigned order) XA_BUG_ON(xa, xa_find_after(xa, &index, next, XA_PRESENT) != NULL); XA_BUG_ON(xa, index != next); - xa_erase_index(xa, multi); - xa_erase_index(xa, next); - xa_erase_index(xa, next + 1); + xa_erase_index(test, xa, multi); + xa_erase_index(test, xa, next); + xa_erase_index(test, xa, next + 1); XA_BUG_ON(xa, !xa_empty(xa)); #endif } -static noinline void check_multi_find_2(struct xarray *xa) +static noinline void check_multi_find_2(struct kunit *test) { + struct xarray *xa = xa_param(test); + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 10 : 1; unsigned int i, j; void *entry; @@ -1211,17 +1260,19 @@ static noinline void check_multi_find_2(struct xarray *xa) GFP_KERNEL); rcu_read_lock(); xas_for_each(&xas, entry, ULONG_MAX) { - xa_erase_index(xa, index); + xa_erase_index(test, xa, index); } rcu_read_unlock(); - xa_erase_index(xa, index - 1); + xa_erase_index(test, xa, index - 1); XA_BUG_ON(xa, !xa_empty(xa)); } } } -static noinline void check_multi_find_3(struct xarray *xa) +static noinline void check_multi_find_3(struct kunit *test) { + struct xarray *xa = xa_param(test); + unsigned int order; for (order = 5; order < order_limit; order++) { @@ -1230,12 +1281,14 @@ static noinline void check_multi_find_3(struct xarray *xa) XA_BUG_ON(xa, !xa_empty(xa)); xa_store_order(xa, 0, order - 4, xa_mk_index(0), GFP_KERNEL); XA_BUG_ON(xa, xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT)); - xa_erase_index(xa, 0); + xa_erase_index(test, xa, 0); } } -static noinline void check_find_1(struct xarray *xa) +static noinline void check_find_1(struct kunit *test) { + struct xarray *xa = xa_param(test); + unsigned long i, j, k; XA_BUG_ON(xa, !xa_empty(xa)); @@ -1272,18 +1325,20 @@ static noinline void check_find_1(struct xarray *xa) else XA_BUG_ON(xa, entry != NULL); } - xa_erase_index(xa, j); + xa_erase_index(test, xa, j); XA_BUG_ON(xa, xa_get_mark(xa, j, XA_MARK_0)); XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0)); } - xa_erase_index(xa, i); + xa_erase_index(test, xa, i); XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_0)); } XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_find_2(struct xarray *xa) +static noinline void check_find_2(struct kunit *test) { + struct xarray *xa = xa_param(test); + void *entry; unsigned long i, j, index; @@ -1303,8 +1358,10 @@ static noinline void check_find_2(struct xarray *xa) xa_destroy(xa); } -static noinline void check_find_3(struct xarray *xa) +static noinline void check_find_3(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); unsigned long i, j, k; void *entry; @@ -1328,8 +1385,10 @@ static noinline void check_find_3(struct xarray *xa) xa_destroy(xa); } -static noinline void check_find_4(struct xarray *xa) +static noinline void check_find_4(struct kunit *test) { + struct xarray *xa = xa_param(test); + unsigned long index = 0; void *entry; @@ -1341,22 +1400,22 @@ static noinline void check_find_4(struct xarray *xa) entry = xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT); XA_BUG_ON(xa, entry); - xa_erase_index(xa, ULONG_MAX); + xa_erase_index(test, xa, ULONG_MAX); } -static noinline void check_find(struct xarray *xa) +static noinline void check_find(struct kunit *test) { unsigned i; - check_find_1(xa); - check_find_2(xa); - check_find_3(xa); - check_find_4(xa); + check_find_1(test); + check_find_2(test); + check_find_3(test); + check_find_4(test); for (i = 2; i < 10; i++) - check_multi_find_1(xa, i); - check_multi_find_2(xa); - check_multi_find_3(xa); + check_multi_find_1(test, i); + check_multi_find_2(test); + check_multi_find_3(test); } /* See find_swap_entry() in mm/shmem.c */ @@ -1382,8 +1441,10 @@ static noinline unsigned long xa_find_entry(struct xarray *xa, void *item) return entry ? xas.xa_index : -1; } -static noinline void check_find_entry(struct xarray *xa) +static noinline void check_find_entry(struct kunit *test) { + struct xarray *xa = xa_param(test); + #ifdef CONFIG_XARRAY_MULTI unsigned int order; unsigned long offset, index; @@ -1410,12 +1471,14 @@ static noinline void check_find_entry(struct xarray *xa) xa_store_index(xa, ULONG_MAX, GFP_KERNEL); XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1); XA_BUG_ON(xa, xa_find_entry(xa, xa_mk_index(ULONG_MAX)) != -1); - xa_erase_index(xa, ULONG_MAX); + xa_erase_index(test, xa, ULONG_MAX); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_pause(struct xarray *xa) +static noinline void check_pause(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); void *entry; unsigned int order; @@ -1448,10 +1511,47 @@ static noinline void check_pause(struct xarray *xa) XA_BUG_ON(xa, count != order_limit); xa_destroy(xa); + + index = 0; + for (order = XA_CHUNK_SHIFT; order > 0; order--) { + XA_BUG_ON(xa, xa_store_order(xa, index, order, + xa_mk_index(index), GFP_KERNEL)); + index += 1UL << order; + } + + index = 0; + count = 0; + xas_set(&xas, 0); + rcu_read_lock(); + xas_for_each(&xas, entry, ULONG_MAX) { + XA_BUG_ON(xa, entry != xa_mk_index(index)); + index += 1UL << (XA_CHUNK_SHIFT - count); + count++; + } + rcu_read_unlock(); + XA_BUG_ON(xa, count != XA_CHUNK_SHIFT); + + index = 0; + count = 0; + xas_set(&xas, XA_CHUNK_SIZE / 2 + 1); + rcu_read_lock(); + xas_for_each(&xas, entry, ULONG_MAX) { + XA_BUG_ON(xa, entry != xa_mk_index(index)); + index += 1UL << (XA_CHUNK_SHIFT - count); + count++; + xas_pause(&xas); + } + rcu_read_unlock(); + XA_BUG_ON(xa, count != XA_CHUNK_SHIFT); + + xa_destroy(xa); + } -static noinline void check_move_tiny(struct xarray *xa) +static noinline void check_move_tiny(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); XA_BUG_ON(xa, !xa_empty(xa)); @@ -1468,12 +1568,14 @@ static noinline void check_move_tiny(struct xarray *xa) XA_BUG_ON(xa, xas_prev(&xas) != xa_mk_index(0)); XA_BUG_ON(xa, xas_prev(&xas) != NULL); rcu_read_unlock(); - xa_erase_index(xa, 0); + xa_erase_index(test, xa, 0); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_move_max(struct xarray *xa) +static noinline void check_move_max(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); xa_store_index(xa, ULONG_MAX, GFP_KERNEL); @@ -1489,12 +1591,14 @@ static noinline void check_move_max(struct xarray *xa) XA_BUG_ON(xa, xas_find(&xas, ULONG_MAX) != NULL); rcu_read_unlock(); - xa_erase_index(xa, ULONG_MAX); + xa_erase_index(test, xa, ULONG_MAX); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_move_small(struct xarray *xa, unsigned long idx) +static noinline void check_move_small(struct kunit *test, unsigned long idx) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); unsigned long i; @@ -1536,13 +1640,15 @@ static noinline void check_move_small(struct xarray *xa, unsigned long idx) XA_BUG_ON(xa, xas.xa_index != ULONG_MAX); rcu_read_unlock(); - xa_erase_index(xa, 0); - xa_erase_index(xa, idx); + xa_erase_index(test, xa, 0); + xa_erase_index(test, xa, idx); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_move(struct xarray *xa) +static noinline void check_move(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, (1 << 16) - 1); unsigned long i; @@ -1569,7 +1675,7 @@ static noinline void check_move(struct xarray *xa) rcu_read_unlock(); for (i = (1 << 8); i < (1 << 15); i++) - xa_erase_index(xa, i); + xa_erase_index(test, xa, i); i = xas.xa_index; @@ -1600,17 +1706,17 @@ static noinline void check_move(struct xarray *xa) xa_destroy(xa); - check_move_tiny(xa); - check_move_max(xa); + check_move_tiny(test); + check_move_max(test); for (i = 0; i < 16; i++) - check_move_small(xa, 1UL << i); + check_move_small(test, 1UL << i); for (i = 2; i < 16; i++) - check_move_small(xa, (1UL << i) - 1); + check_move_small(test, (1UL << i) - 1); } -static noinline void xa_store_many_order(struct xarray *xa, +static noinline void xa_store_many_order(struct kunit *test, struct xarray *xa, unsigned long index, unsigned order) { XA_STATE_ORDER(xas, xa, index, order); @@ -1633,30 +1739,34 @@ unlock: XA_BUG_ON(xa, xas_error(&xas)); } -static noinline void check_create_range_1(struct xarray *xa, +static noinline void check_create_range_1(struct kunit *test, unsigned long index, unsigned order) { + struct xarray *xa = xa_param(test); + unsigned long i; - xa_store_many_order(xa, index, order); + xa_store_many_order(test, xa, index, order); for (i = index; i < index + (1UL << order); i++) - xa_erase_index(xa, i); + xa_erase_index(test, xa, i); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_create_range_2(struct xarray *xa, unsigned order) +static noinline void check_create_range_2(struct kunit *test, unsigned int order) { + struct xarray *xa = xa_param(test); + unsigned long i; unsigned long nr = 1UL << order; for (i = 0; i < nr * nr; i += nr) - xa_store_many_order(xa, i, order); + xa_store_many_order(test, xa, i, order); for (i = 0; i < nr * nr; i++) - xa_erase_index(xa, i); + xa_erase_index(test, xa, i); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_create_range_3(void) +static noinline void check_create_range_3(struct kunit *test) { XA_STATE(xas, NULL, 0); xas_set_err(&xas, -EEXIST); @@ -1664,9 +1774,11 @@ static noinline void check_create_range_3(void) XA_BUG_ON(NULL, xas_error(&xas) != -EEXIST); } -static noinline void check_create_range_4(struct xarray *xa, +static noinline void check_create_range_4(struct kunit *test, unsigned long index, unsigned order) { + struct xarray *xa = xa_param(test); + XA_STATE_ORDER(xas, xa, index, order); unsigned long base = xas.xa_index; unsigned long i = 0; @@ -1692,13 +1804,15 @@ unlock: XA_BUG_ON(xa, xas_error(&xas)); for (i = base; i < base + (1UL << order); i++) - xa_erase_index(xa, i); + xa_erase_index(test, xa, i); XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_create_range_5(struct xarray *xa, +static noinline void check_create_range_5(struct kunit *test, unsigned long index, unsigned int order) { + struct xarray *xa = xa_param(test); + XA_STATE_ORDER(xas, xa, index, order); unsigned int i; @@ -1715,44 +1829,46 @@ static noinline void check_create_range_5(struct xarray *xa, xa_destroy(xa); } -static noinline void check_create_range(struct xarray *xa) +static noinline void check_create_range(struct kunit *test) { unsigned int order; unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 12 : 1; for (order = 0; order < max_order; order++) { - check_create_range_1(xa, 0, order); - check_create_range_1(xa, 1U << order, order); - check_create_range_1(xa, 2U << order, order); - check_create_range_1(xa, 3U << order, order); - check_create_range_1(xa, 1U << 24, order); + check_create_range_1(test, 0, order); + check_create_range_1(test, 1U << order, order); + check_create_range_1(test, 2U << order, order); + check_create_range_1(test, 3U << order, order); + check_create_range_1(test, 1U << 24, order); if (order < 10) - check_create_range_2(xa, order); + check_create_range_2(test, order); - check_create_range_4(xa, 0, order); - check_create_range_4(xa, 1U << order, order); - check_create_range_4(xa, 2U << order, order); - check_create_range_4(xa, 3U << order, order); - check_create_range_4(xa, 1U << 24, order); + check_create_range_4(test, 0, order); + check_create_range_4(test, 1U << order, order); + check_create_range_4(test, 2U << order, order); + check_create_range_4(test, 3U << order, order); + check_create_range_4(test, 1U << 24, order); - check_create_range_4(xa, 1, order); - check_create_range_4(xa, (1U << order) + 1, order); - check_create_range_4(xa, (2U << order) + 1, order); - check_create_range_4(xa, (2U << order) - 1, order); - check_create_range_4(xa, (3U << order) + 1, order); - check_create_range_4(xa, (3U << order) - 1, order); - check_create_range_4(xa, (1U << 24) + 1, order); + check_create_range_4(test, 1, order); + check_create_range_4(test, (1U << order) + 1, order); + check_create_range_4(test, (2U << order) + 1, order); + check_create_range_4(test, (2U << order) - 1, order); + check_create_range_4(test, (3U << order) + 1, order); + check_create_range_4(test, (3U << order) - 1, order); + check_create_range_4(test, (1U << 24) + 1, order); - check_create_range_5(xa, 0, order); - check_create_range_5(xa, (1U << order), order); + check_create_range_5(test, 0, order); + check_create_range_5(test, (1U << order), order); } - check_create_range_3(); + check_create_range_3(test); } -static noinline void __check_store_range(struct xarray *xa, unsigned long first, +static noinline void __check_store_range(struct kunit *test, unsigned long first, unsigned long last) { + struct xarray *xa = xa_param(test); + #ifdef CONFIG_XARRAY_MULTI xa_store_range(xa, first, last, xa_mk_index(first), GFP_KERNEL); @@ -1767,26 +1883,28 @@ static noinline void __check_store_range(struct xarray *xa, unsigned long first, XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_store_range(struct xarray *xa) +static noinline void check_store_range(struct kunit *test) { unsigned long i, j; for (i = 0; i < 128; i++) { for (j = i; j < 128; j++) { - __check_store_range(xa, i, j); - __check_store_range(xa, 128 + i, 128 + j); - __check_store_range(xa, 4095 + i, 4095 + j); - __check_store_range(xa, 4096 + i, 4096 + j); - __check_store_range(xa, 123456 + i, 123456 + j); - __check_store_range(xa, (1 << 24) + i, (1 << 24) + j); + __check_store_range(test, i, j); + __check_store_range(test, 128 + i, 128 + j); + __check_store_range(test, 4095 + i, 4095 + j); + __check_store_range(test, 4096 + i, 4096 + j); + __check_store_range(test, 123456 + i, 123456 + j); + __check_store_range(test, (1 << 24) + i, (1 << 24) + j); } } } #ifdef CONFIG_XARRAY_MULTI -static void check_split_1(struct xarray *xa, unsigned long index, +static void check_split_1(struct kunit *test, unsigned long index, unsigned int order, unsigned int new_order) { + struct xarray *xa = xa_param(test); + XA_STATE_ORDER(xas, xa, index, new_order); unsigned int i, found; void *entry; @@ -1822,26 +1940,30 @@ static void check_split_1(struct xarray *xa, unsigned long index, xa_destroy(xa); } -static noinline void check_split(struct xarray *xa) +static noinline void check_split(struct kunit *test) { + struct xarray *xa = xa_param(test); + unsigned int order, new_order; XA_BUG_ON(xa, !xa_empty(xa)); for (order = 1; order < 2 * XA_CHUNK_SHIFT; order++) { for (new_order = 0; new_order < order; new_order++) { - check_split_1(xa, 0, order, new_order); - check_split_1(xa, 1UL << order, order, new_order); - check_split_1(xa, 3UL << order, order, new_order); + check_split_1(test, 0, order, new_order); + check_split_1(test, 1UL << order, order, new_order); + check_split_1(test, 3UL << order, order, new_order); } } } #else -static void check_split(struct xarray *xa) { } +static void check_split(struct kunit *test) { } #endif -static void check_align_1(struct xarray *xa, char *name) +static void check_align_1(struct kunit *test, char *name) { + struct xarray *xa = xa_param(test); + int i; unsigned int id; unsigned long index; @@ -1861,8 +1983,10 @@ static void check_align_1(struct xarray *xa, char *name) * We should always be able to store without allocating memory after * reserving a slot. */ -static void check_align_2(struct xarray *xa, char *name) +static void check_align_2(struct kunit *test, char *name) { + struct xarray *xa = xa_param(test); + int i; XA_BUG_ON(xa, !xa_empty(xa)); @@ -1881,15 +2005,15 @@ static void check_align_2(struct xarray *xa, char *name) XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_align(struct xarray *xa) +static noinline void check_align(struct kunit *test) { char name[] = "Motorola 68000"; - check_align_1(xa, name); - check_align_1(xa, name + 1); - check_align_1(xa, name + 2); - check_align_1(xa, name + 3); - check_align_2(xa, name); + check_align_1(test, name); + check_align_1(test, name + 1); + check_align_1(test, name + 2); + check_align_1(test, name + 3); + check_align_2(test, name); } static LIST_HEAD(shadow_nodes); @@ -1905,7 +2029,7 @@ static void test_update_node(struct xa_node *node) } } -static noinline void shadow_remove(struct xarray *xa) +static noinline void shadow_remove(struct kunit *test, struct xarray *xa) { struct xa_node *node; @@ -1919,8 +2043,17 @@ static noinline void shadow_remove(struct xarray *xa) xa_unlock(xa); } -static noinline void check_workingset(struct xarray *xa, unsigned long index) +struct workingset_testcase { + struct xarray *xa; + unsigned long index; +}; + +static noinline void check_workingset(struct kunit *test) { + struct workingset_testcase tc = *(struct workingset_testcase *)test->param_value; + struct xarray *xa = tc.xa; + unsigned long index = tc.index; + XA_STATE(xas, xa, index); xas_set_update(&xas, test_update_node); @@ -1943,7 +2076,7 @@ static noinline void check_workingset(struct xarray *xa, unsigned long index) xas_unlock(&xas); XA_BUG_ON(xa, list_empty(&shadow_nodes)); - shadow_remove(xa); + shadow_remove(test, xa); XA_BUG_ON(xa, !list_empty(&shadow_nodes)); XA_BUG_ON(xa, !xa_empty(xa)); } @@ -1952,9 +2085,11 @@ static noinline void check_workingset(struct xarray *xa, unsigned long index) * Check that the pointer / value / sibling entries are accounted the * way we expect them to be. */ -static noinline void check_account(struct xarray *xa) +static noinline void check_account(struct kunit *test) { #ifdef CONFIG_XARRAY_MULTI + struct xarray *xa = xa_param(test); + unsigned int order; for (order = 1; order < 12; order++) { @@ -1981,8 +2116,10 @@ static noinline void check_account(struct xarray *xa) #endif } -static noinline void check_get_order(struct xarray *xa) +static noinline void check_get_order(struct kunit *test) { + struct xarray *xa = xa_param(test); + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; unsigned int order; unsigned long i, j; @@ -2001,8 +2138,10 @@ static noinline void check_get_order(struct xarray *xa) } } -static noinline void check_xas_get_order(struct xarray *xa) +static noinline void check_xas_get_order(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; @@ -2034,8 +2173,10 @@ static noinline void check_xas_get_order(struct xarray *xa) } } -static noinline void check_xas_conflict_get_order(struct xarray *xa) +static noinline void check_xas_conflict_get_order(struct kunit *test) { + struct xarray *xa = xa_param(test); + XA_STATE(xas, xa, 0); void *entry; @@ -2092,8 +2233,10 @@ static noinline void check_xas_conflict_get_order(struct xarray *xa) } -static noinline void check_destroy(struct xarray *xa) +static noinline void check_destroy(struct kunit *test) { + struct xarray *xa = xa_param(test); + unsigned long index; XA_BUG_ON(xa, !xa_empty(xa)); @@ -2126,52 +2269,59 @@ static noinline void check_destroy(struct xarray *xa) } static DEFINE_XARRAY(array); +static struct xarray *arrays[] = { &array }; +KUNIT_ARRAY_PARAM(array, arrays, NULL); -static int xarray_checks(void) -{ - check_xa_err(&array); - check_xas_retry(&array); - check_xa_load(&array); - check_xa_mark(&array); - check_xa_shrink(&array); - check_xas_erase(&array); - check_insert(&array); - check_cmpxchg(&array); - check_cmpxchg_order(&array); - check_reserve(&array); - check_reserve(&xa0); - check_multi_store(&array); - check_multi_store_advanced(&array); - check_get_order(&array); - check_xas_get_order(&array); - check_xas_conflict_get_order(&array); - check_xa_alloc(); - check_find(&array); - check_find_entry(&array); - check_pause(&array); - check_account(&array); - check_destroy(&array); - check_move(&array); - check_create_range(&array); - check_store_range(&array); - check_store_iter(&array); - check_align(&xa0); - check_split(&array); +static struct xarray *xa0s[] = { &xa0 }; +KUNIT_ARRAY_PARAM(xa0, xa0s, NULL); - check_workingset(&array, 0); - check_workingset(&array, 64); - check_workingset(&array, 4096); +static struct workingset_testcase workingset_testcases[] = { + { &array, 0 }, + { &array, 64 }, + { &array, 4096 }, +}; +KUNIT_ARRAY_PARAM(workingset, workingset_testcases, NULL); - printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); - return (tests_run == tests_passed) ? 0 : -EINVAL; -} +static struct kunit_case xarray_cases[] = { + KUNIT_CASE_PARAM(check_xa_err, array_gen_params), + KUNIT_CASE_PARAM(check_xas_retry, array_gen_params), + KUNIT_CASE_PARAM(check_xa_load, array_gen_params), + KUNIT_CASE_PARAM(check_xa_mark, array_gen_params), + KUNIT_CASE_PARAM(check_xa_shrink, array_gen_params), + KUNIT_CASE_PARAM(check_xas_erase, array_gen_params), + KUNIT_CASE_PARAM(check_insert, array_gen_params), + KUNIT_CASE_PARAM(check_cmpxchg, array_gen_params), + KUNIT_CASE_PARAM(check_cmpxchg_order, array_gen_params), + KUNIT_CASE_PARAM(check_reserve, array_gen_params), + KUNIT_CASE_PARAM(check_reserve, xa0_gen_params), + KUNIT_CASE_PARAM(check_multi_store, array_gen_params), + KUNIT_CASE_PARAM(check_multi_store_advanced, array_gen_params), + KUNIT_CASE_PARAM(check_get_order, array_gen_params), + KUNIT_CASE_PARAM(check_xas_get_order, array_gen_params), + KUNIT_CASE_PARAM(check_xas_conflict_get_order, array_gen_params), + KUNIT_CASE(check_xa_alloc), + KUNIT_CASE_PARAM(check_find, array_gen_params), + KUNIT_CASE_PARAM(check_find_entry, array_gen_params), + KUNIT_CASE_PARAM(check_pause, array_gen_params), + KUNIT_CASE_PARAM(check_account, array_gen_params), + KUNIT_CASE_PARAM(check_destroy, array_gen_params), + KUNIT_CASE_PARAM(check_move, array_gen_params), + KUNIT_CASE_PARAM(check_create_range, array_gen_params), + KUNIT_CASE_PARAM(check_store_range, array_gen_params), + KUNIT_CASE_PARAM(check_store_iter, array_gen_params), + KUNIT_CASE_PARAM(check_align, xa0_gen_params), + KUNIT_CASE_PARAM(check_split, array_gen_params), + KUNIT_CASE_PARAM(check_workingset, workingset_gen_params), + {}, +}; -static void xarray_exit(void) -{ -} +static struct kunit_suite xarray_suite = { + .name = "xarray", + .test_cases = xarray_cases, +}; + +kunit_test_suite(xarray_suite); -module_init(xarray_checks); -module_exit(xarray_exit); MODULE_AUTHOR("Matthew Wilcox "); MODULE_DESCRIPTION("XArray API test module"); MODULE_LICENSE("GPL"); diff --git a/lib/xarray.c b/lib/xarray.c index 32d4bac8c94c..116e9286c64e 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -125,19 +125,20 @@ static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) */ static void xas_squash_marks(const struct xa_state *xas) { - unsigned int mark = 0; + xa_mark_t mark = 0; unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; - if (!xas->xa_sibs) - return; + for (;;) { + unsigned long *marks = node_marks(xas->xa_node, mark); - do { - unsigned long *marks = xas->xa_node->marks[mark]; - if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) - continue; - __set_bit(xas->xa_offset, marks); - bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); - } while (mark++ != (__force unsigned)XA_MARK_MAX); + if (find_next_bit(marks, limit, xas->xa_offset + 1) != limit) { + __set_bit(xas->xa_offset, marks); + bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); + } + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } } /* extracts the offset within this node from the index */ @@ -435,6 +436,11 @@ static unsigned long max_index(void *entry) return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; } +static inline void *xa_zero_to_null(void *entry) +{ + return xa_is_zero(entry) ? NULL : entry; +} + static void xas_shrink(struct xa_state *xas) { struct xarray *xa = xas->xa; @@ -451,8 +457,8 @@ static void xas_shrink(struct xa_state *xas) break; if (!xa_is_node(entry) && node->shift) break; - if (xa_is_zero(entry) && xa_zero_busy(xa)) - entry = NULL; + if (xa_zero_busy(xa)) + entry = xa_zero_to_null(entry); xas->xa_node = XAS_BOUNDS; RCU_INIT_POINTER(xa->xa_head, entry); @@ -1022,7 +1028,7 @@ void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, unsigned int mask = xas->xa_sibs; /* XXX: no support for splitting really large entries yet */ - if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order)) + if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT <= order)) goto nomem; if (xas->xa_shift + XA_CHUNK_SHIFT > order) return; @@ -1147,6 +1153,7 @@ void xas_pause(struct xa_state *xas) if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) break; } + xas->xa_index &= ~0UL << node->shift; xas->xa_index += (offset - xas->xa_offset) << node->shift; if (xas->xa_index == 0) xas->xa_node = XAS_BOUNDS; @@ -1382,6 +1389,8 @@ void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK)) continue; + if (xa_is_sibling(entry)) + continue; if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); @@ -1474,9 +1483,7 @@ void *xa_load(struct xarray *xa, unsigned long index) rcu_read_lock(); do { - entry = xas_load(&xas); - if (xa_is_zero(entry)) - entry = NULL; + entry = xa_zero_to_null(xas_load(&xas)); } while (xas_retry(&xas, entry)); rcu_read_unlock(); @@ -1486,8 +1493,6 @@ EXPORT_SYMBOL(xa_load); static void *xas_result(struct xa_state *xas, void *curr) { - if (xa_is_zero(curr)) - return NULL; if (xas_error(xas)) curr = xas->xa_node; return curr; @@ -1508,7 +1513,7 @@ static void *xas_result(struct xa_state *xas, void *curr) void *__xa_erase(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); - return xas_result(&xas, xas_store(&xas, NULL)); + return xas_result(&xas, xa_zero_to_null(xas_store(&xas, NULL))); } EXPORT_SYMBOL(__xa_erase); @@ -1567,7 +1572,7 @@ void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); - return xas_result(&xas, curr); + return xas_result(&xas, xa_zero_to_null(curr)); } EXPORT_SYMBOL(__xa_store); @@ -1600,6 +1605,9 @@ void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) } EXPORT_SYMBOL(xa_store); +static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp); + /** * __xa_cmpxchg() - Store this entry in the XArray. * @xa: XArray. @@ -1618,6 +1626,13 @@ EXPORT_SYMBOL(xa_store); */ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) +{ + return xa_zero_to_null(__xa_cmpxchg_raw(xa, index, old, entry, gfp)); +} +EXPORT_SYMBOL(__xa_cmpxchg); + +static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; @@ -1636,7 +1651,6 @@ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, return xas_result(&xas, curr); } -EXPORT_SYMBOL(__xa_cmpxchg); /** * __xa_insert() - Store this entry in the XArray if no entry is present. @@ -1656,26 +1670,16 @@ EXPORT_SYMBOL(__xa_cmpxchg); */ int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { - XA_STATE(xas, xa, index); void *curr; + int errno; - if (WARN_ON_ONCE(xa_is_advanced(entry))) - return -EINVAL; if (!entry) entry = XA_ZERO_ENTRY; - - do { - curr = xas_load(&xas); - if (!curr) { - xas_store(&xas, entry); - if (xa_track_free(xa)) - xas_clear_mark(&xas, XA_FREE_MARK); - } else { - xas_set_err(&xas, -EBUSY); - } - } while (__xas_nomem(&xas, gfp)); - - return xas_error(&xas); + curr = __xa_cmpxchg_raw(xa, index, NULL, entry, gfp); + errno = xa_err(curr); + if (errno) + return errno; + return (curr != NULL) ? -EBUSY : 0; } EXPORT_SYMBOL(__xa_insert); diff --git a/mm/Kconfig b/mm/Kconfig index 84000b016808..1b501db06417 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -550,20 +550,63 @@ menuconfig MEMORY_HOTPLUG if MEMORY_HOTPLUG -config MEMORY_HOTPLUG_DEFAULT_ONLINE - bool "Online the newly added memory blocks by default" - depends on MEMORY_HOTPLUG +choice + prompt "Memory Hotplug Default Online Type" + default MHP_DEFAULT_ONLINE_TYPE_OFFLINE help + Default memory type for hotplugged memory. + This option sets the default policy setting for memory hotplug onlining policy (/sys/devices/system/memory/auto_online_blocks) which determines what happens to newly added memory regions. Policy setting can always be changed at runtime. + + The default is 'offline'. + + Select offline to defer onlining to drivers and user policy. + Select auto to let the kernel choose what zones to utilize. + Select online_kernel to generally allow kernel usage of this memory. + Select online_movable to generally disallow kernel usage of this memory. + + Example kernel usage would be page structs and page tables. + See Documentation/admin-guide/mm/memory-hotplug.rst for more information. - Say Y here if you want all hot-plugged memory blocks to appear in - 'online' state by default. - Say N here if you want the default policy to keep all hot-plugged - memory blocks in 'offline' state. +config MHP_DEFAULT_ONLINE_TYPE_OFFLINE + bool "offline" + help + Hotplugged memory will not be onlined by default. + Choose this for systems with drivers and user policy that + handle onlining of hotplug memory policy. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO + bool "auto" + help + Select this if you want the kernel to automatically online + hotplugged memory into the zone it thinks is reasonable. + This memory may be utilized for kernel data. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL + bool "kernel" + help + Select this if you want the kernel to automatically online + hotplugged memory into a zone capable of being used for kernel + data. This typically means ZONE_NORMAL. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE + bool "movable" + help + Select this if you want the kernel to automatically online + hotplug memory into ZONE_MOVABLE. This memory will generally + not be utilized for kernel data. + + This should only be used when the admin knows sufficient + ZONE_NORMAL memory is available to describe hotplug memory, + otherwise hotplug memory may fail to online. For example, + sufficient kernel-capable memory (ZONE_NORMAL) must be + available to allocate page structs to describe ZONE_MOVABLE. + +endchoice config MEMORY_HOTREMOVE bool "Allow for memory hot remove" @@ -1301,6 +1344,21 @@ config ARCH_HAS_USER_SHADOW_STACK The architecture has hardware support for userspace shadow call stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). +config ARCH_SUPPORTS_PT_RECLAIM + def_bool n + +config PT_RECLAIM + bool "reclaim empty user page table pages" + default y + depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP + select MMU_GATHER_RCU_TABLE_FREE + help + Try to reclaim empty user page table pages in paths other than munmap + and exit_mmap path. + + Note: now only empty user PTE page table pages will be reclaimed. + + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index dba52bb0da8a..850386a67b3e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -146,3 +146,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o +obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o diff --git a/mm/cma.h b/mm/cma.h index ad61cc6dd439..8485ef893e99 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -36,7 +36,7 @@ struct cma { }; extern struct cma cma_areas[MAX_CMA_AREAS]; -extern unsigned cma_area_count; +extern unsigned int cma_area_count; static inline unsigned long cma_bitmap_maxno(struct cma *cma) { diff --git a/mm/compaction.c b/mm/compaction.c index a2b16b08cbbf..a9f1261972c8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -83,6 +83,7 @@ static inline bool is_via_compact_memory(int order) { return false; } static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags) { post_alloc_hook(page, order, __GFP_MOVABLE); + set_page_refcounted(page); return page; } #define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__)) @@ -1868,6 +1869,7 @@ again: dst = (struct folio *)freepage; post_alloc_hook(&dst->page, order, __GFP_MOVABLE); + set_page_refcounted(&dst->page); if (order) prep_compound_page(&dst->page, order); cc->nr_freepages -= 1 << order; @@ -2381,7 +2383,27 @@ static bool __compaction_suitable(struct zone *zone, int order, int highest_zoneidx, unsigned long wmark_target) { + pg_data_t __maybe_unused *pgdat = zone->zone_pgdat; + unsigned long sum, nr_pinned; unsigned long watermark; + + sum = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_ANON) + + node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_ANON) + + node_page_state(pgdat, NR_UNEVICTABLE); + + nr_pinned = node_page_state(pgdat, NR_FOLL_PIN_ACQUIRED) - + node_page_state(pgdat, NR_FOLL_PIN_RELEASED); + + /* + * Gup-pinned pages are non-migratable. After subtracting these pages, + * we need to check if the remaining pages are sufficient for memory + * compaction. + */ + if ((sum - nr_pinned) < (1 << order)) + return false; + /* * Watermarks for order-0 must be met for compaction to be able to * isolate free pages for migration targets. This means that the diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index d0357f3e9372..c213cf8b5638 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -71,36 +71,6 @@ config DAMON_SYSFS_KUNIT_TEST If unsure, say N. -config DAMON_DBGFS_DEPRECATED - bool "DAMON debugfs interface (DEPRECATED!)" - depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS - help - This builds the debugfs interface for DAMON. The user space admins - can use the interface for arbitrary data access monitoring. - - If unsure, say N. - - This is deprecated, so users should move to the sysfs interface - (DAMON_SYSFS). If you depend on this and cannot move, please report - your usecase to damon@lists.linux.dev and linux-mm@kvack.org. - -config DAMON_DBGFS - bool - default y - depends on DAMON_DBGFS_DEPRECATED - -config DAMON_DBGFS_KUNIT_TEST - bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS - depends on DAMON_DBGFS && KUNIT=y - default KUNIT_ALL_TESTS - help - This builds the DAMON debugfs interface Kunit test suite. - - For more information on KUnit and unit tests in general, please refer - to the KUnit documentation. - - If unsure, say N. - config DAMON_RECLAIM bool "Build DAMON-based reclaim (DAMON_RECLAIM)" depends on DAMON_PADDR diff --git a/mm/damon/Makefile b/mm/damon/Makefile index f7add3f4aa79..8b49012ba8c3 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -4,6 +4,5 @@ obj-y := core.o obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o -obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o diff --git a/mm/damon/core.c b/mm/damon/core.c index 0776452a1abb..55a435bdd89d 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -266,7 +266,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, } struct damos_filter *damos_new_filter(enum damos_filter_type type, - bool matching) + bool matching, bool allow) { struct damos_filter *filter; @@ -275,6 +275,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, return NULL; filter->type = type; filter->matching = matching; + filter->allow = allow; INIT_LIST_HEAD(&filter->list); return filter; } @@ -504,6 +505,8 @@ struct damon_ctx *damon_new_ctx(void) ctx->next_ops_update_sis = 0; mutex_init(&ctx->kdamond_lock); + mutex_init(&ctx->call_control_lock); + mutex_init(&ctx->walk_control_lock); ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; @@ -803,7 +806,8 @@ static int damos_commit_filters(struct damos *dst, struct damos *src) continue; new_filter = damos_new_filter( - src_filter->type, src_filter->matching); + src_filter->type, src_filter->matching, + src_filter->allow); if (!new_filter) return -ENOMEM; damos_commit_filter_arg(new_filter, src_filter); @@ -1162,6 +1166,94 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) return err; } +static bool damon_is_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + return running; +} + +/** + * damon_call() - Invoke a given function on DAMON worker thread (kdamond). + * @ctx: DAMON context to call the function for. + * @control: Control variable of the call request. + * + * Ask DAMON worker thread (kdamond) of @ctx to call a function with an + * argument data that respectively passed via &damon_call_control->fn and + * &damon_call_control->data of @control, and wait until the kdamond finishes + * handling of the request. + * + * The kdamond executes the function with the argument in the main loop, just + * after a sampling of the iteration is finished. The function can hence + * safely access the internal data of the &struct damon_ctx without additional + * synchronization. The return value of the function will be saved in + * &damon_call_control->return_code. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) +{ + init_completion(&control->completion); + control->canceled = false; + + mutex_lock(&ctx->call_control_lock); + if (ctx->call_control) { + mutex_unlock(&ctx->call_control_lock); + return -EBUSY; + } + ctx->call_control = control; + mutex_unlock(&ctx->call_control_lock); + if (!damon_is_running(ctx)) + return -EINVAL; + wait_for_completion(&control->completion); + if (control->canceled) + return -ECANCELED; + return 0; +} + +/** + * damos_walk() - Invoke a given functions while DAMOS walk regions. + * @ctx: DAMON context to call the functions for. + * @control: Control variable of the walk request. + * + * Ask DAMON worker thread (kdamond) of @ctx to call a function for each region + * that the kdamond will apply DAMOS action to, and wait until the kdamond + * finishes handling of the request. + * + * The kdamond executes the given function in the main loop, for each region + * just after it applied any DAMOS actions of @ctx to it. The invocation is + * made only within one &damos->apply_interval_us since damos_walk() + * invocation, for each scheme. The given callback function can hence safely + * access the internal data of &struct damon_ctx and &struct damon_region that + * each of the scheme will apply the action for next interval, without + * additional synchronizations against the kdamond. If every scheme of @ctx + * passed at least one &damos->apply_interval_us, kdamond marks the request as + * completed so that damos_walk() can wakeup and return. + * + * Return: 0 on success, negative error code otherwise. + */ +int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) +{ + init_completion(&control->completion); + control->canceled = false; + mutex_lock(&ctx->walk_control_lock); + if (ctx->walk_control) { + mutex_unlock(&ctx->walk_control_lock); + return -EBUSY; + } + ctx->walk_control = control; + mutex_unlock(&ctx->walk_control_lock); + if (!damon_is_running(ctx)) + return -EINVAL; + wait_for_completion(&control->completion); + if (control->canceled) + return -ECANCELED; + return 0; +} + /* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ @@ -1272,16 +1364,18 @@ static bool damos_skip_charged_region(struct damon_target *t, } static void damos_update_stat(struct damos *s, - unsigned long sz_tried, unsigned long sz_applied) + unsigned long sz_tried, unsigned long sz_applied, + unsigned long sz_ops_filter_passed) { s->stat.nr_tried++; s->stat.sz_tried += sz_tried; if (sz_applied) s->stat.nr_applied++; s->stat.sz_applied += sz_applied; + s->stat.sz_ops_filter_passed += sz_ops_filter_passed; } -static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, +static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos_filter *filter) { bool matched = false; @@ -1335,12 +1429,98 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, struct damos_filter *filter; damos_for_each_filter(filter, s) { - if (__damos_filter_out(ctx, t, r, filter)) - return true; + if (damos_filter_match(ctx, t, r, filter)) + return !filter->allow; } return false; } +/* + * damos_walk_call_walk() - Call &damos_walk_control->walk_fn. + * @ctx: The context of &damon_ctx->walk_control. + * @t: The monitoring target of @r that @s will be applied. + * @r: The region of @t that @s will be applied. + * @s: The scheme of @ctx that will be applied to @r. + * + * This function is called from kdamond whenever it asked the operation set to + * apply a DAMOS scheme action to a region. If a DAMOS walk request is + * installed by damos_walk() and not yet uninstalled, invoke it. + */ +static void damos_walk_call_walk(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, + unsigned long sz_filter_passed) +{ + struct damos_walk_control *control; + + mutex_lock(&ctx->walk_control_lock); + control = ctx->walk_control; + mutex_unlock(&ctx->walk_control_lock); + if (!control) + return; + control->walk_fn(control->data, ctx, t, r, s, sz_filter_passed); +} + +/* + * damos_walk_complete() - Complete DAMOS walk request if all walks are done. + * @ctx: The context of &damon_ctx->walk_control. + * @s: A scheme of @ctx that all walks are now done. + * + * This function is called when kdamond finished applying the action of a DAMOS + * scheme to all regions that eligible for the given &damos->apply_interval_us. + * If every scheme of @ctx including @s now finished walking for at least one + * &damos->apply_interval_us, this function makrs the handling of the given + * DAMOS walk request is done, so that damos_walk() can wake up and return. + */ +static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s) +{ + struct damos *siter; + struct damos_walk_control *control; + + mutex_lock(&ctx->walk_control_lock); + control = ctx->walk_control; + mutex_unlock(&ctx->walk_control_lock); + if (!control) + return; + + s->walk_completed = true; + /* if all schemes completed, signal completion to walker */ + damon_for_each_scheme(siter, ctx) { + if (!siter->walk_completed) + return; + } + complete(&control->completion); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control = NULL; + mutex_unlock(&ctx->walk_control_lock); +} + +/* + * damos_walk_cancel() - Cancel the current DAMOS walk request. + * @ctx: The context of &damon_ctx->walk_control. + * + * This function is called when @ctx is deactivated by DAMOS watermarks, DAMOS + * walk is requested but there is no DAMOS scheme to walk for, or the kdamond + * is already out of the main loop and therefore gonna be terminated, and hence + * cannot continue the walks. This function therefore marks the walk request + * as canceled, so that damos_walk() can wake up and return. + */ +static void damos_walk_cancel(struct damon_ctx *ctx) +{ + struct damos_walk_control *control; + + mutex_lock(&ctx->walk_control_lock); + control = ctx->walk_control; + mutex_unlock(&ctx->walk_control_lock); + + if (!control) + return; + control->canceled = true; + complete(&control->completion); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control = NULL; + mutex_unlock(&ctx->walk_control_lock); +} + static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct damon_region *r, struct damos *s) { @@ -1348,6 +1528,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, unsigned long sz = damon_sz_region(r); struct timespec64 begin, end; unsigned long sz_applied = 0; + unsigned long sz_ops_filter_passed = 0; int err = 0; /* * We plan to support multiple context per kdamond, as DAMON sysfs @@ -1393,8 +1574,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (!err) { trace_damos_before_apply(cidx, sidx, tidx, r, damon_nr_regions(t), do_trace); - sz_applied = c->ops.apply_scheme(c, t, r, s); + sz_applied = c->ops.apply_scheme(c, t, r, s, + &sz_ops_filter_passed); } + damos_walk_call_walk(c, t, r, s, sz_ops_filter_passed); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); @@ -1408,7 +1591,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, r->age = 0; update_stat: - damos_update_stat(s, sz, sz_applied); + damos_update_stat(s, sz, sz_applied, sz_ops_filter_passed); } static void damon_do_apply_schemes(struct damon_ctx *c, @@ -1550,7 +1733,7 @@ static unsigned long damos_quota_score(struct damos_quota *quota) static void damos_set_effective_quota(struct damos_quota *quota) { unsigned long throughput; - unsigned long esz; + unsigned long esz = ULONG_MAX; if (!quota->ms && list_empty("a->goals)) { quota->esz = quota->sz; @@ -1572,10 +1755,7 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->total_charged_ns; else throughput = PAGE_SIZE * 1024; - if (!list_empty("a->goals)) - esz = min(throughput * quota->ms, esz); - else - esz = throughput * quota->ms; + esz = min(throughput * quota->ms, esz); } if (quota->sz && quota->sz < esz) @@ -1666,6 +1846,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damon_for_each_scheme(s, c) { if (c->passed_sample_intervals < s->next_apply_sis) continue; + damos_walk_complete(c, s); s->next_apply_sis = c->passed_sample_intervals + (s->apply_interval_us ? s->apply_interval_us : c->attrs.aggr_interval) / sample_interval; @@ -1920,6 +2101,39 @@ static void kdamond_usleep(unsigned long usecs) usleep_range_idle(usecs, usecs + 1); } +/* + * kdamond_call() - handle damon_call_control. + * @ctx: The &struct damon_ctx of the kdamond. + * @cancel: Whether to cancel the invocation of the function. + * + * If there is a &struct damon_call_control request that registered via + * &damon_call() on @ctx, do or cancel the invocation of the function depending + * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS + * watermarks, or the kdamond is already out of the main loop and therefore + * will be terminated. + */ +static void kdamond_call(struct damon_ctx *ctx, bool cancel) +{ + struct damon_call_control *control; + int ret = 0; + + mutex_lock(&ctx->call_control_lock); + control = ctx->call_control; + mutex_unlock(&ctx->call_control_lock); + if (!control) + return; + if (cancel) { + control->canceled = true; + } else { + ret = control->fn(control->data); + control->return_code = ret; + } + complete(&control->completion); + mutex_lock(&ctx->call_control_lock); + ctx->call_control = NULL; + mutex_unlock(&ctx->call_control_lock); +} + /* Returns negative error code if it's not activated but should return */ static int kdamond_wait_activation(struct damon_ctx *ctx) { @@ -1944,6 +2158,8 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) if (ctx->callback.after_wmarks_check && ctx->callback.after_wmarks_check(ctx)) break; + kdamond_call(ctx, true); + damos_walk_cancel(ctx); } return -EBUSY; } @@ -2014,6 +2230,7 @@ static int kdamond_fn(void *data) if (ctx->callback.after_sampling && ctx->callback.after_sampling(ctx)) break; + kdamond_call(ctx, false); kdamond_usleep(sample_interval); ctx->passed_sample_intervals++; @@ -2036,6 +2253,8 @@ static int kdamond_fn(void *data) */ if (!list_empty(&ctx->schemes)) kdamond_apply_schemes(ctx); + else + damos_walk_cancel(ctx); sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; @@ -2075,6 +2294,9 @@ done: ctx->kdamond = NULL; mutex_unlock(&ctx->kdamond_lock); + kdamond_call(ctx, true); + damos_walk_cancel(ctx); + mutex_lock(&damon_lock); nr_running_ctxs--; if (!nr_running_ctxs && running_exclusive_ctxs) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c deleted file mode 100644 index b4213bc47e44..000000000000 --- a/mm/damon/dbgfs.c +++ /dev/null @@ -1,1148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DAMON Debugfs Interface - * - * Author: SeongJae Park - */ - -#define pr_fmt(fmt) "damon-dbgfs: " fmt - -#include -#include -#include -#include -#include -#include -#include - -#define DAMON_DBGFS_DEPRECATION_NOTICE \ - "DAMON debugfs interface is deprecated, so users should move " \ - "to DAMON_SYSFS. If you cannot, please report your usecase to " \ - "damon@lists.linux.dev and linux-mm@kvack.org.\n" - -static struct damon_ctx **dbgfs_ctxs; -static int dbgfs_nr_ctxs; -static struct dentry **dbgfs_dirs; -static DEFINE_MUTEX(damon_dbgfs_lock); - -static void damon_dbgfs_warn_deprecation(void) -{ - pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE); -} - -/* - * Returns non-empty string on success, negative error code otherwise. - */ -static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - ssize_t ret; - - /* We do not accept continuous write */ - if (*ppos) - return ERR_PTR(-EINVAL); - - kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return ERR_PTR(-ENOMEM); - - ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count); - if (ret != count) { - kfree(kbuf); - return ERR_PTR(-EIO); - } - kbuf[ret] = '\0'; - - return kbuf; -} - -static ssize_t dbgfs_attrs_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char kbuf[128]; - int ret; - - mutex_lock(&ctx->kdamond_lock); - ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", - ctx->attrs.sample_interval, ctx->attrs.aggr_interval, - ctx->attrs.ops_update_interval, - ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions); - mutex_unlock(&ctx->kdamond_lock); - - return simple_read_from_buffer(buf, count, ppos, kbuf, ret); -} - -static ssize_t dbgfs_attrs_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - struct damon_attrs attrs; - char *kbuf; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - if (sscanf(kbuf, "%lu %lu %lu %lu %lu", - &attrs.sample_interval, &attrs.aggr_interval, - &attrs.ops_update_interval, - &attrs.min_nr_regions, - &attrs.max_nr_regions) != 5) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - ret = damon_set_attrs(ctx, &attrs); - if (!ret) - ret = count; -unlock_out: - mutex_unlock(&ctx->kdamond_lock); -out: - kfree(kbuf); - return ret; -} - -/* - * Return corresponding dbgfs' scheme action value (int) for the given - * damos_action if the given damos_action value is valid and supported by - * dbgfs, negative error code otherwise. - */ -static int damos_action_to_dbgfs_scheme_action(enum damos_action action) -{ - switch (action) { - case DAMOS_WILLNEED: - return 0; - case DAMOS_COLD: - return 1; - case DAMOS_PAGEOUT: - return 2; - case DAMOS_HUGEPAGE: - return 3; - case DAMOS_NOHUGEPAGE: - return 4; - case DAMOS_STAT: - return 5; - default: - return -EINVAL; - } -} - -static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) -{ - struct damos *s; - int written = 0; - int rc; - - damon_for_each_scheme(s, c) { - rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - s->pattern.min_sz_region, - s->pattern.max_sz_region, - s->pattern.min_nr_accesses, - s->pattern.max_nr_accesses, - s->pattern.min_age_region, - s->pattern.max_age_region, - damos_action_to_dbgfs_scheme_action(s->action), - s->quota.ms, s->quota.sz, - s->quota.reset_interval, - s->quota.weight_sz, - s->quota.weight_nr_accesses, - s->quota.weight_age, - s->wmarks.metric, s->wmarks.interval, - s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat.nr_tried, s->stat.sz_tried, - s->stat.nr_applied, s->stat.sz_applied, - s->stat.qt_exceeds); - if (!rc) - return -ENOMEM; - - written += rc; - } - return written; -} - -static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - len = sprint_schemes(ctx, kbuf, count); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) -{ - ssize_t i; - - for (i = 0; i < nr_schemes; i++) - kfree(schemes[i]); - kfree(schemes); -} - -/* - * Return corresponding damos_action for the given dbgfs input for a scheme - * action if the input is valid, negative error code otherwise. - */ -static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action) -{ - switch (dbgfs_action) { - case 0: - return DAMOS_WILLNEED; - case 1: - return DAMOS_COLD; - case 2: - return DAMOS_PAGEOUT; - case 3: - return DAMOS_HUGEPAGE; - case 4: - return DAMOS_NOHUGEPAGE; - case 5: - return DAMOS_STAT; - default: - return -EINVAL; - } -} - -/* - * Converts a string into an array of struct damos pointers - * - * Returns an array of struct damos pointers that converted if the conversion - * success, or NULL otherwise. - */ -static struct damos **str_to_schemes(const char *str, ssize_t len, - ssize_t *nr_schemes) -{ - struct damos *scheme, **schemes; - const int max_nr_schemes = 256; - int pos = 0, parsed, ret; - unsigned int action_input; - enum damos_action action; - - schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), - GFP_KERNEL); - if (!schemes) - return NULL; - - *nr_schemes = 0; - while (pos < len && *nr_schemes < max_nr_schemes) { - struct damos_access_pattern pattern = {}; - struct damos_quota quota = {}; - struct damos_watermarks wmarks; - - ret = sscanf(&str[pos], - "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", - &pattern.min_sz_region, &pattern.max_sz_region, - &pattern.min_nr_accesses, - &pattern.max_nr_accesses, - &pattern.min_age_region, - &pattern.max_age_region, - &action_input, "a.ms, - "a.sz, "a.reset_interval, - "a.weight_sz, "a.weight_nr_accesses, - "a.weight_age, &wmarks.metric, - &wmarks.interval, &wmarks.high, &wmarks.mid, - &wmarks.low, &parsed); - if (ret != 18) - break; - action = dbgfs_scheme_action_to_damos_action(action_input); - if ((int)action < 0) - goto fail; - - if (pattern.min_sz_region > pattern.max_sz_region || - pattern.min_nr_accesses > pattern.max_nr_accesses || - pattern.min_age_region > pattern.max_age_region) - goto fail; - - if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || - wmarks.mid < wmarks.low) - goto fail; - - pos += parsed; - scheme = damon_new_scheme(&pattern, action, 0, "a, - &wmarks, NUMA_NO_NODE); - if (!scheme) - goto fail; - - schemes[*nr_schemes] = scheme; - *nr_schemes += 1; - } - return schemes; -fail: - free_schemes_arr(schemes, *nr_schemes); - return NULL; -} - -static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - struct damos **schemes; - ssize_t nr_schemes = 0, ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - schemes = str_to_schemes(kbuf, count, &nr_schemes); - if (!schemes) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - damon_set_schemes(ctx, schemes, nr_schemes); - ret = count; - nr_schemes = 0; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - free_schemes_arr(schemes, nr_schemes); -out: - kfree(kbuf); - return ret; -} - -static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) -{ - struct damon_target *t; - int id; - int written = 0; - int rc; - - damon_for_each_target(t, ctx) { - if (damon_target_has_pid(ctx)) - /* Show pid numbers to debugfs users */ - id = pid_vnr(t->pid); - else - /* Show 42 for physical address space, just for fun */ - id = 42; - - rc = scnprintf(&buf[written], len - written, "%d ", id); - if (!rc) - return -ENOMEM; - written += rc; - } - if (written) - written -= 1; - written += scnprintf(&buf[written], len - written, "\n"); - return written; -} - -static ssize_t dbgfs_target_ids_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - ssize_t len; - char ids_buf[320]; - - mutex_lock(&ctx->kdamond_lock); - len = sprint_target_ids(ctx, ids_buf, 320); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - return len; - - return simple_read_from_buffer(buf, count, ppos, ids_buf, len); -} - -/* - * Converts a string into an integers array - * - * Returns an array of integers array if the conversion success, or NULL - * otherwise. - */ -static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints) -{ - int *array; - const int max_nr_ints = 32; - int nr; - int pos = 0, parsed, ret; - - *nr_ints = 0; - array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL); - if (!array) - return NULL; - while (*nr_ints < max_nr_ints && pos < len) { - ret = sscanf(&str[pos], "%d%n", &nr, &parsed); - pos += parsed; - if (ret != 1) - break; - array[*nr_ints] = nr; - *nr_ints += 1; - } - - return array; -} - -static void dbgfs_put_pids(struct pid **pids, int nr_pids) -{ - int i; - - for (i = 0; i < nr_pids; i++) - put_pid(pids[i]); -} - -/* - * Converts a string into an struct pid pointers array - * - * Returns an array of struct pid pointers if the conversion success, or NULL - * otherwise. - */ -static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids) -{ - int *ints; - ssize_t nr_ints; - struct pid **pids; - - *nr_pids = 0; - - ints = str_to_ints(str, len, &nr_ints); - if (!ints) - return NULL; - - pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL); - if (!pids) - goto out; - - for (; *nr_pids < nr_ints; (*nr_pids)++) { - pids[*nr_pids] = find_get_pid(ints[*nr_pids]); - if (!pids[*nr_pids]) { - dbgfs_put_pids(pids, *nr_pids); - kfree(ints); - kfree(pids); - return NULL; - } - } - -out: - kfree(ints); - return pids; -} - -/* - * dbgfs_set_targets() - Set monitoring targets. - * @ctx: monitoring context - * @nr_targets: number of targets - * @pids: array of target pids (size is same to @nr_targets) - * - * This function should not be called while the kdamond is running. @pids is - * ignored if the context is not configured to have pid in each target. On - * failure, reference counts of all pids in @pids are decremented. - * - * Return: 0 on success, negative error code otherwise. - */ -static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, - struct pid **pids) -{ - ssize_t i; - struct damon_target *t, *next; - - damon_for_each_target_safe(t, next, ctx) { - if (damon_target_has_pid(ctx)) - put_pid(t->pid); - damon_destroy_target(t); - } - - for (i = 0; i < nr_targets; i++) { - t = damon_new_target(); - if (!t) { - damon_for_each_target_safe(t, next, ctx) - damon_destroy_target(t); - if (damon_target_has_pid(ctx)) - dbgfs_put_pids(pids, nr_targets); - return -ENOMEM; - } - if (damon_target_has_pid(ctx)) - t->pid = pids[i]; - damon_add_target(ctx, t); - } - - return 0; -} - -static ssize_t dbgfs_target_ids_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - bool id_is_pid = true; - char *kbuf; - struct pid **target_pids = NULL; - ssize_t nr_targets; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - if (!strncmp(kbuf, "paddr\n", count)) { - id_is_pid = false; - nr_targets = 1; - } - - if (id_is_pid) { - target_pids = str_to_pids(kbuf, count, &nr_targets); - if (!target_pids) { - ret = -ENOMEM; - goto out; - } - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - if (id_is_pid) - dbgfs_put_pids(target_pids, nr_targets); - ret = -EBUSY; - goto unlock_out; - } - - /* remove previously set targets */ - dbgfs_set_targets(ctx, 0, NULL); - if (!nr_targets) { - ret = count; - goto unlock_out; - } - - /* Configure the context for the address space type */ - if (id_is_pid) - ret = damon_select_ops(ctx, DAMON_OPS_VADDR); - else - ret = damon_select_ops(ctx, DAMON_OPS_PADDR); - if (ret) - goto unlock_out; - - ret = dbgfs_set_targets(ctx, nr_targets, target_pids); - if (!ret) - ret = count; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - kfree(target_pids); -out: - kfree(kbuf); - return ret; -} - -static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) -{ - struct damon_target *t; - struct damon_region *r; - int target_idx = 0; - int written = 0; - int rc; - - damon_for_each_target(t, c) { - damon_for_each_region(r, t) { - rc = scnprintf(&buf[written], len - written, - "%d %lu %lu\n", - target_idx, r->ar.start, r->ar.end); - if (!rc) - return -ENOMEM; - written += rc; - } - target_idx++; - } - return written; -} - -static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - mutex_unlock(&ctx->kdamond_lock); - len = -EBUSY; - goto out; - } - - len = sprint_init_regions(ctx, kbuf, count); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static int add_init_region(struct damon_ctx *c, int target_idx, - struct damon_addr_range *ar) -{ - struct damon_target *t; - struct damon_region *r, *prev; - unsigned long idx = 0; - int rc = -EINVAL; - - if (ar->start >= ar->end) - return -EINVAL; - - damon_for_each_target(t, c) { - if (idx++ == target_idx) { - r = damon_new_region(ar->start, ar->end); - if (!r) - return -ENOMEM; - damon_add_region(r, t); - if (damon_nr_regions(t) > 1) { - prev = damon_prev_region(r); - if (prev->ar.end > r->ar.start) { - damon_destroy_region(r, t); - return -EINVAL; - } - } - rc = 0; - } - } - return rc; -} - -static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) -{ - struct damon_target *t; - struct damon_region *r, *next; - int pos = 0, parsed, ret; - int target_idx; - struct damon_addr_range ar; - int err; - - damon_for_each_target(t, c) { - damon_for_each_region_safe(r, next, t) - damon_destroy_region(r, t); - } - - while (pos < len) { - ret = sscanf(&str[pos], "%d %lu %lu%n", - &target_idx, &ar.start, &ar.end, &parsed); - if (ret != 3) - break; - err = add_init_region(c, target_idx, &ar); - if (err) - goto fail; - pos += parsed; - } - - return 0; - -fail: - damon_for_each_target(t, c) { - damon_for_each_region_safe(r, next, t) - damon_destroy_region(r, t); - } - return err; -} - -static ssize_t dbgfs_init_regions_write(struct file *file, - const char __user *buf, size_t count, - loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t ret = count; - int err; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - err = set_init_regions(ctx, kbuf, ret); - if (err) - ret = err; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - kfree(kbuf); - return ret; -} - -static ssize_t dbgfs_kdamond_pid_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) - len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid); - else - len = scnprintf(kbuf, count, "none\n"); - mutex_unlock(&ctx->kdamond_lock); - if (!len) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static int damon_dbgfs_open(struct inode *inode, struct file *file) -{ - damon_dbgfs_warn_deprecation(); - - file->private_data = inode->i_private; - - return nonseekable_open(inode, file); -} - -static const struct file_operations attrs_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_attrs_read, - .write = dbgfs_attrs_write, -}; - -static const struct file_operations schemes_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_schemes_read, - .write = dbgfs_schemes_write, -}; - -static const struct file_operations target_ids_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_target_ids_read, - .write = dbgfs_target_ids_write, -}; - -static const struct file_operations init_regions_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_init_regions_read, - .write = dbgfs_init_regions_write, -}; - -static const struct file_operations kdamond_pid_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_kdamond_pid_read, -}; - -static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) -{ - const char * const file_names[] = {"attrs", "schemes", "target_ids", - "init_regions", "kdamond_pid"}; - const struct file_operations *fops[] = {&attrs_fops, &schemes_fops, - &target_ids_fops, &init_regions_fops, &kdamond_pid_fops}; - int i; - - for (i = 0; i < ARRAY_SIZE(file_names); i++) - debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); -} - -static void dbgfs_before_terminate(struct damon_ctx *ctx) -{ - struct damon_target *t, *next; - - if (!damon_target_has_pid(ctx)) - return; - - mutex_lock(&ctx->kdamond_lock); - damon_for_each_target_safe(t, next, ctx) { - put_pid(t->pid); - damon_destroy_target(t); - } - mutex_unlock(&ctx->kdamond_lock); -} - -static struct damon_ctx *dbgfs_new_ctx(void) -{ - struct damon_ctx *ctx; - - ctx = damon_new_ctx(); - if (!ctx) - return NULL; - - if (damon_select_ops(ctx, DAMON_OPS_VADDR) && - damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return NULL; - } - ctx->callback.before_terminate = dbgfs_before_terminate; - return ctx; -} - -static void dbgfs_destroy_ctx(struct damon_ctx *ctx) -{ - damon_destroy_ctx(ctx); -} - -static ssize_t damon_dbgfs_deprecated_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE; - - return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf)); -} - -/* - * Make a context of @name and create a debugfs directory for it. - * - * This function should be called while holding damon_dbgfs_lock. - * - * Returns 0 on success, negative error code otherwise. - */ -static int dbgfs_mk_context(char *name) -{ - struct dentry *root, **new_dirs, *new_dir; - struct damon_ctx **new_ctxs, *new_ctx; - - if (damon_nr_running_ctxs()) - return -EBUSY; - - new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) * - (dbgfs_nr_ctxs + 1), GFP_KERNEL); - if (!new_ctxs) - return -ENOMEM; - dbgfs_ctxs = new_ctxs; - - new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) * - (dbgfs_nr_ctxs + 1), GFP_KERNEL); - if (!new_dirs) - return -ENOMEM; - dbgfs_dirs = new_dirs; - - root = dbgfs_dirs[0]; - if (!root) - return -ENOENT; - - new_dir = debugfs_create_dir(name, root); - /* Below check is required for a potential duplicated name case */ - if (IS_ERR(new_dir)) - return PTR_ERR(new_dir); - dbgfs_dirs[dbgfs_nr_ctxs] = new_dir; - - new_ctx = dbgfs_new_ctx(); - if (!new_ctx) { - debugfs_remove(new_dir); - dbgfs_dirs[dbgfs_nr_ctxs] = NULL; - return -ENOMEM; - } - - dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx; - dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs], - dbgfs_ctxs[dbgfs_nr_ctxs]); - dbgfs_nr_ctxs++; - - return 0; -} - -static ssize_t dbgfs_mk_context_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - char *ctx_name; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - ctx_name = kmalloc(count + 1, GFP_KERNEL); - if (!ctx_name) { - kfree(kbuf); - return -ENOMEM; - } - - /* Trim white space */ - if (sscanf(kbuf, "%s", ctx_name) != 1) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&damon_dbgfs_lock); - ret = dbgfs_mk_context(ctx_name); - if (!ret) - ret = count; - mutex_unlock(&damon_dbgfs_lock); - -out: - kfree(kbuf); - kfree(ctx_name); - return ret; -} - -/* - * Remove a context of @name and its debugfs directory. - * - * This function should be called while holding damon_dbgfs_lock. - * - * Return 0 on success, negative error code otherwise. - */ -static int dbgfs_rm_context(char *name) -{ - struct dentry *root, *dir, **new_dirs; - struct inode *inode; - struct damon_ctx **new_ctxs; - int i, j; - int ret = 0; - - if (damon_nr_running_ctxs()) - return -EBUSY; - - root = dbgfs_dirs[0]; - if (!root) - return -ENOENT; - - dir = debugfs_lookup(name, root); - if (!dir) - return -ENOENT; - - inode = d_inode(dir); - if (!S_ISDIR(inode->i_mode)) { - ret = -EINVAL; - goto out_dput; - } - - new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), - GFP_KERNEL); - if (!new_dirs) { - ret = -ENOMEM; - goto out_dput; - } - - new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), - GFP_KERNEL); - if (!new_ctxs) { - ret = -ENOMEM; - goto out_new_dirs; - } - - for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { - if (dbgfs_dirs[i] == dir) { - debugfs_remove(dbgfs_dirs[i]); - dbgfs_destroy_ctx(dbgfs_ctxs[i]); - continue; - } - new_dirs[j] = dbgfs_dirs[i]; - new_ctxs[j++] = dbgfs_ctxs[i]; - } - - kfree(dbgfs_dirs); - kfree(dbgfs_ctxs); - - dbgfs_dirs = new_dirs; - dbgfs_ctxs = new_ctxs; - dbgfs_nr_ctxs--; - - goto out_dput; - -out_new_dirs: - kfree(new_dirs); -out_dput: - dput(dir); - return ret; -} - -static ssize_t dbgfs_rm_context_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - ssize_t ret; - char *ctx_name; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - ctx_name = kmalloc(count + 1, GFP_KERNEL); - if (!ctx_name) { - kfree(kbuf); - return -ENOMEM; - } - - /* Trim white space */ - if (sscanf(kbuf, "%s", ctx_name) != 1) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&damon_dbgfs_lock); - ret = dbgfs_rm_context(ctx_name); - if (!ret) - ret = count; - mutex_unlock(&damon_dbgfs_lock); - -out: - kfree(kbuf); - kfree(ctx_name); - return ret; -} - -static ssize_t dbgfs_monitor_on_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - char monitor_on_buf[5]; - bool monitor_on = damon_nr_running_ctxs() != 0; - int len; - - len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n"); - - return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len); -} - -static ssize_t dbgfs_monitor_on_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - ssize_t ret; - char *kbuf; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - /* Remove white space */ - if (sscanf(kbuf, "%s", kbuf) != 1) { - kfree(kbuf); - return -EINVAL; - } - - mutex_lock(&damon_dbgfs_lock); - if (!strncmp(kbuf, "on", count)) { - int i; - - for (i = 0; i < dbgfs_nr_ctxs; i++) { - if (damon_targets_empty(dbgfs_ctxs[i])) { - kfree(kbuf); - mutex_unlock(&damon_dbgfs_lock); - return -EINVAL; - } - } - ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true); - } else if (!strncmp(kbuf, "off", count)) { - ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); - } else { - ret = -EINVAL; - } - mutex_unlock(&damon_dbgfs_lock); - - if (!ret) - ret = count; - kfree(kbuf); - return ret; -} - -static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file) -{ - damon_dbgfs_warn_deprecation(); - return nonseekable_open(inode, file); -} - -static const struct file_operations deprecated_fops = { - .read = damon_dbgfs_deprecated_read, -}; - -static const struct file_operations mk_contexts_fops = { - .open = damon_dbgfs_static_file_open, - .write = dbgfs_mk_context_write, -}; - -static const struct file_operations rm_contexts_fops = { - .open = damon_dbgfs_static_file_open, - .write = dbgfs_rm_context_write, -}; - -static const struct file_operations monitor_on_fops = { - .open = damon_dbgfs_static_file_open, - .read = dbgfs_monitor_on_read, - .write = dbgfs_monitor_on_write, -}; - -static int __init __damon_dbgfs_init(void) -{ - struct dentry *dbgfs_root; - const char * const file_names[] = {"mk_contexts", "rm_contexts", - "monitor_on_DEPRECATED", "DEPRECATED"}; - const struct file_operations *fops[] = {&mk_contexts_fops, - &rm_contexts_fops, &monitor_on_fops, &deprecated_fops}; - int i; - - dbgfs_root = debugfs_create_dir("damon", NULL); - - for (i = 0; i < ARRAY_SIZE(file_names); i++) - debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL, - fops[i]); - dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); - - dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL); - if (!dbgfs_dirs) { - debugfs_remove(dbgfs_root); - return -ENOMEM; - } - dbgfs_dirs[0] = dbgfs_root; - - return 0; -} - -/* - * Functions for the initialization - */ - -static int __init damon_dbgfs_init(void) -{ - int rc = -ENOMEM; - - mutex_lock(&damon_dbgfs_lock); - dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); - if (!dbgfs_ctxs) - goto out; - dbgfs_ctxs[0] = dbgfs_new_ctx(); - if (!dbgfs_ctxs[0]) { - kfree(dbgfs_ctxs); - goto out; - } - dbgfs_nr_ctxs = 1; - - rc = __damon_dbgfs_init(); - if (rc) { - kfree(dbgfs_ctxs[0]); - kfree(dbgfs_ctxs); - pr_err("%s: dbgfs init failed\n", __func__); - } - -out: - mutex_unlock(&damon_dbgfs_lock); - return rc; -} - -module_init(damon_dbgfs_init); - -#include "tests/dbgfs-kunit.h" diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index a9ff35341d65..6b4397de4199 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -198,7 +198,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -static bool __damos_pa_filter_out(struct damos_filter *filter, +static bool damos_pa_filter_match(struct damos_filter *filter, struct folio *folio) { bool matched = false; @@ -237,13 +237,14 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) struct damos_filter *filter; damos_for_each_filter(filter, scheme) { - if (__damos_pa_filter_out(filter, folio)) - return true; + if (damos_pa_filter_match(filter, folio)) + return !filter->allow; } return false; } -static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) +static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) { unsigned long addr, applied; LIST_HEAD(folio_list); @@ -258,7 +259,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) } } if (install_young_filter) { - filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true); + filter = damos_new_filter( + DAMOS_FILTER_TYPE_YOUNG, true, false); if (!filter) return 0; damos_add_filter(s, filter); @@ -272,6 +274,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); folio_clear_referenced(folio); folio_test_clear_young(folio); @@ -292,7 +296,8 @@ put_folio: } static inline unsigned long damon_pa_mark_accessed_or_deactivate( - struct damon_region *r, struct damos *s, bool mark_accessed) + struct damon_region *r, struct damos *s, bool mark_accessed, + unsigned long *sz_filter_passed) { unsigned long addr, applied = 0; @@ -304,6 +309,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); if (mark_accessed) folio_mark_accessed(folio); @@ -317,15 +324,17 @@ put_folio: } static unsigned long damon_pa_mark_accessed(struct damon_region *r, - struct damos *s) + struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, true); + return damon_pa_mark_accessed_or_deactivate(r, s, true, + sz_filter_passed); } static unsigned long damon_pa_deactivate_pages(struct damon_region *r, - struct damos *s) + struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, false); + return damon_pa_mark_accessed_or_deactivate(r, s, false, + sz_filter_passed); } static unsigned int __damon_pa_migrate_folio_list( @@ -449,7 +458,8 @@ static unsigned long damon_pa_migrate_pages(struct list_head *folio_list, return nr_migrated; } -static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s) +static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) { unsigned long addr, applied; LIST_HEAD(folio_list); @@ -462,6 +472,8 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s) if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); if (!folio_isolate_lru(folio)) goto put_folio; @@ -474,23 +486,56 @@ put_folio: return applied * PAGE_SIZE; } +static bool damon_pa_scheme_has_filter(struct damos *s) +{ + struct damos_filter *f; + + damos_for_each_filter(f, s) + return true; + return false; +} + +static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + unsigned long addr; + LIST_HEAD(folio_list); + + if (!damon_pa_scheme_has_filter(s)) + return 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct folio *folio = damon_get_folio(PHYS_PFN(addr)); + + if (!folio) + continue; + + if (damos_pa_filter_out(s, folio)) + goto put_folio; + else + *sz_filter_passed += folio_size(folio); +put_folio: + folio_put(folio); + } + return 0; +} static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damos *scheme, unsigned long *sz_filter_passed) { switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pa_pageout(r, scheme); + return damon_pa_pageout(r, scheme, sz_filter_passed); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r, scheme); + return damon_pa_mark_accessed(r, scheme, sz_filter_passed); case DAMOS_LRU_DEPRIO: - return damon_pa_deactivate_pages(r, scheme); + return damon_pa_deactivate_pages(r, scheme, sz_filter_passed); case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: - return damon_pa_migrate(r, scheme); + return damon_pa_migrate(r, scheme, sz_filter_passed); case DAMOS_STAT: - break; + return damon_pa_stat(r, scheme, sz_filter_passed); default: /* DAMOS actions that not yet supported by 'paddr'. */ break; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 9e0077a9404e..a675150965e0 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -221,7 +221,7 @@ static int damon_reclaim_apply_parameters(void) } if (skip_anon) { - filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false); if (!filter) goto out; damos_add_filter(scheme, filter); diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 9a18f3c535d3..70d84bdc9f5f 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -45,19 +45,13 @@ void damon_sysfs_schemes_update_stats( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); -int damon_sysfs_schemes_update_regions_start( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx, bool total_bytes_only); - -void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx); - -bool damos_sysfs_regions_upd_done(void); - -int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); +void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, + bool total_bytes_only, unsigned long sz_filter_passed); int damon_sysfs_schemes_clear_regions( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx); + struct damon_sysfs_schemes *sysfs_schemes); int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index b095457380b5..98f93ae9f59e 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -19,6 +19,7 @@ struct damon_sysfs_scheme_region { struct damon_addr_range ar; unsigned int nr_accesses; unsigned int age; + unsigned long sz_filter_passed; struct list_head list; }; @@ -74,6 +75,15 @@ static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr, return sysfs_emit(buf, "%u\n", region->age); } +static ssize_t sz_filter_passed_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->sz_filter_passed); +} + static void damon_sysfs_scheme_region_release(struct kobject *kobj) { struct damon_sysfs_scheme_region *region = container_of(kobj, @@ -95,11 +105,15 @@ static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr = static struct kobj_attribute damon_sysfs_scheme_region_age_attr = __ATTR_RO_MODE(age, 0400); +static struct kobj_attribute damon_sysfs_scheme_region_sz_filter_passed_attr = + __ATTR_RO_MODE(sz_filter_passed, 0400); + static struct attribute *damon_sysfs_scheme_region_attrs[] = { &damon_sysfs_scheme_region_start_attr.attr, &damon_sysfs_scheme_region_end_attr.attr, &damon_sysfs_scheme_region_nr_accesses_attr.attr, &damon_sysfs_scheme_region_age_attr.attr, + &damon_sysfs_scheme_region_sz_filter_passed_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); @@ -114,55 +128,11 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = { * scheme regions directory */ -/* - * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update - * status - * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request. - * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started. - * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished. - * - * Each DAMON-based operation scheme (&struct damos) has its own apply - * interval, and we need to expose the scheme tried regions based on only - * single snapshot. For this, we keep the tried regions update status for each - * scheme. The status becomes 'idle' at the beginning. - * - * Once the tried regions update request is received, the request handling - * start function (damon_sysfs_scheme_update_regions_start()) sets the status - * of all schemes as 'idle' again, and register ->before_damos_apply() - * callback. - * - * Then, the first followup ->before_damos_apply() callback - * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first - * ->after_sampling() or ->after_aggregation() callback - * (damon_sysfs_cmd_request_callback()) after the call is called only after - * the scheme is completely applied to the given snapshot. Hence the callback - * knows the situation by showing 'started' status, and sets the status as - * 'finished'. Then, damon_sysfs_before_damos_apply() understands the - * situation by showing the 'finished' status and do nothing. - * - * If DAMOS is not applied to any region due to any reasons including the - * access pattern, the watermarks, the quotas, and the filters, - * ->before_damos_apply() will not be called back. Until the situation is - * changed, the update will not be finished. To avoid this, - * damon_sysfs_after_sampling() set the status as 'finished' if more than two - * apply intervals of the scheme is passed while the state is 'idle'. - * - * Finally, the tried regions request handling finisher function - * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks. - */ -enum damos_sysfs_regions_upd_status { - DAMOS_TRIED_REGIONS_UPD_IDLE, - DAMOS_TRIED_REGIONS_UPD_STARTED, - DAMOS_TRIED_REGIONS_UPD_FINISHED, -}; - struct damon_sysfs_scheme_regions { struct kobject kobj; struct list_head regions_list; int nr_regions; unsigned long total_bytes; - enum damos_sysfs_regions_upd_status upd_status; - unsigned long upd_timeout_jiffies; }; static struct damon_sysfs_scheme_regions * @@ -178,7 +148,6 @@ damon_sysfs_scheme_regions_alloc(void) INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; regions->total_bytes = 0; - regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; return regions; } @@ -233,6 +202,7 @@ struct damon_sysfs_stats { unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; }; @@ -277,6 +247,15 @@ static ssize_t sz_applied_show(struct kobject *kobj, return sysfs_emit(buf, "%lu\n", stats->sz_applied); } +static ssize_t sz_ops_filter_passed_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_ops_filter_passed); +} + static ssize_t qt_exceeds_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -303,6 +282,9 @@ static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = __ATTR_RO_MODE(sz_applied, 0400); +static struct kobj_attribute damon_sysfs_stats_sz_ops_filter_passed_attr = + __ATTR_RO_MODE(sz_ops_filter_passed, 0400); + static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = __ATTR_RO_MODE(qt_exceeds, 0400); @@ -311,6 +293,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = { &damon_sysfs_stats_sz_tried_attr.attr, &damon_sysfs_stats_nr_applied_attr.attr, &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_sz_ops_filter_passed_attr.attr, &damon_sysfs_stats_qt_exceeds_attr.attr, NULL, }; @@ -330,6 +313,7 @@ struct damon_sysfs_scheme_filter { struct kobject kobj; enum damos_filter_type type; bool matching; + bool allow; char *memcg_path; struct damon_addr_range addr_range; int target_idx; @@ -402,6 +386,30 @@ static ssize_t matching_store(struct kobject *kobj, return count; } +static ssize_t allow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->allow ? 'Y' : 'N'); +} + +static ssize_t allow_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + bool allow; + int err = kstrtobool(buf, &allow); + + if (err) + return err; + + filter->allow = allow; + return count; +} + static ssize_t memcg_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -499,6 +507,9 @@ static struct kobj_attribute damon_sysfs_scheme_filter_type_attr = static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr = __ATTR_RW_MODE(matching, 0600); +static struct kobj_attribute damon_sysfs_scheme_filter_allow_attr = + __ATTR_RW_MODE(allow, 0600); + static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr = __ATTR_RW_MODE(memcg_path, 0600); @@ -514,6 +525,7 @@ static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr = static struct attribute *damon_sysfs_scheme_filter_attrs[] = { &damon_sysfs_scheme_filter_type_attr.attr, &damon_sysfs_scheme_filter_matching_attr.attr, + &damon_sysfs_scheme_filter_allow_attr.attr, &damon_sysfs_scheme_filter_memcg_path_attr.attr, &damon_sysfs_scheme_filter_addr_start_attr.attr, &damon_sysfs_scheme_filter_addr_end_attr.attr, @@ -1918,7 +1930,8 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme, sysfs_filters->filters_arr[i]; struct damos_filter *filter = damos_new_filter(sysfs_filter->type, - sysfs_filter->matching); + sysfs_filter->matching, + sysfs_filter->allow); int err; if (!filter) @@ -2122,32 +2135,32 @@ void damon_sysfs_schemes_update_stats( sysfs_stats->sz_tried = scheme->stat.sz_tried; sysfs_stats->nr_applied = scheme->stat.nr_applied; sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->sz_ops_filter_passed = + scheme->stat.sz_ops_filter_passed; sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; } } -/* - * damon_sysfs_schemes that need to update its schemes regions dir. Protected - * by damon_sysfs_lock +/** + * damos_sysfs_populate_region_dir() - Populate a schemes tried region dir. + * @sysfs_schemes: Schemes directory to populate regions directory. + * @ctx: Corresponding DAMON context. + * @t: DAMON target of @r. + * @r: DAMON region to populate the directory for. + * @s: Corresponding scheme. + * @total_bytes_only: Whether the request is for bytes update only. + * @sz_filter_passed: Bytes of @r that passed filters of @s. + * + * Called from DAMOS walk callback while holding damon_sysfs_lock. */ -static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; -static int damon_sysfs_schemes_region_idx; -static bool damos_regions_upd_total_bytes_only; - -/* - * DAMON callback that called before damos apply. While this callback is - * registered, damon_sysfs_lock should be held to ensure the regions - * directories exist. - */ -static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, - struct damon_target *t, struct damon_region *r, - struct damos *s) +void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, bool total_bytes_only, + unsigned long sz_filter_passed) { struct damos *scheme; struct damon_sysfs_scheme_regions *sysfs_regions; struct damon_sysfs_scheme_region *region; - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; int schemes_idx = 0; damon_for_each_scheme(scheme, ctx) { @@ -2158,152 +2171,40 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, /* user could have removed the scheme sysfs dir */ if (schemes_idx >= sysfs_schemes->nr) - return 0; + return; sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; - if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED) - return 0; - if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE) - sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED; sysfs_regions->total_bytes += r->ar.end - r->ar.start; - if (damos_regions_upd_total_bytes_only) - return 0; + if (total_bytes_only) + return; region = damon_sysfs_scheme_region_alloc(r); if (!region) - return 0; + return; + region->sz_filter_passed = sz_filter_passed; list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; if (kobject_init_and_add(®ion->kobj, &damon_sysfs_scheme_region_ktype, &sysfs_regions->kobj, "%d", - damon_sysfs_schemes_region_idx++)) { + sysfs_regions->nr_regions++)) { kobject_put(®ion->kobj); } - return 0; -} - -/* - * DAMON callback that called after each accesses sampling. While this - * callback is registered, damon_sysfs_lock should be held to ensure the - * regions directories exist. - */ -void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx) -{ - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; - struct damon_sysfs_scheme_regions *sysfs_regions; - int i; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - if (sysfs_regions->upd_status == - DAMOS_TRIED_REGIONS_UPD_STARTED || - time_after(jiffies, - sysfs_regions->upd_timeout_jiffies)) - sysfs_regions->upd_status = - DAMOS_TRIED_REGIONS_UPD_FINISHED; - } } /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_clear_regions( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) + struct damon_sysfs_schemes *sysfs_schemes) { - struct damos *scheme; - int schemes_idx = 0; + int i; - damon_for_each_scheme(scheme, ctx) { + for (i = 0; i < sysfs_schemes->nr; i++) { struct damon_sysfs_scheme *sysfs_scheme; - /* user could have removed the scheme sysfs dir */ - if (schemes_idx >= sysfs_schemes->nr) - break; - - sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; + sysfs_scheme = sysfs_schemes->schemes_arr[i]; damon_sysfs_scheme_regions_rm_dirs( sysfs_scheme->tried_regions); sysfs_scheme->tried_regions->total_bytes = 0; } return 0; } - -static struct damos *damos_sysfs_nth_scheme(int n, struct damon_ctx *ctx) -{ - struct damos *scheme; - int i = 0; - - damon_for_each_scheme(scheme, ctx) { - if (i == n) - return scheme; - i++; - } - return NULL; -} - -static void damos_tried_regions_init_upd_status( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) -{ - int i; - struct damos *scheme; - struct damon_sysfs_scheme_regions *sysfs_regions; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - scheme = damos_sysfs_nth_scheme(i, ctx); - if (!scheme) { - sysfs_regions->upd_status = - DAMOS_TRIED_REGIONS_UPD_FINISHED; - continue; - } - sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; - sysfs_regions->upd_timeout_jiffies = jiffies + - 2 * usecs_to_jiffies(scheme->apply_interval_us ? - scheme->apply_interval_us : - ctx->attrs.aggr_interval); - } -} - -/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ -int damon_sysfs_schemes_update_regions_start( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx, bool total_bytes_only) -{ - damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); - damon_sysfs_schemes_for_damos_callback = sysfs_schemes; - damos_tried_regions_init_upd_status(sysfs_schemes, ctx); - damos_regions_upd_total_bytes_only = total_bytes_only; - ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; - return 0; -} - -bool damos_sysfs_regions_upd_done(void) -{ - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; - struct damon_sysfs_scheme_regions *sysfs_regions; - int i; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - if (sysfs_regions->upd_status != - DAMOS_TRIED_REGIONS_UPD_FINISHED) - return false; - } - return true; -} - -/* - * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller - * should unlock damon_sysfs_lock which held before - * damon_sysfs_schemes_update_regions_start() - */ -int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) -{ - damon_sysfs_schemes_for_damos_callback = NULL; - ctx->callback.before_damos_apply = NULL; - damon_sysfs_schemes_region_idx = 0; - return 0; -} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 58145d59881d..deeab04d3b46 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1181,25 +1181,9 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx, return 0; } -static bool damon_sysfs_schemes_regions_updating; - static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - struct damon_sysfs_kdamond *kdamond; - enum damon_sysfs_cmd cmd; - - /* damon_sysfs_schemes_update_regions_stop() might not yet called */ - kdamond = damon_sysfs_cmd_request.kdamond; - cmd = damon_sysfs_cmd_request.cmd; - if (kdamond && ctx == kdamond->damon_ctx && - (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS || - cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) && - damon_sysfs_schemes_regions_updating) { - damon_sysfs_schemes_update_regions_stop(ctx); - damon_sysfs_schemes_regions_updating = false; - mutex_unlock(&damon_sysfs_lock); - } if (!damon_target_has_pid(ctx)) return; @@ -1214,57 +1198,24 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) /* * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. - * @kdamond: The kobject wrapper that associated to the kdamond thread. + * @data: The kobject wrapper that associated to the kdamond thread. * * This function reads the schemes stats of specific kdamond and update the * related values for sysfs files. This function should be called from DAMON - * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON - * contexts-internal data and DAMON sysfs variables. + * worker thread,to safely access the DAMON contexts-internal data. Caller + * should also ensure holding ``damon_syfs_lock``, and ->damon_ctx of @data is + * not NULL but a valid pointer, to safely access DAMON sysfs variables. */ -static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_upd_schemes_stats(void *data) { + struct damon_sysfs_kdamond *kdamond = data; struct damon_ctx *ctx = kdamond->damon_ctx; - if (!ctx) - return -EINVAL; damon_sysfs_schemes_update_stats( kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; } -static int damon_sysfs_upd_schemes_regions_start( - struct damon_sysfs_kdamond *kdamond, bool total_bytes_only) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_update_regions_start( - kdamond->contexts->contexts_arr[0]->schemes, ctx, - total_bytes_only); -} - -static int damon_sysfs_upd_schemes_regions_stop( - struct damon_sysfs_kdamond *kdamond) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_update_regions_stop(ctx); -} - -static int damon_sysfs_clear_schemes_regions( - struct damon_sysfs_kdamond *kdamond) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_clear_regions( - kdamond->contexts->contexts_arr[0]->schemes, ctx); -} - static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { @@ -1318,9 +1269,9 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) return err; } -static int damon_sysfs_commit_schemes_quota_goals( - struct damon_sysfs_kdamond *sysfs_kdamond) +static int damon_sysfs_commit_schemes_quota_goals(void *data) { + struct damon_sysfs_kdamond *sysfs_kdamond = data; struct damon_ctx *ctx; struct damon_sysfs_context *sysfs_ctx; @@ -1338,20 +1289,18 @@ static int damon_sysfs_commit_schemes_quota_goals( /* * damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas * sysfs files. - * @kdamond: The kobject wrapper that associated to the kdamond thread. + * @data: The kobject wrapper that associated to the kdamond thread. * * This function reads the schemes' effective quotas of specific kdamond and * update the related values for sysfs files. This function should be called * from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the * DAMON contexts-internal data and DAMON sysfs variables. */ -static int damon_sysfs_upd_schemes_effective_quotas( - struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_upd_schemes_effective_quotas(void *data) { + struct damon_sysfs_kdamond *kdamond = data; struct damon_ctx *ctx = kdamond->damon_ctx; - if (!ctx) - return -EINVAL; damos_sysfs_update_effective_quotas( kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; @@ -1371,67 +1320,27 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active, bool after_aggregation) { struct damon_sysfs_kdamond *kdamond; - bool total_bytes_only = false; int err = 0; /* avoid deadlock due to concurrent state_store('off') */ - if (!damon_sysfs_schemes_regions_updating && - !mutex_trylock(&damon_sysfs_lock)) + if (!mutex_trylock(&damon_sysfs_lock)) return 0; kdamond = damon_sysfs_cmd_request.kdamond; if (!kdamond || kdamond->damon_ctx != c) goto out; switch (damon_sysfs_cmd_request.cmd) { - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: - err = damon_sysfs_upd_schemes_stats(kdamond); - break; case DAMON_SYSFS_CMD_COMMIT: if (!after_aggregation) goto out; err = damon_sysfs_commit_input(kdamond); break; - case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: - err = damon_sysfs_commit_schemes_quota_goals(kdamond); - break; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: - total_bytes_only = true; - fallthrough; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: - if (!damon_sysfs_schemes_regions_updating) { - err = damon_sysfs_upd_schemes_regions_start(kdamond, - total_bytes_only); - if (!err) { - damon_sysfs_schemes_regions_updating = true; - goto keep_lock_out; - } - } else { - damos_sysfs_mark_finished_regions_updates(c); - /* - * Continue regions updating if DAMON is till - * active and the update for all schemes is not - * finished. - */ - if (active && !damos_sysfs_regions_upd_done()) - goto keep_lock_out; - err = damon_sysfs_upd_schemes_regions_stop(kdamond); - damon_sysfs_schemes_regions_updating = false; - } - break; - case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: - err = damon_sysfs_clear_schemes_regions(kdamond); - break; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: - err = damon_sysfs_upd_schemes_effective_quotas(kdamond); - break; default: break; } /* Mark the request as invalid now. */ damon_sysfs_cmd_request.kdamond = NULL; out: - if (!damon_sysfs_schemes_regions_updating) - mutex_unlock(&damon_sysfs_lock); -keep_lock_out: + mutex_unlock(&damon_sysfs_lock); return err; } @@ -1525,6 +1434,58 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond) */ } +static int damon_sysfs_damon_call(int (*fn)(void *data), + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_call_control call_control = {}; + + if (!kdamond->damon_ctx) + return -EINVAL; + call_control.fn = fn; + call_control.data = kdamond; + return damon_call(kdamond->damon_ctx, &call_control); +} + +struct damon_sysfs_schemes_walk_data { + struct damon_sysfs_kdamond *sysfs_kdamond; + bool total_bytes_only; +}; + +/* populate the region directory */ +static void damon_sysfs_schemes_tried_regions_upd_one(void *data, struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s, unsigned long sz_filter_passed) +{ + struct damon_sysfs_schemes_walk_data *walk_data = data; + struct damon_sysfs_kdamond *sysfs_kdamond = walk_data->sysfs_kdamond; + + damos_sysfs_populate_region_dir( + sysfs_kdamond->contexts->contexts_arr[0]->schemes, + ctx, t, r, s, walk_data->total_bytes_only, + sz_filter_passed); +} + +static int damon_sysfs_update_schemes_tried_regions( + struct damon_sysfs_kdamond *sysfs_kdamond, bool total_bytes_only) +{ + struct damon_sysfs_schemes_walk_data walk_data = { + .sysfs_kdamond = sysfs_kdamond, + .total_bytes_only = total_bytes_only, + }; + struct damos_walk_control control = { + .walk_fn = damon_sysfs_schemes_tried_regions_upd_one, + .data = &walk_data, + }; + struct damon_ctx *ctx = sysfs_kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + + damon_sysfs_schemes_clear_regions( + sysfs_kdamond->contexts->contexts_arr[0]->schemes); + return damos_walk(ctx, &control); +} + /* * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond. * @cmd: The command to handle. @@ -1543,12 +1504,29 @@ static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, { bool need_wait = true; - /* Handle commands that doesn't access DAMON context-internal data */ switch (cmd) { case DAMON_SYSFS_CMD_ON: return damon_sysfs_turn_damon_on(kdamond); case DAMON_SYSFS_CMD_OFF: return damon_sysfs_turn_damon_off(kdamond); + case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: + return damon_sysfs_damon_call( + damon_sysfs_commit_schemes_quota_goals, + kdamond); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: + return damon_sysfs_damon_call( + damon_sysfs_upd_schemes_stats, kdamond); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: + return damon_sysfs_update_schemes_tried_regions(kdamond, true); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: + return damon_sysfs_update_schemes_tried_regions(kdamond, false); + case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: + return damon_sysfs_schemes_clear_regions( + kdamond->contexts->contexts_arr[0]->schemes); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: + return damon_sysfs_damon_call( + damon_sysfs_upd_schemes_effective_quotas, + kdamond); default: break; } diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig index a73be044fc9b..36a450f57b58 100644 --- a/mm/damon/tests/.kunitconfig +++ b/mm/damon/tests/.kunitconfig @@ -13,10 +13,3 @@ CONFIG_DAMON_VADDR_KUNIT_TEST=y CONFIG_SYSFS=y CONFIG_DAMON_SYSFS=y CONFIG_DAMON_SYSFS_KUNIT_TEST=y - -# for DAMON debugfs interface -CONFIG_DEBUG_FS=y -CONFIG_DAMON_PADDR=y -CONFIG_DAMON_DBGFS_DEPRECATED=y -CONFIG_DAMON_DBGFS=y -CONFIG_DAMON_DBGFS_KUNIT_TEST=y diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index cf22e09a3507..532c6a6f21f9 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -411,7 +411,7 @@ static void damos_test_new_filter(struct kunit *test) { struct damos_filter *filter; - filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false); KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON); KUNIT_EXPECT_EQ(test, filter->matching, true); KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list); @@ -425,7 +425,7 @@ static void damos_test_filter_out(struct kunit *test) struct damon_region *r, *r2; struct damos_filter *f; - f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true); + f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false); f->addr_range = (struct damon_addr_range){ .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6}; @@ -434,25 +434,25 @@ static void damos_test_filter_out(struct kunit *test) damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 2; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ r->ar.start = DAMON_MIN_REGION * 6; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); @@ -465,7 +465,7 @@ static void damos_test_filter_out(struct kunit *test) /* region started in the range */ r->ar.start = DAMON_MIN_REGION * 2; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); diff --git a/mm/damon/tests/dbgfs-kunit.h b/mm/damon/tests/dbgfs-kunit.h deleted file mode 100644 index 087e53f641a8..000000000000 --- a/mm/damon/tests/dbgfs-kunit.h +++ /dev/null @@ -1,173 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * DAMON Debugfs Interface Unit Tests - * - * Author: SeongJae Park - */ - -#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST - -#ifndef _DAMON_DBGFS_TEST_H -#define _DAMON_DBGFS_TEST_H - -#include - -static void damon_dbgfs_test_str_to_ints(struct kunit *test) -{ - char *question; - int *answers; - int expected[] = {12, 35, 46}; - ssize_t nr_integers = 0, i; - - question = "123"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123, answers[0]); - kfree(answers); - - question = "123abc"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123, answers[0]); - kfree(answers); - - question = "a123"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); - - question = "12 35"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); - for (i = 0; i < nr_integers; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = "12 35 46"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); - for (i = 0; i < nr_integers; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = "12 35 abc 46"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); - for (i = 0; i < 2; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = ""; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); - - question = "\n"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); -} - -static void damon_dbgfs_test_set_targets(struct kunit *test) -{ - struct damon_ctx *ctx = dbgfs_new_ctx(); - char buf[64]; - - if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { - dbgfs_destroy_ctx(ctx); - kunit_skip(test, "PADDR not registered"); - } - - /* Make DAMON consider target has no pid */ - damon_select_ops(ctx, DAMON_OPS_PADDR); - - dbgfs_set_targets(ctx, 0, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - - dbgfs_set_targets(ctx, 1, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n"); - - dbgfs_set_targets(ctx, 0, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - - dbgfs_destroy_ctx(ctx); -} - -static void damon_dbgfs_test_set_init_regions(struct kunit *test) -{ - struct damon_ctx *ctx = damon_new_ctx(); - /* Each line represents one region in `` `` */ - char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", - "1 10 20\n", - "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n", - ""}; - /* Reading the file again will show sorted, clean output */ - char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n", - "1 10 20\n", - "0 39 59\n0 70 134\n1 10 20\n1 20 25\n", - ""}; - char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */ - "1 10 20\n 1 14 26\n", /* regions overlap */ - "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */ - char *input, *expect; - int i, rc; - char buf[256]; - - if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - kunit_skip(test, "PADDR not registered"); - } - - damon_select_ops(ctx, DAMON_OPS_PADDR); - - dbgfs_set_targets(ctx, 3, NULL); - - /* Put valid inputs and check the results */ - for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { - input = valid_inputs[i]; - expect = valid_expects[i]; - - rc = set_init_regions(ctx, input, strnlen(input, 256)); - KUNIT_EXPECT_EQ(test, rc, 0); - - memset(buf, 0, 256); - sprint_init_regions(ctx, buf, 256); - - KUNIT_EXPECT_STREQ(test, (char *)buf, expect); - } - /* Put invalid inputs and check the return error code */ - for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) { - input = invalid_inputs[i]; - pr_info("input: %s\n", input); - rc = set_init_regions(ctx, input, strnlen(input, 256)); - KUNIT_EXPECT_EQ(test, rc, -EINVAL); - - memset(buf, 0, 256); - sprint_init_regions(ctx, buf, 256); - - KUNIT_EXPECT_STREQ(test, (char *)buf, ""); - } - - dbgfs_set_targets(ctx, 0, NULL); - damon_destroy_ctx(ctx); -} - -static struct kunit_case damon_test_cases[] = { - KUNIT_CASE(damon_dbgfs_test_str_to_ints), - KUNIT_CASE(damon_dbgfs_test_set_targets), - KUNIT_CASE(damon_dbgfs_test_set_init_regions), - {}, -}; - -static struct kunit_suite damon_test_suite = { - .name = "damon-dbgfs", - .test_cases = damon_test_cases, -}; -kunit_test_suite(damon_test_suite); - -#endif /* _DAMON_DBGFS_TEST_H */ - -#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index b9fe3bc8472b..7cd944266a92 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -68,7 +68,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) static struct mm_struct mm; struct damon_addr_range regions[3] = {0}; /* 10-20-25, 200-210-220, 300-305, 307-330 */ - struct vm_area_struct vmas[] = { + static struct vm_area_struct vmas[] = { (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, (struct vm_area_struct) {.vm_start = 20, .vm_end = 25}, (struct vm_area_struct) {.vm_start = 200, .vm_end = 210}, diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index b9eaa20b73b9..a6174f725bd7 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -655,7 +655,7 @@ static unsigned long damos_madvise(struct damon_target *target, static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damos *scheme, unsigned long *sz_filter_passed) { int madv_action; diff --git a/mm/debug.c b/mm/debug.c index 95b6ab809c0e..325d7bf22038 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -178,6 +178,17 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { +#ifdef CONFIG_PER_VMA_LOCK + pr_emerg("vma %px start %px end %px mm %px\n" + "prot %lx anon_vma %px vm_ops %px\n" + "pgoff %lx file %px private_data %px\n" + "flags: %#lx(%pGv) refcnt %x\n", + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm, + (unsigned long)pgprot_val(vma->vm_page_prot), + vma->anon_vma, vma->vm_ops, vma->vm_pgoff, + vma->vm_file, vma->vm_private_data, + vma->vm_flags, &vma->vm_flags, refcount_read(&vma->vm_refcnt)); +#else pr_emerg("vma %px start %px end %px mm %px\n" "prot %lx anon_vma %px vm_ops %px\n" "pgoff %lx file %px private_data %px\n" @@ -187,6 +198,7 @@ void dump_vma(const struct vm_area_struct *vma) vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data, vma->vm_flags, &vma->vm_flags); +#endif } EXPORT_SYMBOL(dump_vma); @@ -249,6 +261,77 @@ void dump_mm(const struct mm_struct *mm) } EXPORT_SYMBOL(dump_mm); +void dump_vmg(const struct vma_merge_struct *vmg, const char *reason) +{ + if (reason) + pr_warn("vmg %px dumped because: %s\n", vmg, reason); + + if (!vmg) { + pr_warn("vmg %px state: (NULL)\n", vmg); + return; + } + + pr_warn("vmg %px state: mm %px pgoff %lx\n" + "vmi %px [%lx,%lx)\n" + "prev %px next %px vma %px\n" + "start %lx end %lx flags %lx\n" + "file %px anon_vma %px policy %px\n" + "uffd_ctx %px\n" + "anon_name %px\n" + "merge_flags %x state %x\n", + vmg, vmg->mm, vmg->pgoff, + vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0, + vmg->vmi ? vma_iter_end(vmg->vmi) : 0, + vmg->prev, vmg->next, vmg->vma, + vmg->start, vmg->end, vmg->flags, + vmg->file, vmg->anon_vma, vmg->policy, +#ifdef CONFIG_USERFAULTFD + vmg->uffd_ctx.ctx, +#else + (void *)0, +#endif + vmg->anon_name, + (int)vmg->merge_flags, (int)vmg->state); + + if (vmg->mm) { + pr_warn("vmg %px mm:\n", vmg); + dump_mm(vmg->mm); + } else { + pr_warn("vmg %px mm: (NULL)\n", vmg); + } + + if (vmg->vma) { + pr_warn("vmg %px vma:\n", vmg); + dump_vma(vmg->vma); + } else { + pr_warn("vmg %px vma: (NULL)\n", vmg); + } + + if (vmg->prev) { + pr_warn("vmg %px prev:\n", vmg); + dump_vma(vmg->prev); + } else { + pr_warn("vmg %px prev: (NULL)\n", vmg); + } + + if (vmg->next) { + pr_warn("vmg %px next:\n", vmg); + dump_vma(vmg->next); + } else { + pr_warn("vmg %px next: (NULL)\n", vmg); + } + +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + if (vmg->vmi) { + pr_warn("vmg %px vmi:\n", vmg); + vma_iter_dump_tree(vmg->vmi); + } else { + pr_warn("vmg %px vmi: (NULL)\n", vmg); + } +#endif +} +EXPORT_SYMBOL(dump_vmg); + static bool page_init_poisoning __read_mostly = true; static int __init setup_vm_debug(char *str) diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index ce06b2884789..ff35b84a7b50 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -245,7 +245,10 @@ early_memremap_prot(resource_size_t phys_addr, unsigned long size, #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) -void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) +/* + * If no empty slot, handle that and return -ENOMEM. + */ +int __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) { unsigned long slop, clen; char *p; @@ -256,12 +259,15 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) if (clen > MAX_MAP_CHUNK - slop) clen = MAX_MAP_CHUNK - slop; p = early_memremap(src & PAGE_MASK, clen + slop); + if (!p) + return -ENOMEM; memcpy(dest, p + slop, clen); early_memunmap(p, clen + slop); dest += clen; src += clen; size -= clen; } + return 0; } #else /* CONFIG_MMU */ diff --git a/mm/filemap.c b/mm/filemap.c index 4f476411a9a2..279959cf9300 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -440,6 +440,24 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, } EXPORT_SYMBOL(filemap_fdatawrite_range); +/** + * filemap_fdatawrite_range_kick - start writeback on a range + * @mapping: target address_space + * @start: index to start writeback on + * @end: last (non-inclusive) index for writeback + * + * This is a non-integrity writeback helper, to start writing back folios + * for the indicated range. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); +} +EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); + /** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space @@ -1463,25 +1481,6 @@ static int folio_put_wait_locked(struct folio *folio, int state) return folio_wait_bit_common(folio, PG_locked, state, DROP); } -/** - * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue - * @folio: Folio defining the wait queue of interest - * @waiter: Waiter to add to the queue - * - * Add an arbitrary @waiter to the wait queue for the nominated @folio. - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) -{ - wait_queue_head_t *q = folio_waitqueue(folio); - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, waiter); - folio_set_waiters(folio); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(folio_add_wait_queue); - /** * folio_unlock - Unlock a locked folio. * @folio: The folio. @@ -1590,6 +1589,27 @@ int folio_wait_private_2_killable(struct folio *folio) } EXPORT_SYMBOL(folio_wait_private_2_killable); +/* + * If folio was marked as dropbehind, then pages should be dropped when writeback + * completes. Do that now. If we fail, it's likely because of a big folio - + * just reset dropbehind for that case and latter completions should invalidate. + */ +static void folio_end_dropbehind_write(struct folio *folio) +{ + /* + * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, + * but can happen if normal writeback just happens to find dirty folios + * that were created as part of uncached writeback, and that writeback + * would otherwise not need non-IRQ handling. Just skip the + * invalidation in that case. + */ + if (in_task() && folio_trylock(folio)) { + if (folio->mapping) + folio_unmap_invalidate(folio->mapping, folio, 0); + folio_unlock(folio); + } +} + /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. @@ -1600,6 +1620,8 @@ EXPORT_SYMBOL(folio_wait_private_2_killable); */ void folio_end_writeback(struct folio *folio) { + bool folio_dropbehind = false; + VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* @@ -1621,9 +1643,14 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); + if (!folio_test_dirty(folio)) + folio_dropbehind = folio_test_clear_dropbehind(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); + + if (folio_dropbehind) + folio_end_dropbehind_write(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); @@ -1946,6 +1973,8 @@ no_page: /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); + if (fgp_flags & FGP_DONTCACHE) + __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) @@ -1968,6 +1997,9 @@ no_page: if (!folio) return ERR_PTR(-ENOENT); + /* not an uncached lookup, clear uncached if set */ + if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) + folio_clear_dropbehind(folio); return folio; } EXPORT_SYMBOL(__filemap_get_folio); @@ -2450,18 +2482,22 @@ unlock_mapping: return error; } -static int filemap_create_folio(struct file *file, - struct address_space *mapping, loff_t pos, - struct folio_batch *fbatch) +static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; + if (iocb->ki_flags & IOCB_DONTCACHE) + __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock @@ -2477,7 +2513,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - index = (pos >> (PAGE_SHIFT + min_order)) << min_order; + index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2485,7 +2521,8 @@ static int filemap_create_folio(struct file *file, if (error) goto error; - error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); if (error) goto error; @@ -2506,6 +2543,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } @@ -2515,7 +2554,6 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; @@ -2530,20 +2568,21 @@ retry: filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); - page_cache_sync_readahead(mapping, ra, filp, index, - last_index - index); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; + page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { - if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) - return -EAGAIN; - err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); + err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; @@ -2584,6 +2623,20 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) return (pos1 >> shift == pos2 >> shift); } +static void filemap_end_dropbehind_read(struct address_space *mapping, + struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (folio_trylock(folio)) { + if (folio_test_clear_dropbehind(folio)) + folio_unmap_invalidate(mapping, folio, 0); + folio_unlock(folio); + } +} + /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. @@ -2697,8 +2750,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, } } put_folios: - for (i = 0; i < folio_batch_count(&fbatch); i++) - folio_put(fbatch.folios[i]); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + filemap_end_dropbehind_read(mapping, folio); + folio_put(folio); + } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); diff --git a/mm/gup.c b/mm/gup.c index 3b75e631f369..2cc3a9d28e70 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -596,6 +596,33 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs, } #endif /* CONFIG_HAVE_GUP_FAST */ +/* Common code for can_follow_write_* */ +static inline bool can_follow_write_common(struct page *page, + struct vm_area_struct *vma, unsigned int flags) +{ + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + return page && PageAnon(page) && PageAnonExclusive(page); +} + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags, unsigned long address) { @@ -622,6 +649,18 @@ static struct page *no_page_table(struct vm_area_struct *vma, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */ +static inline bool can_follow_write_pud(pud_t pud, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pud is writable, we can write to the page. */ + if (pud_write(pud)) + return true; + + return can_follow_write_common(page, vma, flags); +} + static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) @@ -634,10 +673,11 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, assert_spin_locked(pud_lockptr(mm, pudp)); - if ((flags & FOLL_WRITE) && !pud_write(pud)) + if (!pud_present(pud)) return NULL; - if (!pud_present(pud)) + if ((flags & FOLL_WRITE) && + !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags)) return NULL; pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; @@ -686,27 +726,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, if (pmd_write(pmd)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ @@ -807,27 +827,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page, if (pte_write(pte)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ @@ -1294,9 +1294,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; - /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ - if (is_vm_hugetlb_page(vma)) - return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could @@ -2347,7 +2344,7 @@ static unsigned long collect_longterm_unpinnable_folios( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(folio, movable_folio_list); + folio_isolate_hugetlb(folio, movable_folio_list); continue; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index db64116a4f84..3d3ebdc002d5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -617,6 +617,8 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); +DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK); +DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -637,6 +639,8 @@ static struct attribute *anon_stats_attrs[] = { #ifndef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, + &swpin_fallback_attr.attr, + &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -669,6 +673,8 @@ static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, + &swpin_fallback_attr.attr, + &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -2003,7 +2009,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) spin_unlock(vmf->ptl); writable = false; - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { flags |= TNF_MIGRATED; nid = target_nid; task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); @@ -3284,7 +3290,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ lruvec = folio_lruvec_lock(folio); - ClearPageHasHWPoisoned(head); + folio_clear_has_hwpoisoned(folio); for (i = nr - new_nr; i >= new_nr; i -= new_nr) { struct folio *tail; @@ -4175,20 +4181,21 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, if (input_buf[0] == '/') { char *tok; - char *buf = input_buf; + char *tok_buf = input_buf; char file_path[MAX_INPUT_BUF_SZ]; pgoff_t off_start = 0, off_end = 0; size_t input_len = strlen(input_buf); - tok = strsep(&buf, ","); - if (tok && buf) { + tok = strsep(&tok_buf, ","); + if (tok && tok_buf) { strscpy(file_path, tok); } else { ret = -EINVAL; goto out; } - ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); + ret = sscanf(tok_buf, "0x%lx,0x%lx,%d", &off_start, + &off_end, &new_order); if (ret != 2 && ret != 3) { ret = -EINVAL; goto out; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eaaec19caa7c..6a0ea28f5bac 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -48,6 +48,7 @@ #include #include "internal.h" #include "hugetlb_vmemmap.h" +#include int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; @@ -1246,69 +1247,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) hugetlb_dup_vma_private(vma); } -/* Returns true if the VMA has associated reserve pages */ -static bool vma_has_reserves(struct vm_area_struct *vma, long chg) -{ - if (vma->vm_flags & VM_NORESERVE) { - /* - * This address is already reserved by other process(chg == 0), - * so, we should decrement reserved count. Without decrementing, - * reserve count remains after releasing inode, because this - * allocated page will go into page cache and is regarded as - * coming from reserved pool in releasing step. Currently, we - * don't have any other solution to deal with this situation - * properly, so add work-around here. - */ - if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return true; - else - return false; - } - - /* Shared mappings always use reserves */ - if (vma->vm_flags & VM_MAYSHARE) { - /* - * We know VM_NORESERVE is not set. Therefore, there SHOULD - * be a region map for all pages. The only situation where - * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reserves to - * use. This situation is indicated if chg != 0. - */ - if (chg) - return false; - else - return true; - } - - /* - * Only the process that called mmap() has reserves for - * private mappings. - */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Like the shared case above, a hole punch or truncate - * could have been performed on the private mapping. - * Examine the value of chg to determine if reserves - * actually exist or were previously consumed. - * Very Subtle - The value of chg comes from a previous - * call to vma_needs_reserves(). The reserve map for - * private mappings has different (opposite) semantics - * than that of shared mappings. vma_needs_reserves() - * has already taken this difference in semantics into - * account. Therefore, the meaning of chg is the same - * as in the shared case above. Code could easily be - * combined, but keeping it separate draws attention to - * subtle differences. - */ - if (chg) - return false; - else - return true; - } - - return false; -} - static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); @@ -1336,6 +1274,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, if (folio_test_hwpoison(folio)) continue; + if (is_migrate_isolate_page(&folio->page)) + continue; + list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); @@ -1394,8 +1335,7 @@ static unsigned long available_huge_pages(struct hstate *h) static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve, - long chg) + unsigned long address, long gbl_chg) { struct folio *folio = NULL; struct mempolicy *mpol; @@ -1404,15 +1344,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, int nid; /* - * A child process with MAP_PRIVATE mappings created by their parent - * have no page reserves. This check ensures that reservations are - * not "stolen". The child may still get SIGKILLed + * gbl_chg==1 means the allocation requires a new page that was not + * reserved before. Making sure there's at least one free page. */ - if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) - goto err; - - /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && !available_huge_pages(h)) + if (gbl_chg && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -1430,11 +1365,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); - if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } - mpol_cond_put(mpol); return folio; @@ -2463,7 +2393,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) long needed, allocated; bool alloc_ok = true; int node; - nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); + nodemask_t *mbind_nodemask, alloc_nodemask; + + mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); + if (mbind_nodemask) + nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed); + else + alloc_nodemask = cpuset_current_mems_allowed; lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; @@ -2479,8 +2415,16 @@ retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { folio = NULL; - for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) { + + /* Prioritize current node */ + if (node_isset(numa_mem_id(), alloc_nodemask)) + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + numa_mem_id(), NULL); + + if (!folio) { + for_each_node_mask(node, alloc_nodemask) { + if (node == numa_mem_id()) + continue; folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), node, NULL); if (folio) @@ -2868,7 +2812,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - isolated = isolate_hugetlb(old_folio, list); + isolated = folio_isolate_hugetlb(old_folio, list); ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; @@ -2953,7 +2897,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) + if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -2961,69 +2905,129 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } +/* + * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn + * range with new folios. + * @start_pfn: start pfn of the given pfn range + * @end_pfn: end pfn of the given pfn range + * Returns 0 on success, otherwise negated error. + */ +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) +{ + struct hstate *h; + struct folio *folio; + int ret = 0; + + LIST_HEAD(isolate_list); + + while (start_pfn < end_pfn) { + folio = pfn_folio(start_pfn); + if (folio_test_hugetlb(folio)) { + h = folio_hstate(folio); + } else { + start_pfn++; + continue; + } + + if (!folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(h, folio, + &isolate_list); + if (ret) + break; + + putback_movable_pages(&isolate_list); + } + start_pfn++; + } + + return ret; +} + +typedef enum { + /* + * For either 0/1: we checked the per-vma resv map, and one resv + * count either can be reused (0), or an extra needed (1). + */ + MAP_CHG_REUSE = 0, + MAP_CHG_NEEDED = 1, + /* + * Cannot use per-vma resv count can be used, hence a new resv + * count is enforced. + * + * NOTE: This is mostly identical to MAP_CHG_NEEDED, except + * that currently vma_needs_reservation() has an unwanted side + * effect to either use end() or commit() to complete the + * transaction. Hence it needs to differenciate from NEEDED. + */ + MAP_CHG_ENFORCED = 2, +} map_chg_state; + +/* + * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW + * faults of hugetlb private mappings on top of a non-page-cache folio (in + * which case even if there's a private vma resv map it won't cover such + * allocation). New call sites should (probably) never set it to true!! + * When it's set, the allocation will bypass all vma level reservations. + */ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) + unsigned long addr, bool cow_from_owner) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit, nr_pages = pages_per_huge_page(h); - long gbl_chg; - int memcg_charge_ret, ret, idx; + long retval, gbl_chg; + map_chg_state map_chg; + int ret, idx; struct hugetlb_cgroup *h_cg = NULL; - struct mem_cgroup *memcg; - bool deferred_reserve; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; - memcg = get_mem_cgroup_from_current(); - memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); - if (memcg_charge_ret == -ENOMEM) { - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); - } - idx = hstate_index(h); - /* - * Examine the region/reserve map to determine if the process - * has a reservation for the page to be allocated. A return - * code of zero indicates a reservation exists (no change). - */ - map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) { - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); + + /* Whether we need a separate per-vma reservation? */ + if (cow_from_owner) { + /* + * Special case! Since it's a CoW on top of a reserved + * page, the private resv map doesn't count. So it cannot + * consume the per-vma resv map even if it's reserved. + */ + map_chg = MAP_CHG_ENFORCED; + } else { + /* + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). + */ + retval = vma_needs_reservation(h, vma, addr); + if (retval < 0) + return ERR_PTR(-ENOMEM); + map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE; } /* + * Whether we need a separate global reservation? + * * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. - * Allocations for MAP_NORESERVE mappings also need to be - * checked against any subpool limit. + * Or if it can get one from the pool reservation directly. */ - if (map_chg || avoid_reserve) { + if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1); if (gbl_chg < 0) goto out_end_reservation; - + } else { /* - * Even though there was no reservation in the region/reserve - * map, there could be reservations associated with the - * subpool that can be used. This would be indicated if the - * return value of hugepage_subpool_get_pages() is zero. - * However, if avoid_reserve is specified we still avoid even - * the subpool reservations. + * If we have the vma reservation ready, no need for extra + * global reservation. */ - if (avoid_reserve) - gbl_chg = 1; + gbl_chg = 0; } - /* If this allocation is not consuming a reservation, charge it now. + /* + * If this allocation is not consuming a per-vma reservation, + * charge the hugetlb cgroup now. */ - deferred_reserve = map_chg || avoid_reserve; - if (deferred_reserve) { + if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) @@ -3040,27 +3044,32 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } + /* + * Either dequeued or buddy-allocated folio needs to add special + * mark to the folio when it consumes a global reservation. + */ + if (!gbl_chg) { + folio_set_hugetlb_restore_reserve(folio); + h->resv_huge_pages--; + } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ - if (deferred_reserve) { + if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } @@ -3069,50 +3078,61 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_set_folio_subpool(folio, spool); - map_commit = vma_commit_reservation(h, vma, addr); - if (unlikely(map_chg > map_commit)) { + if (map_chg != MAP_CHG_ENFORCED) { + /* commit() is only needed if the map_chg is not enforced */ + retval = vma_commit_reservation(h, vma, addr); /* + * Check for possible race conditions. When it happens.. * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND - * in hugetlb_reserve_pages for the same page. Also, + * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ - long rsv_adjust; + if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { + long rsv_adjust; - rsv_adjust = hugepage_subpool_put_pages(spool, 1); - hugetlb_acct_memory(h, -rsv_adjust); - if (deferred_reserve) { - spin_lock_irq(&hugetlb_lock); - hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), - pages_per_huge_page(h), folio); - spin_unlock_irq(&hugetlb_lock); + rsv_adjust = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -rsv_adjust); + if (map_chg) { + spin_lock_irq(&hugetlb_lock); + hugetlb_cgroup_uncharge_folio_rsvd( + hstate_index(h), pages_per_huge_page(h), + folio); + spin_unlock_irq(&hugetlb_lock); + } } } - if (!memcg_charge_ret) - mem_cgroup_commit_charge(folio, memcg); + ret = mem_cgroup_charge_hugetlb(folio, gfp); + /* + * Unconditionally increment NR_HUGETLB here. If it turns out that + * mem_cgroup_charge_hugetlb failed, then immediately free the page and + * decrement NR_HUGETLB. + */ lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); - mem_cgroup_put(memcg); + + if (ret == -ENOMEM) { + free_huge_folio(folio); + return ERR_PTR(-ENOMEM); + } return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: - if (deferred_reserve) + if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg || avoid_reserve) + if (map_chg) hugepage_subpool_put_pages(spool, 1); out_end_reservation: - vma_end_reservation(h, vma, addr); - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); + if (map_chg != MAP_CHG_ENFORCED) + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -3806,13 +3826,15 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { struct page *page = folio_page(folio, i); + struct folio *new_folio; page->mapping = NULL; clear_compound_head(page); prep_compound_page(page, dst->order); + new_folio = page_folio(page); - init_new_hugetlb_folio(dst, page_folio(page)); - list_add(&page->lru, &dst_list); + init_new_hugetlb_folio(dst, new_folio); + list_add(&new_folio->lru, &dst_list); } } @@ -5141,12 +5163,12 @@ const struct vm_operations_struct hugetlb_vm_ops = { }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, - int writable) + bool try_mkwrite) { pte_t entry; unsigned int shift = huge_page_shift(hstate_vma(vma)); - if (writable) { + if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, vma->vm_page_prot))); } else { @@ -5169,6 +5191,13 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } +static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + if (vma->vm_flags & VM_WRITE) + set_huge_ptep_writable(vma, address, ptep); +} + bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; @@ -5199,7 +5228,7 @@ static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); + pte_t newpte = make_huge_pte(vma, &new_folio->page, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); @@ -5333,7 +5362,7 @@ again: spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + new_folio = alloc_hugetlb_folio(dst_vma, addr, false); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); @@ -5799,7 +5828,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; - int outside_reserve = 0; + bool cow_from_owner = 0; vm_fault_t ret = 0; struct mmu_notifier_range range; @@ -5814,13 +5843,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, if (!unshare && huge_pte_uffd_wp(pte)) return 0; - /* - * hugetlb does not support FOLL_FORCE-style write faults that keep the - * PTE mapped R/O such as maybe_mkwrite() would do. - */ - if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) - return VM_FAULT_SIGSEGV; - /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { set_huge_ptep_writable(vma, vmf->address, vmf->pte); @@ -5849,7 +5871,8 @@ retry_avoidcopy: SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, vmf->address, vmf->pte); + set_huge_ptep_maybe_writable(vma, vmf->address, + vmf->pte); delayacct_wpcopy_end(); return 0; @@ -5868,7 +5891,7 @@ retry_avoidcopy: */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_folio != pagecache_folio) - outside_reserve = 1; + cow_from_owner = true; folio_get(old_folio); @@ -5877,7 +5900,7 @@ retry_avoidcopy: * be acquired again before returning to the caller, as expected. */ spin_unlock(vmf->ptl); - new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner); if (IS_ERR(new_folio)) { /* @@ -5887,7 +5910,7 @@ retry_avoidcopy: * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ - if (outside_reserve) { + if (cow_from_owner) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; @@ -6138,7 +6161,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, goto out; } - folio = alloc_hugetlb_folio(vma, vmf->address, 0); + folio = alloc_hugetlb_folio(vma, vmf->address, false); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being @@ -6235,8 +6258,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); - new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) - && (vma->vm_flags & VM_SHARED))); + new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. @@ -6568,7 +6590,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, spinlock_t *ptl; int ret = -ENOMEM; struct folio *folio; - int writable; bool folio_in_pagecache = false; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { @@ -6606,7 +6627,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; @@ -6648,7 +6669,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; @@ -6722,12 +6743,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - if (wp_enabled || (is_continue && !vm_shared)) - writable = 0; - else - writable = dst_vma->vm_flags & VM_WRITE; - - _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); + _dst_pte = make_huge_pte(dst_vma, &folio->page, + !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -7406,7 +7423,24 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ -bool isolate_hugetlb(struct folio *folio, struct list_head *list) +/** + * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio + * @folio: the folio to isolate + * @list: the list to add the folio to on success + * + * Isolate an allocated (refcount > 0) hugetlb folio, marking it as + * isolated/non-migratable, and moving it from the active list to the + * given list. + * + * Isolation will fail if @folio is not an allocated hugetlb folio, or if + * it is already isolated/non-migratable. + * + * On success, an additional folio reference is taken that must be dropped + * using folio_putback_hugetlb() to undo the isolation. + * + * Return: True if isolation worked, otherwise False. + */ +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; @@ -7454,7 +7488,18 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void folio_putback_active_hugetlb(struct folio *folio) +/** + * folio_putback_hugetlb: unisolate a hugetlb folio + * @folio: the isolated hugetlb folio + * + * Putback/un-isolate the hugetlb folio that was previous isolated using + * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it + * back onto the active list. + * + * Will drop the additional folio reference obtained through + * folio_isolate_hugetlb(). + */ +void folio_putback_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(folio); @@ -7501,6 +7546,11 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re } spin_unlock_irq(&hugetlb_lock); } + /* + * Our old folio is isolated and has "migratable" cleared until it + * is putback. As migration succeeded, set the new folio "migratable". + */ + folio_set_hugetlb_migratable(new_folio); } static void hugetlb_unshare_pmds(struct vm_area_struct *vma, diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index e716c4671a15..bb9578bd99f9 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -195,24 +195,23 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) * cannot fail. */ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { unsigned int nr_pages; struct page_counter *counter; - struct hugetlb_cgroup *page_hcg; + struct hugetlb_cgroup *hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); - struct folio *folio = page_folio(page); - page_hcg = hugetlb_cgroup_from_folio(folio); + hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely * ignore those pages. */ - if (!page_hcg || page_hcg != h_cg) + if (!hcg || hcg != h_cg) goto out; - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); if (!parent) { parent = root_h_cgroup; /* root has no limit */ @@ -235,13 +234,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; - struct page *page; + struct folio *folio; do { for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); - list_for_each_entry(page, &h->hugepage_activelist, lru) - hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); + list_for_each_entry(folio, &h->hugepage_activelist, lru) + hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); spin_unlock_irq(&hugetlb_lock); } @@ -917,7 +916,6 @@ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); - return; } static struct cftype hugetlb_files[] = { diff --git a/mm/init-mm.c b/mm/init-mm.c index 24c809379274..4600e7605cab 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -40,7 +40,8 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), #ifdef CONFIG_PER_VMA_LOCK - .mm_lock_seq = 0, + .vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait), + .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, diff --git a/mm/internal.h b/mm/internal.h index 9826f7dce607..109ef30fee11 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -392,6 +392,8 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); +int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, + gfp_t gfp); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); @@ -735,15 +737,30 @@ static inline void prep_compound_tail(struct page *head, int tail_idx) extern void prep_compound_page(struct page *page, unsigned int order); -extern void post_alloc_hook(struct page *page, unsigned int order, - gfp_t gfp_flags); +void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; -void free_unref_page(struct page *page, unsigned int order); +struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, + nodemask_t *); +#define __alloc_frozen_pages(...) \ + alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__)) +void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); +#ifdef CONFIG_NUMA +struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order); +#else +static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) +{ + return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL); +} +#endif + +#define alloc_frozen_pages(...) \ + alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) + extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); @@ -824,10 +841,6 @@ int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype); - /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); @@ -1440,22 +1453,6 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); -#ifdef CONFIG_64BIT -static inline int can_do_mseal(unsigned long flags) -{ - if (flags) - return -EINVAL; - - return 0; -} - -#else -static inline int can_do_mseal(unsigned long flags) -{ - return -EPERM; -} -#endif - #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) @@ -1530,4 +1527,23 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +/* pt_reclaim.c */ +bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); +void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, + pmd_t pmdval); +void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + struct mmu_gather *tlb); + +#ifdef CONFIG_PT_RECLAIM +bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details); +#else +static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details) +{ + return false; +} +#endif /* CONFIG_PT_RECLAIM */ + + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 8b9e348113b1..d54e89f8c3e7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -524,7 +524,11 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) sizeof(struct kasan_free_meta) : 0); } -static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) +/* + * This function avoids dynamic memory allocations and thus can be called from + * contexts that do not allow allocating memory. + */ +void kasan_record_aux_stack(void *addr) { struct slab *slab = kasan_addr_to_slab(addr); struct kmem_cache *cache; @@ -541,17 +545,7 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags); -} - -void kasan_record_aux_stack(void *addr) -{ - return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC); -} - -void kasan_record_aux_stack_noalloc(void *addr) -{ - return __kasan_record_aux_stack(addr, 0); + alloc_meta->aux_stack[0] = kasan_save_stack(0, 0); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b7e4b81421b3..129178be5e64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -501,18 +501,18 @@ static inline bool kasan_byte_accessible(const void *addr) /** * kasan_poison - mark the memory range as inaccessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size, must be aligned to KASAN_GRANULE_SIZE - * @value - value that's written to metadata for the range - * @init - whether to initialize the memory range (only for hardware tag-based) + * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size, must be aligned to KASAN_GRANULE_SIZE + * @value: value that's written to metadata for the range + * @init: whether to initialize the memory range (only for hardware tag-based) */ void kasan_poison(const void *addr, size_t size, u8 value, bool init); /** * kasan_unpoison - mark the memory range as accessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size, can be unaligned - * @init - whether to initialize the memory range (only for hardware tag-based) + * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size, can be unaligned + * @init: whether to initialize the memory range (only for hardware tag-based) * * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before * marking the range. @@ -530,8 +530,8 @@ bool kasan_byte_accessible(const void *addr); /** * kasan_poison_last_granule - mark the last granule of the memory range as * inaccessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size + * @address: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size * * This function is only available for the generic mode, as it's the only mode * that has partially poisoned memory granules. diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 99d4ff0ed57a..59d673400085 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -47,8 +47,8 @@ static struct { * Some tests use these global variables to store return values from function * calls that could otherwise be eliminated by the compiler as dead code. */ -void *kasan_ptr_result; -int kasan_int_result; +static volatile void *kasan_ptr_result; +static volatile int kasan_int_result; /* Probe for console output: obtains test_status lines of interest. */ static void probe_console(void *ignore, const char *buf, size_t len) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index bad1e130eda8..5f0be134141e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -948,17 +948,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } -static int find_pmd_or_thp_or_none(struct mm_struct *mm, - unsigned long address, - pmd_t **pmd) +static inline int check_pmd_state(pmd_t *pmd) { - pmd_t pmde; + pmd_t pmde = pmdp_get_lockless(pmd); - *pmd = mm_find_pmd(mm, address); - if (!*pmd) - return SCAN_PMD_NULL; - - pmde = pmdp_get_lockless(*pmd); if (pmd_none(pmde)) return SCAN_PMD_NONE; if (!pmd_present(pmde)) @@ -972,6 +965,17 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, return SCAN_SUCCEED; } +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + return check_pmd_state(*pmd); +} + static int check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) @@ -1721,7 +1725,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; - bool skipped_uffd = false; + bool success = false; /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that @@ -1758,6 +1762,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); + /* + * The lock of new_folio is still held, we will be blocked in + * the page fault path, which prevents the pte entries from + * being set again. So even though the old empty PTE page may be + * concurrently freed and a new PTE page is filled into the pmd + * entry, it is still empty and can be removed. + * + * So here we only need to recheck if the state of pmd entry + * still meets our requirements, rather than checking pmd_same() + * like elsewhere. + */ + if (check_pmd_state(pmd) != SCAN_SUCCEED) + goto drop_pml; ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); @@ -1771,20 +1788,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * repeating the anon_vma check protects from one category, * and repeating the userfaultfd_wp() check from another. */ - if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { - skipped_uffd = true; - } else { + if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); + success = true; } if (ptl != pml) spin_unlock(ptl); +drop_pml: spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); - if (!skipped_uffd) { + if (success) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 820ba3b5cbfc..982bb5ef3233 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1855,7 +1855,7 @@ static int kmemleak_scan_thread(void *arg) * Wait before the first scan to allow the system to fully initialize. */ if (first_run) { - signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000); + signed long timeout = secs_to_jiffies(SECS_FIRST_SCAN); first_run = 0; while (timeout && !kthread_should_stop()) timeout = schedule_timeout_interruptible(timeout); @@ -2241,7 +2241,7 @@ void __init kmemleak_init(void) return; jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); - jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); + jiffies_scan_wait = secs_to_jiffies(SECS_SCAN_WAIT); object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 9c58f081d84f..1bb505a08415 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -280,12 +280,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end) start = (void *)PAGE_ALIGN_DOWN((u64)start); size = PAGE_ALIGN((u64)end - (u64)start); - shadow = memblock_alloc(size, PAGE_SIZE); - origin = memblock_alloc(size, PAGE_SIZE); - - if (!shadow || !origin) - panic("%s: Failed to allocate metadata memory for early boot range of size %llu", - __func__, size); + shadow = memblock_alloc_or_panic(size, PAGE_SIZE); + origin = memblock_alloc_or_panic(size, PAGE_SIZE); for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { page = virt_to_page_or_null((char *)start + addr); diff --git a/mm/ksm.c b/mm/ksm.c index 31a9bc365437..8be2b144fefd 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3262,6 +3262,25 @@ static void wait_while_offlining(void) #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_PROC_FS +/* + * The process is mergeable only if any VMA is currently + * applicable to KSM. + * + * The mmap lock must be held in read mode. + */ +bool ksm_process_mergeable(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + VMA_ITERATOR(vmi, mm, 0); + for_each_vma(vmi, vma) + if (vma->vm_flags & VM_MERGEABLE) + return true; + + return false; +} + long ksm_process_profit(struct mm_struct *mm) { return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - diff --git a/mm/madvise.c b/mm/madvise.c index 0ceae57da7da..49f3a75046f6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -851,7 +851,12 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range_single(vma, start, end - start, NULL); + struct zap_details details = { + .reclaim_pt = true, + .even_cows = true, + }; + + zap_page_range_single(vma, start, end - start, &details); return 0; } diff --git a/mm/memblock.c b/mm/memblock.c index 095c18b5c430..95af35fd1389 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1691,6 +1691,26 @@ void * __init memblock_alloc_try_nid( return ptr; } +/** + * __memblock_alloc_or_panic - Try to allocate memory and panic on failure + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @func: caller func name + * + * This function attempts to allocate memory using memblock_alloc, + * and in case of failure, it calls panic with the formatted message. + * This function should not be used directly, please use the macro memblock_alloc_or_panic. + */ +void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func) +{ + void *addr = memblock_alloc(size, align); + + if (unlikely(!addr)) + panic("%s: Failed to allocate %pap bytes\n", func, &size); + return addr; +} + /** * memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index a071fa43d479..2be6b9112808 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -899,7 +899,7 @@ static void memcg_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct mem_cgroup_event *event = @@ -1040,13 +1040,13 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, } else if (!strcmp(name, "memory.oom_control")) { pr_warn_once("oom_control is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org" - " if you depend on this functionality. \n"); + " if you depend on this functionality.\n"); event->register_event = mem_cgroup_oom_register_event; event->unregister_event = mem_cgroup_oom_unregister_event; } else if (!strcmp(name, "memory.pressure_level")) { pr_warn_once("pressure_level is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org " - "if you depend on this functionality. \n"); + "if you depend on this functionality.\n"); event->register_event = vmpressure_register_event; event->unregister_event = vmpressure_unregister_event; } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { @@ -1134,8 +1134,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) failed = iter; mem_cgroup_iter_break(memcg, iter); break; - } else - iter->oom_lock = true; + } + iter->oom_lock = true; } if (failed) { @@ -1202,7 +1202,7 @@ struct oom_wait_info { }; static int memcg_oom_wake_function(wait_queue_entry_t *wait, - unsigned mode, int sync, void *arg) + unsigned int mode, int sync, void *arg) { struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; struct mem_cgroup *oom_wait_memcg; @@ -1644,7 +1644,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, unsigned long nr = 0; enum lru_list lru; - VM_BUG_ON((unsigned)nid >= nr_node_ids); + VM_BUG_ON((unsigned int)nid >= nr_node_ids); for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) @@ -1881,7 +1881,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, pr_warn_once("oom_control is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org if you " - "depend on this functionality. \n"); + "depend on this functionality.\n"); /* cannot set to root cgroup and only 0 and 1 are allowed */ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7b3503d12aaf..46f8b372d212 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1161,6 +1161,7 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { struct mem_cgroup *iter; int ret = 0; + int i = 0; BUG_ON(mem_cgroup_is_root(memcg)); @@ -1169,8 +1170,12 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct task_struct *task; css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); - while (!ret && (task = css_task_iter_next(&it))) + while (!ret && (task = css_task_iter_next(&it))) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + cond_resched(); ret = fn(task, arg); + } css_task_iter_end(&it); if (ret) { mem_cgroup_iter_break(memcg, iter); @@ -1448,6 +1453,18 @@ unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item) memcg_page_state_output_unit(item); } +#ifdef CONFIG_HUGETLB_PAGE +static bool memcg_accounts_hugetlb(void) +{ + return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; +} +#else /* CONFIG_HUGETLB_PAGE */ +static bool memcg_accounts_hugetlb(void) +{ + return false; +} +#endif /* CONFIG_HUGETLB_PAGE */ + static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { int i; @@ -1469,7 +1486,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) #ifdef CONFIG_HUGETLB_PAGE if (unlikely(memory_stats[i].idx == NR_HUGETLB) && - !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + !memcg_accounts_hugetlb()) continue; #endif size = memcg_page_state_output(memcg, memory_stats[i].idx); @@ -2371,21 +2388,6 @@ done_restock: return 0; } -/** - * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call. - * @memcg: memcg previously charged. - * @nr_pages: number of pages previously charged. - */ -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - if (mem_cgroup_is_root(memcg)) - return; - - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); -} - static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio); @@ -2399,18 +2401,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } -/** - * mem_cgroup_commit_charge - commit a previously successful try_charge(). - * @folio: folio to commit the charge to. - * @memcg: memcg previously charged. - */ -void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) -{ - css_get(&memcg->css); - commit_charge(folio, memcg); - memcg1_commit_charge(folio, memcg); -} - static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) @@ -4498,7 +4488,9 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, if (ret) goto out; - mem_cgroup_commit_charge(folio, memcg); + css_get(&memcg->css); + commit_charge(folio, memcg); + memcg1_commit_charge(folio, memcg); out: return ret; } @@ -4516,38 +4508,37 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) } /** - * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio - * @memcg: memcg to charge. - * @gfp: reclaim mode. - * @nr_pages: number of pages to charge. + * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio + * @folio: folio being charged + * @gfp: reclaim mode * - * This function is called when allocating a huge page folio to determine if - * the memcg has the capacity for it. It does not commit the charge yet, - * as the hugetlb folio itself has not been obtained from the hugetlb pool. + * This function is called when allocating a huge page folio, after the page has + * already been obtained and charged to the appropriate hugetlb cgroup + * controller (if it is enabled). * - * Once we have obtained the hugetlb folio, we can call - * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the - * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect - * of try_charge(). - * - * Returns 0 on success. Otherwise, an error code is returned. + * Returns ENOMEM if the memcg is already full. + * Returns 0 if either the charge was successful, or if we skip the charging. */ -int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, - long nr_pages) +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) { + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); + int ret = 0; + /* - * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, - * but do not attempt to commit charge later (or cancel on error) either. + * Even memcg does not account for hugetlb, we still want to update + * system-level stats via lruvec_stat_mod_folio. Return 0, and skip + * charging the memcg. */ - if (mem_cgroup_disabled() || !memcg || - !cgroup_subsys_on_dfl(memory_cgrp_subsys) || - !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) - return -EOPNOTSUPP; + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + goto out; - if (try_charge(memcg, gfp, nr_pages)) - return -ENOMEM; + if (charge_memcg(folio, memcg, gfp)) + ret = -ENOMEM; - return 0; +out: + mem_cgroup_put(memcg); + return ret; } /** @@ -4609,7 +4600,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed. */ - if (!mem_cgroup_disabled() && do_memsw_account()) { + if (do_memsw_account()) { /* * The swap entry might not get freed for a long time, * let's not wait for it. The page already received a @@ -4973,7 +4964,6 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; unsigned int nr_entries; - unsigned short oldid; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); @@ -5000,11 +4990,10 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) /* Get references for the tail pages, too */ if (nr_entries > 1) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), - nr_entries); - VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); + swap_cgroup_record(folio, entry); + folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -5035,7 +5024,6 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned int nr_pages = folio_nr_pages(folio); struct page_counter *counter; struct mem_cgroup *memcg; - unsigned short oldid; if (do_memsw_account()) return 0; @@ -5064,10 +5052,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) /* Get references for the tail pages, too */ if (nr_pages > 1) mem_cgroup_id_get_many(memcg, nr_pages - 1); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); - VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); + swap_cgroup_record(folio, entry); + return 0; } @@ -5081,7 +5069,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - id = swap_cgroup_record(entry, 0, nr_pages); + id = swap_cgroup_clear(entry, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { diff --git a/mm/memfd.c b/mm/memfd.c index 35a370d75c9a..37f7be57c2f5 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) return error; } -unsigned int *memfd_file_seals_ptr(struct file *file) +static unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; @@ -327,15 +327,51 @@ static int check_sysctl_memfd_noexec(unsigned int *flags) return 0; } -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) +static inline bool is_write_sealed(unsigned int seals) { - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +static int check_write_seal(unsigned long *vm_flags_ptr) +{ + unsigned long vm_flags = *vm_flags_ptr; + unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); + + /* If a private matting then writability is irrelevant. */ + if (!(mask & VM_SHARED)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if (mask & VM_WRITE) + return -EPERM; + + /* + * This is a read-only mapping, disallow mprotect() from making a + * write-sealed mapping writable in future. + */ + *vm_flags_ptr &= ~VM_MAYWRITE; + + return 0; +} + +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) +{ + int err = 0; + unsigned int *seals_ptr = memfd_file_seals_ptr(file); + unsigned int seals = seals_ptr ? *seals_ptr : 0; + + if (is_write_sealed(seals)) + err = check_write_seal(vm_flags_ptr); + + return err; +} + +static int sanitize_flags(unsigned int *flags_ptr) +{ + unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { if (flags & ~(unsigned int)MFD_ALL_FLAGS) @@ -351,50 +387,52 @@ SYSCALL_DEFINE2(memfd_create, if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; - error = check_sysctl_memfd_noexec(&flags); - if (error < 0) - return error; + return check_sysctl_memfd_noexec(flags_ptr); +} - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) - return -EINVAL; +static char *alloc_name(const char __user *uname) +{ + int error; + char *name; + long len; - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + /* returned length does not include terminating zero */ + len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); + if (len < 0) { error = -EFAULT; goto err_name; + } else if (len > MFD_NAME_MAX_LEN) { + error = -EINVAL; + goto err_name; } - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; - goto err_name; - } + return name; - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } +err_name: + kfree(name); + return ERR_PTR(error); +} + +static struct file *alloc_file(const char *name, unsigned int flags) +{ + unsigned int *file_seals; + struct file *file; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); - } else + } else { file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; } + if (IS_ERR(file)) + return file; file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; @@ -414,6 +452,37 @@ SYSCALL_DEFINE2(memfd_create, *file_seals &= ~F_SEAL_SEAL; } + return file; +} + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct file *file; + int fd, error; + char *name; + + error = sanitize_flags(&flags); + if (error < 0) + return error; + + name = alloc_name(uname); + if (IS_ERR(name)) + return PTR_ERR(name); + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = alloc_file(name, flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + fd_install(fd, file); kfree(name); return fd; diff --git a/mm/memory.c b/mm/memory.c index 398c031be9ba..67cfcebb0f94 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1436,7 +1436,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details) + if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ @@ -1466,34 +1466,42 @@ static inline bool zap_drop_markers(struct zap_details *details) /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. + * + * Returns true if uffd-wp ptes was installed, false otherwise. */ -static inline void +static inline bool zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { + bool was_installed = false; + +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) - return; + return false; if (zap_drop_markers(details)) - return; + return false; for (;;) { /* the PFN in the PTE is irrelevant. */ - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval)) + was_installed = true; if (--nr == 0) break; pte++; addr += PAGE_SIZE; } +#endif + return was_installed; } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, - bool *force_flush, bool *force_break) + bool *force_flush, bool *force_break, bool *any_skipped) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; @@ -1519,8 +1527,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, - ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, + nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); @@ -1544,7 +1552,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, - bool *force_break) + bool *force_break, bool *any_skipped) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; @@ -1559,15 +1567,17 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, - details, ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, + pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) + if (unlikely(!should_zap_folio(details, folio))) { + *any_skipped = true; return 1; + } /* * Make sure that the common "small folio" case is as fast as possible @@ -1579,14 +1589,121 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, - force_break); + force_break, any_skipped); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, - details, rss, force_flush, force_break); + details, rss, force_flush, force_break, any_skipped); return 1; } +static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *any_skipped) +{ + swp_entry_t entry; + int nr = 1; + + *any_skipped = true; + entry = pte_to_swp_entry(ptent); + if (is_device_private_entry(entry) || + is_device_exclusive_entry(entry)) { + struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) + return 1; + /* + * Both device private/exclusive mappings should only + * work with anonymous page so far, so we don't need to + * consider uffd-wp bit when zap. For more information, + * see zap_install_uffd_wp_if_needed(). + */ + WARN_ON_ONCE(!vma_is_anonymous(vma)); + rss[mm_counter(folio)]--; + if (is_device_private_entry(entry)) + folio_remove_rmap_pte(folio, page, vma); + folio_put(folio); + } else if (!non_swap_entry(entry)) { + /* Genuine swap entries, hence a private anon pages */ + if (!should_zap_cows(details)) + return 1; + + nr = swap_pte_batch(pte, max_nr, ptent); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); + } else if (is_migration_entry(entry)) { + struct folio *folio = pfn_swap_entry_folio(entry); + + if (!should_zap_folio(details, folio)) + return 1; + rss[mm_counter(folio)]--; + } else if (pte_marker_entry_uffd_wp(entry)) { + /* + * For anon: always drop the marker; for file: only + * drop the marker if explicitly requested. + */ + if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) + return 1; + } else if (is_guard_swp_entry(entry)) { + /* + * Ordinary zapping should not remove guard PTE + * markers. Only do so if we should remove PTE markers + * in general. + */ + if (!zap_drop_markers(details)) + return 1; + } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { + if (!should_zap_cows(details)) + return 1; + } else { + /* We should have covered all the swap entry types */ + pr_alert("unrecognized swap entry 0x%lx\n", entry.val); + WARN_ON_ONCE(1); + } + clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); + + return nr; +} + +static inline int do_zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, unsigned long end, + struct zap_details *details, int *rss, + bool *force_flush, bool *force_break, + bool *any_skipped) +{ + pte_t ptent = ptep_get(pte); + int max_nr = (end - addr) / PAGE_SIZE; + int nr = 0; + + /* Skip all consecutive none ptes */ + if (pte_none(ptent)) { + for (nr = 1; nr < max_nr; nr++) { + ptent = ptep_get(pte + nr); + if (!pte_none(ptent)) + break; + } + max_nr -= nr; + if (!max_nr) + return nr; + pte += nr; + addr += nr * PAGE_SIZE; + } + + if (pte_present(ptent)) + nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, force_flush, force_break, + any_skipped); + else + nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, any_skipped); + + return nr; +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1598,9 +1715,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - swp_entry_t entry; + pmd_t pmdval; + unsigned long start = addr; + bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); + bool direct_reclaim = false; int nr; +retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1610,90 +1731,24 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = ptep_get(pte); - struct folio *folio; - struct page *page; - int max_nr; - - nr = 1; - if (pte_none(ptent)) - continue; + bool any_skipped = false; if (need_resched()) break; - if (pte_present(ptent)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, - addr, details, rss, &force_flush, - &force_break); - if (unlikely(force_break)) { - addr += nr * PAGE_SIZE; - break; - } - continue; + nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, + &force_flush, &force_break, &any_skipped); + if (any_skipped) + can_reclaim_pt = false; + if (unlikely(force_break)) { + addr += nr * PAGE_SIZE; + break; } - - entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry) || - is_device_exclusive_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) - continue; - /* - * Both device private/exclusive mappings should only - * work with anonymous page so far, so we don't need to - * consider uffd-wp bit when zap. For more information, - * see zap_install_uffd_wp_if_needed(). - */ - WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(folio)]--; - if (is_device_private_entry(entry)) - folio_remove_rmap_pte(folio, page, vma); - folio_put(folio); - } else if (!non_swap_entry(entry)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = swap_pte_batch(pte, max_nr, ptent); - /* Genuine swap entries, hence a private anon pages */ - if (!should_zap_cows(details)) - continue; - rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); - if (!should_zap_folio(details, folio)) - continue; - rss[mm_counter(folio)]--; - } else if (pte_marker_entry_uffd_wp(entry)) { - /* - * For anon: always drop the marker; for file: only - * drop the marker if explicitly requested. - */ - if (!vma_is_anonymous(vma) && - !zap_drop_markers(details)) - continue; - } else if (is_guard_swp_entry(entry)) { - /* - * Ordinary zapping should not remove guard PTE - * markers. Only do so if we should remove PTE markers - * in general. - */ - if (!zap_drop_markers(details)) - continue; - } else if (is_hwpoison_entry(entry) || - is_poisoned_swp_entry(entry)) { - if (!should_zap_cows(details)) - continue; - } else { - /* We should have covered all the swap entry types */ - pr_alert("unrecognized swap entry 0x%lx\n", entry.val); - WARN_ON_ONCE(1); - } - clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); + if (can_reclaim_pt && addr == end) + direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1713,6 +1768,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (force_flush) tlb_flush_mmu(tlb); + if (addr != end) { + cond_resched(); + force_flush = false; + force_break = false; + goto retry; + } + + if (can_reclaim_pt) { + if (direct_reclaim) + free_pte(mm, start, tlb, pmdval); + else + try_to_free_pte(mm, pmd, start, tlb); + } + return addr; } @@ -1935,7 +2004,6 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, struct mmu_notifier_range range; struct mmu_gather tlb; - lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); @@ -3013,7 +3081,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, { return __apply_to_page_range(mm, addr, size, fn, data, false); } -EXPORT_SYMBOL_GPL(apply_to_existing_page_range); /* * handle_pte_fault chooses page fault handler according to an entry which was @@ -4189,8 +4256,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); folio_put(folio); } + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); } @@ -5625,7 +5694,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) ignore_writable = true; /* Migrate to the requested node */ - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); @@ -6069,7 +6138,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, } /* - * By the time we get here, we already hold the mm semaphore + * By the time we get here, we already hold either the VMA lock or the + * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). @@ -6185,7 +6255,7 @@ static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_r /* * Helper for page fault handling. * - * This is kind of equivalend to "mmap_read_lock()" followed + * This is kind of equivalent to "mmap_read_lock()" followed * by "find_extend_vma()", except it's a lot more careful about * the locking (and will drop the lock on failure). * @@ -6258,6 +6328,88 @@ fail: #endif #ifdef CONFIG_PER_VMA_LOCK +static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) +{ + unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + + /* Additional refcnt if the vma is attached. */ + if (!detaching) + tgt_refcnt++; + + /* + * If vma is detached then only vma_mark_attached() can raise the + * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + */ + if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + return false; + + rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + refcount_read(&vma->vm_refcnt) == tgt_refcnt, + TASK_UNINTERRUPTIBLE); + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); + + return true; +} + +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +{ + *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); +} + +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) +{ + bool locked; + + /* + * __vma_enter_locked() returns false immediately if the vma is not + * attached, otherwise it waits until refcnt is indicating that vma + * is attached with no readers. + */ + locked = __vma_enter_locked(vma, false); + + /* + * We should use WRITE_ONCE() here because we can have concurrent reads + * from the early lockless pessimistic check in vma_start_read(). + * We don't really care about the correctness of that early check, but + * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. + */ + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); + + if (locked) { + bool detached; + + __vma_exit_locked(vma, &detached); + VM_BUG_ON_VMA(detached, vma); /* vma should remain attached */ + } +} +EXPORT_SYMBOL_GPL(__vma_start_write); + +void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * We are the only writer, so no need to use vma_refcount_put(). + * The condition below is unlikely because the vma has been already + * write-locked and readers can increment vm_refcnt only temporarily + * before they check vm_lock_seq, realize the vma is locked and drop + * back the vm_refcnt. That is a narrow window for observing a raised + * vm_refcnt. + */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* Wait until vma is detached with no readers. */ + if (__vma_enter_locked(vma, true)) { + bool detached; + + __vma_exit_locked(vma, &detached); + VM_BUG_ON_VMA(!detached, vma); + } + } +} + /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the @@ -6270,21 +6422,13 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, struct vm_area_struct *vma; rcu_read_lock(); -retry: vma = mas_walk(&mas); if (!vma) goto inval; - if (!vma_start_read(vma)) + if (!vma_start_read(mm, vma)) goto inval; - /* Check if the VMA got isolated after we found it */ - if (vma->detached) { - vma_end_read(vma); - count_vm_vma_lock_event(VMA_LOCK_MISS); - /* The area was replaced with another one */ - goto retry; - } /* * At this point, we have a stable reference to a VMA: The VMA is * locked and we know it hasn't already been isolated. @@ -6292,8 +6436,9 @@ retry: * fields are accessible for RCU readers. */ - /* Check since vm_start/vm_end might change before we lock the VMA */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + /* Check if the vma we locked is the right one. */ + if (unlikely(vma->vm_mm != mm || + address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); @@ -6961,7 +7106,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc) void ptlock_free(struct ptdesc *ptdesc) { - kmem_cache_free(page_ptl_cachep, ptdesc->ptl); + if (ptdesc->ptl) + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c43b4e7fb298..e3655f07dd6e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -219,11 +219,30 @@ void put_online_mems(void) bool movable_node_enabled = false; -#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -int mhp_default_online_type = MMOP_OFFLINE; -#else -int mhp_default_online_type = MMOP_ONLINE; -#endif +static int mhp_default_online_type = -1; +int mhp_get_default_online_type(void) +{ + if (mhp_default_online_type >= 0) + return mhp_default_online_type; + + if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE)) + mhp_default_online_type = MMOP_OFFLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO)) + mhp_default_online_type = MMOP_ONLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL)) + mhp_default_online_type = MMOP_ONLINE_KERNEL; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE)) + mhp_default_online_type = MMOP_ONLINE_MOVABLE; + else + mhp_default_online_type = MMOP_OFFLINE; + + return mhp_default_online_type; +} + +void mhp_set_default_online_type(int online_type) +{ + mhp_default_online_type = online_type; +} static int __init setup_memhp_default_state(char *str) { @@ -650,6 +669,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * this and the first chunk to online will be pageblock_nr_pages. */ for (pfn = start_pfn; pfn < end_pfn;) { + struct page *page = pfn_to_page(pfn); int order; /* @@ -664,7 +684,14 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) else order = MAX_PAGE_ORDER; - (*online_page_callback)(pfn_to_page(pfn), order); + /* + * Exposing the page to the buddy by freeing can cause + * issues with debug_pagealloc enabled: some archs don't + * like double-unmappings. So treat them like any pages that + * were allocated from the buddy. + */ + debug_pagealloc_map_pages(page, 1 << order); + (*online_page_callback)(page, order); pfn += (1UL << order); } @@ -1320,7 +1347,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { - mem->online_type = mhp_default_online_type; + mem->online_type = mhp_get_default_online_type(); return device_online(&mem->dev); } @@ -1567,7 +1594,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) merge_system_ram_resource(res); /* online pages if requested */ - if (mhp_default_online_type != MMOP_OFFLINE) + if (mhp_get_default_online_type() != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; @@ -1830,7 +1857,7 @@ put_folio: nodemask_t nmask = node_states[N_MEMORY]; struct migration_target_control mtc = { .nmask = &nmask, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, .reason = MR_MEMORY_HOTPLUG, }; int ret; @@ -1992,8 +2019,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - MEMORY_OFFLINE | REPORT_FAILURE, - GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); + MEMORY_OFFLINE | REPORT_FAILURE); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 162407fbf2bc..bbaadbeeb291 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -647,7 +647,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, */ if ((flags & MPOL_MF_MOVE_ALL) || (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) - if (!isolate_hugetlb(folio, qp->pagelist)) + if (!folio_isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); @@ -2205,9 +2205,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask); + page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) - page = __alloc_pages_noprof(gfp, order, nid, NULL); + page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); return page; } @@ -2222,7 +2222,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, +static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; @@ -2253,8 +2253,9 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - page = __alloc_pages_node_noprof(nid, - gfp | __GFP_THISNODE | __GFP_NORETRY, order); + page = __alloc_frozen_pages_noprof( + gfp | __GFP_THISNODE | __GFP_NORETRY, order, + nid, NULL); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* @@ -2266,7 +2267,7 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, } } - page = __alloc_pages_noprof(gfp, order, nid, nodemask); + page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { @@ -2285,8 +2286,13 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { - return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP, - order, pol, ilx, nid)); + struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, + ilx, nid); + if (!page) + return NULL; + + set_page_refcounted(page); + return page_rmappable_folio(page); } /** @@ -2300,7 +2306,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and - * excepting where direct use of alloc_pages_mpol() is more appropriate. + * excepting where direct use of folio_alloc_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ @@ -2321,6 +2327,21 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct } EXPORT_SYMBOL(vma_alloc_folio_noprof); +struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) +{ + struct mempolicy *pol = &default_policy; + + /* + * No reference counting needed for current->mempolicy + * nor system default_policy + */ + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + + return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, + numa_node_id()); +} + /** * alloc_pages - Allocate pages. * @gfp: GFP flags. @@ -2337,17 +2358,11 @@ EXPORT_SYMBOL(vma_alloc_folio_noprof); */ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { - struct mempolicy *pol = &default_policy; + struct page *page = alloc_frozen_pages_noprof(gfp, order); - /* - * No reference counting needed for current->mempolicy - * nor system default_policy - */ - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) - pol = get_task_policy(current); - - return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX, - numa_node_id()); + if (page) + set_page_refcounted(page); + return page; } EXPORT_SYMBOL(alloc_pages_noprof); @@ -2357,7 +2372,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) } EXPORT_SYMBOL(folio_alloc_noprof); -static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, +static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2376,13 +2391,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, if (delta) { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, - nr_pages_per_node + 1, NULL, + nr_pages_per_node + 1, page_array); delta--; } else { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, - nr_pages_per_node, NULL, page_array); + nr_pages_per_node, page_array); } page_array += nr_allocated; @@ -2392,7 +2407,7 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } -static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, +static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2431,7 +2446,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, if (weight && node_isset(node, nodes)) { node_pages = min(rem_pages, weight); nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, - NULL, page_array); + page_array); page_array += nr_allocated; total_allocated += nr_allocated; /* if that's all the pages, no need to interleave */ @@ -2494,7 +2509,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, if (!node_pages) break; nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, - NULL, page_array); + page_array); page_array += nr_allocated; total_allocated += nr_allocated; if (total_allocated == nr_pages) @@ -2507,7 +2522,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, return total_allocated; } -static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, +static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2518,11 +2533,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, - nr_pages, NULL, page_array); + nr_pages, page_array); if (nr_allocated < nr_pages) nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, - nr_pages - nr_allocated, NULL, + nr_pages - nr_allocated, page_array + nr_allocated); return nr_allocated; } @@ -2533,7 +2548,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, * It can accelerate memory allocation especially interleaving * allocate memory. */ -unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; @@ -2544,21 +2559,21 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) - return alloc_pages_bulk_array_interleave(gfp, pol, + return alloc_pages_bulk_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) - return alloc_pages_bulk_array_weighted_interleave( + return alloc_pages_bulk_weighted_interleave( gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) - return alloc_pages_bulk_array_preferred_many(gfp, + return alloc_pages_bulk_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return alloc_pages_bulk_noprof(gfp, nid, nodemask, - nr_pages, NULL, page_array); + nr_pages, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) diff --git a/mm/migrate.c b/mm/migrate.c index cc68583c86f9..be9e3b48cd62 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -68,10 +68,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) if (!folio) goto out; - if (unlikely(folio_test_slab(folio))) - goto out_putfolio; - /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ - smp_rmb(); /* * Check movable flag before taking the page lock because * we use non-atomic bitops on newly allocated page flags so @@ -79,10 +75,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) */ if (unlikely(!__folio_test_movable(folio))) goto out_putfolio; - /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ - smp_rmb(); - if (unlikely(folio_test_slab(folio))) - goto out_putfolio; /* * As movable pages are not isolated from LRU lists, concurrent @@ -136,7 +128,7 @@ static void putback_movable_folio(struct folio *folio) * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() - * and isolate_hugetlb(). + * and folio_isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { @@ -145,7 +137,7 @@ void putback_movable_pages(struct list_head *l) list_for_each_entry_safe(folio, folio2, l, lru) { if (unlikely(folio_test_hugetlb(folio))) { - folio_putback_active_hugetlb(folio); + folio_putback_hugetlb(folio); continue; } list_del(&folio->lru); @@ -177,7 +169,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list) bool isolated, lru; if (folio_test_hugetlb(folio)) - return isolate_hugetlb(folio, list); + return folio_isolate_hugetlb(folio, list); lru = !__folio_test_movable(folio); if (lru) @@ -1262,7 +1254,10 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, */ switch (mode) { case MIGRATE_SYNC: - break; + if (!src->mapping || + !mapping_writeback_indeterminate(src->mapping)) + break; + fallthrough; default: rc = -EBUSY; goto out; @@ -1459,7 +1454,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ - folio_putback_active_hugetlb(src); + folio_putback_hugetlb(src); return MIGRATEPAGE_SUCCESS; } @@ -1542,19 +1537,19 @@ out_unlock: folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) - folio_putback_active_hugetlb(src); + folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); /* - * If migration was not successful and there's a freeing callback, use - * it. Otherwise, put_page() will drop the reference grabbed during - * isolation. + * If migration was not successful and there's a freeing callback, + * return the folio to that special allocator. Otherwise, simply drop + * our additional reference. */ if (put_new_folio) put_new_folio(dst, private); else - folio_putback_active_hugetlb(dst); + folio_put(dst); return rc; } @@ -2208,7 +2203,7 @@ static int __add_folio_for_migration(struct folio *folio, int node, return -EACCES; if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(folio, pagelist)) + if (folio_isolate_hugetlb(folio, pagelist)) return 1; } else if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, pagelist); @@ -2683,8 +2678,7 @@ int migrate_misplaced_folio_prepare(struct folio *folio, * elevated reference count on the folio. This function will un-isolate the * folio, dereferencing the folio before returning. */ -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node) +int migrate_misplaced_folio(struct folio *folio, int node) { pg_data_t *pgdat = NODE_DATA(node); int nr_remaining; diff --git a/mm/mm_init.c b/mm/mm_init.c index 24b68b425afb..2630cc30147e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1585,13 +1585,17 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, { void *ptr; + /* + * Kmemleak will explicitly scan mem_map by traversing all valid + * `struct *page`,so memblock does not need to be added to the scan list. + */ if (exact_nid) ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, - MEMBLOCK_ALLOC_ACCESSIBLE, + MEMBLOCK_ALLOC_NOLEAKTRACE, nid); else ptr = memblock_alloc_try_nid_raw(size, align, min_addr, - MEMBLOCK_ALLOC_ACCESSIBLE, + MEMBLOCK_ALLOC_NOLEAKTRACE, nid); if (ptr && size > 0) diff --git a/mm/mmap.c b/mm/mmap.c index aec208f90337..7aa36216ecc0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -111,8 +111,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } -static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, unsigned long flags); + SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; @@ -278,8 +277,62 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, return true; } -/* +/** + * do_mmap() - Perform a userland memory mapping into the current process + * address space of length @len with protection bits @prot, mmap flags @flags + * (from which VMA flags will be inferred), and any additional VMA flags to + * apply @vm_flags. If this is a file-backed mapping then the file is specified + * in @file and page offset into the file via @pgoff. + * + * This function does not perform security checks on the file and assumes, if + * @uf is non-NULL, the caller has provided a list head to track unmap events + * for userfaultfd @uf. + * + * It also simply indicates whether memory population is required by setting + * @populate, which must be non-NULL, expecting the caller to actually perform + * this task itself if appropriate. + * + * This function will invoke architecture-specific (and if provided and + * relevant, file system-specific) logic to determine the most appropriate + * unmapped area in which to place the mapping if not MAP_FIXED. + * + * Callers which require userland mmap() behaviour should invoke vm_mmap(), + * which is also exported for module use. + * + * Those which require this behaviour less security checks, userfaultfd and + * populate behaviour, and who handle the mmap write lock themselves, should + * call this function. + * + * Note that the returned address may reside within a merged VMA if an + * appropriate merge were to take place, so it doesn't necessarily specify the + * start of a VMA, rather only the start of a valid mapped range of length + * @len bytes, rounded down to the nearest page size. + * * The caller must write-lock current->mm->mmap_lock. + * + * @file: An optional struct file pointer describing the file which is to be + * mapped, if a file-backed mapping. + * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the + * address at which to perform this mapping. See mmap (2) for details. Must be + * page-aligned. + * @len: The length of the mapping. Will be page-aligned and must be at least 1 + * page in size. + * @prot: Protection bits describing access required to the mapping. See mmap + * (2) for details. + * @flags: Flags specifying how the mapping should be performed, see mmap (2) + * for details. + * @vm_flags: VMA flags which should be set by default, or 0 otherwise. + * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise. + * @populate: A pointer to a value which will be set to 0 if no population of + * the range is required, or the number of bytes to populate if it is. Must be + * non-NULL. See mmap (2) for details as to under what circumstances population + * of the range occurs. + * @uf: An optional pointer to a list head to track userfaultfd unmap events + * should unmapping events arise. If provided, it is up to the caller to manage + * this. + * + * Returns: Either an error, or the address at which the requested mapping has + * been performed. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -292,6 +345,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, *populate = 0; + mmap_assert_write_locked(mm); + if (!len) return -EINVAL; @@ -369,8 +424,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); - unsigned int seals = memfd_file_seals(file); unsigned long flags_mask; + int err; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; @@ -410,8 +465,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); - else if (is_readonly_sealed(seals, vm_flags)) - vm_flags &= ~VM_MAYWRITE; fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) @@ -431,6 +484,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr, default: return -EINVAL; } + + /* + * Check to see if we are violating any seals and update VMA + * flags if necessary to avoid future seal violations. + */ + err = memfd_check_seals_mmap(file, &vm_flags); + if (err) + return (unsigned long)err; } else { switch (flags & MAP_TYPE) { case MAP_SHARED: @@ -581,115 +642,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ -/** - * unmapped_area() - Find an area between the low_limit and the high_limit with - * the correct alignment and offset, all from @info. Note: current->mm is used - * for the search. - * - * @info: The unmapped area information including the range [low_limit - - * high_limit), the alignment offset and mask. - * - * Return: A memory address or -ENOMEM. - */ -static unsigned long unmapped_area(struct vm_unmapped_area_info *info) -{ - unsigned long length, gap; - unsigned long low_limit, high_limit; - struct vm_area_struct *tmp; - VMA_ITERATOR(vmi, current->mm, 0); - - /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask + info->start_gap; - if (length < info->length) - return -ENOMEM; - - low_limit = info->low_limit; - if (low_limit < mmap_min_addr) - low_limit = mmap_min_addr; - high_limit = info->high_limit; -retry: - if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) - return -ENOMEM; - - /* - * Adjust for the gap first so it doesn't interfere with the - * later alignment. The first step is the minimum needed to - * fulill the start gap, the next steps is the minimum to align - * that. It is the minimum needed to fulill both. - */ - gap = vma_iter_addr(&vmi) + info->start_gap; - gap += (info->align_offset - gap) & info->align_mask; - tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ - if (vm_start_gap(tmp) < gap + length - 1) { - low_limit = tmp->vm_end; - vma_iter_reset(&vmi); - goto retry; - } - } else { - tmp = vma_prev(&vmi); - if (tmp && vm_end_gap(tmp) > gap) { - low_limit = vm_end_gap(tmp); - vma_iter_reset(&vmi); - goto retry; - } - } - - return gap; -} - -/** - * unmapped_area_topdown() - Find an area between the low_limit and the - * high_limit with the correct alignment and offset at the highest available - * address, all from @info. Note: current->mm is used for the search. - * - * @info: The unmapped area information including the range [low_limit - - * high_limit), the alignment offset and mask. - * - * Return: A memory address or -ENOMEM. - */ -static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) -{ - unsigned long length, gap, gap_end; - unsigned long low_limit, high_limit; - struct vm_area_struct *tmp; - VMA_ITERATOR(vmi, current->mm, 0); - - /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask + info->start_gap; - if (length < info->length) - return -ENOMEM; - - low_limit = info->low_limit; - if (low_limit < mmap_min_addr) - low_limit = mmap_min_addr; - high_limit = info->high_limit; -retry: - if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) - return -ENOMEM; - - gap = vma_iter_end(&vmi) - info->length; - gap -= (gap - info->align_offset) & info->align_mask; - gap_end = vma_iter_end(&vmi); - tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ - if (vm_start_gap(tmp) < gap_end) { - high_limit = vm_start_gap(tmp); - vma_iter_reset(&vmi); - goto retry; - } - } else { - tmp = vma_prev(&vmi); - if (tmp && vm_end_gap(tmp) > gap) { - high_limit = tmp->vm_start; - vma_iter_reset(&vmi); - goto retry; - } - } - - return gap; -} - /* * Determine if the allocation needs to ensure that there is no * existing mapping within it's guard gaps, for use as start_gap. @@ -989,211 +941,6 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, return vma; } -/* - * Verify that the stack growth is acceptable and - * update accounting. This is shared with both the - * grow-up and grow-down cases. - */ -static int acct_stack_growth(struct vm_area_struct *vma, - unsigned long size, unsigned long grow) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long new_start; - - /* address space limit tests */ - if (!may_expand_vm(mm, vma->vm_flags, grow)) - return -ENOMEM; - - /* Stack limit test */ - if (size > rlimit(RLIMIT_STACK)) - return -ENOMEM; - - /* mlock limit tests */ - if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) - return -ENOMEM; - - /* Check to ensure the stack will not grow into a hugetlb-only region */ - new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : - vma->vm_end - size; - if (is_hugepage_only_range(vma->vm_mm, new_start, size)) - return -EFAULT; - - /* - * Overcommit.. This must be the final test, as it will - * update security statistics. - */ - if (security_vm_enough_memory_mm(mm, grow)) - return -ENOMEM; - - return 0; -} - -#if defined(CONFIG_STACK_GROWSUP) -/* - * PA-RISC uses this for its stack. - * vma is the last one with address > vma->vm_end. Have to extend vma. - */ -static int expand_upwards(struct vm_area_struct *vma, unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next; - unsigned long gap_addr; - int error = 0; - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (!(vma->vm_flags & VM_GROWSUP)) - return -EFAULT; - - mmap_assert_write_locked(mm); - - /* Guard against exceeding limits of the address space. */ - address &= PAGE_MASK; - if (address >= (TASK_SIZE & PAGE_MASK)) - return -ENOMEM; - address += PAGE_SIZE; - - /* Enforce stack_guard_gap */ - gap_addr = address + stack_guard_gap; - - /* Guard against overflow */ - if (gap_addr < address || gap_addr > TASK_SIZE) - gap_addr = TASK_SIZE; - - next = find_vma_intersection(mm, vma->vm_end, gap_addr); - if (next && vma_is_accessible(next)) { - if (!(next->vm_flags & VM_GROWSUP)) - return -ENOMEM; - /* Check that both stack segments have the same anon_vma? */ - } - - if (next) - vma_iter_prev_range_limit(&vmi, address); - - vma_iter_config(&vmi, vma->vm_start, address); - if (vma_iter_prealloc(&vmi, vma)) - return -ENOMEM; - - /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) { - vma_iter_free(&vmi); - return -ENOMEM; - } - - /* Lock the VMA before expanding to prevent concurrent page faults */ - vma_start_write(vma); - /* We update the anon VMA tree. */ - anon_vma_lock_write(vma->anon_vma); - - /* Somebody else might have raced and expanded it already */ - if (address > vma->vm_end) { - unsigned long size, grow; - - size = address - vma->vm_start; - grow = (address - vma->vm_end) >> PAGE_SHIFT; - - error = -ENOMEM; - if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { - error = acct_stack_growth(vma, size, grow); - if (!error) { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, grow); - anon_vma_interval_tree_pre_update_vma(vma); - vma->vm_end = address; - /* Overwrite old entry in mtree. */ - vma_iter_store(&vmi, vma); - anon_vma_interval_tree_post_update_vma(vma); - - perf_event_mmap(vma); - } - } - } - anon_vma_unlock_write(vma->anon_vma); - vma_iter_free(&vmi); - validate_mm(mm); - return error; -} -#endif /* CONFIG_STACK_GROWSUP */ - -/* - * vma is the first one with address < vma->vm_start. Have to extend vma. - * mmap_lock held for writing. - */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *prev; - int error = 0; - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (!(vma->vm_flags & VM_GROWSDOWN)) - return -EFAULT; - - mmap_assert_write_locked(mm); - - address &= PAGE_MASK; - if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) - return -EPERM; - - /* Enforce stack_guard_gap */ - prev = vma_prev(&vmi); - /* Check that both stack segments have the same anon_vma? */ - if (prev) { - if (!(prev->vm_flags & VM_GROWSDOWN) && - vma_is_accessible(prev) && - (address - prev->vm_end < stack_guard_gap)) - return -ENOMEM; - } - - if (prev) - vma_iter_next_range_limit(&vmi, vma->vm_start); - - vma_iter_config(&vmi, address, vma->vm_end); - if (vma_iter_prealloc(&vmi, vma)) - return -ENOMEM; - - /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) { - vma_iter_free(&vmi); - return -ENOMEM; - } - - /* Lock the VMA before expanding to prevent concurrent page faults */ - vma_start_write(vma); - /* We update the anon VMA tree. */ - anon_vma_lock_write(vma->anon_vma); - - /* Somebody else might have raced and expanded it already */ - if (address < vma->vm_start) { - unsigned long size, grow; - - size = vma->vm_end - address; - grow = (vma->vm_start - address) >> PAGE_SHIFT; - - error = -ENOMEM; - if (grow <= vma->vm_pgoff) { - error = acct_stack_growth(vma, size, grow); - if (!error) { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, grow); - anon_vma_interval_tree_pre_update_vma(vma); - vma->vm_start = address; - vma->vm_pgoff -= grow; - /* Overwrite old entry in mtree. */ - vma_iter_store(&vmi, vma); - anon_vma_interval_tree_post_update_vma(vma); - - perf_event_mmap(vma); - } - } - } - anon_vma_unlock_write(vma->anon_vma); - vma_iter_free(&vmi); - validate_mm(mm); - return error; -} - /* enforced gap between the expanding stack and other mappings. */ unsigned long stack_guard_gap = 256UL<f_mapping); - - if (error) - return error; - writable_file_mapping = true; - } - - ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); - - /* Clear our write mapping regardless of error. */ - if (writable_file_mapping) - mapping_unmap_writable(file->f_mapping); - - validate_mm(current->mm); - return ret; -} - -static int __vm_munmap(unsigned long start, size_t len, bool unlock) -{ - int ret; - struct mm_struct *mm = current->mm; - LIST_HEAD(uf); - VMA_ITERATOR(vmi, mm, start); - - if (mmap_write_lock_killable(mm)) - return -EINTR; - - ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); - if (ret || !unlock) - mmap_write_unlock(mm); - - userfaultfd_unmap_complete(mm, &uf); - return ret; -} - int vm_munmap(unsigned long start, size_t len) { return __vm_munmap(start, len, false); @@ -1512,88 +1207,6 @@ out: return ret; } -/* - * do_brk_flags() - Increase the brk vma if the flags match. - * @vmi: The vma iterator - * @addr: The start address - * @len: The length of the increase - * @vma: The vma, - * @flags: The VMA Flags - * - * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags - * do not match then create a new anonymous VMA. Eventually we may be able to - * do some brk-specific accounting here. - */ -static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, unsigned long flags) -{ - struct mm_struct *mm = current->mm; - - /* - * Check against address space limits by the changed size - * Note: This happens *after* clearing old mappings in some code paths. - */ - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) - return -ENOMEM; - - if (mm->map_count > sysctl_max_map_count) - return -ENOMEM; - - if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) - return -ENOMEM; - - /* - * Expand the existing vma if possible; Note that singular lists do not - * occur after forking, so the expand will only happen on new VMAs. - */ - if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); - - vmg.prev = vma; - /* vmi is positioned at prev, which this mode expects. */ - vmg.merge_flags = VMG_FLAG_JUST_EXPAND; - - if (vma_merge_new_range(&vmg)) - goto out; - else if (vmg_nomem(&vmg)) - goto unacct_fail; - } - - if (vma) - vma_iter_next_range(vmi); - /* create a vma struct for an anonymous mapping */ - vma = vm_area_alloc(mm); - if (!vma) - goto unacct_fail; - - vma_set_anonymous(vma); - vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, flags); - vma->vm_page_prot = vm_get_page_prot(flags); - vma_start_write(vma); - if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) - goto mas_store_fail; - - mm->map_count++; - validate_mm(mm); - ksm_add_vma(vma); -out: - perf_event_mmap(vma); - mm->total_vm += len >> PAGE_SHIFT; - mm->data_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) - mm->locked_vm += (len >> PAGE_SHIFT); - vm_flags_set(vma, VM_SOFTDIRTY); - return 0; - -mas_store_fail: - vm_area_free(vma); -unacct_fail: - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; -} - int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; @@ -1664,7 +1277,6 @@ void exit_mmap(struct mm_struct *mm) goto destroy; } - lru_add_drain(); flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ @@ -1693,7 +1305,8 @@ void exit_mmap(struct mm_struct *mm) do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - remove_vma(vma, /* unreachable = */ true); + vma_mark_detached(vma); + remove_vma(vma); count++; cond_resched(); vma = vma_next(&vmi); @@ -2107,7 +1720,6 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) vma, new_start, length, false, true)) return -ENOMEM; - lru_add_drain(); tlb_gather_mmu(&tlb, mm); next = vma_next(&vmi); if (new_end > old_start) { @@ -2132,3 +1744,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) /* Shrink the vma to just the new range */ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } + +#ifdef CONFIG_MMU +/* + * Obtain a read lock on mm->mmap_lock, if the specified address is below the + * start of the VMA, the intent is to perform a write, and it is a + * downward-growing stack, then attempt to expand the stack to contain it. + * + * This function is intended only for obtaining an argument page from an ELF + * image, and is almost certainly NOT what you want to use for any other + * purpose. + * + * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the + * VMA referenced must not be linked in any user-visible tree, i.e. it must be a + * new VMA being mapped. + * + * The function assumes that addr is either contained within the VMA or below + * it, and makes no attempt to validate this value beyond that. + * + * Returns true if the read lock was obtained and a stack was perhaps expanded, + * false if the stack expansion failed. + * + * On stack expansion the function temporarily acquires an mmap write lock + * before downgrading it. + */ +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, + struct vm_area_struct *new_vma, + unsigned long addr, bool write) +{ + if (!write || addr >= new_vma->vm_start) { + mmap_read_lock(mm); + return true; + } + + if (!(new_vma->vm_flags & VM_GROWSDOWN)) + return false; + + mmap_write_lock(mm); + if (expand_downwards(new_vma, addr)) { + mmap_write_unlock(mm); + return false; + } + + mmap_write_downgrade(mm); + return true; +} +#else +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write) +{ + return false; +} +#endif diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index f186d57df2c6..e7dbaf96aa17 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); -#ifdef CONFIG_MEMCG - -/* - * Size of the buffer for memcg path names. Ignoring stack trace support, - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. - */ -#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - if (trace_mmap_lock_##type##_enabled()) { \ - char buf[MEMCG_PATH_BUF_SIZE]; \ - get_mm_memcg_path(mm, buf, sizeof(buf)); \ - trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ - } \ - } while (0) - -#else /* !CONFIG_MEMCG */ - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) - -#endif /* CONFIG_MEMCG */ - #ifdef CONFIG_TRACING -#ifdef CONFIG_MEMCG -/* - * Write the given mm_struct's memcg path to a buffer. If the path cannot be - * determined, empty string is written. - */ -static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) -{ - struct mem_cgroup *memcg; - - buf[0] = '\0'; - memcg = get_mem_cgroup_from_mm(mm); - if (memcg == NULL) - return; - if (memcg->css.cgroup) - cgroup_path(memcg->css.cgroup, buf, buflen); - css_put(&memcg->css); -} - -#endif /* CONFIG_MEMCG */ - /* * Trace calls must be in a separate file, as otherwise there's a circular * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. @@ -69,20 +25,20 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) { - TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); + trace_mmap_lock_start_locking(mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { - TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); + trace_mmap_lock_acquire_returned(mm, write, success); } EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) { - TRACE_MMAP_LOCK_EVENT(released, mm, write); + trace_mmap_lock_released(mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_released); #endif /* CONFIG_TRACING */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 99b3e9408aa0..7aa6f18c500b 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -311,11 +311,34 @@ static inline void tlb_table_invalidate(struct mmu_gather *tlb) } } -static void tlb_remove_table_one(void *table) +#ifdef CONFIG_PT_RECLAIM +static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) +{ + struct ptdesc *ptdesc; + + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + __tlb_remove_table(ptdesc); +} + +static inline void __tlb_remove_table_one(void *table) +{ + struct ptdesc *ptdesc; + + ptdesc = table; + call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu); +} +#else +static inline void __tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } +#endif /* CONFIG_PT_RECLAIM */ + +static void tlb_remove_table_one(void *table) +{ + __tlb_remove_table_one(table); +} static void tlb_table_flush(struct mmu_gather *tlb) { diff --git a/mm/mseal.c b/mm/mseal.c index 81d6e980e8a9..c27197ac04e8 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -217,9 +217,9 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) unsigned long end; struct mm_struct *mm = current->mm; - ret = can_do_mseal(flags); - if (ret) - return ret; + /* Verify flags not set. */ + if (flags) + return -EINVAL; start = untagged_addr(start); if (!PAGE_ALIGNED(start)) diff --git a/mm/numa.c b/mm/numa.c index e2eec07707d1..f1787d7713a6 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -37,13 +37,7 @@ void __init alloc_node_data(int nid) void __init alloc_offline_node_data(int nid) { pg_data_t *pgdat; - - pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); - if (!pgdat) - panic("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - - node_data[nid] = pgdat; + node_data[nid] = memblock_alloc_or_panic(sizeof(*pgdat), SMP_CACHE_BYTES); } /* Stub functions: */ diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c index 031fb9961bf7..9d55679d99ce 100644 --- a/mm/numa_emulation.c +++ b/mm/numa_emulation.c @@ -8,11 +8,12 @@ #include #include #include +#include #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -static int emu_nid_to_phys[MAX_NUMNODES]; +int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; int __init numa_emu_cmdline(char *str) @@ -379,6 +380,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); int max_emu_nid, dfl_phys_nid; int i, j, ret; + nodemask_t physnode_mask = numa_nodes_parsed; if (!emu_cmdline) goto no_emu; @@ -395,7 +397,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * split the system RAM into N fake nodes. */ if (strchr(emu_cmdline, 'U')) { - nodemask_t physnode_mask = numa_nodes_parsed; unsigned long n; int nid = 0; @@ -465,9 +466,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) */ max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); - /* commit */ - *numa_meminfo = ei; - /* Make sure numa_nodes_parsed only contains emulated nodes */ nodes_clear(numa_nodes_parsed); for (i = 0; i < ARRAY_SIZE(ei.blk); i++) @@ -475,10 +473,21 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) ei.blk[i].nid != NUMA_NO_NODE) node_set(ei.blk[i].nid, numa_nodes_parsed); - numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); + /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision + * with faked numa nodes, particularly during later memory hotplug + * handling, and also update numa_nodes_parsed accordingly. + */ + ret = fix_pxm_node_maps(max_emu_nid); + if (ret < 0) + goto no_emu; + + /* commit */ + *numa_meminfo = ei; + + numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1); /* make sure all emulated nodes are mapped to a physical node */ - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + for (i = 0; i < max_emu_nid + 1; i++) if (emu_nid_to_phys[i] == NUMA_NO_NODE) emu_nid_to_phys[i] = dfl_phys_nid; @@ -501,12 +510,34 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) numa_set_distance(i, j, dist); } } + for (i = 0; i < numa_distance_cnt; i++) { + for (j = 0; j < numa_distance_cnt; j++) { + int physi, physj; + u8 dist; + + /* distance between fake nodes is already ok */ + if (emu_nid_to_phys[i] != NUMA_NO_NODE && + emu_nid_to_phys[j] != NUMA_NO_NODE) + continue; + if (emu_nid_to_phys[i] != NUMA_NO_NODE) + physi = emu_nid_to_phys[i]; + else + physi = i - max_emu_nid; + if (emu_nid_to_phys[j] != NUMA_NO_NODE) + physj = emu_nid_to_phys[j]; + else + physj = j - max_emu_nid; + dist = phys_dist[physi * numa_dist_cnt + physj]; + numa_set_distance(i, j, dist); + } + } /* free the copied physical distance table */ memblock_free(phys_dist, phys_size); return; no_emu: + numa_nodes_parsed = physnode_mask; /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) emu_nid_to_phys[i] = i; diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index a3877e9bc878..ff4054f4334d 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -7,7 +7,7 @@ #include #include -static int numa_distance_cnt; +int numa_distance_cnt; static u8 *numa_distance; nodemask_t numa_nodes_parsed __initdata; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1c485beb0b93..044ebab2c941 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "internal.h" @@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc) mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); else { struct task_struct *p; + int i = 0; rcu_read_lock(); - for_each_process(p) + for_each_process(p) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + touch_softlockup_watchdog(); dump_task(p, oc); + } rcu_read_unlock(); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d9861e42b2bd..4f5970723cf2 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -942,26 +942,25 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); + + /* + * It's very possible that wb_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. + */ + if (thresh > dtc->dirty) { + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100); + else + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8); + } + wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); if (wb_thresh > wb_max_thresh) wb_thresh = wb_max_thresh; - /* - * With strictlimit flag, the wb_thresh is treated as - * a hard limit in balance_dirty_pages() and wb_position_ratio(). - * It's possible that wb_thresh is close to zero, not because - * the device is slow, but because it has been inactive. - * To prevent occasional writes from being blocked, we raise wb_thresh. - */ - if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { - unsigned long limit = hard_dirty_limit(dom, dtc->thresh); - u64 wb_scale_thresh = 0; - - if (limit > dtc->dirty) - wb_scale_thresh = (limit - dtc->dirty) / 100; - wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4)); - } - return wb_thresh; } @@ -969,6 +968,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + domain_dirty_avail(&gdtc, true); return __wb_calc_thresh(&gdtc, thresh); } @@ -1145,12 +1145,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; - if (dtc->wb_dirty < 8) { - dtc->pos_ratio = min_t(long long, pos_ratio * 2, - 2 << RATELIMIT_CALC_SHIFT); - return; - } - if (dtc->wb_dirty >= wb_thresh) return; @@ -1221,14 +1215,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) */ if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; - /* - * It's very possible that wb_thresh is close to 0 not because the - * device is slow, but that it has remained inactive for long time. - * Honour such devices a reasonable good (hopefully IO efficient) - * threshold, so that the occasional writes won't be blocked and active - * writes can rampup the threshold quickly. - */ - wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh @@ -1484,17 +1470,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". - * - * We rampup dirty_ratelimit forcibly if wb_dirty is low because - * it's possible that wb_thresh is close to zero due to inactivity - * of backing device. */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; - if (dtc->wb_dirty < 8) - setpoint = dtc->wb_dirty + 1; - else - setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 01eab25edf89..c9d5f2450a08 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1295,12 +1295,6 @@ void __meminit __free_pages_core(struct page *page, unsigned int order, set_page_count(p, 0); } - /* - * Freeing the page with debug_pagealloc enabled will try to - * unmap it; some archs don't like double-unmappings, so - * map it first. - */ - debug_pagealloc_map_pages(page, nr_pages); adjust_managed_page_count(page, nr_pages); } else { for (loop = 0; loop < nr_pages; loop++, p++) { @@ -1508,7 +1502,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, int i; set_page_private(page, 0); - set_page_refcounted(page); arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); @@ -1856,6 +1849,14 @@ static bool can_steal_fallback(unsigned int order, int start_mt) if (order >= pageblock_order) return true; + /* + * Movable pages won't cause permanent fragmentation, so when you alloc + * small pages, you just need to temporarily steal unmovable or + * reclaimable pages that are closest to the request size. After a + * while, memory compaction may occur to form large contiguous pages, + * and the next movable allocation may not need to steal. Unmovable and + * reclaimable allocations need to actually steal pages. + */ if (order >= pageblock_order / 2 || start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE || @@ -2592,9 +2593,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; } -static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, - struct page *page, int migratetype, - unsigned int order) +static void free_frozen_page_commit(struct zone *zone, + struct per_cpu_pages *pcp, struct page *page, int migratetype, + unsigned int order) { int high, batch; int pindex; @@ -2643,7 +2644,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, /* * Free a pcp page */ -void free_unref_page(struct page *page, unsigned int order) +void free_frozen_pages(struct page *page, unsigned int order) { unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; @@ -2666,20 +2667,20 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ + zone = page_zone(page); migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; } - zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, migratetype, order); + free_frozen_page_commit(zone, pcp, page, migratetype, order); pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, FPI_NONE); @@ -2743,7 +2744,7 @@ void free_unref_folios(struct folio_batch *folios) /* * Free isolated pages directly to the - * allocator, see comment in free_unref_page. + * allocator, see comment in free_frozen_pages. */ if (is_migrate_isolate(migratetype)) { free_one_page(zone, &folio->page, pfn, @@ -2774,7 +2775,7 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_unref_page_commit(zone, pcp, &folio->page, migratetype, + free_frozen_page_commit(zone, pcp, &folio->page, migratetype, order); } @@ -3567,7 +3568,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, if (!page) page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - return page; } @@ -4531,28 +4531,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array * @gfp: GFP flags for the allocation * @preferred_nid: The preferred NUMA node ID to allocate from * @nodemask: Set of nodes to allocate from, may be NULL - * @nr_pages: The number of pages desired on the list or array - * @page_list: Optional list to store the allocated pages - * @page_array: Optional array to store the pages + * @nr_pages: The number of pages desired in the array + * @page_array: Array to store the pages * * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly. Pages are added to page_list if page_list - * is not NULL, otherwise it is assumed that the page_array is valid. + * allocate nr_pages quickly. Pages are added to the page_array. * - * For lists, nr_pages is the number of pages that should be allocated. - * - * For arrays, only NULL elements are populated with pages and nr_pages + * Note that only NULL elements are populated with pages and nr_pages * is the maximum number of pages that will be stored in the array. * - * Returns the number of pages on the list or array. + * Returns the number of pages in the array. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, struct page **page_array) { struct page *page; @@ -4570,7 +4565,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, * Skip populated array elements to determine if any pages need * to be allocated before disabling IRQs. */ - while (page_array && nr_populated < nr_pages && page_array[nr_populated]) + while (nr_populated < nr_pages && page_array[nr_populated]) nr_populated++; /* No pages requested? */ @@ -4578,7 +4573,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, goto out; /* Already populated array? */ - if (unlikely(page_array && nr_pages - nr_populated == 0)) + if (unlikely(nr_pages - nr_populated == 0)) goto out; /* Bulk allocator does not support memcg accounting. */ @@ -4660,7 +4655,7 @@ retry_this_zone: while (nr_populated < nr_pages) { /* Skip existing pages */ - if (page_array && page_array[nr_populated]) { + if (page_array[nr_populated]) { nr_populated++; continue; } @@ -4678,11 +4673,8 @@ retry_this_zone: nr_account++; prep_new_page(page, 0, gfp, 0); - if (page_list) - list_add(&page->lru, page_list); - else - page_array[nr_populated] = page; - nr_populated++; + set_page_refcounted(page); + page_array[nr_populated++] = page; } pcp_spin_unlock(pcp); @@ -4699,14 +4691,8 @@ failed_irq: failed: page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); - if (page) { - if (page_list) - list_add(&page->lru, page_list); - else - page_array[nr_populated] = page; - nr_populated++; - } - + if (page) + page_array[nr_populated++] = page; goto out; } EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); @@ -4714,8 +4700,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* * This is the 'heart' of the zoned buddy allocator. */ -struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, - int preferred_nid, nodemask_t *nodemask) +struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4768,7 +4754,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, out: if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { - __free_pages(page, order); + free_frozen_pages(page, order); page = NULL; } @@ -4777,6 +4763,18 @@ out: return page; } +EXPORT_SYMBOL(__alloc_frozen_pages_noprof); + +struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct page *page; + + page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask); + if (page) + set_page_refcounted(page); + return page; +} EXPORT_SYMBOL(__alloc_pages_noprof); struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, @@ -4837,11 +4835,11 @@ void __free_pages(struct page *page, unsigned int order) struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) - free_unref_page(page, order); + free_frozen_pages(page, order); else if (!head) { pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) - free_unref_page(page + (1 << order), order); + free_frozen_pages(page + (1 << order), order); } } EXPORT_SYMBOL(__free_pages); @@ -6270,9 +6268,8 @@ static void alloc_contig_dump_pages(struct list_head *page_list) * @migratetype: using migratetype to filter the type of migration in * trace_mm_alloc_contig_migrate_range_info. */ -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype) +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end, int migratetype) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6281,7 +6278,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, int ret = 0; struct migration_target_control mtc = { .nid = zone_to_nid(cc->zone), - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = cc->gfp_mask, .reason = MR_CONTIG_RANGE, }; struct page *page; @@ -6351,7 +6348,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } -static void split_free_pages(struct list_head *list) +static void split_free_pages(struct list_head *list, gfp_t gfp_mask) { int order; @@ -6362,7 +6359,8 @@ static void split_free_pages(struct list_head *list) list_for_each_entry_safe(page, next, &list[order], lru) { int i; - post_alloc_hook(page, order, __GFP_MOVABLE); + post_alloc_hook(page, order, gfp_mask); + set_page_refcounted(page); if (!order) continue; @@ -6376,6 +6374,40 @@ static void split_free_pages(struct list_head *list) } } +static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) +{ + const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO; + const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN; + + /* + * We are given the range to allocate; node, mobility and placement + * hints are irrelevant at this point. We'll simply ignore them. + */ + gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE | + __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE); + + /* + * We only support most reclaim flags (but not NOFAIL/NORETRY), and + * selected action flags. + */ + if (gfp_mask & ~(reclaim_mask | action_mask)) + return -EINVAL; + + /* + * Flags to control page compaction/migration/reclaim, to free up our + * page range. Migratable pages are movable, __GFP_MOVABLE is implied + * for them. + * + * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that + * to not degrade callers. + */ + *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) | + __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; + return 0; +} + /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -6384,7 +6416,9 @@ static void split_free_pages(struct list_head *list) * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. - * @gfp_mask: GFP mask to use during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * * The PFN range does not have to be pageblock aligned. The PFN range must * belong to a single zone. @@ -6410,11 +6444,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .no_set_skip_hint = true, - .gfp_mask = current_gfp_context(gfp_mask), .alloc_contig = true, }; INIT_LIST_HEAD(&cc.migratepages); + gfp_mask = current_gfp_context(gfp_mask); + if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) + return -EINVAL; + /* * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may @@ -6436,7 +6473,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, * put back to page allocator so that buddy can use them. */ - ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); + ret = start_isolate_page_range(start, end, migratetype, 0); if (ret) goto done; @@ -6455,7 +6492,17 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); if (ret && ret != -EBUSY) goto done; - ret = 0; + + /* + * When in-use hugetlb pages are migrated, they may simply be released + * back into the free hugepage pool instead of being returned to the + * buddy system. After the migration of in-use huge pages is completed, + * we will invoke replace_free_hugepage_folios() to ensure that these + * hugepages are properly released to the buddy system. + */ + ret = replace_free_hugepage_folios(start, end); + if (ret) + goto done; /* * Pages from [start, end) are within a pageblock_nr_pages @@ -6489,7 +6536,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, } if (!(gfp_mask & __GFP_COMP)) { - split_free_pages(cc.freepages); + split_free_pages(cc.freepages, gfp_mask); /* Free head and tail (if any) */ if (start != outer_start) @@ -6502,6 +6549,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); + set_page_refcounted(head); } else { ret = -EINVAL; WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", @@ -6556,7 +6604,9 @@ static bool zone_spans_last_pfn(const struct zone *zone, /** * alloc_contig_pages() -- tries to find and allocate contiguous range of pages * @nr_pages: Number of contiguous pages to allocate - * @gfp_mask: GFP mask to limit search and used during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * @nid: Target node * @nodemask: Mask for other possible nodes * diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c index 3f7a203d35c6..d2423f30577e 100644 --- a/mm/page_frag_cache.c +++ b/mm/page_frag_cache.c @@ -86,7 +86,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) - free_unref_page(page, compound_order(page)); + free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -138,7 +138,7 @@ refill: goto refill; if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) { - free_unref_page(page, + free_frozen_pages(page, encoded_page_decode_order(encoded_page)); goto refill; } @@ -166,6 +166,6 @@ void page_frag_free(void *addr) struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) - free_unref_page(page, compound_order(page)); + free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free); diff --git a/mm/page_idle.c b/mm/page_idle.c index 41ea77f22011..947c7c7a3728 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -112,7 +112,7 @@ static void page_idle_clear_pte_refs(struct folio *folio) } static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t pos, size_t count) { u64 *out = (u64 *)buf; @@ -157,7 +157,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, } static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t pos, size_t count) { const u64 *in = (u64 *)buf; @@ -193,17 +193,17 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, return (char *)in - buf; } -static struct bin_attribute page_idle_bitmap_attr = +static const struct bin_attribute page_idle_bitmap_attr = __BIN_ATTR(bitmap, 0600, page_idle_bitmap_read, page_idle_bitmap_write, 0); -static struct bin_attribute *page_idle_bin_attrs[] = { +static const struct bin_attribute *const page_idle_bin_attrs[] = { &page_idle_bitmap_attr, NULL, }; static const struct attribute_group page_idle_attr_group = { - .bin_attrs = page_idle_bin_attrs, + .bin_attrs_new = page_idle_bin_attrs, .name = "page_idle", }; diff --git a/mm/page_io.c b/mm/page_io.c index 4b4ea8e49cf6..9b983de351f9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -163,7 +163,6 @@ reprobe: page_no = 1; /* force Empty message */ sis->max = page_no; sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; out: return ret; bad_bmap: diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 7e04047977cf..c608e9d72865 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -286,7 +286,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * within a free or in-use page. * @boundary_pfn: pageblock-aligned pfn that a page might cross * @flags: isolation flags - * @gfp_flags: GFP flags used for migrating pages * @isolate_before: isolate the pageblock before the boundary_pfn * @skip_isolation: the flag to skip the pageblock isolation in second * isolate_single_pageblock() @@ -306,8 +305,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * the in-use page then splitting the free page. */ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - gfp_t gfp_flags, bool isolate_before, bool skip_isolation, - int migratetype) + bool isolate_before, bool skip_isolation, int migratetype) { unsigned long start_pfn; unsigned long isolate_pageblock; @@ -444,8 +442,6 @@ failed: * and PageOffline() pages. * REPORT_FAILURE - report details about the failure to * isolate the range - * @gfp_flags: GFP flags used for migrating pages that sit across the - * range boundaries. * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the @@ -478,7 +474,7 @@ failed: * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags) + int migratetype, int flags) { unsigned long pfn; struct page *page; @@ -489,7 +485,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, + ret = isolate_single_pageblock(isolate_start, flags, false, skip_isolation, migratetype); if (ret) return ret; @@ -498,7 +494,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, skip_isolation = true; /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, + ret = isolate_single_pageblock(isolate_end, flags, true, skip_isolation, migratetype); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); diff --git a/mm/percpu.c b/mm/percpu.c index d8dd31a2e407..ac61e3fc5f15 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1359,10 +1359,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, /* allocate chunk */ alloc_size = struct_size(chunk, populated, BITS_TO_LONGS(region_size >> PAGE_SHIFT)); - chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); INIT_LIST_HEAD(&chunk->list); @@ -1374,24 +1371,14 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_bits = pcpu_chunk_map_bits(chunk); alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); - chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->alloc_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); - chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->bound_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); - chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->md_blocks) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); - + chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); #ifdef NEED_PCPUOBJ_EXT /* first chunk is free to use */ chunk->obj_exts = NULL; @@ -2595,28 +2582,16 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* process group information and build config tables accordingly */ alloc_size = ai->nr_groups * sizeof(group_offsets[0]); - group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!group_offsets) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = ai->nr_groups * sizeof(group_sizes[0]); - group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!group_sizes) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_map[0]); - unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!unit_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_off[0]); - unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!unit_off) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2685,12 +2660,9 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_free_slot = pcpu_sidelined_slot + 1; pcpu_to_depopulate_slot = pcpu_free_slot + 1; pcpu_nr_slots = pcpu_to_depopulate_slot + 1; - pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * + pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]), SMP_CACHE_BYTES); - if (!pcpu_chunk_lists) - panic("%s: Failed to allocate %zu bytes\n", __func__, - pcpu_nr_slots * sizeof(pcpu_chunk_lists[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_chunk_lists[i]); @@ -3155,25 +3127,19 @@ void __init __weak pcpu_populate_pte(unsigned long addr) pmd_t *pmd; if (pgd_none(*pgd)) { - p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - if (!p4d) - goto err_alloc; + p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE); pgd_populate(&init_mm, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!pud) - goto err_alloc; + pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); p4d_populate(&init_mm, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!pmd) - goto err_alloc; + pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); pud_populate(&init_mm, pud, pmd); } @@ -3181,16 +3147,11 @@ void __init __weak pcpu_populate_pte(unsigned long addr) if (!pmd_present(*pmd)) { pte_t *new; - new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); - if (!new) - goto err_alloc; + new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmd, new); } return; - -err_alloc: - panic("%s: Failed to allocate memory\n", __func__); } /** @@ -3237,10 +3198,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); - if (!pages) - panic("%s: Failed to allocate %zu bytes\n", __func__, - pages_size); + pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES); /* allocate pages */ j = 0; diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c new file mode 100644 index 000000000000..7e9455a18aae --- /dev/null +++ b/mm/pt_reclaim.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "internal.h" + +bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details) +{ + return details && details->reclaim_pt && (end - start >= PMD_SIZE); +} + +bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval) +{ + spinlock_t *pml = pmd_lockptr(mm, pmd); + + if (!spin_trylock(pml)) + return false; + + *pmdval = pmdp_get_lockless(pmd); + pmd_clear(pmd); + spin_unlock(pml); + + return true; +} + +void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, + pmd_t pmdval) +{ + pte_free_tlb(tlb, pmd_pgtable(pmdval), addr); + mm_dec_nr_ptes(mm); +} + +void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + struct mmu_gather *tlb) +{ + pmd_t pmdval; + spinlock_t *pml, *ptl = NULL; + pte_t *start_pte, *pte; + int i; + + pml = pmd_lock(mm, pmd); + start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl); + if (!start_pte) + goto out_ptl; + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + + /* Check if it is empty PTE page */ + for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(ptep_get(pte))) + goto out_ptl; + } + pte_unmap(start_pte); + + pmd_clear(pmd); + + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); + + free_pte(mm, addr, tlb, pmdval); + + return; +out_ptl: + if (start_pte) + pte_unmap_unlock(start_pte, ptl); + if (ptl != pml) + spin_unlock(pml); +} diff --git a/mm/readahead.c b/mm/readahead.c index e151f4b13ca4..6a4e96b69702 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -158,20 +158,10 @@ static void read_pages(struct readahead_control *rac) if (aops->readahead) { aops->readahead(rac); - /* - * Clean up the remaining folios. The sizes in ->ra - * may be used to size the next readahead, so make sure - * they accurately reflect what happened. - */ + /* Clean up the remaining folios. */ while ((folio = readahead_folio(rac)) != NULL) { - unsigned long nr = folio_nr_pages(folio); - folio_get(folio); - rac->ra->size -= nr; - if (rac->ra->async_size >= nr) { - rac->ra->async_size -= nr; - filemap_remove_folio(folio); - } + filemap_remove_folio(folio); folio_unlock(folio); folio_put(folio); } @@ -188,6 +178,18 @@ static void read_pages(struct readahead_control *rac) BUG_ON(readahead_count(rac)); } +static struct folio *ractl_alloc_folio(struct readahead_control *ractl, + gfp_t gfp_mask, unsigned int order) +{ + struct folio *folio; + + folio = filemap_alloc_folio(gfp_mask, order); + if (folio && ractl->dropbehind) + __folio_set_dropbehind(folio); + + return folio; +} + /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. @@ -265,8 +267,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - folio = filemap_alloc_folio(gfp_mask, - mapping_min_folio_order(mapping)); + folio = ractl_alloc_folio(ractl, gfp_mask, + mapping_min_folio_order(mapping)); if (!folio) break; @@ -436,7 +438,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; - struct folio *folio = filemap_alloc_folio(gfp, order); + struct folio *folio = ractl_alloc_folio(ractl, gfp, order); if (!folio) return -ENOMEM; @@ -458,7 +460,8 @@ void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; - pgoff_t index = readahead_index(ractl); + pgoff_t start = readahead_index(ractl); + pgoff_t index = start; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; @@ -516,12 +519,18 @@ void page_cache_ra_order(struct readahead_control *ractl, /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this - * situation. + * situation below. */ if (!err) return; fallback: - do_page_cache_ra(ractl, ra->size, ra->async_size); + /* + * ->readahead() may have updated readahead window size so we have to + * check there's still something to read. + */ + if (ra->size > index - start) + do_page_cache_ra(ractl, ra->size - (index - start), + ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, @@ -754,7 +763,7 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, min_order); + folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; @@ -783,7 +792,7 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, min_order); + folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 6d783436951f..e7173fcd210c 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -12,7 +12,8 @@ #include #include -static const int rodata_test_data = 0xC3; +#define TEST_VALUE 0xC3 +static const int rodata_test_data = TEST_VALUE; void rodata_test(void) { @@ -20,7 +21,7 @@ void rodata_test(void) /* test 1: read the value */ /* If this test fails, some previous testrun has clobbered the state */ - if (!rodata_test_data) { + if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) { pr_err("test 1 fails (start data)\n"); return; } @@ -33,7 +34,7 @@ void rodata_test(void) } /* test 3: check the value hasn't changed */ - if (rodata_test_data == zero) { + if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) { pr_err("test data was changed\n"); return; } diff --git a/mm/shmem.c b/mm/shmem.c index fdb5afa1cfe9..44379bee5b96 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -553,38 +553,105 @@ static bool shmem_confirm_swap(struct address_space *mapping, /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; +static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; -static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - unsigned long vm_flags) +/** + * shmem_mapping_size_orders - Get allowable folio orders for the given file size. + * @mapping: Target address_space. + * @index: The page index. + * @write_end: end of a write, could extend inode size. + * + * This returns huge orders for folios (when supported) based on the file size + * which the mapping currently allows at the given index. The index is relevant + * due to alignment considerations the mapping might have. The returned order + * may be less than the size passed. + * + * Return: The orders. + */ +static inline unsigned int +shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end) { + unsigned int order; + size_t size; + + if (!mapping_large_folio_support(mapping) || !write_end) + return 0; + + /* Calculate the write size based on the write_end */ + size = write_end - (index << PAGE_SHIFT); + order = filemap_get_order(size); + if (!order) + return 0; + + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); + + order = min_t(size_t, order, MAX_PAGECACHE_ORDER); + return order > 0 ? BIT(order + 1) - 1 : 0; +} + +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) +{ + unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? + 0 : BIT(HPAGE_PMD_ORDER); + unsigned long within_size_orders; + unsigned int order; + pgoff_t aligned_index; loff_t i_size; - if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) - return false; if (!S_ISREG(inode->i_mode)) - return false; + return 0; if (shmem_huge == SHMEM_HUGE_DENY) - return false; + return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) - return true; + return maybe_pmd_order; + /* + * The huge order allocation for anon shmem is controlled through + * the mTHP interface, so we still use PMD-sized huge order to + * check whether global control is enabled. + * + * For tmpfs mmap()'s huge order, we still use PMD-sized order to + * allocate huge pages due to lack of a write size hint. + * + * Otherwise, tmpfs will allow getting a highest order hint based on + * the size of write and fallocate paths, then will try each allowable + * huge orders. + */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: - return true; + if (vma) + return maybe_pmd_order; + + return shmem_mapping_size_orders(inode->i_mapping, index, write_end); case SHMEM_HUGE_WITHIN_SIZE: - index = round_up(index + 1, HPAGE_PMD_NR); - i_size = max(write_end, i_size_read(inode)); - i_size = round_up(i_size, PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) - return true; + if (vma) + within_size_orders = maybe_pmd_order; + else + within_size_orders = shmem_mapping_size_orders(inode->i_mapping, + index, write_end); + + order = highest_order(within_size_orders); + while (within_size_orders) { + aligned_index = round_up(index + 1, 1 << order); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= aligned_index) + return within_size_orders; + + order = next_order(&within_size_orders, order); + } fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) - return true; + return maybe_pmd_order; fallthrough; default: - return false; + return 0; } } @@ -779,11 +846,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, return 0; } -static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - unsigned long vm_flags) +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) { - return false; + return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -1180,7 +1248,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_huge_global_enabled(inode, 0, 0, false, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1690,22 +1758,18 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; pgoff_t aligned_index; - bool global_huge; + unsigned int global_orders; loff_t i_size; int order; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) return 0; - global_huge = shmem_huge_global_enabled(inode, index, write_end, - shmem_huge_force, vm_flags); - if (!vma || !vma_is_anon_shmem(vma)) { - /* - * For tmpfs, we now only support PMD sized THP if huge page - * is enabled, otherwise fallback to order 0. - */ - return global_huge ? BIT(HPAGE_PMD_ORDER) : 0; - } + global_orders = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); + /* Tmpfs huge pages allocation */ + if (!vma || !vma_is_anon_shmem(vma)) + return global_orders; /* * Following the 'deny' semantics of the top level, force the huge @@ -1737,7 +1801,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); - if (global_huge) + if (global_orders > 0) mask |= READ_ONCE(huge_shmem_orders_inherit); return THP_ORDERS_ALL_FILE_DEFAULT & mask; @@ -1903,6 +1967,65 @@ unlock: return ERR_PTR(error); } +static struct folio *shmem_swap_alloc_folio(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + swp_entry_t entry, int order, gfp_t gfp) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct folio *new; + void *shadow; + int nr_pages; + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success with further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } + + new = shmem_alloc_folio(gfp, order, info, index); + if (!new) + return ERR_PTR(-ENOMEM); + + nr_pages = folio_nr_pages(new); + if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, + gfp, entry)) { + folio_put(new); + return ERR_PTR(-ENOMEM); + } + + /* + * Prevent parallel swapin from proceeding with the swap cache flag. + * + * Of course there is another possible concurrent scenario as well, + * that is to say, the swap cache flag of a large folio has already + * been set by swapcache_prepare(), while another thread may have + * already split the large swap entry stored in the shmem mapping. + * In this case, shmem_add_to_page_cache() will help identify the + * concurrent swapin and return -EEXIST. + */ + if (swapcache_prepare(entry, nr_pages)) { + folio_put(new); + return ERR_PTR(-EEXIST); + } + + __folio_set_locked(new); + __folio_set_swapbacked(new); + new->swap = entry; + + mem_cgroup_swapin_uncharge_swap(entry, nr_pages); + shadow = get_shadow_from_swap_cache(entry); + if (shadow) + workingset_refault(new, shadow); + folio_add_lru(new); + swap_read_folio(new, NULL); + return new; +} + /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of @@ -2006,7 +2129,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, - struct folio *folio, swp_entry_t swap) + struct folio *folio, swp_entry_t swap, + bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; @@ -2022,7 +2146,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); - delete_from_swap_cache(folio); + if (!skip_swapcache) + delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) @@ -2126,6 +2251,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct swap_info_struct *si; struct folio *folio = NULL; + bool skip_swapcache = false; swp_entry_t swap; int error, nr_pages; @@ -2147,6 +2273,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { + int order = xa_get_order(&mapping->i_pages, index); + bool fallback_order0 = false; int split_order; /* Or update major stats only when swapin succeeds?? */ @@ -2156,6 +2284,33 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(fault_mm, PGMAJFAULT); } + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + */ + if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled())) + fallback_order0 = true; + + /* Skip swapcache for synchronous device. */ + if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); + if (!IS_ERR(folio)) { + skip_swapcache = true; + goto alloced; + } + + /* + * Fallback to swapin order-0 folio unless the swap entry + * already exists. + */ + error = PTR_ERR(folio); + folio = NULL; + if (error == -EEXIST) + goto failed; + } + /* * Now swap device can only swap in order 0 folio, then we * should split the large swap entry stored in the pagecache @@ -2186,9 +2341,10 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } } +alloced: /* We have to do this with folio locked to prevent races */ folio_lock(folio); - if (!folio_test_swapcache(folio) || + if ((!skip_swapcache && !folio_test_swapcache(folio)) || folio->swap.val != swap.val || !shmem_confirm_swap(mapping, index, swap)) { error = -EEXIST; @@ -2224,7 +2380,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); - delete_from_swap_cache(folio); + if (skip_swapcache) { + folio->swap.val = 0; + swapcache_clear(si, swap, nr_pages); + } else { + delete_from_swap_cache(folio); + } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); put_swap_device(si); @@ -2235,8 +2396,11 @@ failed: if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; if (error == -EIO) - shmem_set_folio_swapin_error(inode, index, folio, swap); + shmem_set_folio_swapin_error(inode, index, folio, swap, + skip_swapcache); unlock: + if (skip_swapcache) + swapcache_clear(si, swap, folio_nr_pages(folio)); if (folio) { folio_unlock(folio); folio_put(folio); @@ -2752,12 +2916,6 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); - int ret; - - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ @@ -4583,48 +4741,37 @@ bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } -static int shmem_parse_options(struct fs_context *fc, void *data) +static char *shmem_next_opt(char **s) { - char *options = data; + char *sbegin = *s; + char *p; - if (options) { - int err = security_sb_eat_lsm_opts(options, &fc->security); - if (err) - return err; - } + if (sbegin == NULL) + return NULL; - while (options != NULL) { - char *this_char = options; - for (;;) { - /* - * NUL-terminate this option: unfortunately, - * mount options form a comma-separated list, - * but mpol's nodelist may also contain commas. - */ - options = strchr(options, ','); - if (options == NULL) - break; - options++; - if (!isdigit(*options)) { - options[-1] = '\0'; - break; - } - } - if (*this_char) { - char *value = strchr(this_char, '='); - size_t len = 0; - int err; - - if (value) { - *value++ = '\0'; - len = strlen(value); - } - err = vfs_parse_fs_string(fc, this_char, value, len); - if (err < 0) - return err; + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + for (;;) { + p = strchr(*s, ','); + if (p == NULL) + break; + *s = p + 1; + if (!isdigit(*(p+1))) { + *p = '\0'; + return sbegin; } } - return 0; + + *s = NULL; + return sbegin; +} + +static int shmem_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* @@ -4891,7 +5038,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; - sbinfo->huge = ctx->huge; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + else + sbinfo->huge = tmpfs_huge; +#endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; @@ -4969,7 +5121,7 @@ static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS - .parse_monolithic = shmem_parse_options, + .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif @@ -5442,6 +5594,21 @@ static int __init setup_transparent_hugepage_shmem(char *str) } __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); +static int __init setup_transparent_hugepage_tmpfs(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge < 0) { + pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); + return huge; + } + + tmpfs_huge = huge; + return 1; +} +__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); + static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_shmem(char *str) { diff --git a/mm/slub.c b/mm/slub.c index c2151c9fee22..996691c137eb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2311,7 +2311,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, * We have to do this manually because the rcu_head is * not located inside the object. */ - kasan_record_aux_stack_noalloc(x); + kasan_record_aux_stack(x); delayed_free->object = x; call_rcu(&delayed_free->head, slab_free_after_rcu_debug); @@ -2420,17 +2420,15 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, unsigned int order = oo_order(oo); if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_pages(flags, order); + folio = (struct folio *)alloc_frozen_pages(flags, order); else - folio = (struct folio *)__alloc_pages_node(node, flags, order); + folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); if (!folio) return NULL; slab = folio_slab(folio); __folio_set_slab(folio); - /* Make the flag visible before any changes to folio->mapping */ - smp_wmb(); if (folio_is_pfmemalloc(folio)) slab_set_pfmemalloc(slab); @@ -2651,12 +2649,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) __slab_clear_pfmemalloc(slab); folio->mapping = NULL; - /* Make the mapping reset visible before clearing the flag */ - smp_wmb(); __folio_clear_slab(folio); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - __free_pages(&folio->page, order); + free_frozen_pages(&folio->page, order); } static void rcu_free_slab(struct rcu_head *h) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index cec67c5f37d8..3287ebadd167 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -31,6 +31,8 @@ #include #include +#include "internal.h" + /* * Allocate a block of memory to be used to back the virtual memory map * or to back the page tables that are used to create the mapping. @@ -42,8 +44,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_alloc_try_nid_raw(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, node); + return memmap_alloc(size, align, goal, node, false); } void * __meminit vmemmap_alloc_block(unsigned long size, int node) diff --git a/mm/sparse.c b/mm/sparse.c index 13b6624d3562..133b033d0cba 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -257,10 +257,7 @@ static void __init memblocks_present(void) size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_alloc(size, align); - if (!mem_section) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, align); + mem_section = memblock_alloc_or_panic(size, align); } #endif diff --git a/mm/swap.c b/mm/swap.c index 10decd9dffa1..fc8281ef4241 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -109,7 +109,7 @@ void __folio_put(struct folio *folio) page_cache_release(folio); folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, folio_order(folio)); + free_frozen_pages(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); @@ -379,37 +379,58 @@ static void __lru_cache_activate_folio(struct folio *folio) } #ifdef CONFIG_LRU_GEN -static void folio_inc_refs(struct folio *folio) + +static void lru_gen_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { - folio_set_referenced(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); return; } - if (!folio_test_workingset(folio)) { - folio_set_workingset(folio); - return; - } - - /* see the comment on MAX_NR_TIERS */ do { - new_flags = old_flags & LRU_REFS_MASK; - if (new_flags == LRU_REFS_MASK) - break; + if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) { + if (!folio_test_workingset(folio)) + folio_set_workingset(folio); + return; + } - new_flags += BIT(LRU_REFS_PGOFF); - new_flags |= old_flags & ~LRU_REFS_MASK; + new_flags = old_flags + BIT(LRU_REFS_PGOFF); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } -#else -static void folio_inc_refs(struct folio *folio) + +static bool lru_gen_clear_refs(struct folio *folio) +{ + struct lru_gen_folio *lrugen; + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + + if (gen < 0) + return true; + + set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); + + lrugen = &folio_lruvec(folio)->lrugen; + /* whether can do without shuffling under the LRU lock */ + return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); +} + +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_inc_refs(struct folio *folio) { } + +static bool lru_gen_clear_refs(struct folio *folio) +{ + return false; +} + #endif /* CONFIG_LRU_GEN */ /** @@ -427,8 +448,10 @@ static void folio_inc_refs(struct folio *folio) */ void folio_mark_accessed(struct folio *folio) { + if (folio_test_dropbehind(folio)) + return; if (lru_gen_enabled()) { - folio_inc_refs(folio); + lru_gen_inc_refs(folio); return; } @@ -474,7 +497,7 @@ void folio_add_lru(struct folio *folio) folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - /* see the comment in lru_gen_add_folio() */ + /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); @@ -524,7 +547,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) */ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { - bool active = folio_test_active(folio); + bool active = folio_test_active(folio) || lru_gen_enabled(); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) @@ -589,7 +612,10 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) lruvec_del_folio(lruvec, folio); folio_clear_active(folio); - folio_clear_referenced(folio); + if (lru_gen_enabled()) + lru_gen_clear_refs(folio); + else + folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal @@ -657,6 +683,9 @@ void deactivate_file_folio(struct folio *folio) if (folio_test_unevictable(folio)) return; + if (lru_gen_enabled() && lru_gen_clear_refs(folio)) + return; + folio_batch_add_and_move(folio, lru_deactivate_file, true); } @@ -670,7 +699,10 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) + if (folio_test_unevictable(folio)) + return; + + if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; folio_batch_add_and_move(folio, lru_deactivate, true); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index da1278f0563b..be39078f255b 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -6,149 +6,106 @@ #include /* depends on mm.h include */ static DEFINE_MUTEX(swap_cgroup_mutex); + +/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */ +#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short)) +#define ID_SHIFT (BITS_PER_TYPE(unsigned short)) +#define ID_MASK (BIT(ID_SHIFT) - 1) +struct swap_cgroup { + atomic_t ids; +}; + struct swap_cgroup_ctrl { - struct page **map; - unsigned long length; - spinlock_t lock; + struct swap_cgroup *map; }; static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; -struct swap_cgroup { - unsigned short id; -}; -#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) - -/* - * SwapCgroup implements "lookup" and "exchange" operations. - * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge - * against SwapCache. At swap_free(), this is accessed directly from swap. - * - * This means, - * - we have no race in "exchange" when we're accessed via SwapCache because - * SwapCache(and its swp_entry) is under lock. - * - When called via swap_free(), there is no user of this entry and no race. - * Then, we don't need lock around "exchange". - * - * TODO: we can push these buffers out to HIGHMEM. - */ - -/* - * allocate buffer for swap_cgroup. - */ -static int swap_cgroup_prepare(int type) +static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map, + pgoff_t offset) { - struct page *page; - struct swap_cgroup_ctrl *ctrl; - unsigned long idx, max; + unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; + unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids); - ctrl = &swap_cgroup_ctrl[type]; + BUILD_BUG_ON(!is_power_of_2(ID_PER_SC)); + BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t)); - for (idx = 0; idx < ctrl->length; idx++) { - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto not_enough_page; - ctrl->map[idx] = page; - - if (!(idx % SWAP_CLUSTER_MAX)) - cond_resched(); - } - return 0; -not_enough_page: - max = idx; - for (idx = 0; idx < max; idx++) - __free_page(ctrl->map[idx]); - - return -ENOMEM; + return (old_ids >> shift) & ID_MASK; } -static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, - pgoff_t offset) +static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, + pgoff_t offset, + unsigned short new_id) { - struct page *mappage; - struct swap_cgroup *sc; + unsigned short old_id; + struct swap_cgroup *sc = &map[offset / ID_PER_SC]; + unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; + unsigned int new_ids, old_ids = atomic_read(&sc->ids); - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; -} + do { + old_id = (old_ids >> shift) & ID_MASK; + new_ids = (old_ids & ~(ID_MASK << shift)); + new_ids |= ((unsigned int)new_id) << shift; + } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids)); -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, - struct swap_cgroup_ctrl **ctrlp) -{ - pgoff_t offset = swp_offset(ent); - struct swap_cgroup_ctrl *ctrl; - - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - if (ctrlp) - *ctrlp = ctrl; - return __lookup_swap_cgroup(ctrl, offset); + return old_id; } /** - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @ent: swap entry to be cmpxchged - * @old: old id - * @new: new id + * swap_cgroup_record - record mem_cgroup for a set of swap entries. + * These entries must belong to one single folio, and that folio + * must be being charged for swap space (swap out), and these + * entries must not have been charged * - * Returns old id at success, 0 at failure. - * (There is no mem_cgroup using 0 as its id) + * @folio: the folio that the swap entry belongs to + * @ent: the first swap entry to be recorded */ -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new) +void swap_cgroup_record(struct folio *folio, swp_entry_t ent) { - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned long flags; - unsigned short retval; + unsigned int nr_ents = folio_nr_pages(folio); + struct swap_cgroup *map; + pgoff_t offset, end; + unsigned short old; - sc = lookup_swap_cgroup(ent, &ctrl); + offset = swp_offset(ent); + end = offset + nr_ents; + map = swap_cgroup_ctrl[swp_type(ent)].map; - spin_lock_irqsave(&ctrl->lock, flags); - retval = sc->id; - if (retval == old) - sc->id = new; - else - retval = 0; - spin_unlock_irqrestore(&ctrl->lock, flags); - return retval; + do { + old = __swap_cgroup_id_xchg(map, offset, + mem_cgroup_id(folio_memcg(folio))); + VM_BUG_ON(old); + } while (++offset != end); } /** - * swap_cgroup_record - record mem_cgroup for a set of swap entries + * swap_cgroup_clear - clear mem_cgroup for a set of swap entries. + * These entries must be being uncharged from swap. They either + * belongs to one single folio in the swap cache (swap in for + * cgroup v1), or no longer have any users (slot freeing). + * * @ent: the first swap entry to be recorded into - * @id: mem_cgroup to be recorded * @nr_ents: number of swap entries to be recorded * - * Returns old value at success, 0 at failure. - * (Of course, old value can be 0.) + * Returns the existing old value. */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents) +unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned short old; - unsigned long flags; pgoff_t offset = swp_offset(ent); pgoff_t end = offset + nr_ents; + struct swap_cgroup *map; + unsigned short old, iter = 0; - sc = lookup_swap_cgroup(ent, &ctrl); + offset = swp_offset(ent); + end = offset + nr_ents; + map = swap_cgroup_ctrl[swp_type(ent)].map; - spin_lock_irqsave(&ctrl->lock, flags); - old = sc->id; - for (;;) { - VM_BUG_ON(sc->id != old); - sc->id = id; - offset++; - if (offset == end) - break; - if (offset % SC_PER_PAGE) - sc++; - else - sc = __lookup_swap_cgroup(ctrl, offset); - } - spin_unlock_irqrestore(&ctrl->lock, flags); + do { + old = __swap_cgroup_id_xchg(map, offset, 0); + if (!iter) + iter = old; + VM_BUG_ON(iter != old); + } while (++offset != end); return old; } @@ -161,39 +118,33 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { - if (mem_cgroup_disabled()) - return 0; - return lookup_swap_cgroup(ent, NULL)->id; -} - -int swap_cgroup_swapon(int type, unsigned long max_pages) -{ - void *array; - unsigned long length; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); +} - array = vcalloc(length, sizeof(void *)); - if (!array) +int swap_cgroup_swapon(int type, unsigned long max_pages) +{ + struct swap_cgroup *map; + struct swap_cgroup_ctrl *ctrl; + + if (mem_cgroup_disabled()) + return 0; + + BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC != + sizeof(struct swap_cgroup)); + map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) * + sizeof(struct swap_cgroup)); + if (!map) goto nomem; ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); - ctrl->length = length; - ctrl->map = array; - spin_lock_init(&ctrl->lock); - if (swap_cgroup_prepare(type)) { - /* memory shortage */ - ctrl->map = NULL; - ctrl->length = 0; - mutex_unlock(&swap_cgroup_mutex); - vfree(array); - goto nomem; - } + ctrl->map = map; mutex_unlock(&swap_cgroup_mutex); return 0; @@ -205,8 +156,7 @@ nomem: void swap_cgroup_swapoff(int type) { - struct page **map; - unsigned long i, length; + struct swap_cgroup *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) @@ -215,19 +165,8 @@ void swap_cgroup_swapoff(int type) mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; - length = ctrl->length; ctrl->map = NULL; - ctrl->length = 0; mutex_unlock(&swap_cgroup_mutex); - if (map) { - for (i = 0; i < length; i++) { - struct page *page = map[i]; - if (page) - __free_page(page); - if (!(i % SWAP_CLUSTER_MAX)) - cond_resched(); - } - vfree(map); - } + vfree(map); } diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 13ab3b771409..9c7c171df7ba 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -43,17 +43,15 @@ static DEFINE_MUTEX(swap_slots_cache_mutex); /* Serialize swap slots cache enable/disable operations */ static DEFINE_MUTEX(swap_slots_cache_enable_mutex); -static void __drain_swap_slots_cache(unsigned int type); +static void __drain_swap_slots_cache(void); #define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled) -#define SLOTS_CACHE 0x1 -#define SLOTS_CACHE_RET 0x2 static void deactivate_swap_slots_cache(void) { mutex_lock(&swap_slots_cache_mutex); swap_slot_cache_active = false; - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + __drain_swap_slots_cache(); mutex_unlock(&swap_slots_cache_mutex); } @@ -72,7 +70,7 @@ void disable_swap_slots_cache_lock(void) if (swap_slot_cache_initialized) { /* serialize with cpu hotplug operations */ cpus_read_lock(); - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + __drain_swap_slots_cache(); cpus_read_unlock(); } } @@ -113,7 +111,7 @@ out: static int alloc_swap_slot_cache(unsigned int cpu) { struct swap_slots_cache *cache; - swp_entry_t *slots, *slots_ret; + swp_entry_t *slots; /* * Do allocation outside swap_slots_cache_mutex @@ -125,28 +123,19 @@ static int alloc_swap_slot_cache(unsigned int cpu) if (!slots) return -ENOMEM; - slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), - GFP_KERNEL); - if (!slots_ret) { - kvfree(slots); - return -ENOMEM; - } - mutex_lock(&swap_slots_cache_mutex); cache = &per_cpu(swp_slots, cpu); - if (cache->slots || cache->slots_ret) { + if (cache->slots) { /* cache already allocated */ mutex_unlock(&swap_slots_cache_mutex); kvfree(slots); - kvfree(slots_ret); return 0; } if (!cache->lock_initialized) { mutex_init(&cache->alloc_lock); - spin_lock_init(&cache->free_lock); cache->lock_initialized = true; } cache->nr = 0; @@ -160,19 +149,16 @@ static int alloc_swap_slot_cache(unsigned int cpu) */ mb(); cache->slots = slots; - cache->slots_ret = slots_ret; mutex_unlock(&swap_slots_cache_mutex); return 0; } -static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, - bool free_slots) +static void drain_slots_cache_cpu(unsigned int cpu, bool free_slots) { struct swap_slots_cache *cache; - swp_entry_t *slots = NULL; cache = &per_cpu(swp_slots, cpu); - if ((type & SLOTS_CACHE) && cache->slots) { + if (cache->slots) { mutex_lock(&cache->alloc_lock); swapcache_free_entries(cache->slots + cache->cur, cache->nr); cache->cur = 0; @@ -183,20 +169,9 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } mutex_unlock(&cache->alloc_lock); } - if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { - spin_lock_irq(&cache->free_lock); - swapcache_free_entries(cache->slots_ret, cache->n_ret); - cache->n_ret = 0; - if (free_slots && cache->slots_ret) { - slots = cache->slots_ret; - cache->slots_ret = NULL; - } - spin_unlock_irq(&cache->free_lock); - kvfree(slots); - } } -static void __drain_swap_slots_cache(unsigned int type) +static void __drain_swap_slots_cache(void) { unsigned int cpu; @@ -224,13 +199,13 @@ static void __drain_swap_slots_cache(unsigned int type) * There are no slots on such cpu that need to be drained. */ for_each_online_cpu(cpu) - drain_slots_cache_cpu(cpu, type, false); + drain_slots_cache_cpu(cpu, false); } static int free_slot_cache(unsigned int cpu) { mutex_lock(&swap_slots_cache_mutex); - drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); + drain_slots_cache_cpu(cpu, true); mutex_unlock(&swap_slots_cache_mutex); return 0; } @@ -269,39 +244,6 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) return cache->nr; } -void free_swap_slot(swp_entry_t entry) -{ - struct swap_slots_cache *cache; - - /* Large folio swap slot is not covered. */ - zswap_invalidate(entry); - - cache = raw_cpu_ptr(&swp_slots); - if (likely(use_swap_slot_cache && cache->slots_ret)) { - spin_lock_irq(&cache->free_lock); - /* Swap slots cache may be deactivated before acquiring lock */ - if (!use_swap_slot_cache || !cache->slots_ret) { - spin_unlock_irq(&cache->free_lock); - goto direct_free; - } - if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { - /* - * Return slots to global pool. - * The current swap_map value is SWAP_HAS_CACHE. - * Set it to 0 to indicate it is available for - * allocation in global pool - */ - swapcache_free_entries(cache->slots_ret, cache->n_ret); - cache->n_ret = 0; - } - cache->slots_ret[cache->n_ret++] = entry; - spin_unlock_irq(&cache->free_lock); - } else { -direct_free: - swapcache_free_entries(&entry, 1); - } -} - swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; diff --git a/mm/swap_state.c b/mm/swap_state.c index e0c0321b8ff7..ca42b2be64d9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) struct folio_batch folios; unsigned int refs[PAGEVEC_SIZE]; - lru_add_drain(); folio_batch_init(&folios); for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); diff --git a/mm/swapfile.c b/mm/swapfile.c index b0a9071cfe1d..d623f5b6dc4c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,15 +53,15 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, - unsigned int nr_pages); -static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, +static void swap_entry_range_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); +static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, unsigned long offset); -static void unlock_cluster_or_swap_info(struct swap_info_struct *si, - struct swap_cluster_info *ci); +static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset); +static inline void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -129,6 +129,26 @@ static inline unsigned char swap_count(unsigned char ent) return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } +/* + * Use the second highest bit of inuse_pages counter as the indicator + * of if one swap device is on the available plist, so the atomic can + * still be updated arithmetic while having special data embedded. + * + * inuse_pages counter is the only thing indicating if a device should + * be on avail_lists or not (except swapon / swapoff). By embedding the + * on-list bit in the atomic counter, updates no longer need any lock + * to check the list status. + * + * This bit will be set if the device is not on the plist and not + * usable, will be cleared if the device is on the plist. + */ +#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) +#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) +static long swap_usage_in_pages(struct swap_info_struct *si) +{ + return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK; +} + /* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* @@ -222,9 +242,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); need_reclaim = swap_is_has_cache(si, offset, nr_pages); - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); if (!need_reclaim) goto out_unlock; @@ -242,12 +262,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, folio_ref_sub(folio, nr_pages); folio_set_dirty(folio); - spin_lock(&si->lock); - /* Only sinple page folio can be backed by zswap */ - if (nr_pages == 1) - zswap_invalidate(entry); - swap_entry_range_free(si, entry, nr_pages); - spin_unlock(&si->lock); + ci = lock_cluster(si, offset); + swap_entry_range_free(si, ci, entry, nr_pages); + unlock_cluster(ci); ret = nr_pages; out_unlock: folio_unlock(folio); @@ -384,7 +401,21 @@ static void discard_swap_cluster(struct swap_info_struct *si, static inline bool cluster_is_free(struct swap_cluster_info *info) { - return info->flags & CLUSTER_FLAG_FREE; + return info->count == 0; +} + +static inline bool cluster_is_discard(struct swap_cluster_info *info) +{ + return info->flags == CLUSTER_FLAG_DISCARD; +} + +static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) +{ + if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) + return false; + if (!order) + return true; + return cluster_is_free(ci) || order == ci->order; } static inline unsigned int cluster_index(struct swap_info_struct *si, @@ -393,6 +424,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } +static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, + unsigned long offset) +{ + return &si->cluster_info[offset / SWAPFILE_CLUSTER]; +} + static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { @@ -404,45 +441,37 @@ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si { struct swap_cluster_info *ci; - ci = si->cluster_info; - if (ci) { - ci += offset / SWAPFILE_CLUSTER; - spin_lock(&ci->lock); - } + ci = offset_to_cluster(si, offset); + spin_lock(&ci->lock); + return ci; } static inline void unlock_cluster(struct swap_cluster_info *ci) { - if (ci) - spin_unlock(&ci->lock); + spin_unlock(&ci->lock); } -/* - * Determine the locking method in use for this device. Return - * swap_cluster_info if SSD-style cluster-based locking is in place. - */ -static inline struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, unsigned long offset) +static void cluster_move(struct swap_info_struct *si, + struct swap_cluster_info *ci, struct list_head *list, + enum swap_cluster_flags new_flags) { - struct swap_cluster_info *ci; + VM_WARN_ON(ci->flags == new_flags); + BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX); + lockdep_assert_held(&ci->lock); - /* Try to use fine-grained SSD-style locking if available: */ - ci = lock_cluster(si, offset); - /* Otherwise, fall back to traditional, coarse locking: */ - if (!ci) - spin_lock(&si->lock); - - return ci; -} - -static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, - struct swap_cluster_info *ci) -{ - if (ci) - unlock_cluster(ci); + spin_lock(&si->lock); + if (ci->flags == CLUSTER_FLAG_NONE) + list_add_tail(&ci->list, list); else - spin_unlock(&si->lock); + list_move_tail(&ci->list, list); + spin_unlock(&si->lock); + + if (ci->flags == CLUSTER_FLAG_FRAG) + atomic_long_dec(&si->frag_cluster_nr[ci->order]); + else if (new_flags == CLUSTER_FLAG_FRAG) + atomic_long_inc(&si->frag_cluster_nr[ci->order]); + ci->flags = new_flags; } /* Add a cluster to discard list and schedule it to do discard */ @@ -458,51 +487,97 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - - VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); - list_move_tail(&ci->list, &si->discard_clusters); - ci->flags = 0; + VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); + cluster_move(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD); schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - lockdep_assert_held(&si->lock); lockdep_assert_held(&ci->lock); - - if (ci->flags) - list_move_tail(&ci->list, &si->free_clusters); - else - list_add_tail(&ci->list, &si->free_clusters); - ci->flags = CLUSTER_FLAG_FREE; + cluster_move(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } +/* + * Isolate and lock the first cluster that is not contented on a list, + * clean its flag before taken off-list. Cluster flag must be in sync + * with list status, so cluster updaters can always know the cluster + * list status without touching si lock. + * + * Note it's possible that all clusters on a list are contented so + * this returns NULL for an non-empty list. + */ +static struct swap_cluster_info *cluster_isolate_lock( + struct swap_info_struct *si, struct list_head *list) +{ + struct swap_cluster_info *ci, *ret = NULL; + + spin_lock(&si->lock); + + if (unlikely(!(si->flags & SWP_WRITEOK))) + goto out; + + list_for_each_entry(ci, list, list) { + if (!spin_trylock(&ci->lock)) + continue; + + /* We may only isolate and clear flags of following lists */ + VM_BUG_ON(!ci->flags); + VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE && + ci->flags != CLUSTER_FLAG_FULL); + + list_del(&ci->list); + ci->flags = CLUSTER_FLAG_NONE; + ret = ci; + break; + } +out: + spin_unlock(&si->lock); + + return ret; +} + /* * Doing discard actually. After a cluster discard is finished, the cluster - * will be added to free cluster list. caller should hold si->lock. -*/ -static void swap_do_scheduled_discard(struct swap_info_struct *si) + * will be added to free cluster list. Discard cluster is a bit special as + * they don't participate in allocation or reclaim, so clusters marked as + * CLUSTER_FLAG_DISCARD must remain off-list or on discard list. + */ +static bool swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *ci; + bool ret = false; unsigned int idx; + spin_lock(&si->lock); while (!list_empty(&si->discard_clusters)) { ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); + /* + * Delete the cluster from list but don't clear its flags until + * discard is done, so isolation and relocation will skip it. + */ list_del(&ci->list); idx = cluster_index(si, ci); spin_unlock(&si->lock); - discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); - spin_lock(&si->lock); spin_lock(&ci->lock); - __free_cluster(si, ci); + /* + * Discard is done, clear its flags as it's now off-list, + * then return the cluster to allocation list. + */ + ci->flags = CLUSTER_FLAG_NONE; memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); + __free_cluster(si, ci); spin_unlock(&ci->lock); + ret = true; + spin_lock(&si->lock); } + spin_unlock(&si->lock); + return ret; } static void swap_discard_work(struct work_struct *work) @@ -511,9 +586,7 @@ static void swap_discard_work(struct work_struct *work) si = container_of(work, struct swap_info_struct, discard_work); - spin_lock(&si->lock); swap_do_scheduled_discard(si); - spin_unlock(&si->lock); } static void swap_users_ref_free(struct percpu_ref *ref) @@ -524,15 +597,16 @@ static void swap_users_ref_free(struct percpu_ref *ref) complete(&si->comp); } +/* + * Must be called after freeing if ci->count == 0, moves the cluster to free + * or discard list. + */ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(ci->count != 0); - lockdep_assert_held(&si->lock); + VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); lockdep_assert_held(&ci->lock); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed @@ -547,6 +621,48 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info * __free_cluster(si, ci); } +/* + * Must be called after freeing if ci->count != 0, moves the cluster to + * nonfull list. + */ +static void partial_free_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER); + lockdep_assert_held(&ci->lock); + + if (ci->flags != CLUSTER_FLAG_NONFULL) + cluster_move(si, ci, &si->nonfull_clusters[ci->order], + CLUSTER_FLAG_NONFULL); +} + +/* + * Must be called after allocation, moves the cluster to full or frag list. + * Note: allocation doesn't acquire si lock, and may drop the ci lock for + * reclaim, so the cluster could be any where when called. + */ +static void relocate_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + lockdep_assert_held(&ci->lock); + + /* Discard cluster must remain off-list or on discard list */ + if (cluster_is_discard(ci)) + return; + + if (!ci->count) { + free_cluster(si, ci); + } else if (ci->count != SWAPFILE_CLUSTER) { + if (ci->flags != CLUSTER_FLAG_FRAG) + cluster_move(si, ci, &si->frag_clusters[ci->order], + CLUSTER_FLAG_FRAG); + } else { + if (ci->flags != CLUSTER_FLAG_FULL) + cluster_move(si, ci, &si->full_clusters, + CLUSTER_FLAG_FULL); + } +} + /* * The cluster corresponding to page_nr will be used. The cluster will not be * added to free cluster list and its usage counter will be increased by 1. @@ -558,9 +674,6 @@ static void inc_cluster_info_page(struct swap_info_struct *si, unsigned long idx = page_nr / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; - if (!cluster_info) - return; - ci = cluster_info + idx; ci->count++; @@ -568,63 +681,33 @@ static void inc_cluster_info_page(struct swap_info_struct *si, VM_BUG_ON(ci->flags); } -/* - * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, - * which means no page in the cluster is in use, we can optionally discard - * the cluster and add it to free cluster list. - */ -static void dec_cluster_info_page(struct swap_info_struct *si, - struct swap_cluster_info *ci, int nr_pages) -{ - if (!si->cluster_info) - return; - - VM_BUG_ON(ci->count < nr_pages); - VM_BUG_ON(cluster_is_free(ci)); - lockdep_assert_held(&si->lock); - lockdep_assert_held(&ci->lock); - ci->count -= nr_pages; - - if (!ci->count) { - free_cluster(si, ci); - return; - } - - if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { - VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]); - ci->flags = CLUSTER_FLAG_NONFULL; - } -} - static bool cluster_reclaim_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned long end) { unsigned char *map = si->swap_map; - unsigned long offset; + unsigned long offset = start; + int nr_reclaim; spin_unlock(&ci->lock); - spin_unlock(&si->lock); - - for (offset = start; offset < end; offset++) { + do { switch (READ_ONCE(map[offset])) { case 0: - continue; + offset++; + break; case SWAP_HAS_CACHE: - if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) - continue; - goto out; + nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); + if (nr_reclaim > 0) + offset += nr_reclaim; + else + goto out; + break; default: goto out; } - } + } while (offset < end); out: - spin_lock(&si->lock); spin_lock(&ci->lock); - /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock. @@ -638,11 +721,11 @@ out: static bool cluster_scan_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long start, unsigned int nr_pages) + unsigned long start, unsigned int nr_pages, + bool *need_reclaim) { unsigned long offset, end = start + nr_pages; unsigned char *map = si->swap_map; - bool need_reclaim = false; for (offset = start; offset < end; offset++) { switch (READ_ONCE(map[offset])) { @@ -651,16 +734,13 @@ static bool cluster_scan_range(struct swap_info_struct *si, case SWAP_HAS_CACHE: if (!vm_swap_full()) return false; - need_reclaim = true; + *need_reclaim = true; continue; default: return false; } } - if (need_reclaim) - return cluster_reclaim_range(si, ci, start, end); - return true; } @@ -670,73 +750,75 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster { unsigned int nr_pages = 1 << order; + lockdep_assert_held(&ci->lock); + if (!(si->flags & SWP_WRITEOK)) return false; - if (cluster_is_free(ci)) { - if (nr_pages < SWAPFILE_CLUSTER) { - list_move_tail(&ci->list, &si->nonfull_clusters[order]); - ci->flags = CLUSTER_FLAG_NONFULL; - } + if (cluster_is_free(ci)) ci->order = order; - } memset(si->swap_map + start, usage, nr_pages); - swap_range_alloc(si, start, nr_pages); + swap_range_alloc(si, nr_pages); ci->count += nr_pages; - if (ci->count == SWAPFILE_CLUSTER) { - VM_BUG_ON(!(ci->flags & - (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - list_move_tail(&ci->list, &si->full_clusters); - ci->flags = CLUSTER_FLAG_FULL; - } - return true; } -static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, - unsigned int *foundp, unsigned int order, +/* Try use a new cluster for current CPU and allocate from it. */ +static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, + unsigned int order, unsigned char usage) { - unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); + unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; + unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); unsigned int nr_pages = 1 << order; - struct swap_cluster_info *ci; + bool need_reclaim, ret; - if (end < nr_pages) - return SWAP_NEXT_INVALID; - end -= nr_pages; + lockdep_assert_held(&ci->lock); - ci = lock_cluster(si, offset); - if (ci->count + nr_pages > SWAPFILE_CLUSTER) { - offset = SWAP_NEXT_INVALID; - goto done; - } + if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) + goto out; - while (offset <= end) { - if (cluster_scan_range(si, ci, offset, nr_pages)) { - if (!cluster_alloc_range(si, ci, offset, usage, order)) { - offset = SWAP_NEXT_INVALID; - goto done; - } - *foundp = offset; - if (ci->count == SWAPFILE_CLUSTER) { - offset = SWAP_NEXT_INVALID; - goto done; - } - offset += nr_pages; - break; + for (end -= nr_pages; offset <= end; offset += nr_pages) { + need_reclaim = false; + if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) + continue; + if (need_reclaim) { + ret = cluster_reclaim_range(si, ci, start, end); + /* + * Reclaim drops ci->lock and cluster could be used + * by another order. Not checking flag as off-list + * cluster has no flag set, and change of list + * won't cause fragmentation. + */ + if (!cluster_is_usable(ci, order)) + goto out; + if (cluster_is_free(ci)) + offset = start; + /* Reclaim failed but cluster is usable, try next */ + if (!ret) + continue; } + if (!cluster_alloc_range(si, ci, offset, usage, order)) + break; + found = offset; offset += nr_pages; + if (ci->count < SWAPFILE_CLUSTER && offset <= end) + next = offset; + break; } - if (offset > end) - offset = SWAP_NEXT_INVALID; -done: +out: + relocate_cluster(si, ci); unlock_cluster(ci); - return offset; + if (si->flags & SWP_SOLIDSTATE) + __this_cpu_write(si->percpu_cluster->next[order], next); + else + si->global_cluster->next[order] = next; + return found; } /* Return true if reclaimed a whole cluster */ @@ -749,20 +831,19 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) int nr_reclaim; if (force) - to_scan = si->inuse_pages / SWAPFILE_CLUSTER; + to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while (!list_empty(&si->full_clusters)) { - ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); - list_move_tail(&ci->list, &si->full_clusters); + while ((ci = cluster_isolate_lock(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; - spin_unlock(&si->lock); while (offset < end) { if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { + spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); + spin_lock(&ci->lock); if (nr_reclaim) { offset += abs(nr_reclaim); continue; @@ -770,8 +851,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) } offset++; } - spin_lock(&si->lock); + unlock_cluster(ci); if (to_scan <= 0) break; } @@ -783,9 +864,7 @@ static void swap_reclaim_work(struct work_struct *work) si = container_of(work, struct swap_info_struct, reclaim_work); - spin_lock(&si->lock); swap_reclaim_full_clusters(si, true); - spin_unlock(&si->lock); } /* @@ -796,29 +875,41 @@ static void swap_reclaim_work(struct work_struct *work) static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, unsigned char usage) { - struct percpu_cluster *cluster; struct swap_cluster_info *ci; unsigned int offset, found = 0; -new_cluster: - lockdep_assert_held(&si->lock); - cluster = this_cpu_ptr(si->percpu_cluster); - offset = cluster->next[order]; + if (si->flags & SWP_SOLIDSTATE) { + /* Fast path using per CPU cluster */ + local_lock(&si->percpu_cluster->lock); + offset = __this_cpu_read(si->percpu_cluster->next[order]); + } else { + /* Serialize HDD SWAP allocation for each device. */ + spin_lock(&si->global_cluster_lock); + offset = si->global_cluster->next[order]; + } + if (offset) { - offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); + ci = lock_cluster(si, offset); + /* Cluster could have been used by another order */ + if (cluster_is_usable(ci, order)) { + if (cluster_is_free(ci)) + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, + order, usage); + } else { + unlock_cluster(ci); + } if (found) goto done; } - if (!list_empty(&si->free_clusters)) { - ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); - /* - * Either we didn't touch the cluster due to swapoff, - * or the allocation must success. - */ - VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); - goto done; +new_cluster: + ci = cluster_isolate_lock(si, &si->free_clusters); + if (ci) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; } /* Try reclaim from full clusters if free clusters list is drained */ @@ -826,56 +917,45 @@ new_cluster: swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { - unsigned int frags = 0; + unsigned int frags = 0, frags_existing; - while (!list_empty(&si->nonfull_clusters[order])) { - ci = list_first_entry(&si->nonfull_clusters[order], - struct swap_cluster_info, list); - list_move_tail(&ci->list, &si->frag_clusters[order]); - ci->flags = CLUSTER_FLAG_FRAG; - si->frag_cluster_nr[order]++; - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, order, usage); - frags++; - if (found) - break; - } - - if (!found) { + while ((ci = cluster_isolate_lock(si, &si->nonfull_clusters[order]))) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); /* - * Nonfull clusters are moved to frag tail if we reached - * here, count them too, don't over scan the frag list. + * With `fragmenting` set to true, it will surely take + * the cluster off nonfull list */ - while (frags < si->frag_cluster_nr[order]) { - ci = list_first_entry(&si->frag_clusters[order], - struct swap_cluster_info, list); - /* - * Rotate the frag list to iterate, they were all failing - * high order allocation or moved here due to per-CPU usage, - * this help keeping usable cluster ahead. - */ - list_move_tail(&ci->list, &si->frag_clusters[order]); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, order, usage); - frags++; - if (found) - break; - } + if (found) + goto done; + frags++; + } + + frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); + while (frags < frags_existing && + (ci = cluster_isolate_lock(si, &si->frag_clusters[order]))) { + atomic_long_dec(&si->frag_cluster_nr[order]); + /* + * Rotate the frag list to iterate, they were all + * failing high order allocation or moved here due to + * per-CPU usage, but they could contain newly released + * reclaimable (eg. lazy-freed swap cache) slots. + */ + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; + frags++; } } - if (found) - goto done; - - if (!list_empty(&si->discard_clusters)) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock - */ - swap_do_scheduled_discard(si); + /* + * We don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock + */ + if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) goto new_cluster; - } if (order) goto done; @@ -886,76 +966,153 @@ new_cluster: * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - while (!list_empty(&si->frag_clusters[o])) { - ci = list_first_entry(&si->frag_clusters[o], - struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, 0, usage); + while ((ci = cluster_isolate_lock(si, &si->frag_clusters[o]))) { + atomic_long_dec(&si->frag_cluster_nr[o]); + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + 0, usage); if (found) goto done; } - while (!list_empty(&si->nonfull_clusters[o])) { - ci = list_first_entry(&si->nonfull_clusters[o], - struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, 0, usage); + while ((ci = cluster_isolate_lock(si, &si->nonfull_clusters[o]))) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + 0, usage); if (found) goto done; } } - done: - cluster->next[order] = offset; + if (si->flags & SWP_SOLIDSTATE) + local_unlock(&si->percpu_cluster->lock); + else + spin_unlock(&si->global_cluster_lock); return found; } -static void __del_from_avail_list(struct swap_info_struct *si) +/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ +static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { int nid; + unsigned long pages; + + spin_lock(&swap_avail_lock); + + if (swapoff) { + /* + * Forcefully remove it. Clear the SWP_WRITEOK flags for + * swapoff here so it's synchronized by both si->lock and + * swap_avail_lock, to ensure the result can be seen by + * add_to_avail_list. + */ + lockdep_assert_held(&si->lock); + si->flags &= ~SWP_WRITEOK; + atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); + } else { + /* + * If not called by swapoff, take it off-list only if it's + * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly + * si->inuse_pages == pages), any concurrent slot freeing, + * or device already removed from plist by someone else + * will make this return false. + */ + pages = si->pages; + if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages, + pages | SWAP_USAGE_OFFLIST_BIT)) + goto skip; + } - assert_spin_locked(&si->lock); for_each_node(nid) plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); -} -static void del_from_avail_list(struct swap_info_struct *si) -{ - spin_lock(&swap_avail_lock); - __del_from_avail_list(si); +skip: spin_unlock(&swap_avail_lock); } -static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, +/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ +static void add_to_avail_list(struct swap_info_struct *si, bool swapon) +{ + int nid; + long val; + unsigned long pages; + + spin_lock(&swap_avail_lock); + + /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ + if (swapon) { + lockdep_assert_held(&si->lock); + si->flags |= SWP_WRITEOK; + } else { + if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) + goto skip; + } + + if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) + goto skip; + + val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); + + /* + * When device is full and device is on the plist, only one updater will + * see (inuse_pages == si->pages) and will call del_from_avail_list. If + * that updater happen to be here, just skip adding. + */ + pages = si->pages; + if (val == pages) { + /* Just like the cmpxchg in del_from_avail_list */ + if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages, + pages | SWAP_USAGE_OFFLIST_BIT)) + goto skip; + } + + for_each_node(nid) + plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + +skip: + spin_unlock(&swap_avail_lock); +} + +/* + * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock + * within each cluster, so the total contribution to the global counter should + * always be positive and cannot exceed the total number of usable slots. + */ +static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries) +{ + long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages); + + /* + * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, + * remove it from the plist. + */ + if (unlikely(val == si->pages)) { + del_from_avail_list(si, false); + return true; + } + + return false; +} + +static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries) +{ + long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages); + + /* + * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, + * remove it from the plist. + */ + if (unlikely(val & SWAP_USAGE_OFFLIST_BIT)) + add_to_avail_list(si, false); +} + +static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries) { - unsigned int end = offset + nr_entries - 1; - - if (offset == si->lowest_bit) - si->lowest_bit += nr_entries; - if (end == si->highest_bit) - WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); - WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; - del_from_avail_list(si); - + if (swap_usage_add(si, nr_entries)) { if (si->cluster_info && vm_swap_full()) schedule_work(&si->reclaim_work); } } -static void add_to_avail_list(struct swap_info_struct *si) -{ - int nid; - - spin_lock(&swap_avail_lock); - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); - spin_unlock(&swap_avail_lock); -} - static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { @@ -968,18 +1125,11 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, * Use atomic clear_bit operations only on zeromap instead of non-atomic * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { clear_bit(offset + i, si->zeromap); - - if (offset < si->lowest_bit) - si->lowest_bit = offset; - if (end > si->highest_bit) { - bool was_full = !si->highest_bit; - - WRITE_ONCE(si->highest_bit, end); - if (was_full && (si->flags & SWP_WRITEOK)) - add_to_avail_list(si); + zswap_invalidate(swp_entry(si->type, offset + i)); } + if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; @@ -999,50 +1149,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, */ smp_wmb(); atomic_long_add(nr_entries, &nr_swap_pages); - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); -} - -static void set_cluster_next(struct swap_info_struct *si, unsigned long next) -{ - unsigned long prev; - - if (!(si->flags & SWP_SOLIDSTATE)) { - si->cluster_next = next; - return; - } - - prev = this_cpu_read(*si->cluster_next_cpu); - /* - * Cross the swap address space size aligned trunk, choose - * another trunk randomly to avoid lock contention on swap - * address space if possible. - */ - if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != - (next >> SWAP_ADDRESS_SPACE_SHIFT)) { - /* No free swap slots available */ - if (si->highest_bit <= si->lowest_bit) - return; - next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); - next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); - next = max_t(unsigned int, next, si->lowest_bit); - } - this_cpu_write(*si->cluster_next_cpu, next); -} - -static bool swap_offset_available_and_locked(struct swap_info_struct *si, - unsigned long offset) -{ - if (data_race(!si->swap_map[offset])) { - spin_lock(&si->lock); - return true; - } - - if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - spin_lock(&si->lock); - return true; - } - - return false; + swap_usage_sub(si, nr_entries); } static int cluster_alloc_swap(struct swap_info_struct *si, @@ -1051,10 +1158,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si, { int n_ret = 0; - VM_BUG_ON(!si->cluster_info); - - si->flags += SWP_SCANNING; - while (n_ret < nr) { unsigned long offset = cluster_alloc_swap_entry(si, order, usage); @@ -1063,8 +1166,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si, slots[n_ret++] = swp_entry(si->type, offset); } - si->flags -= SWP_SCANNING; - return n_ret; } @@ -1072,13 +1173,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { - unsigned long offset; - unsigned long scan_base; - unsigned long last_in_cluster = 0; - int latency_ration = LATENCY_LIMIT; unsigned int nr_pages = 1 << order; - int n_ret = 0; - bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially @@ -1090,7 +1185,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, * But we do now try to find an empty cluster. -Andrea * And we let swap pages go all over an SSD partition. Hugh */ - if (order > 0) { /* * Should not even be attempting large allocations when huge @@ -1103,165 +1197,30 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } /* - * Swapfile is not block device or not using clusters so unable + * Swapfile is not block device so unable * to allocate large entries. */ - if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) + if (!(si->flags & SWP_BLKDEV)) return 0; } - if (si->cluster_info) - return cluster_alloc_swap(si, usage, nr, slots, order); - - si->flags += SWP_SCANNING; - - /* For HDD, sequential access is more important. */ - scan_base = si->cluster_next; - offset = scan_base; - - if (unlikely(!si->cluster_nr--)) { - if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - - spin_unlock(&si->lock); - - /* - * If seek is expensive, start searching for new cluster from - * start of partition, to minimize the span of allocated swap. - */ - scan_base = offset = si->lowest_bit; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - - /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { - if (si->swap_map[offset]) - last_in_cluster = offset + SWAPFILE_CLUSTER; - else if (offset == last_in_cluster) { - spin_lock(&si->lock); - offset -= SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - } - } - - offset = scan_base; - spin_lock(&si->lock); - si->cluster_nr = SWAPFILE_CLUSTER - 1; - } - -checks: - if (!(si->flags & SWP_WRITEOK)) - goto no_page; - if (!si->highest_bit) - goto no_page; - if (offset > si->highest_bit) - scan_base = offset = si->lowest_bit; - - /* reuse swap entry of cache-only swap if not busy. */ - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - int swap_was_freed; - spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); - spin_lock(&si->lock); - /* entry was freed successfully, try to use this again */ - if (swap_was_freed > 0) - goto checks; - goto scan; /* check next one */ - } - - if (si->swap_map[offset]) { - if (!n_ret) - goto scan; - else - goto done; - } - memset(si->swap_map + offset, usage, nr_pages); - - swap_range_alloc(si, offset, nr_pages); - slots[n_ret++] = swp_entry(si->type, offset); - - /* got enough slots or reach max slots? */ - if ((n_ret == nr) || (offset >= si->highest_bit)) - goto done; - - /* search for next available slot */ - - /* time to take a break? */ - if (unlikely(--latency_ration < 0)) { - if (n_ret) - goto done; - spin_unlock(&si->lock); - cond_resched(); - spin_lock(&si->lock); - latency_ration = LATENCY_LIMIT; - } - - if (si->cluster_nr && !si->swap_map[++offset]) { - /* non-ssd case, still more slots in cluster? */ - --si->cluster_nr; - goto checks; - } + return cluster_alloc_swap(si, usage, nr, slots, order); +} +static bool get_swap_device_info(struct swap_info_struct *si) +{ + if (!percpu_ref_tryget_live(&si->users)) + return false; /* - * Even if there's no free clusters available (fragmented), - * try to scan a little more quickly with lock held unless we - * have scanned too many slots already. + * Guarantee the si->users are checked before accessing other + * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is + * up to dated. + * + * Paired with the spin_unlock() after setup_swap_info() in + * enable_swap_info(), and smp_wmb() in swapoff. */ - if (!scanned_many) { - unsigned long scan_limit; - - if (offset < scan_base) - scan_limit = scan_base; - else - scan_limit = si->highest_bit; - for (; offset <= scan_limit && --latency_ration > 0; - offset++) { - if (!si->swap_map[offset]) - goto checks; - } - } - -done: - if (order == 0) - set_cluster_next(si, offset + 1); - si->flags -= SWP_SCANNING; - return n_ret; - -scan: - VM_WARN_ON(order > 0); - spin_unlock(&si->lock); - while (++offset <= READ_ONCE(si->highest_bit)) { - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - scanned_many = true; - } - if (swap_offset_available_and_locked(si, offset)) - goto checks; - } - offset = si->lowest_bit; - while (offset < scan_base) { - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - scanned_many = true; - } - if (swap_offset_available_and_locked(si, offset)) - goto checks; - offset++; - } - spin_lock(&si->lock); - -no_page: - si->flags -= SWP_SCANNING; - return n_ret; + smp_rmb(); + return true; } int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) @@ -1291,32 +1250,15 @@ start_over: /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); - spin_lock(&si->lock); - if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - if (plist_node_empty(&si->avail_lists[node])) { - spin_unlock(&si->lock); - goto nextsi; - } - WARN(!si->highest_bit, - "swap_info %d in list but !highest_bit\n", - si->type); - WARN(!(si->flags & SWP_WRITEOK), - "swap_info %d in list but !SWP_WRITEOK\n", - si->type); - __del_from_avail_list(si); - spin_unlock(&si->lock); - goto nextsi; + if (get_swap_device_info(si)) { + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries, order); + put_swap_device(si); + if (n_ret || size > 1) + goto check_out; } - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries, order); - spin_unlock(&si->lock); - if (n_ret || size > 1) - goto check_out; - cond_resched(); spin_lock(&swap_avail_lock); -nextsi: /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map_slots() can drop the si->lock, @@ -1376,22 +1318,6 @@ out: return NULL; } -static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, - struct swap_info_struct *q) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - - if (p != q) { - if (q != NULL) - spin_unlock(&q->lock); - if (p != NULL) - spin_lock(&p->lock); - } - return p; -} - static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, unsigned long offset, unsigned char usage) @@ -1481,16 +1407,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) si = swp_swap_info(entry); if (!si) goto bad_nofile; - if (!percpu_ref_tryget_live(&si->users)) + if (!get_swap_device_info(si)) goto out; - /* - * Guarantee the si->users are checked before accessing other - * fields of swap_info_struct. - * - * Paired with the spin_unlock() after setup_swap_info() in - * enable_swap_info(). - */ - smp_rmb(); offset = swp_offset(entry); if (offset >= si->max) goto put_out; @@ -1513,11 +1431,11 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si, unsigned long offset = swp_offset(entry); unsigned char usage; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); usage = __swap_entry_free_locked(si, offset, 1); - unlock_cluster_or_swap_info(si, ci); if (!usage) - free_swap_slot(entry); + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + unlock_cluster(ci); return usage; } @@ -1538,22 +1456,17 @@ static bool __swap_entries_free(struct swap_info_struct *si, if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) goto fallback; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); goto fallback; } for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster_or_swap_info(si, ci); + if (!has_cache) + swap_entry_range_free(si, ci, entry, nr); + unlock_cluster(ci); - if (!has_cache) { - for (i = 0; i < nr; i++) - zswap_invalidate(swp_entry(si->type, offset + i)); - spin_lock(&si->lock); - swap_entry_range_free(si, entry, nr); - spin_unlock(&si->lock); - } return has_cache; fallback: @@ -1573,24 +1486,32 @@ fallback: * Drop the last HAS_CACHE flag of swap entries, caller have to * ensure all entries belong to the same cgroup. */ -static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, - unsigned int nr_pages) +static void swap_entry_range_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; - struct swap_cluster_info *ci; - ci = lock_cluster(si, offset); + /* It should never free entries across different clusters */ + VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(cluster_is_free(ci)); + VM_BUG_ON(ci->count < nr_pages); + + ci->count -= nr_pages; do { VM_BUG_ON(*map != SWAP_HAS_CACHE); *map = 0; } while (++map < map_end); - dec_cluster_info_page(si, ci, nr_pages); - unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); + + if (!ci->count) + free_cluster(si, ci); + else + partial_free_cluster(si, ci); } static void cluster_swap_free_nr(struct swap_info_struct *si, @@ -1598,29 +1519,14 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, unsigned char usage) { struct swap_cluster_info *ci; - DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; - int i, nr; + unsigned long end = offset + nr_pages; - ci = lock_cluster_or_swap_info(si, offset); - while (nr_pages) { - nr = min(BITS_PER_LONG, nr_pages); - for (i = 0; i < nr; i++) { - if (!__swap_entry_free_locked(si, offset + i, usage)) - bitmap_set(to_free, i, 1); - } - if (!bitmap_empty(to_free, BITS_PER_LONG)) { - unlock_cluster_or_swap_info(si, ci); - for_each_set_bit(i, to_free, BITS_PER_LONG) - free_swap_slot(swp_entry(si->type, offset + i)); - if (nr == nr_pages) - return; - bitmap_clear(to_free, 0, BITS_PER_LONG); - ci = lock_cluster_or_swap_info(si, offset); - } - offset += nr; - nr_pages -= nr; - } - unlock_cluster_or_swap_info(si, ci); + ci = lock_cluster(si, offset); + do { + if (!__swap_entry_free_locked(si, offset, usage)) + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + } while (++offset < end); + unlock_cluster(ci); } /* @@ -1659,59 +1565,35 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster_or_swap_info(si, offset); - if (size > 1 && swap_is_has_cache(si, offset, size)) { - unlock_cluster_or_swap_info(si, ci); - spin_lock(&si->lock); - swap_entry_range_free(si, entry, size); - spin_unlock(&si->lock); - return; - } - for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { - unlock_cluster_or_swap_info(si, ci); - free_swap_slot(entry); - if (i == size - 1) - return; - lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); + if (swap_is_has_cache(si, offset, size)) + swap_entry_range_free(si, ci, entry, size); + else { + for (int i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) + swap_entry_range_free(si, ci, entry, 1); } } - unlock_cluster_or_swap_info(si, ci); -} - -static int swp_entry_cmp(const void *ent1, const void *ent2) -{ - const swp_entry_t *e1 = ent1, *e2 = ent2; - - return (int)swp_type(*e1) - (int)swp_type(*e2); + unlock_cluster(ci); } void swapcache_free_entries(swp_entry_t *entries, int n) { - struct swap_info_struct *p, *prev; int i; + struct swap_cluster_info *ci; + struct swap_info_struct *si = NULL; if (n <= 0) return; - prev = NULL; - p = NULL; - - /* - * Sort swap entries by swap device, so each lock is only taken once. - * nr_swapfiles isn't absolutely correct, but the overhead of sort() is - * so low that it isn't necessary to optimize further. - */ - if (nr_swapfiles > 1) - sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { - p = swap_info_get_cont(entries[i], prev); - if (p) - swap_entry_range_free(p, entries[i], 1); - prev = p; + si = _swap_info_get(entries[i]); + if (si) { + ci = lock_cluster(si, swp_offset(entries[i])); + swap_entry_range_free(si, ci, entries[i], 1); + unlock_cluster(ci); + } } - if (p) - spin_unlock(&p->lock); } int __swap_count(swp_entry_t entry) @@ -1733,9 +1615,9 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) struct swap_cluster_info *ci; int count; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return count; } @@ -1758,7 +1640,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1781,7 +1663,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return count; } @@ -1796,8 +1678,8 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster_or_swap_info(si, offset); - if (!ci || nr_pages == 1) { + ci = lock_cluster(si, offset); + if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; @@ -1809,7 +1691,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return ret; } @@ -1963,10 +1845,8 @@ swp_entry_t get_swap_page_of_type(int type) goto fail; /* This is called for allocating swap entry, not cache */ - spin_lock(&si->lock); if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) atomic_long_dec(&nr_swap_pages); - spin_unlock(&si->lock); fail: return entry; } @@ -2057,7 +1937,7 @@ unsigned int count_swap_pages(int type, int free) if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) - n -= sis->inuse_pages; + n -= swap_usage_in_pages(sis); } spin_unlock(&sis->lock); } @@ -2392,7 +2272,7 @@ static int try_to_unuse(unsigned int type) swp_entry_t entry; unsigned int i; - if (!READ_ONCE(si->inuse_pages)) + if (!swap_usage_in_pages(si)) goto success; retry: @@ -2405,7 +2285,7 @@ retry: spin_lock(&mmlist_lock); p = &init_mm.mmlist; - while (READ_ONCE(si->inuse_pages) && + while (swap_usage_in_pages(si) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { @@ -2433,7 +2313,7 @@ retry: mmput(prev_mm); i = 0; - while (READ_ONCE(si->inuse_pages) && + while (swap_usage_in_pages(si) && !signal_pending(current) && (i = find_next_to_unuse(si, i)) != 0) { @@ -2468,7 +2348,7 @@ retry: * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ - if (READ_ONCE(si->inuse_pages)) { + if (swap_usage_in_pages(si)) { if (!signal_pending(current)) goto retry; return -EINTR; @@ -2477,7 +2357,7 @@ retry: success: /* * Make sure that further cleanups after try_to_unuse() returns happen - * after swap_range_free() reduces si->inuse_pages to 0. + * after swap_range_free() reduces inuse_pages to 0. */ smp_mb(); return 0; @@ -2495,7 +2375,7 @@ static void drain_mmlist(void) unsigned int type; for (type = 0; type < nr_swapfiles; type++) - if (swap_info[type]->inuse_pages) + if (swap_usage_in_pages(swap_info[type])) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) @@ -2674,7 +2554,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, static void _enable_swap_info(struct swap_info_struct *si) { - si->flags |= SWP_WRITEOK; atomic_long_add(si->pages, &nr_swap_pages); total_swap_pages += si->pages; @@ -2691,9 +2570,8 @@ static void _enable_swap_info(struct swap_info_struct *si) */ plist_add(&si->list, &swap_active_head); - /* add to available list iff swap device is not full */ - if (si->highest_bit) - add_to_avail_list(si); + /* Add back to available list */ + add_to_avail_list(si, true); } static void enable_swap_info(struct swap_info_struct *si, int prio, @@ -2742,6 +2620,25 @@ bool has_usable_swap(void) return ret; } +/* + * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range + * see the updated flags, so there will be no more allocations. + */ +static void wait_for_allocation(struct swap_info_struct *si) +{ + unsigned long offset; + unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER); + struct swap_cluster_info *ci; + + BUG_ON(si->flags & SWP_WRITEOK); + + for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { + ci = lock_cluster(si, offset); + unlock_cluster(ci); + offset += SWAPFILE_CLUSTER; + } +} + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; @@ -2791,7 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out_dput; } spin_lock(&p->lock); - del_from_avail_list(p); + del_from_avail_list(p, true); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; @@ -2809,10 +2706,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; - p->flags &= ~SWP_WRITEOK; spin_unlock(&p->lock); spin_unlock(&swap_lock); + wait_for_allocation(p); + disable_swap_slots_cache_lock(); set_current_oom_origin(); @@ -2855,16 +2753,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_lock(&p->lock); drain_mmlist(); - /* wait for anyone still in scan_swap_map_slots */ - p->highest_bit = 0; /* cuts scans short */ - while (p->flags >= SWP_SCANNING) { - spin_unlock(&p->lock); - spin_unlock(&swap_lock); - schedule_timeout_uninterruptible(1); - spin_lock(&swap_lock); - spin_lock(&p->lock); - } - swap_file = p->swap_file; p->swap_file = NULL; p->max = 0; @@ -2881,8 +2769,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; - free_percpu(p->cluster_next_cpu); - p->cluster_next_cpu = NULL; + kfree(p->global_cluster); + p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); @@ -2992,7 +2880,7 @@ static int swap_show(struct seq_file *swap, void *v) } bytes = K(si->pages); - inuse = K(READ_ONCE(si->inuse_pages)); + inuse = K(swap_usage_in_pages(si)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); @@ -3109,6 +2997,7 @@ static struct swap_info_struct *alloc_swap_info(void) } spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); + atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); return p; @@ -3193,10 +3082,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return 0; } - si->lowest_bit = 1; - si->cluster_next = 1; - si->cluster_nr = 0; - maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { @@ -3213,7 +3098,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } - si->highest_bit = maxpages - 1; if (!maxpages) return 0; @@ -3281,7 +3165,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; struct swap_cluster_info *cluster_info; unsigned long i, j, k, idx; int cpu, err = -ENOMEM; @@ -3293,25 +3176,24 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); - si->cluster_next_cpu = alloc_percpu(unsigned int); - if (!si->cluster_next_cpu) - goto err_free; + if (si->flags & SWP_SOLIDSTATE) { + si->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!si->percpu_cluster) + goto err_free; - /* Random start position to help with wear leveling */ - for_each_possible_cpu(cpu) - per_cpu(*si->cluster_next_cpu, cpu) = - get_random_u32_inclusive(1, si->highest_bit); + for_each_possible_cpu(cpu) { + struct percpu_cluster *cluster; - si->percpu_cluster = alloc_percpu(struct percpu_cluster); - if (!si->percpu_cluster) - goto err_free; - - for_each_possible_cpu(cpu) { - struct percpu_cluster *cluster; - - cluster = per_cpu_ptr(si->percpu_cluster, cpu); + cluster = per_cpu_ptr(si->percpu_cluster, cpu); + for (i = 0; i < SWAP_NR_ORDERS; i++) + cluster->next[i] = SWAP_ENTRY_INVALID; + local_lock_init(&cluster->lock); + } + } else { + si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); for (i = 0; i < SWAP_NR_ORDERS; i++) - cluster->next[i] = SWAP_NEXT_INVALID; + si->global_cluster->next[i] = SWAP_ENTRY_INVALID; + spin_lock_init(&si->global_cluster_lock); } /* @@ -3335,7 +3217,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - si->frag_cluster_nr[i] = 0; + atomic_long_set(&si->frag_cluster_nr[i], 0); } /* @@ -3343,7 +3225,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * sharing same address space. */ for (k = 0; k < SWAP_CLUSTER_COLS; k++) { - j = (k + col) % SWAP_CLUSTER_COLS; + j = k % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; @@ -3493,18 +3375,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (si->bdev && bdev_nonrot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; - - cluster_info = setup_clusters(si, swap_header, maxpages); - if (IS_ERR(cluster_info)) { - error = PTR_ERR(cluster_info); - cluster_info = NULL; - goto bad_swap_unlock_inode; - } } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } + cluster_info = setup_clusters(si, swap_header, maxpages); + if (IS_ERR(cluster_info)) { + error = PTR_ERR(cluster_info); + cluster_info = NULL; + goto bad_swap_unlock_inode; + } + if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* @@ -3585,8 +3467,8 @@ bad_swap_unlock_inode: bad_swap: free_percpu(si->percpu_cluster); si->percpu_cluster = NULL; - free_percpu(si->cluster_next_cpu); - si->cluster_next_cpu = NULL; + kfree(si->global_cluster); + si->global_cluster = NULL; inode = NULL; destroy_swap_extents(si); swap_cgroup_swapoff(si->type); @@ -3623,7 +3505,7 @@ void si_swapinfo(struct sysinfo *val) struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) - nr_to_be_unused += READ_ONCE(si->inuse_pages); + nr_to_be_unused += swap_usage_in_pages(si); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; @@ -3655,7 +3537,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3710,7 +3592,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return err; } @@ -3819,7 +3701,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ goto outer; } - spin_lock(&si->lock); offset = swp_offset(entry); @@ -3884,7 +3765,6 @@ out_unlock_cont: spin_unlock(&si->cont_lock); out: unlock_cluster(ci); - spin_unlock(&si->lock); put_swap_device(si); outer: if (page) diff --git a/mm/truncate.c b/mm/truncate.c index 7c304d2f0052..e2e115adfbc5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -525,6 +525,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } EXPORT_SYMBOL(invalidate_mapping_pages); +static int folio_launder(struct address_space *mapping, struct folio *folio) +{ + if (!folio_test_dirty(folio)) + return 0; + if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) + return 0; + return mapping->a_ops->launder_folio(folio); +} + /* * This is like mapping_evict_folio(), except it ignores the folio's * refcount. We do this because invalidate_inode_pages2() needs stronger @@ -532,14 +541,26 @@ EXPORT_SYMBOL(invalidate_mapping_pages); * shrink_folio_list() has a temp ref on them, or because they're transiently * sitting in the folio_add_lru() caches. */ -static int invalidate_complete_folio2(struct address_space *mapping, - struct folio *folio) +int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, + gfp_t gfp) { - if (folio->mapping != mapping) - return 0; + int ret; - if (!filemap_release_folio(folio, GFP_KERNEL)) + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (folio_test_dirty(folio)) return 0; + if (folio_mapped(folio)) + unmap_mapping_folio(folio); + BUG_ON(folio_mapped(folio)); + + ret = folio_launder(mapping, folio); + if (ret) + return ret; + if (folio->mapping != mapping) + return -EBUSY; + if (!filemap_release_folio(folio, gfp)) + return -EBUSY; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); @@ -558,16 +579,7 @@ static int invalidate_complete_folio2(struct address_space *mapping, failed: xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); - return 0; -} - -static int folio_launder(struct address_space *mapping, struct folio *folio) -{ - if (!folio_test_dirty(folio)) - return 0; - if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) - return 0; - return mapping->a_ops->launder_folio(folio); + return -EBUSY; } /** @@ -631,16 +643,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); - - if (folio_mapped(folio)) - unmap_mapping_folio(folio); - BUG_ON(folio_mapped(folio)); - - ret2 = folio_launder(mapping, folio); - if (ret2 == 0) { - if (!invalidate_complete_folio2(mapping, folio)) - ret2 = -EBUSY; - } + ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL); if (ret2 < 0) ret = ret2; folio_unlock(folio); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 60a0be33766f..411a663932c4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -84,16 +84,9 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, mmap_read_lock(mm); vma = find_vma_and_prepare_anon(mm, address); - if (!IS_ERR(vma)) { - /* - * We cannot use vma_start_read() as it may fail due to - * false locked (see comment in vma_start_read()). We - * can avoid that by directly locking vm_lock under - * mmap_lock, which guarantees that nobody can lock the - * vma for write (vma_start_write()) under us. - */ - down_read(&vma->vm_lock->lock); - } + if (!IS_ERR(vma)) + if (!vma_start_read_locked(vma)) + vma = ERR_PTR(-EAGAIN); mmap_read_unlock(mm); return vma; @@ -1020,6 +1013,14 @@ void double_pt_unlock(spinlock_t *ptl1, __release(ptl2); } +static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval) +{ + return pte_same(ptep_get(src_pte), orig_src_pte) && + pte_same(ptep_get(dst_pte), orig_dst_pte) && + pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); +} static int move_present_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1027,6 +1028,7 @@ static int move_present_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, struct folio *src_folio) { @@ -1034,8 +1036,8 @@ static int move_present_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { err = -EAGAIN; goto out; } @@ -1071,6 +1073,7 @@ static int move_swap_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl) { if (!pte_swp_exclusive(orig_src_pte)) @@ -1078,8 +1081,8 @@ static int move_swap_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1097,13 +1100,14 @@ static int move_zeropage_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl) { pte_t zero_pte; double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1136,6 +1140,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pte_t *src_pte = NULL; pte_t *dst_pte = NULL; pmd_t dummy_pmdval; + pmd_t dst_pmdval; struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; @@ -1148,11 +1153,11 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, retry: /* * Use the maywrite version to indicate that dst_pte will be modified, - * but since we will use pte_same() to detect the change of the pte - * entry, there is no need to get pmdval, so just pass a dummy variable - * to it. + * since dst_pte needs to be none, the subsequent pte_same() check + * cannot prevent the dst_pte page from being freed concurrently, so we + * also need to abtain dst_pmdval and recheck pmd_same() later. */ - dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval, + dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, &dst_ptl); /* Retry if a huge pmd materialized from under us */ @@ -1161,7 +1166,11 @@ retry: goto out; } - /* same as dst_pte */ + /* + * Unlike dst_pte, the subsequent pte_same() check can ensure the + * stability of the src_pte page, so there is no need to get pmdval, + * just pass a dummy variable to it. + */ src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, &src_ptl); @@ -1177,8 +1186,8 @@ retry: } /* Sanity checks before the operation */ - if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || - WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { + if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || + pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { err = -EINVAL; goto out; } @@ -1213,7 +1222,7 @@ retry: err = move_zeropage_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl); + dst_pmd, dst_pmdval, dst_ptl, src_ptl); goto out; } @@ -1303,8 +1312,8 @@ retry: err = move_present_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl, src_folio); + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl, src_folio); } else { entry = pte_to_swp_entry(orig_src_pte); if (non_swap_entry(entry)) { @@ -1319,10 +1328,9 @@ retry: goto out; } - err = move_swap_pte(mm, dst_addr, src_addr, - dst_pte, src_pte, - orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl); + err = move_swap_pte(mm, dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl); } out: @@ -1476,14 +1484,17 @@ static int uffd_move_lock(struct mm_struct *mm, mmap_read_lock(mm); err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); if (!err) { - /* - * See comment in uffd_lock_vma() as to why not using - * vma_start_read() here. - */ - down_read(&(*dst_vmap)->vm_lock->lock); - if (*dst_vmap != *src_vmap) - down_read_nested(&(*src_vmap)->vm_lock->lock, - SINGLE_DEPTH_NESTING); + if (vma_start_read_locked(*dst_vmap)) { + if (*dst_vmap != *src_vmap) { + if (!vma_start_read_locked_nested(*src_vmap, + SINGLE_DEPTH_NESTING)) { + vma_end_read(*dst_vmap); + err = -EAGAIN; + } + } + } else { + err = -EAGAIN; + } } mmap_read_unlock(mm); return err; diff --git a/mm/util.c b/mm/util.c index 60aa40f612b8..b6b9684a1438 100644 --- a/mm/util.c +++ b/mm/util.c @@ -582,6 +582,23 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, return ret; } +/* + * Perform a userland memory mapping into the current process address space. See + * the comment for do_mmap() for more details on this operation in general. + * + * This differs from do_mmap() in that: + * + * a. An offset parameter is provided rather than pgoff, which is both checked + * for overflow and page alignment. + * b. mmap locking is performed on the caller's behalf. + * c. Userfaultfd unmap events and memory population are handled. + * + * This means that this function performs essentially the same work as if + * userland were invoking mmap (2). + * + * Returns either an error, or the address at which the requested mapping has + * been performed. + */ unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) diff --git a/mm/vma.c b/mm/vma.c index bb2119e5a0d0..0a5158d611e3 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -202,6 +202,38 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, flush_dcache_mmap_unlock(mapping); } +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_lock and by + * the root anon_vma's mutex. + */ +static void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct @@ -295,7 +327,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, if (vp->remove) { again: - vma_mark_detached(vp->remove, true); + vma_mark_detached(vp->remove); if (vp->file) { uprobe_munmap(vp->remove, vp->remove->vm_start, vp->remove->vm_end); @@ -374,17 +406,14 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg, /* * Close a vm structure and free it. */ -void remove_vma(struct vm_area_struct *vma, bool unreachable) +void remove_vma(struct vm_area_struct *vma) { might_sleep(); vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - if (unreachable) - __vm_area_free(vma); - else - vm_area_free(vma); + vm_area_free(vma); } /* @@ -398,7 +427,6 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; - lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, @@ -415,8 +443,9 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ -static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static __must_check int +__split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; @@ -510,38 +539,6 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } -/* - * vma has some anon_vma assigned, and is already inserted on that - * anon_vma's interval trees. - * - * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the - * vma must be removed from the anon_vma's interval trees using - * anon_vma_interval_tree_pre_update_vma(). - * - * After the update, the vma will be reinserted using - * anon_vma_interval_tree_post_update_vma(). - * - * The entire update must be protected by exclusive mmap_lock and by - * the root anon_vma's mutex. - */ -void -anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); -} - -void -anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); -} - /* * dup_anon_vma() - Helper function to duplicate anon_vma * @dst: The destination VMA @@ -660,14 +657,14 @@ static int commit_merge(struct vma_merge_struct *vmg, vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff); if (expanded) - vma_iter_store(vmg->vmi, vmg->vma); + vma_iter_store_attached(vmg->vmi, vmg->vma); if (adj_start) { adjust->vm_start += adj_start; adjust->vm_pgoff += PHYS_PFN(adj_start); if (adj_start < 0) { WARN_ON(expanded); - vma_iter_store(vmg->vmi, adjust); + vma_iter_store_attached(vmg->vmi, adjust); } } @@ -710,7 +707,8 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end). */ -static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg) +static __must_check struct vm_area_struct *vma_merge_existing_range( + struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->vma; struct vm_area_struct *prev = vmg->prev; @@ -728,19 +726,20 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct * bool expanded; mmap_assert_write_locked(vmg->mm); - VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */ - VM_WARN_ON(vmg->next); /* We set this. */ - VM_WARN_ON(prev && start <= prev->vm_start); - VM_WARN_ON(start >= end); + VM_WARN_ON_VMG(!vma, vmg); /* We are modifying a VMA, so caller must specify. */ + VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ + VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); + VM_WARN_ON_VMG(start >= end, vmg); + /* * If vma == prev, then we are offset into a VMA. Otherwise, if we are * not, we must span a portion of the VMA. */ - VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) || - vmg->end > vma->vm_end)); + VM_WARN_ON_VMG(vma && ((vma != prev && vmg->start != vma->vm_start) || + vmg->end > vma->vm_end), vmg); /* The vmi must be positioned within vmg->vma. */ - VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && - vma_iter_addr(vmg->vmi) < vma->vm_end)); + VM_WARN_ON_VMG(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && + vma_iter_addr(vmg->vmi) < vma->vm_end), vmg); vmg->state = VMA_MERGE_NOMERGE; @@ -857,9 +856,9 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct * pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); - VM_WARN_ON(!merge_right); + VM_WARN_ON_VMG(!merge_right, vmg); /* If we are offset into a VMA, then prev must be vma. */ - VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev); + VM_WARN_ON_VMG(vmg->start > vma->vm_start && prev && vma != prev, vmg); if (merge_will_delete_vma) { vmg->vma = next; @@ -971,9 +970,9 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND; mmap_assert_write_locked(vmg->mm); - VM_WARN_ON(vmg->vma); + VM_WARN_ON_VMG(vmg->vma, vmg); /* vmi must point at or before the gap. */ - VM_WARN_ON(vma_iter_addr(vmg->vmi) > end); + VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; @@ -1055,7 +1054,7 @@ int vma_expand(struct vma_merge_struct *vmg) remove_next = true; /* This should already have been checked by this point. */ - VM_WARN_ON(!can_merge_remove_vma(next)); + VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); vma_start_write(next); ret = dup_anon_vma(vma, next, &anon_dup); if (ret) @@ -1063,10 +1062,10 @@ int vma_expand(struct vma_merge_struct *vmg) } /* Not merging but overwriting any part of next is not handled. */ - VM_WARN_ON(next && !remove_next && - next != vma && vmg->end > next->vm_start); + VM_WARN_ON_VMG(next && !remove_next && + next != vma && vmg->end > next->vm_start, vmg); /* Only handles expanding */ - VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end); + VM_WARN_ON_VMG(vma->vm_start < vmg->start || vma->vm_end > vmg->end, vmg); if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true)) goto nomem; @@ -1130,7 +1129,6 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, * were isolated before we downgraded mmap_lock. */ mas_set(mas_detach, 1); - lru_add_drain(); tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, @@ -1198,7 +1196,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - remove_vma(vma, /* unreachable = */ false); + remove_vma(vma); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); @@ -1220,7 +1218,7 @@ static void reattach_vmas(struct ma_state *mas_detach) mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - vma_mark_detached(vma, false); + vma_mark_attached(vma); __mt_destroy(mas_detach->tree); } @@ -1295,7 +1293,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, if (error) goto munmap_gather_failed; - vma_mark_detached(next, true); + vma_mark_detached(next); nrpages = vma_pages(next); vms->nr_pages += nrpages; @@ -2430,7 +2428,7 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } -unsigned long __mmap_region(struct file *file, unsigned long addr, +static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { @@ -2481,3 +2479,476 @@ abort_munmap: vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } + +/** + * mmap_region() - Actually perform the userland mapping of a VMA into + * current->mm with known, aligned and overflow-checked @addr and @len, and + * correctly determined VMA flags @vm_flags and page offset @pgoff. + * + * This is an internal memory management function, and should not be used + * directly. + * + * The caller must write-lock current->mm->mmap_lock. + * + * @file: If a file-backed mapping, a pointer to the struct file describing the + * file to be mapped, otherwise NULL. + * @addr: The page-aligned address at which to perform the mapping. + * @len: The page-aligned, non-zero, length of the mapping. + * @vm_flags: The VMA flags which should be applied to the mapping. + * @pgoff: If @file is specified, the page offset into the file, if not then + * the virtual page offset in memory of the anonymous mapping. + * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap + * events. + * + * Returns: Either an error, or the address at which the requested mapping has + * been performed. + */ +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + unsigned long ret; + bool writable_file_mapping = false; + + mmap_assert_write_locked(current->mm); + + /* Check to see if MDWE is applicable. */ + if (map_deny_write_exec(vm_flags, vm_flags)) + return -EACCES; + + /* Allow architectures to sanity-check the vm_flags. */ + if (!arch_validate_flags(vm_flags)) + return -EINVAL; + + /* Map writable and ensure this isn't a sealed memfd. */ + if (file && is_shared_maywrite(vm_flags)) { + int error = mapping_map_writable(file->f_mapping); + + if (error) + return error; + writable_file_mapping = true; + } + + ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + + /* Clear our write mapping regardless of error. */ + if (writable_file_mapping) + mapping_unmap_writable(file->f_mapping); + + validate_mm(current->mm); + return ret; +} + +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @vmi: The vma iterator + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. + */ +int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + + /* + * Check against address space limits by the changed size + * Note: This happens *after* clearing old mappings in some code paths. + */ + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) + return -ENOMEM; + + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) + return -ENOMEM; + + /* + * Expand the existing vma if possible; Note that singular lists do not + * occur after forking, so the expand will only happen on new VMAs. + */ + if (vma && vma->vm_end == addr) { + VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); + + vmg.prev = vma; + /* vmi is positioned at prev, which this mode expects. */ + vmg.merge_flags = VMG_FLAG_JUST_EXPAND; + + if (vma_merge_new_range(&vmg)) + goto out; + else if (vmg_nomem(&vmg)) + goto unacct_fail; + } + + if (vma) + vma_iter_next_range(vmi); + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto unacct_fail; + + vma_set_anonymous(vma); + vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); + vm_flags_init(vma, flags); + vma->vm_page_prot = vm_get_page_prot(flags); + vma_start_write(vma); + if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) + goto mas_store_fail; + + mm->map_count++; + validate_mm(mm); + ksm_add_vma(vma); +out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; + mm->data_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) + mm->locked_vm += (len >> PAGE_SHIFT); + vm_flags_set(vma, VM_SOFTDIRTY); + return 0; + +mas_store_fail: + vm_area_free(vma); +unacct_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; +} + +/** + * unmapped_area() - Find an area between the low_limit and the high_limit with + * the correct alignment and offset, all from @info. Note: current->mm is used + * for the search. + * + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ +unsigned long unmapped_area(struct vm_unmapped_area_info *info) +{ + unsigned long length, gap; + unsigned long low_limit, high_limit; + struct vm_area_struct *tmp; + VMA_ITERATOR(vmi, current->mm, 0); + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask + info->start_gap; + if (length < info->length) + return -ENOMEM; + + low_limit = info->low_limit; + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; +retry: + if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) + return -ENOMEM; + + /* + * Adjust for the gap first so it doesn't interfere with the + * later alignment. The first step is the minimum needed to + * fulill the start gap, the next steps is the minimum to align + * that. It is the minimum needed to fulill both. + */ + gap = vma_iter_addr(&vmi) + info->start_gap; + gap += (info->align_offset - gap) & info->align_mask; + tmp = vma_next(&vmi); + if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap + length - 1) { + low_limit = tmp->vm_end; + vma_iter_reset(&vmi); + goto retry; + } + } else { + tmp = vma_prev(&vmi); + if (tmp && vm_end_gap(tmp) > gap) { + low_limit = vm_end_gap(tmp); + vma_iter_reset(&vmi); + goto retry; + } + } + + return gap; +} + +/** + * unmapped_area_topdown() - Find an area between the low_limit and the + * high_limit with the correct alignment and offset at the highest available + * address, all from @info. Note: current->mm is used for the search. + * + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +{ + unsigned long length, gap, gap_end; + unsigned long low_limit, high_limit; + struct vm_area_struct *tmp; + VMA_ITERATOR(vmi, current->mm, 0); + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask + info->start_gap; + if (length < info->length) + return -ENOMEM; + + low_limit = info->low_limit; + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; +retry: + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) + return -ENOMEM; + + gap = vma_iter_end(&vmi) - info->length; + gap -= (gap - info->align_offset) & info->align_mask; + gap_end = vma_iter_end(&vmi); + tmp = vma_next(&vmi); + if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap_end) { + high_limit = vm_start_gap(tmp); + vma_iter_reset(&vmi); + goto retry; + } + } else { + tmp = vma_prev(&vmi); + if (tmp && vm_end_gap(tmp) > gap) { + high_limit = tmp->vm_start; + vma_iter_reset(&vmi); + goto retry; + } + } + + return gap; +} + +/* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the + * grow-up and grow-down cases. + */ +static int acct_stack_growth(struct vm_area_struct *vma, + unsigned long size, unsigned long grow) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long new_start; + + /* address space limit tests */ + if (!may_expand_vm(mm, vma->vm_flags, grow)) + return -ENOMEM; + + /* Stack limit test */ + if (size > rlimit(RLIMIT_STACK)) + return -ENOMEM; + + /* mlock limit tests */ + if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) + return -ENOMEM; + + /* Check to ensure the stack will not grow into a hugetlb-only region */ + new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : + vma->vm_end - size; + if (is_hugepage_only_range(vma->vm_mm, new_start, size)) + return -EFAULT; + + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory_mm(mm, grow)) + return -ENOMEM; + + return 0; +} + +#if defined(CONFIG_STACK_GROWSUP) +/* + * PA-RISC uses this for its stack. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +int expand_upwards(struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; + unsigned long gap_addr; + int error = 0; + VMA_ITERATOR(vmi, mm, vma->vm_start); + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + mmap_assert_write_locked(mm); + + /* Guard against exceeding limits of the address space. */ + address &= PAGE_MASK; + if (address >= (TASK_SIZE & PAGE_MASK)) + return -ENOMEM; + address += PAGE_SIZE; + + /* Enforce stack_guard_gap */ + gap_addr = address + stack_guard_gap; + + /* Guard against overflow */ + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } + + if (next) + vma_iter_prev_range_limit(&vmi, address); + + vma_iter_config(&vmi, vma->vm_start, address); + if (vma_iter_prealloc(&vmi, vma)) + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) { + vma_iter_free(&vmi); + return -ENOMEM; + } + + /* Lock the VMA before expanding to prevent concurrent page faults */ + vma_start_write(vma); + /* We update the anon VMA tree. */ + anon_vma_lock_write(vma->anon_vma); + + /* Somebody else might have raced and expanded it already */ + if (address > vma->vm_end) { + unsigned long size, grow; + + size = address - vma->vm_start; + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + error = -ENOMEM; + if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, grow); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_end = address; + /* Overwrite old entry in mtree. */ + vma_iter_store_attached(&vmi, vma); + anon_vma_interval_tree_post_update_vma(vma); + + perf_event_mmap(vma); + } + } + } + anon_vma_unlock_write(vma->anon_vma); + vma_iter_free(&vmi); + validate_mm(mm); + return error; +} +#endif /* CONFIG_STACK_GROWSUP */ + +/* + * vma is the first one with address < vma->vm_start. Have to extend vma. + * mmap_lock held for writing. + */ +int expand_downwards(struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *prev; + int error = 0; + VMA_ITERATOR(vmi, mm, vma->vm_start); + + if (!(vma->vm_flags & VM_GROWSDOWN)) + return -EFAULT; + + mmap_assert_write_locked(mm); + + address &= PAGE_MASK; + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; + + /* Enforce stack_guard_gap */ + prev = vma_prev(&vmi); + /* Check that both stack segments have the same anon_vma? */ + if (prev) { + if (!(prev->vm_flags & VM_GROWSDOWN) && + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; + } + + if (prev) + vma_iter_next_range_limit(&vmi, vma->vm_start); + + vma_iter_config(&vmi, address, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) { + vma_iter_free(&vmi); + return -ENOMEM; + } + + /* Lock the VMA before expanding to prevent concurrent page faults */ + vma_start_write(vma); + /* We update the anon VMA tree. */ + anon_vma_lock_write(vma->anon_vma); + + /* Somebody else might have raced and expanded it already */ + if (address < vma->vm_start) { + unsigned long size, grow; + + size = vma->vm_end - address; + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + error = -ENOMEM; + if (grow <= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, grow); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_start = address; + vma->vm_pgoff -= grow; + /* Overwrite old entry in mtree. */ + vma_iter_store_attached(&vmi, vma); + anon_vma_interval_tree_post_update_vma(vma); + + perf_event_mmap(vma); + } + } + } + anon_vma_unlock_write(vma->anon_vma); + vma_iter_free(&vmi); + validate_mm(mm); + return error; +} + +int __vm_munmap(unsigned long start, size_t len, bool unlock) +{ + int ret; + struct mm_struct *mm = current->mm; + LIST_HEAD(uf); + VMA_ITERATOR(vmi, mm, start); + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); + if (ret || !unlock) + mmap_write_unlock(mm); + + userfaultfd_unmap_complete(mm, &uf); + return ret; +} diff --git a/mm/vma.h b/mm/vma.h index 388d34748674..f51005b95b39 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -139,15 +139,10 @@ void validate_mm(struct mm_struct *mm); #define validate_mm(mm) do { } while (0) #endif -/* Required for expand_downwards(). */ -void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); - -/* Required for expand_downwards(). */ -void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); - -int vma_expand(struct vma_merge_struct *vmg); -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff); +__must_check int vma_expand(struct vma_merge_struct *vmg); +__must_check int vma_shrink(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) @@ -162,6 +157,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; + vma_mark_attached(vma); return 0; } @@ -174,19 +170,20 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); -void remove_vma(struct vm_area_struct *vma, bool unreachable); +void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ -struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, +__must_check struct vm_area_struct +*vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags); /* We are about to modify the VMA's flags and/or anon_name. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -196,7 +193,7 @@ struct vm_area_struct struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -204,7 +201,7 @@ struct vm_area_struct struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -212,11 +209,13 @@ struct vm_area_struct unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx); -struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); +__must_check struct vm_area_struct +*vma_merge_new_range(struct vma_merge_struct *vmg); -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta); +__must_check struct vm_area_struct +*vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); @@ -243,10 +242,16 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); -unsigned long __mmap_region(struct file *file, unsigned long addr, +unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); +int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, unsigned long flags); + +unsigned long unmapped_area(struct vm_unmapped_area_info *info); +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); + static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* @@ -360,9 +365,10 @@ static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) } /* Store a VMA with preallocated memory */ -static inline void vma_iter_store(struct vma_iterator *vmi, - struct vm_area_struct *vma) +static inline void vma_iter_store_attached(struct vma_iterator *vmi, + struct vm_area_struct *vma) { + vma_assert_attached(vma); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && @@ -387,6 +393,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi, mas_store_prealloc(&vmi->mas, vma); } +static inline void vma_iter_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + vma_mark_attached(vma); + vma_iter_store_attached(vmi, vma); +} + static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; @@ -472,4 +485,12 @@ static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) #endif +#if defined(CONFIG_STACK_GROWSUP) +int expand_upwards(struct vm_area_struct *vma, unsigned long address); +#endif + +int expand_downwards(struct vm_area_struct *vma, unsigned long address); + +int __vm_munmap(unsigned long start, size_t len, bool unlock); + #endif /* __MM_VMA_H */ diff --git a/mm/vma_internal.h b/mm/vma_internal.h index fc5f172a36bd..2f05735ff190 100644 --- a/mm/vma_internal.h +++ b/mm/vma_internal.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5c88d0e90c20..a6e7acebe9ad 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3562,11 +3562,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy_noprof(gfp, + nr = alloc_pages_bulk_mempolicy_noprof(gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node_noprof(gfp, nid, + nr = alloc_pages_bulk_node_noprof(gfp, nid, nr_pages_request, pages + nr_allocated); diff --git a/mm/vmscan.c b/mm/vmscan.c index 49f801b00d5d..01dce6f26ed3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -862,6 +862,31 @@ enum folio_references { FOLIOREF_ACTIVATE, }; +#ifdef CONFIG_LRU_GEN +/* + * Only used on a mapped folio in the eviction (rmap walk) path, where promotion + * needs to be done by taking the folio off the LRU list and then adding it back + * with PG_active set. In contrast, the aging (page table walk) path uses + * folio_update_gen(). + */ +static bool lru_gen_set_refs(struct folio *folio) +{ + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return false; + } + + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + return true; +} +#else +static bool lru_gen_set_refs(struct folio *folio) +{ + return false; +} +#endif /* CONFIG_LRU_GEN */ + static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { @@ -870,7 +895,6 @@ static enum folio_references folio_check_references(struct folio *folio, referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); - referenced_folio = folio_test_clear_referenced(folio); /* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. @@ -888,6 +912,15 @@ static enum folio_references folio_check_references(struct folio *folio, if (referenced_ptes == -1) return FOLIOREF_KEEP; + if (lru_gen_enabled()) { + if (!referenced_ptes) + return FOLIOREF_RECLAIM; + + return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP; + } + + referenced_folio = folio_test_clear_referenced(folio); + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -1092,11 +1125,6 @@ retry: if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; - /* folio_update_gen() tried to promote this page? */ - if (lru_gen_enabled() && !ignore_references && - folio_mapped(folio) && folio_test_referenced(folio)) - goto keep_locked; - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -1137,8 +1165,9 @@ retry: * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, - * not to fs). In this case mark the folio for immediate - * reclaim and continue scanning. + * not to fs), or the writeback may take an indeterminate + * amount of time to complete. In this case mark the folio + * for immediate reclaim and continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might @@ -1163,6 +1192,8 @@ retry: * takes to write them to disk. */ if (folio_test_writeback(folio)) { + mapping = folio_mapping(folio); + /* Case 1 above */ if (current_is_kswapd() && folio_test_reclaim(folio) && @@ -1173,7 +1204,8 @@ retry: /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || - !may_enter_fs(folio, sc->gfp_mask)) { + !may_enter_fs(folio, sc->gfp_mask) || + (mapping && mapping_writeback_indeterminate(mapping))) { /* * This is slightly racy - * folio_end_writeback() might have @@ -2624,11 +2656,17 @@ static bool should_clear_pmd_young(void) READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ } +#define evictable_min_seq(min_seq, swappiness) \ + min((min_seq)[!(swappiness)], (min_seq)[(swappiness) != MAX_SWAPPINESS]) + #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define for_each_evictable_type(type, swappiness) \ + for ((type) = !(swappiness); (type) <= ((swappiness) != MAX_SWAPPINESS); (type)++) + #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) @@ -2674,10 +2712,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type) static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { - /* see the comment on lru_gen_folio */ - return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && - get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && - get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + int n = get_nr_gens(lruvec, type); + + if (n < MIN_NR_GENS || n > MAX_NR_GENS) + return false; + } + + return true; } /****************************************************************************** @@ -3078,16 +3122,20 @@ struct ctrl_pos { static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { + int i; struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); - pos->refaulted = lrugen->avg_refaulted[type][tier] + - atomic_long_read(&lrugen->refaulted[hist][type][tier]); - pos->total = lrugen->avg_total[type][tier] + - atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - pos->total += lrugen->protected[hist][type][tier - 1]; pos->gain = gain; + pos->refaulted = pos->total = 0; + + for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) { + pos->refaulted += lrugen->avg_refaulted[type][i] + + atomic_long_read(&lrugen->refaulted[hist][type][i]); + pos->total += lrugen->avg_total[type][i] + + lrugen->protected[hist][type][i] + + atomic_long_read(&lrugen->evicted[hist][type][i]); + } } static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) @@ -3113,17 +3161,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); sum = lrugen->avg_total[type][tier] + + lrugen->protected[hist][type][tier] + atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - sum += lrugen->protected[hist][type][tier - 1]; WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); } if (clear) { atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); atomic_long_set(&lrugen->evicted[hist][type][tier], 0); - if (tier) - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + WRITE_ONCE(lrugen->protected[hist][type][tier], 0); } } } @@ -3150,16 +3196,19 @@ static int folio_update_gen(struct folio *folio, int gen) VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return -1; + } + do { /* lru_gen_del_folio() has isolated this page? */ - if (!(old_flags & LRU_GEN_MASK)) { - /* for shrink_folio_list() */ - new_flags = old_flags | BIT(PG_referenced); - continue; - } + if (!(old_flags & LRU_GEN_MASK)) + return -1; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); - new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); + new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; @@ -3183,7 +3232,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai new_gen = (old_gen + 1) % MAX_NR_GENS; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; /* for folio_end_writeback() */ if (reclaiming) @@ -3258,7 +3307,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (vma_is_anonymous(vma)) - return !walk->can_swap; + return !walk->swappiness; if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) return true; @@ -3268,7 +3317,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (shmem_mapping(mapping)) - return !walk->can_swap; + return !walk->swappiness; + + if (walk->swappiness == MAX_SWAPPINESS) + return true; /* to exclude special mappings like dax, etc. */ return !mapping->a_ops->read_folio; @@ -3356,21 +3408,19 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned } static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, - struct pglist_data *pgdat, bool can_swap) + struct pglist_data *pgdat) { - struct folio *folio; + struct folio *folio = pfn_folio(pfn); + + if (folio_lru_gen(folio) < 0) + return NULL; - folio = pfn_folio(pfn); if (folio_nid(folio) != pgdat->node_id) return NULL; if (folio_memcg(folio) != memcg) return NULL; - /* file VMAs can contain anon pages from COW */ - if (!folio_is_file_lru(folio) && !can_swap) - return NULL; - return folio; } @@ -3382,29 +3432,55 @@ static bool suitable_to_scan(int total, int young) return young * n >= total; } +static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio, + int new_gen, bool dirty) +{ + int old_gen; + + if (!folio) + return; + + if (dirty && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + } else if (lru_gen_set_refs(folio)) { + old_gen = folio_lru_gen(folio); + if (old_gen >= 0 && old_gen != new_gen) + folio_activate(folio); + } +} + static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *args) { int i; + bool dirty; pte_t *pte; spinlock_t *ptl; unsigned long addr; int total = 0; int young = 0; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); pmd_t pmdval; - pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, - &ptl); + pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); if (!pte) return false; + if (!spin_trylock(ptl)) { pte_unmap(pte); - return false; + return true; } if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { @@ -3426,26 +3502,30 @@ restart: if (pfn == -1) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; if (!ptep_clear_young_notify(args->vma, addr, pte + i)) continue; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } + + if (pte_dirty(ptent)) + dirty = true; + young++; walk->mm_stats[MM_LEAF_YOUNG]++; - - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); - - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); } + walk_update_folio(walk, last, gen, dirty); + last = NULL; + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; @@ -3459,13 +3539,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; + bool dirty; pmd_t *pmd; spinlock_t *ptl; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3511,27 +3593,30 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (pfn == -1) goto next; - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) goto next; if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } + + if (pmd_dirty(pmd[i])) + dirty = true; + walk->mm_stats[MM_LEAF_YOUNG]++; - - if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); - - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); next: i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; } while (i <= MIN_LRU_BATCH); + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: @@ -3723,22 +3808,25 @@ static void clear_mm_walk(void) kfree(walk); } -static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) +static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) { int zone; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - if (type == LRU_GEN_ANON && !can_swap) + if (type ? swappiness == MAX_SWAPPINESS : !swappiness) goto done; - /* prevent cold/hot inversion if force_scan is true */ + /* prevent cold/hot inversion if the type is evictable */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); + int refs = folio_lru_refs(folio); + bool workingset = folio_test_workingset(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); @@ -3748,6 +3836,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) new_gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int tier = lru_tier_from_refs(refs, workingset); + int delta = folio_nr_pages(folio); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } + if (!--remaining) return false; } @@ -3759,7 +3856,7 @@ done: return true; } -static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; bool success = false; @@ -3769,7 +3866,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); /* find the oldest populated generation */ - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { gen = lru_gen_from_seq(min_seq[type]); @@ -3785,13 +3882,17 @@ next: } /* see the comment on lru_gen_folio */ - if (can_swap) { - min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); - min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + if (swappiness && swappiness != MAX_SWAPPINESS) { + unsigned long seq = lrugen->max_seq - MIN_NR_GENS; + + if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq) + min_seq[LRU_GEN_ANON] = seq; + else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq) + min_seq[LRU_GEN_FILE] = seq; } - for (type = !can_swap; type < ANON_AND_FILE; type++) { - if (min_seq[type] == lrugen->min_seq[type]) + for_each_evictable_type(type, swappiness) { + if (min_seq[type] <= lrugen->min_seq[type]) continue; reset_ctrl_pos(lruvec, type, true); @@ -3802,8 +3903,7 @@ next: return success; } -static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) +static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) { bool success; int prev, next; @@ -3821,13 +3921,11 @@ restart: if (!success) goto unlock; - for (type = ANON_AND_FILE - 1; type >= 0; type--) { + for (type = 0; type < ANON_AND_FILE; type++) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; - VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); - - if (inc_min_seq(lruvec, type, can_swap)) + if (inc_min_seq(lruvec, type, swappiness)) continue; spin_unlock_irq(&lruvec->lru_lock); @@ -3871,7 +3969,7 @@ unlock: } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) + int swappiness, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; @@ -3882,7 +3980,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); if (!mm_state) - return inc_max_seq(lruvec, seq, can_swap, force_scan); + return inc_max_seq(lruvec, seq, swappiness); /* see the comment in iterate_mm_list() */ if (seq <= READ_ONCE(mm_state->seq)) @@ -3907,7 +4005,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, walk->lruvec = lruvec; walk->seq = seq; - walk->can_swap = can_swap; + walk->swappiness = swappiness; walk->force_scan = force_scan; do { @@ -3917,7 +4015,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, } while (mm); done: if (success) { - success = inc_max_seq(lruvec, seq, can_swap, force_scan); + success = inc_max_seq(lruvec, seq, swappiness); WARN_ON_ONCE(!success); } @@ -3958,13 +4056,13 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; unsigned long total = 0; - bool can_swap = get_swappiness(lruvec, sc); + int swappiness = get_swappiness(lruvec, sc); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { @@ -3984,6 +4082,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc { int gen; unsigned long birth; + int swappiness = get_swappiness(lruvec, sc); struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); @@ -3993,8 +4092,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc if (!lruvec_is_sizable(lruvec, sc)) return false; - /* see the comment on lru_gen_folio */ - gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness)); birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); return time_is_before_jiffies(birth + min_ttl); @@ -4053,21 +4151,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; + bool dirty; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; + struct folio *last = NULL; int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); - bool can_swap = !folio_is_file_lru(folio); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); @@ -4114,37 +4213,28 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (pfn == -1) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; if (!ptep_clear_young_notify(vma, addr, pte + i)) continue; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } + + if (pte_dirty(ptent)) + dirty = true; + young++; - - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); - - if (walk) { - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); - - continue; - } - - old_gen = folio_lru_gen(folio); - if (old_gen < 0) - folio_set_referenced(folio); - else if (old_gen != new_gen) { - folio_clear_lru_refs(folio); - folio_activate(folio); - } } + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); /* feedback from rmap walkers to page table walkers */ @@ -4302,7 +4392,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4324,14 +4415,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* protected */ - if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) { - int hist = lru_hist_from_seq(lrugen->min_seq[type]); - + if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], - lrugen->protected[hist][type][tier - 1] + delta); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } return true; } @@ -4351,8 +4445,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* waiting for writeback */ - if (folio_test_locked(folio) || writeback || - (type == LRU_GEN_FILE && dirty)) { + if (writeback || (type == LRU_GEN_FILE && dirty)) { gen = folio_inc_gen(lruvec, folio, true); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; @@ -4381,13 +4474,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return false; } - /* see the comment on MAX_NR_TIERS */ + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - folio_clear_lru_refs(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); - folio_clear_referenced(folio); success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); @@ -4483,13 +4575,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type) struct ctrl_pos sp, pv; /* - * To leave a margin for fluctuations, use a larger gain factor (1:2). + * To leave a margin for fluctuations, use a larger gain factor (2:3). * This value is chosen because any other tier would have at least twice * as many refaults as the first tier. */ - read_ctrl_pos(lruvec, type, 0, 1, &sp); + read_ctrl_pos(lruvec, type, 0, 2, &sp); for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, 2, &pv); + read_ctrl_pos(lruvec, type, tier, 3, &pv); if (!positive_ctrl_err(&sp, &pv)) break; } @@ -4497,79 +4589,45 @@ static int get_tier_idx(struct lruvec *lruvec, int type) return tier - 1; } -static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +static int get_type_to_scan(struct lruvec *lruvec, int swappiness) { - int type, tier; struct ctrl_pos sp, pv; - int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness }; + if (!swappiness) + return LRU_GEN_FILE; + + if (swappiness == MAX_SWAPPINESS) + return LRU_GEN_ANON; /* - * Compare the first tier of anon with that of file to determine which - * type to scan. Also need to compare other tiers of the selected type - * with the first tier of the other type to determine the last tier (of - * the selected type) to evict. + * Compare the sum of all tiers of anon with that of file to determine + * which type to scan. */ - read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); - read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); - type = positive_ctrl_err(&sp, &pv); + read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv); - read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); - for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, gain[type], &pv); - if (!positive_ctrl_err(&sp, &pv)) - break; - } - - *tier_idx = tier - 1; - - return type; + return positive_ctrl_err(&sp, &pv); } static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; - int type; - int scanned; - int tier = -1; - DEFINE_MIN_SEQ(lruvec); + int type = get_type_to_scan(lruvec, swappiness); - /* - * Try to make the obvious choice first, and if anon and file are both - * available from the same generation, - * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon - * first. - * 2. If !__GFP_IO, file first since clean pagecache is more likely to - * exist than clean swapcache. - */ - if (!swappiness) - type = LRU_GEN_FILE; - else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) - type = LRU_GEN_ANON; - else if (swappiness == 1) - type = LRU_GEN_FILE; - else if (swappiness == MAX_SWAPPINESS) - type = LRU_GEN_ANON; - else if (!(sc->gfp_mask & __GFP_IO)) - type = LRU_GEN_FILE; - else - type = get_type_to_scan(lruvec, swappiness, &tier); + for_each_evictable_type(i, swappiness) { + int scanned; + int tier = get_tier_idx(lruvec, type); - for (i = !swappiness; i < ANON_AND_FILE; i++) { - if (tier < 0) - tier = get_tier_idx(lruvec, type); + *type_scanned = type; scanned = scan_folios(lruvec, sc, type, tier, list); if (scanned) - break; + return scanned; type = !type; - tier = -1; } - *type_scanned = type; - - return scanned; + return 0; } static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) @@ -4585,6 +4643,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -4594,7 +4653,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap scanned += try_to_inc_min_seq(lruvec, swappiness); - if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; spin_unlock_irq(&lruvec->lru_lock); @@ -4610,31 +4669,24 @@ retry: type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { + DEFINE_MIN_SEQ(lruvec); + if (!folio_evictable(folio)) { list_del(&folio->lru); folio_putback_lru(folio); continue; } - if (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio))) { - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ - if (folio_test_workingset(folio)) - folio_set_referenced(folio); - continue; - } - - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || - folio_mapped(folio) || folio_test_locked(folio) || - folio_test_dirty(folio) || folio_test_writeback(folio)) { - /* don't add rejected folios to the oldest generation */ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, - BIT(PG_active)); - continue; - } - /* retry folios that may have missed folio_rotate_reclaimable() */ - list_move(&folio->lru, &clean); + if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && + !folio_test_dirty(folio) && !folio_test_writeback(folio)) { + list_move(&folio->lru, &clean); + continue; + } + + /* don't add rejected folios to the oldest generation */ + if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); @@ -4671,63 +4723,32 @@ retry: } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - bool can_swap, unsigned long *nr_to_scan) + int swappiness, unsigned long *nr_to_scan) { int gen, type, zone; - unsigned long old = 0; - unsigned long young = 0; - unsigned long total = 0; + unsigned long size = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); - /* whether this lruvec is completely out of cold folios */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { - *nr_to_scan = 0; + *nr_to_scan = 0; + /* have to run aging, since eviction is not possible anymore */ + if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) return true; - } - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { - unsigned long size = 0; - gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - - total += size; - if (seq == max_seq) - young += size; - else if (seq + MIN_NR_GENS == max_seq) - old += size; } } - *nr_to_scan = total; - - /* - * The aging tries to be lazy to reduce the overhead, while the eviction - * stalls when the number of generations reaches MIN_NR_GENS. Hence, the - * ideal number of generations is MIN_NR_GENS+1. - */ - if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) - return false; - - /* - * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) - * of the total number of pages for each generation. A reasonable range - * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The - * aging cares about the upper bound of hot pages, while the eviction - * cares about the lower bound of cold pages. - */ - if (young * MIN_NR_GENS > total) - return true; - if (old * (MIN_NR_GENS + 2) < total) - return true; - - return false; + *nr_to_scan = size; + /* better to run aging even though eviction is still possible */ + return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; } /* @@ -4735,7 +4756,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ -static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { bool success; unsigned long nr_to_scan; @@ -4745,7 +4766,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; - success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan); + success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); /* try to scrape all its memory if this memcg was deleted */ if (nr_to_scan && !mem_cgroup_online(memcg)) @@ -4756,7 +4777,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool return nr_to_scan >> sc->priority; /* stop scanning this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) @@ -5300,8 +5321,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, s = "rep"; n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + n[2] = READ_ONCE(lrugen->protected[hist][type][tier]); } for (i = 0; i < 3; i++) @@ -5356,7 +5376,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) seq_printf(m, " node %5d\n", nid); if (!full) - seq = min_seq[LRU_GEN_ANON]; + seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2); else if (max_seq >= MAX_NR_GENS) seq = max_seq - MAX_NR_GENS + 1; else @@ -5396,23 +5416,14 @@ static const struct seq_operations lru_gen_seq_ops = { }; static int run_aging(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) + int swappiness, bool force_scan) { DEFINE_MAX_SEQ(lruvec); - DEFINE_MIN_SEQ(lruvec); - - if (seq < max_seq) - return 0; if (seq > max_seq) return -EINVAL; - if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) - return -ERANGE; - - try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan); - - return 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST; } static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, @@ -5428,7 +5439,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co while (!signal_pending(current)) { DEFINE_MIN_SEQ(lruvec); - if (seq < min_seq[!swappiness]) + if (seq < evictable_min_seq(min_seq, swappiness)) return 0; if (sc->nr_reclaimed >= nr_to_reclaim) diff --git a/mm/workingset.c b/mm/workingset.c index a4705e196545..4841ae8af411 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -239,7 +239,8 @@ static void *lru_gen_eviction(struct folio *folio) int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); @@ -253,18 +254,18 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); - return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ -static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; - unsigned long min_seq; + unsigned long max_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; @@ -273,8 +274,10 @@ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); - min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); - return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); + max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); + max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; + + return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } static void lru_gen_refault(struct folio *folio, void *shadow) @@ -290,7 +293,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) rcu_read_lock(); - recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset); + recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; @@ -302,24 +305,20 @@ static void lru_gen_refault(struct folio *folio, void *shadow) lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); - /* see the comment in folio_lru_refs() */ - refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; - tier = lru_tier_from_refs(refs); + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; + tier = lru_tier_from_refs(refs, workingset); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); - mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); - /* - * Count the following two cases as stalls: - * 1. For pages accessed through page tables, hotter pages pushed out - * hot pages which refaulted immediately. - * 2. For pages accessed multiple times through file descriptors, - * they would have been protected by sort_folio(). - */ - if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { - set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); + /* see folio_add_lru() where folio_set_active() will be called */ + if (lru_gen_in_fault()) + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + + if (workingset) { + folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); - } + } else + set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } @@ -331,7 +330,7 @@ static void *lru_gen_eviction(struct folio *folio) return NULL; } -static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; @@ -428,17 +427,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, struct pglist_data *pgdat; unsigned long eviction; - rcu_read_lock(); - if (lru_gen_enabled()) { - bool recent = lru_gen_test_recent(shadow, file, - &eviction_lruvec, &eviction, workingset); + bool recent; + rcu_read_lock(); + recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } - + rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; @@ -459,14 +457,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && - (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) { - rcu_read_unlock(); - return false; - } - + if (!mem_cgroup_tryget(eviction_memcg)) + eviction_memcg = NULL; rcu_read_unlock(); + if (!mem_cgroup_disabled() && !eviction_memcg) + return false; /* * Flush stats (and potentially sleep) outside the RCU read section. * @@ -544,6 +540,8 @@ void workingset_refault(struct folio *folio, void *shadow) bool workingset; long nr; + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; @@ -558,7 +556,6 @@ void workingset_refault(struct folio *folio, void *shadow) * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); diff --git a/mm/zpdesc.h b/mm/zpdesc.h new file mode 100644 index 000000000000..fa47fece2237 --- /dev/null +++ b/mm/zpdesc.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* zpdesc.h: zswap.zpool memory descriptor + * + * Written by Alex Shi + * Hyeonggon Yoo <42.hyeyoo@gmail.com> + */ +#ifndef __MM_ZPDESC_H__ +#define __MM_ZPDESC_H__ + +/* + * struct zpdesc - Memory descriptor for zpool memory. + * @flags: Page flags, mostly unused by zsmalloc. + * @lru: Indirectly used by page migration. + * @movable_ops: Used by page migration. + * @next: Next zpdesc in a zspage in zsmalloc zpool. + * @handle: For huge zspage in zsmalloc zpool. + * @zspage: Points to the zspage this zpdesc is a part of. + * @first_obj_offset: First object offset in zsmalloc zpool. + * @_refcount: The number of references to this zpdesc. + * + * This struct overlays struct page for now. Do not modify without a good + * understanding of the issues. In particular, do not expand into the overlap + * with memcg_data. + * + * Page flags used: + * * PG_private identifies the first component page. + * * PG_locked is used by page migration code. + */ +struct zpdesc { + unsigned long flags; + struct list_head lru; + unsigned long movable_ops; + union { + struct zpdesc *next; + unsigned long handle; + }; + struct zspage *zspage; + /* + * Only the lower 24 bits are available for offset, limiting a page + * to 16 MiB. The upper 8 bits are reserved for PGTY_zsmalloc. + * + * Do not access this field directly. + * Instead, use {get,set}_first_obj_offset() helpers. + */ + unsigned int first_obj_offset; + atomic_t _refcount; +}; +#define ZPDESC_MATCH(pg, zp) \ + static_assert(offsetof(struct page, pg) == offsetof(struct zpdesc, zp)) + +ZPDESC_MATCH(flags, flags); +ZPDESC_MATCH(lru, lru); +ZPDESC_MATCH(mapping, movable_ops); +ZPDESC_MATCH(index, next); +ZPDESC_MATCH(index, handle); +ZPDESC_MATCH(private, zspage); +ZPDESC_MATCH(page_type, first_obj_offset); +ZPDESC_MATCH(_refcount, _refcount); +#undef ZPDESC_MATCH +static_assert(sizeof(struct zpdesc) <= sizeof(struct page)); + +/* + * zpdesc_page - The first struct page allocated for a zpdesc + * @zp: The zpdesc. + * + * A convenience wrapper for converting zpdesc to the first struct page of the + * underlying folio, to communicate with code not yet converted to folio or + * struct zpdesc. + * + */ +#define zpdesc_page(zp) (_Generic((zp), \ + const struct zpdesc *: (const struct page *)(zp), \ + struct zpdesc *: (struct page *)(zp))) + +/** + * zpdesc_folio - The folio allocated for a zpdesc + * @zp: The zpdesc. + * + * Zpdescs are descriptors for zpool memory. The zpool memory itself is + * allocated as folios that contain the zpool objects, and zpdesc uses specific + * fields in the first struct page of the folio - those fields are now accessed + * by struct zpdesc. + * + * It is occasionally necessary convert to back to a folio in order to + * communicate with the rest of the mm. Please use this helper function + * instead of casting yourself, as the implementation may change in the future. + */ +#define zpdesc_folio(zp) (_Generic((zp), \ + const struct zpdesc *: (const struct folio *)(zp), \ + struct zpdesc *: (struct folio *)(zp))) +/** + * page_zpdesc - Converts from first struct page to zpdesc. + * @p: The first (either head of compound or single) page of zpdesc. + * + * A temporary wrapper to convert struct page to struct zpdesc in situations + * where we know the page is the compound head, or single order-0 page. + * + * Long-term ideally everything would work with struct zpdesc directly or go + * through folio to struct zpdesc. + * + * Return: The zpdesc which contains this page + */ +#define page_zpdesc(p) (_Generic((p), \ + const struct page *: (const struct zpdesc *)(p), \ + struct page *: (struct zpdesc *)(p))) + +static inline void zpdesc_lock(struct zpdesc *zpdesc) +{ + folio_lock(zpdesc_folio(zpdesc)); +} + +static inline bool zpdesc_trylock(struct zpdesc *zpdesc) +{ + return folio_trylock(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_unlock(struct zpdesc *zpdesc) +{ + folio_unlock(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_wait_locked(struct zpdesc *zpdesc) +{ + folio_wait_locked(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_get(struct zpdesc *zpdesc) +{ + folio_get(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_put(struct zpdesc *zpdesc) +{ + folio_put(zpdesc_folio(zpdesc)); +} + +static inline void *kmap_local_zpdesc(struct zpdesc *zpdesc) +{ + return kmap_local_page(zpdesc_page(zpdesc)); +} + +static inline unsigned long zpdesc_pfn(struct zpdesc *zpdesc) +{ + return page_to_pfn(zpdesc_page(zpdesc)); +} + +static inline struct zpdesc *pfn_zpdesc(unsigned long pfn) +{ + return page_zpdesc(pfn_to_page(pfn)); +} + +static inline void __zpdesc_set_movable(struct zpdesc *zpdesc, + const struct movable_operations *mops) +{ + __SetPageMovable(zpdesc_page(zpdesc), mops); +} + +static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc) +{ + __SetPageZsmalloc(zpdesc_page(zpdesc)); +} + +static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc) +{ + __ClearPageZsmalloc(zpdesc_page(zpdesc)); +} + +static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc) +{ + return PageIsolated(zpdesc_page(zpdesc)); +} + +static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc) +{ + return page_zone(zpdesc_page(zpdesc)); +} + +static inline bool zpdesc_is_locked(struct zpdesc *zpdesc) +{ + return folio_test_locked(zpdesc_folio(zpdesc)); +} +#endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 64b66a4d3e6e..dae32e051779 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -13,24 +13,6 @@ * Released under the terms of GNU General Public License Version 2.0 */ -/* - * Following is how we use various fields and flags of underlying - * struct page(s) to form a zspage. - * - * Usage of struct page fields: - * page->private: points to zspage - * page->index: links together all component pages of a zspage - * For the huge page, this is always 0, so we use this field - * to store handle. - * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object - * offset in a subpage of a zspage - * - * Usage of struct page flags: - * PG_private: identifies the first component page - * PG_owner_priv_1: identifies the huge component page - * - */ - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* @@ -67,6 +49,7 @@ #include #include #include +#include "zpdesc.h" #define ZSPAGE_MAGIC 0x58 @@ -245,6 +228,35 @@ struct zs_pool { atomic_t compaction_in_progress; }; +static inline void zpdesc_set_first(struct zpdesc *zpdesc) +{ + SetPagePrivate(zpdesc_page(zpdesc)); +} + +static inline void zpdesc_inc_zone_page_state(struct zpdesc *zpdesc) +{ + inc_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); +} + +static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc) +{ + dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); +} + +static inline struct zpdesc *alloc_zpdesc(gfp_t gfp) +{ + struct page *page = alloc_page(gfp); + + return page_zpdesc(page); +} + +static inline void free_zpdesc(struct zpdesc *zpdesc) +{ + struct page *page = zpdesc_page(zpdesc); + + __free_page(page); +} + struct zspage { struct { unsigned int huge:HUGE_BITS; @@ -254,7 +266,7 @@ struct zspage { }; unsigned int inuse; unsigned int freeobj; - struct page *first_page; + struct zpdesc *first_zpdesc; struct list_head list; /* fullness list */ struct zs_pool *pool; rwlock_t lock; @@ -440,9 +452,9 @@ static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { .lock = INIT_LOCAL_LOCK(lock), }; -static __maybe_unused int is_first_page(struct page *page) +static inline bool is_first_zpdesc(struct zpdesc *zpdesc) { - return PagePrivate(page); + return PagePrivate(zpdesc_page(zpdesc)); } /* Protected by class->lock */ @@ -451,36 +463,35 @@ static inline int get_zspage_inuse(struct zspage *zspage) return zspage->inuse; } - static inline void mod_zspage_inuse(struct zspage *zspage, int val) { zspage->inuse += val; } -static inline struct page *get_first_page(struct zspage *zspage) +static struct zpdesc *get_first_zpdesc(struct zspage *zspage) { - struct page *first_page = zspage->first_page; + struct zpdesc *first_zpdesc = zspage->first_zpdesc; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - return first_page; + VM_BUG_ON_PAGE(!is_first_zpdesc(first_zpdesc), zpdesc_page(first_zpdesc)); + return first_zpdesc; } #define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff -static inline unsigned int get_first_obj_offset(struct page *page) +static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc) { - VM_WARN_ON_ONCE(!PageZsmalloc(page)); - return page->page_type & FIRST_OBJ_PAGE_TYPE_MASK; + VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); + return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK; } -static inline void set_first_obj_offset(struct page *page, unsigned int offset) +static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset) { /* With 24 bits available, we can support offsets into 16 MiB pages. */ BUILD_BUG_ON(PAGE_SIZE > SZ_16M); - VM_WARN_ON_ONCE(!PageZsmalloc(page)); + VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK); - page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK; - page->page_type |= offset & FIRST_OBJ_PAGE_TYPE_MASK; + zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK; + zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK; } static inline unsigned int get_freeobj(struct zspage *zspage) @@ -733,52 +744,52 @@ out: return newfg; } -static struct zspage *get_zspage(struct page *page) +static struct zspage *get_zspage(struct zpdesc *zpdesc) { - struct zspage *zspage = (struct zspage *)page_private(page); + struct zspage *zspage = zpdesc->zspage; BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; } -static struct page *get_next_page(struct page *page) +static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc) { - struct zspage *zspage = get_zspage(page); + struct zspage *zspage = get_zspage(zpdesc); if (unlikely(ZsHugePage(zspage))) return NULL; - return (struct page *)page->index; + return zpdesc->next; } /** - * obj_to_location - get (, ) from encoded object value + * obj_to_location - get (, ) from encoded object value * @obj: the encoded object value - * @page: page object resides in zspage + * @zpdesc: zpdesc object resides in zspage * @obj_idx: object index */ -static void obj_to_location(unsigned long obj, struct page **page, +static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc, unsigned int *obj_idx) { - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); *obj_idx = (obj & OBJ_INDEX_MASK); } -static void obj_to_page(unsigned long obj, struct page **page) +static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc) { - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); } /** - * location_to_obj - get obj value encoded from (, ) - * @page: page object resides in zspage + * location_to_obj - get obj value encoded from (, ) + * @zpdesc: zpdesc object resides in zspage * @obj_idx: object index */ -static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) +static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx) { unsigned long obj; - obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS; obj |= obj_idx & OBJ_INDEX_MASK; return obj; @@ -789,15 +800,15 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static inline bool obj_allocated(struct page *page, void *obj, +static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj, unsigned long *phandle) { unsigned long handle; - struct zspage *zspage = get_zspage(page); + struct zspage *zspage = get_zspage(zpdesc); if (unlikely(ZsHugePage(zspage))) { - VM_BUG_ON_PAGE(!is_first_page(page), page); - handle = page->index; + VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc)); + handle = zpdesc->handle; } else handle = *(unsigned long *)obj; @@ -809,8 +820,10 @@ static inline bool obj_allocated(struct page *page, void *obj, return true; } -static void reset_page(struct page *page) +static void reset_zpdesc(struct zpdesc *zpdesc) { + struct page *page = zpdesc_page(zpdesc); + __ClearPageMovable(page); ClearPagePrivate(page); set_page_private(page, 0); @@ -820,11 +833,11 @@ static void reset_page(struct page *page) static int trylock_zspage(struct zspage *zspage) { - struct page *cursor, *fail; + struct zpdesc *cursor, *fail; - for (cursor = get_first_page(zspage); cursor != NULL; cursor = - get_next_page(cursor)) { - if (!trylock_page(cursor)) { + for (cursor = get_first_zpdesc(zspage); cursor != NULL; cursor = + get_next_zpdesc(cursor)) { + if (!zpdesc_trylock(cursor)) { fail = cursor; goto unlock; } @@ -832,9 +845,9 @@ static int trylock_zspage(struct zspage *zspage) return 1; unlock: - for (cursor = get_first_page(zspage); cursor != fail; cursor = - get_next_page(cursor)) - unlock_page(cursor); + for (cursor = get_first_zpdesc(zspage); cursor != fail; cursor = + get_next_zpdesc(cursor)) + zpdesc_unlock(cursor); return 0; } @@ -842,23 +855,23 @@ unlock: static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { - struct page *page, *next; + struct zpdesc *zpdesc, *next; assert_spin_locked(&class->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0); - next = page = get_first_page(zspage); + next = zpdesc = get_first_zpdesc(zspage); do { - VM_BUG_ON_PAGE(!PageLocked(page), page); - next = get_next_page(page); - reset_page(page); - unlock_page(page); - dec_zone_page_state(page, NR_ZSPAGES); - put_page(page); - page = next; - } while (page != NULL); + VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc)); + next = get_next_zpdesc(zpdesc); + reset_zpdesc(zpdesc); + zpdesc_unlock(zpdesc); + zpdesc_dec_zone_page_state(zpdesc); + zpdesc_put(zpdesc); + zpdesc = next; + } while (zpdesc != NULL); cache_free_zspage(pool, zspage); @@ -891,16 +904,16 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) { unsigned int freeobj = 1; unsigned long off = 0; - struct page *page = get_first_page(zspage); + struct zpdesc *zpdesc = get_first_zpdesc(zspage); - while (page) { - struct page *next_page; + while (zpdesc) { + struct zpdesc *next_zpdesc; struct link_free *link; void *vaddr; - set_first_obj_offset(page, off); + set_first_obj_offset(zpdesc, off); - vaddr = kmap_local_page(page); + vaddr = kmap_local_zpdesc(zpdesc); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { @@ -913,8 +926,8 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) * page, which must point to the first object on the next * page (if present) */ - next_page = get_next_page(page); - if (next_page) { + next_zpdesc = get_next_zpdesc(zpdesc); + if (next_zpdesc) { link->next = freeobj++ << OBJ_TAG_BITS; } else { /* @@ -924,7 +937,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) link->next = -1UL << OBJ_TAG_BITS; } kunmap_local(vaddr); - page = next_page; + zpdesc = next_zpdesc; off %= PAGE_SIZE; } @@ -932,35 +945,35 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) } static void create_page_chain(struct size_class *class, struct zspage *zspage, - struct page *pages[]) + struct zpdesc *zpdescs[]) { int i; - struct page *page; - struct page *prev_page = NULL; - int nr_pages = class->pages_per_zspage; + struct zpdesc *zpdesc; + struct zpdesc *prev_zpdesc = NULL; + int nr_zpdescs = class->pages_per_zspage; /* * Allocate individual pages and link them together as: - * 1. all pages are linked together using page->index - * 2. each sub-page point to zspage using page->private + * 1. all pages are linked together using zpdesc->next + * 2. each sub-page point to zspage using zpdesc->zspage * - * we set PG_private to identify the first page (i.e. no other sub-page + * we set PG_private to identify the first zpdesc (i.e. no other zpdesc * has this flag set). */ - for (i = 0; i < nr_pages; i++) { - page = pages[i]; - set_page_private(page, (unsigned long)zspage); - page->index = 0; + for (i = 0; i < nr_zpdescs; i++) { + zpdesc = zpdescs[i]; + zpdesc->zspage = zspage; + zpdesc->next = NULL; if (i == 0) { - zspage->first_page = page; - SetPagePrivate(page); + zspage->first_zpdesc = zpdesc; + zpdesc_set_first(zpdesc); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) SetZsHugePage(zspage); } else { - prev_page->index = (unsigned long)page; + prev_zpdesc->next = zpdesc; } - prev_page = page; + prev_zpdesc = zpdesc; } } @@ -972,7 +985,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, gfp_t gfp) { int i; - struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; + struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; struct zspage *zspage = cache_alloc_zspage(pool, gfp); if (!zspage) @@ -982,25 +995,25 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, migrate_lock_init(zspage); for (i = 0; i < class->pages_per_zspage; i++) { - struct page *page; + struct zpdesc *zpdesc; - page = alloc_page(gfp); - if (!page) { + zpdesc = alloc_zpdesc(gfp); + if (!zpdesc) { while (--i >= 0) { - dec_zone_page_state(pages[i], NR_ZSPAGES); - __ClearPageZsmalloc(pages[i]); - __free_page(pages[i]); + zpdesc_dec_zone_page_state(zpdescs[i]); + __zpdesc_clear_zsmalloc(zpdescs[i]); + free_zpdesc(zpdescs[i]); } cache_free_zspage(pool, zspage); return NULL; } - __SetPageZsmalloc(page); + __zpdesc_set_zsmalloc(zpdesc); - inc_zone_page_state(page, NR_ZSPAGES); - pages[i] = page; + zpdesc_inc_zone_page_state(zpdesc); + zpdescs[i] = zpdesc; } - create_page_chain(class, zspage, pages); + create_page_chain(class, zspage, zpdescs); init_zspage(class, zspage); zspage->pool = pool; zspage->class = class->index; @@ -1044,7 +1057,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) } static void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) + struct zpdesc *zpdescs[2], int off, int size) { size_t sizes[2]; char *buf = area->vm_buf; @@ -1060,14 +1073,14 @@ static void *__zs_map_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy object to per-cpu buffer */ - memcpy_from_page(buf, pages[0], off, sizes[0]); - memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]); + memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]); + memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]); out: return area->vm_buf; } static void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) + struct zpdesc *zpdescs[2], int off, int size) { size_t sizes[2]; char *buf; @@ -1085,8 +1098,8 @@ static void __zs_unmap_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy per-cpu buffer to object */ - memcpy_to_page(pages[0], off, buf, sizes[0]); - memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]); + memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]); + memcpy_to_page(zpdesc_page(zpdescs[1]), 0, buf + sizes[0], sizes[1]); out: /* enable page faults to match kunmap_local() return conditions */ @@ -1176,13 +1189,13 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { struct zspage *zspage; - struct page *page; + struct zpdesc *zpdesc; unsigned long obj, off; unsigned int obj_idx; struct size_class *class; struct mapping_area *area; - struct page *pages[2]; + struct zpdesc *zpdescs[2]; void *ret; /* @@ -1195,8 +1208,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, /* It guarantees it can get zspage from handle safely */ read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); - obj_to_location(obj, &page, &obj_idx); - zspage = get_zspage(page); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); /* * migration cannot move any zpages in this zspage. Here, class->lock @@ -1215,17 +1228,17 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ - area->vm_addr = kmap_local_page(page); + area->vm_addr = kmap_local_zpdesc(zpdesc); ret = area->vm_addr + off; goto out; } /* this object spans two pages */ - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); + zpdescs[0] = zpdesc; + zpdescs[1] = get_next_zpdesc(zpdesc); + BUG_ON(!zpdescs[1]); - ret = __zs_map_object(area, pages, off, class->size); + ret = __zs_map_object(area, zpdescs, off, class->size); out: if (likely(!ZsHugePage(zspage))) ret += ZS_HANDLE_SIZE; @@ -1237,7 +1250,7 @@ EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; - struct page *page; + struct zpdesc *zpdesc; unsigned long obj, off; unsigned int obj_idx; @@ -1245,8 +1258,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) struct mapping_area *area; obj = handle_to_obj(handle); - obj_to_location(obj, &page, &obj_idx); - zspage = get_zspage(page); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); @@ -1254,13 +1267,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) if (off + class->size <= PAGE_SIZE) kunmap_local(area->vm_addr); else { - struct page *pages[2]; + struct zpdesc *zpdescs[2]; - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); + zpdescs[0] = zpdesc; + zpdescs[1] = get_next_zpdesc(zpdesc); + BUG_ON(!zpdescs[1]); - __zs_unmap_object(area, pages, off, class->size); + __zs_unmap_object(area, zpdescs, off, class->size); } local_unlock(&zs_map_area.lock); @@ -1290,12 +1303,12 @@ EXPORT_SYMBOL_GPL(zs_huge_class_size); static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { - int i, nr_page, offset; + int i, nr_zpdesc, offset; unsigned long obj; struct link_free *link; struct size_class *class; - struct page *m_page; + struct zpdesc *m_zpdesc; unsigned long m_offset; void *vaddr; @@ -1303,27 +1316,26 @@ static unsigned long obj_malloc(struct zs_pool *pool, obj = get_freeobj(zspage); offset = obj * class->size; - nr_page = offset >> PAGE_SHIFT; + nr_zpdesc = offset >> PAGE_SHIFT; m_offset = offset_in_page(offset); - m_page = get_first_page(zspage); + m_zpdesc = get_first_zpdesc(zspage); - for (i = 0; i < nr_page; i++) - m_page = get_next_page(m_page); + for (i = 0; i < nr_zpdesc; i++) + m_zpdesc = get_next_zpdesc(m_zpdesc); - vaddr = kmap_local_page(m_page); + vaddr = kmap_local_zpdesc(m_zpdesc); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle | OBJ_ALLOCATED_TAG; else - /* record handle to page->index */ - zspage->first_page->index = handle | OBJ_ALLOCATED_TAG; + zspage->first_zpdesc->handle = handle | OBJ_ALLOCATED_TAG; kunmap_local(vaddr); mod_zspage_inuse(zspage, 1); - obj = location_to_obj(m_page, obj); + obj = location_to_obj(m_zpdesc, obj); record_obj(handle, obj); return obj; @@ -1402,23 +1414,24 @@ static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; - struct page *f_page; + struct zpdesc *f_zpdesc; unsigned long f_offset; unsigned int f_objidx; void *vaddr; - obj_to_location(obj, &f_page, &f_objidx); - f_offset = offset_in_page(class_size * f_objidx); - zspage = get_zspage(f_page); - vaddr = kmap_local_page(f_page); + obj_to_location(obj, &f_zpdesc, &f_objidx); + f_offset = offset_in_page(class_size * f_objidx); + zspage = get_zspage(f_zpdesc); + + vaddr = kmap_local_zpdesc(f_zpdesc); link = (struct link_free *)(vaddr + f_offset); /* Insert this object in containing zspage's freelist */ if (likely(!ZsHugePage(zspage))) link->next = get_freeobj(zspage) << OBJ_TAG_BITS; else - f_page->index = 0; + f_zpdesc->handle = 0; set_freeobj(zspage, f_objidx); kunmap_local(vaddr); @@ -1428,7 +1441,7 @@ static void obj_free(int class_size, unsigned long obj) void zs_free(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; - struct page *f_page; + struct zpdesc *f_zpdesc; unsigned long obj; struct size_class *class; int fullness; @@ -1442,8 +1455,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle) */ read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); - obj_to_page(obj, &f_page); - zspage = get_zspage(f_page); + obj_to_zpdesc(obj, &f_zpdesc); + zspage = get_zspage(f_zpdesc); class = zspage_class(pool, zspage); spin_lock(&class->lock); read_unlock(&pool->migrate_lock); @@ -1463,7 +1476,7 @@ EXPORT_SYMBOL_GPL(zs_free); static void zs_object_copy(struct size_class *class, unsigned long dst, unsigned long src) { - struct page *s_page, *d_page; + struct zpdesc *s_zpdesc, *d_zpdesc; unsigned int s_objidx, d_objidx; unsigned long s_off, d_off; void *s_addr, *d_addr; @@ -1472,8 +1485,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, s_size = d_size = class->size; - obj_to_location(src, &s_page, &s_objidx); - obj_to_location(dst, &d_page, &d_objidx); + obj_to_location(src, &s_zpdesc, &s_objidx); + obj_to_location(dst, &d_zpdesc, &d_objidx); s_off = offset_in_page(class->size * s_objidx); d_off = offset_in_page(class->size * d_objidx); @@ -1484,8 +1497,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, if (d_off + class->size > PAGE_SIZE) d_size = PAGE_SIZE - d_off; - s_addr = kmap_local_page(s_page); - d_addr = kmap_local_page(d_page); + s_addr = kmap_local_zpdesc(s_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); while (1) { size = min(s_size, d_size); @@ -1510,17 +1523,17 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, if (s_off >= PAGE_SIZE) { kunmap_local(d_addr); kunmap_local(s_addr); - s_page = get_next_page(s_page); - s_addr = kmap_local_page(s_page); - d_addr = kmap_local_page(d_page); + s_zpdesc = get_next_zpdesc(s_zpdesc); + s_addr = kmap_local_zpdesc(s_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); s_size = class->size - written; s_off = 0; } if (d_off >= PAGE_SIZE) { kunmap_local(d_addr); - d_page = get_next_page(d_page); - d_addr = kmap_local_page(d_page); + d_zpdesc = get_next_zpdesc(d_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); d_size = class->size - written; d_off = 0; } @@ -1535,18 +1548,18 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, * return handle. */ static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) + struct zpdesc *zpdesc, int *obj_idx) { unsigned int offset; int index = *obj_idx; unsigned long handle = 0; - void *addr = kmap_local_page(page); + void *addr = kmap_local_zpdesc(zpdesc); - offset = get_first_obj_offset(page); + offset = get_first_obj_offset(zpdesc); offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_allocated(page, addr + offset, &handle)) + if (obj_allocated(zpdesc, addr + offset, &handle)) break; offset += class->size; @@ -1566,14 +1579,14 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, unsigned long used_obj, free_obj; unsigned long handle; int obj_idx = 0; - struct page *s_page = get_first_page(src_zspage); + struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage); struct size_class *class = pool->size_class[src_zspage->class]; while (1) { - handle = find_alloced_obj(class, s_page, &obj_idx); + handle = find_alloced_obj(class, s_zpdesc, &obj_idx); if (!handle) { - s_page = get_next_page(s_page); - if (!s_page) + s_zpdesc = get_next_zpdesc(s_zpdesc); + if (!s_zpdesc) break; obj_idx = 0; continue; @@ -1653,7 +1666,7 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage) */ static void lock_zspage(struct zspage *zspage) { - struct page *curr_page, *page; + struct zpdesc *curr_zpdesc, *zpdesc; /* * Pages we haven't locked yet can be migrated off the list while we're @@ -1665,24 +1678,24 @@ static void lock_zspage(struct zspage *zspage) */ while (1) { migrate_read_lock(zspage); - page = get_first_page(zspage); - if (trylock_page(page)) + zpdesc = get_first_zpdesc(zspage); + if (zpdesc_trylock(zpdesc)) break; - get_page(page); + zpdesc_get(zpdesc); migrate_read_unlock(zspage); - wait_on_page_locked(page); - put_page(page); + zpdesc_wait_locked(zpdesc); + zpdesc_put(zpdesc); } - curr_page = page; - while ((page = get_next_page(curr_page))) { - if (trylock_page(page)) { - curr_page = page; + curr_zpdesc = zpdesc; + while ((zpdesc = get_next_zpdesc(curr_zpdesc))) { + if (zpdesc_trylock(zpdesc)) { + curr_zpdesc = zpdesc; } else { - get_page(page); + zpdesc_get(zpdesc); migrate_read_unlock(zspage); - wait_on_page_locked(page); - put_page(page); + zpdesc_wait_locked(zpdesc); + zpdesc_put(zpdesc); migrate_read_lock(zspage); } } @@ -1720,26 +1733,28 @@ static void migrate_write_unlock(struct zspage *zspage) static const struct movable_operations zsmalloc_mops; static void replace_sub_page(struct size_class *class, struct zspage *zspage, - struct page *newpage, struct page *oldpage) + struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc) { - struct page *page; - struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + struct zpdesc *zpdesc; + struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + unsigned int first_obj_offset; int idx = 0; - page = get_first_page(zspage); + zpdesc = get_first_zpdesc(zspage); do { - if (page == oldpage) - pages[idx] = newpage; + if (zpdesc == oldzpdesc) + zpdescs[idx] = newzpdesc; else - pages[idx] = page; + zpdescs[idx] = zpdesc; idx++; - } while ((page = get_next_page(page)) != NULL); + } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); - create_page_chain(class, zspage, pages); - set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); + create_page_chain(class, zspage, zpdescs); + first_obj_offset = get_first_obj_offset(oldzpdesc); + set_first_obj_offset(newzpdesc, first_obj_offset); if (unlikely(ZsHugePage(zspage))) - newpage->index = oldpage->index; - __SetPageMovable(newpage, &zsmalloc_mops); + newzpdesc->handle = oldzpdesc->handle; + __zpdesc_set_movable(newzpdesc, &zsmalloc_mops); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) @@ -1759,20 +1774,22 @@ static int zs_page_migrate(struct page *newpage, struct page *page, struct zs_pool *pool; struct size_class *class; struct zspage *zspage; - struct page *dummy; + struct zpdesc *dummy; + struct zpdesc *newzpdesc = page_zpdesc(newpage); + struct zpdesc *zpdesc = page_zpdesc(page); void *s_addr, *d_addr, *addr; unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; - VM_BUG_ON_PAGE(!PageIsolated(page), page); + VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); /* We're committed, tell the world that this is a Zsmalloc page. */ - __SetPageZsmalloc(newpage); + __zpdesc_set_zsmalloc(newzpdesc); /* The page is locked, so this pointer must remain valid */ - zspage = get_zspage(page); + zspage = get_zspage(zpdesc); pool = zspage->pool; /* @@ -1789,30 +1806,29 @@ static int zs_page_migrate(struct page *newpage, struct page *page, /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); - offset = get_first_obj_offset(page); - s_addr = kmap_local_page(page); + offset = get_first_obj_offset(zpdesc); + s_addr = kmap_local_zpdesc(zpdesc); /* * Here, any user cannot access all objects in the zspage so let's move. */ - d_addr = kmap_local_page(newpage); + d_addr = kmap_local_zpdesc(newzpdesc); copy_page(d_addr, s_addr); kunmap_local(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { - if (obj_allocated(page, addr, &handle)) { + if (obj_allocated(zpdesc, addr, &handle)) { old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); - new_obj = (unsigned long)location_to_obj(newpage, - obj_idx); + new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx); record_obj(handle, new_obj); } } kunmap_local(s_addr); - replace_sub_page(class, zspage, newpage, page); + replace_sub_page(class, zspage, newzpdesc, zpdesc); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release migration_lock. @@ -1821,14 +1837,14 @@ static int zs_page_migrate(struct page *newpage, struct page *page, spin_unlock(&class->lock); migrate_write_unlock(zspage); - get_page(newpage); - if (page_zone(newpage) != page_zone(page)) { - dec_zone_page_state(page, NR_ZSPAGES); - inc_zone_page_state(newpage, NR_ZSPAGES); + zpdesc_get(newzpdesc); + if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) { + zpdesc_dec_zone_page_state(zpdesc); + zpdesc_inc_zone_page_state(newzpdesc); } - reset_page(page); - put_page(page); + reset_zpdesc(zpdesc); + zpdesc_put(zpdesc); return MIGRATEPAGE_SUCCESS; } @@ -1897,13 +1913,13 @@ static void init_deferred_free(struct zs_pool *pool) static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) { - struct page *page = get_first_page(zspage); + struct zpdesc *zpdesc = get_first_zpdesc(zspage); do { - WARN_ON(!trylock_page(page)); - __SetPageMovable(page, &zsmalloc_mops); - unlock_page(page); - } while ((page = get_next_page(page)) != NULL); + WARN_ON(!zpdesc_trylock(zpdesc)); + __zpdesc_set_movable(zpdesc, &zsmalloc_mops); + zpdesc_unlock(zpdesc); + } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); } #else static inline void zs_flush_migration(struct zs_pool *pool) { } diff --git a/mm/zswap.c b/mm/zswap.c index 30f5a27a6862..167ae641379f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1186,7 +1186,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o /* * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. + * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP. */ spin_unlock(&l->lock); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index de47ad999d7b..f4c5d1a8c01f 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -210,7 +210,7 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_EXP_FEATURE_CHANGED, }; -#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) +#define CACHE_TIMEOUT secs_to_jiffies(2) #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f89cf93f6eb4..8a91c1972dc5 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -532,12 +532,11 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; - /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_array_node(gfp, - pool->p.nid, bulk, - (struct page **)pool->alloc.cache); + nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, + (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) return 0; diff --git a/net/mpls/internal.h b/net/mpls/internal.h index b9f492ddf93b..83c629529b57 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -33,7 +33,7 @@ struct mpls_dev { #define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ do { \ - __typeof__(*(mdev)->stats) *ptr = \ + TYPEOF_UNQUAL(*(mdev)->stats) *ptr = \ raw_cpu_ptr((mdev)->stats); \ local_bh_disable(); \ u64_stats_update_begin(&ptr->syncp); \ @@ -45,7 +45,7 @@ struct mpls_dev { #define MPLS_INC_STATS(mdev, field) \ do { \ - __typeof__(*(mdev)->stats) *ptr = \ + TYPEOF_UNQUAL(*(mdev)->stats) *ptr = \ raw_cpu_ptr((mdev)->stats); \ local_bh_disable(); \ u64_stats_update_begin(&ptr->syncp); \ diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 4cc97f971264..7c6f7c9f7332 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -39,20 +39,15 @@ static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; -#define SECS * HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { - [SCTP_CONNTRACK_CLOSED] = 10 SECS, - [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, - [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, - [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, - [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS, - [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS, - [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, - [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, + [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10), + [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3), + [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210), + [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30), }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 79879b7d39cb..e7f9c295d13c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -651,8 +651,8 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) if (pages > RPCSVC_MAXPAGES) pages = RPCSVC_MAXPAGES; - ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages, - rqstp->rq_pages); + ret = alloc_pages_bulk_node(GFP_KERNEL, node, pages, + rqstp->rq_pages); return ret == pages; } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 43c57124de52..aebc0d8ddff5 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -671,8 +671,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) } for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk_array(GFP_KERNEL, pages, - rqstp->rq_pages); + ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 3bb04b05c5ce..bea70eb6f034 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -640,10 +640,8 @@ EXPORT_SYMBOL(wireless_send_event); #ifdef CONFIG_CFG80211_WEXT static void wireless_warn_cfg80211_wext(void) { - char name[sizeof(current->comm)]; - pr_warn_once("warning: `%s' uses wireless extensions which will stop working for Wi-Fi 7 hardware; use nl80211\n", - get_task_comm(name, current)); + current->comm); } #endif diff --git a/samples/Kconfig b/samples/Kconfig index b288d9991d27..8d5a36f0e5d6 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -293,6 +293,8 @@ config SAMPLE_CGROUP source "samples/rust/Kconfig" +source "samples/damon/Kconfig" + endif # SAMPLES config HAVE_SAMPLE_FTRACE_DIRECT diff --git a/samples/Makefile b/samples/Makefile index b85fa64390c5..5af6bb8afb07 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -39,3 +39,5 @@ obj-$(CONFIG_SAMPLE_KMEMLEAK) += kmemleak/ obj-$(CONFIG_SAMPLE_CORESIGHT_SYSCFG) += coresight/ obj-$(CONFIG_SAMPLE_FPROBE) += fprobe/ obj-$(CONFIG_SAMPLES_RUST) += rust/ +obj-$(CONFIG_SAMPLE_DAMON_WSSE) += damon/ +obj-$(CONFIG_SAMPLE_DAMON_PRCL) += damon/ diff --git a/samples/damon/Kconfig b/samples/damon/Kconfig new file mode 100644 index 000000000000..63f6dcd71daa --- /dev/null +++ b/samples/damon/Kconfig @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "DAMON Samples" + +config SAMPLE_DAMON_WSSE + bool "DAMON sameple module for working set size estimation" + depends on DAMON && DAMON_VADDR + help + This builds DAMON sample module for working set size estimation. + + The module receives a pid, monitor access to the virtual address + space of the process, estimate working set size of the process, and + repeatedly prints the size on the kernel log. + + If unsure, say N. + +config SAMPLE_DAMON_PRCL + bool "DAMON sameple module for access-aware proactive reclamation" + depends on DAMON && DAMON_VADDR + help + This builds DAMON sample module for access-aware proactive + reclamation. + + The module receives a pid, monitor access to the virtual address + space of the process, find memory regions that not accessed, and + proactively reclaim the regions. + + If unsure, say N. + +endmenu diff --git a/samples/damon/Makefile b/samples/damon/Makefile new file mode 100644 index 000000000000..7f155143f237 --- /dev/null +++ b/samples/damon/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_SAMPLE_DAMON_WSSE) += wsse.o +obj-$(CONFIG_SAMPLE_DAMON_PRCL) += prcl.o diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c new file mode 100644 index 000000000000..c3acbdab7a62 --- /dev/null +++ b/samples/damon/prcl.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * proactive reclamation: monitor access pattern of a given process, find + * regiosn that seems not accessed, and proactively page out the regions. + */ + +#define pr_fmt(fmt) "damon_sample_prcl: " fmt + +#include +#include +#include +#include + +static int target_pid __read_mostly; +module_param(target_pid, int, 0600); + +static int damon_sample_prcl_enable_store( + const char *val, const struct kernel_param *kp); + +static const struct kernel_param_ops enable_param_ops = { + .set = damon_sample_prcl_enable_store, + .get = param_get_bool, +}; + +static bool enable __read_mostly; +module_param_cb(enable, &enable_param_ops, &enable, 0600); +MODULE_PARM_DESC(enable, "Enable of disable DAMON_SAMPLE_WSSE"); + +static struct damon_ctx *ctx; +static struct pid *target_pidp; + +static int damon_sample_prcl_after_aggregate(struct damon_ctx *c) +{ + struct damon_target *t; + + damon_for_each_target(t, c) { + struct damon_region *r; + unsigned long wss = 0; + + damon_for_each_region(r, t) { + if (r->nr_accesses > 0) + wss += r->ar.end - r->ar.start; + } + pr_info("wss: %lu\n", wss); + } + return 0; +} + +static int damon_sample_prcl_start(void) +{ + struct damon_target *target; + struct damos *scheme; + + pr_info("start\n"); + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + if (damon_select_ops(ctx, DAMON_OPS_VADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + target_pidp = find_get_pid(target_pid); + if (!target_pidp) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + target->pid = target_pidp; + + ctx->callback.after_aggregation = damon_sample_prcl_after_aggregate; + + scheme = damon_new_scheme( + &(struct damos_access_pattern) { + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + .min_nr_accesses = 0, + .max_nr_accesses = 0, + .min_age_region = 50, + .max_age_region = UINT_MAX}, + DAMOS_PAGEOUT, + 0, + &(struct damos_quota){}, + &(struct damos_watermarks){}, + NUMA_NO_NODE); + if (!scheme) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_set_schemes(ctx, &scheme, 1); + + return damon_start(&ctx, 1, true); +} + +static void damon_sample_prcl_stop(void) +{ + pr_info("stop\n"); + if (ctx) { + damon_stop(&ctx, 1); + damon_destroy_ctx(ctx); + } + if (target_pidp) + put_pid(target_pidp); +} + +static int damon_sample_prcl_enable_store( + const char *val, const struct kernel_param *kp) +{ + bool enabled = enable; + int err; + + err = kstrtobool(val, &enable); + if (err) + return err; + + if (enable == enabled) + return 0; + + if (enable) + return damon_sample_prcl_start(); + damon_sample_prcl_stop(); + return 0; +} + +static int __init damon_sample_prcl_init(void) +{ + return 0; +} + +module_init(damon_sample_prcl_init); diff --git a/samples/damon/wsse.c b/samples/damon/wsse.c new file mode 100644 index 000000000000..11be25803274 --- /dev/null +++ b/samples/damon/wsse.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * working set size estimation: monitor access pattern of given process and + * print estimated working set size (total size of regions that showing some + * access). + */ + +#define pr_fmt(fmt) "damon_sample_wsse: " fmt + +#include +#include +#include +#include + +static int target_pid __read_mostly; +module_param(target_pid, int, 0600); + +static int damon_sample_wsse_enable_store( + const char *val, const struct kernel_param *kp); + +static const struct kernel_param_ops enable_param_ops = { + .set = damon_sample_wsse_enable_store, + .get = param_get_bool, +}; + +static bool enable __read_mostly; +module_param_cb(enable, &enable_param_ops, &enable, 0600); +MODULE_PARM_DESC(enable, "Enable or disable DAMON_SAMPLE_WSSE"); + +static struct damon_ctx *ctx; +static struct pid *target_pidp; + +static int damon_sample_wsse_after_aggregate(struct damon_ctx *c) +{ + struct damon_target *t; + + damon_for_each_target(t, c) { + struct damon_region *r; + unsigned long wss = 0; + + damon_for_each_region(r, t) { + if (r->nr_accesses > 0) + wss += r->ar.end - r->ar.start; + } + pr_info("wss: %lu\n", wss); + } + return 0; +} + +static int damon_sample_wsse_start(void) +{ + struct damon_target *target; + + pr_info("start\n"); + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + if (damon_select_ops(ctx, DAMON_OPS_VADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + target_pidp = find_get_pid(target_pid); + if (!target_pidp) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + target->pid = target_pidp; + + ctx->callback.after_aggregation = damon_sample_wsse_after_aggregate; + return damon_start(&ctx, 1, true); +} + +static void damon_sample_wsse_stop(void) +{ + pr_info("stop\n"); + if (ctx) { + damon_stop(&ctx, 1); + damon_destroy_ctx(ctx); + } + if (target_pidp) + put_pid(target_pidp); +} + +static int damon_sample_wsse_enable_store( + const char *val, const struct kernel_param *kp) +{ + bool enabled = enable; + int err; + + err = kstrtobool(val, &enable); + if (err) + return err; + + if (enable == enabled) + return 0; + + if (enable) + return damon_sample_wsse_start(); + damon_sample_wsse_stop(); + return 0; +} + +static int __init damon_sample_wsse_init(void) +{ + return 0; +} + +module_init(damon_sample_wsse_init); diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c index 378e2d40271a..69105596e72e 100644 --- a/samples/livepatch/livepatch-callbacks-busymod.c +++ b/samples/livepatch/livepatch-callbacks-busymod.c @@ -44,8 +44,7 @@ static void busymod_work_func(struct work_struct *work) static int livepatch_callbacks_mod_init(void) { pr_info("%s\n", __func__); - schedule_delayed_work(&work, - msecs_to_jiffies(1000 * 0)); + schedule_delayed_work(&work, 0); return 0; } diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c index 6701641bf12d..f3f153895d6c 100644 --- a/samples/livepatch/livepatch-shadow-fix1.c +++ b/samples/livepatch/livepatch-shadow-fix1.c @@ -72,8 +72,7 @@ static struct dummy *livepatch_fix1_dummy_alloc(void) if (!d) return NULL; - d->jiffies_expire = jiffies + - msecs_to_jiffies(1000 * EXPIRE_PERIOD); + d->jiffies_expire = jiffies + secs_to_jiffies(EXPIRE_PERIOD); /* * Patch: save the extra memory location into a SV_LEAK shadow diff --git a/samples/livepatch/livepatch-shadow-mod.c b/samples/livepatch/livepatch-shadow-mod.c index 7e753b0d2fa6..5d83ad5a8118 100644 --- a/samples/livepatch/livepatch-shadow-mod.c +++ b/samples/livepatch/livepatch-shadow-mod.c @@ -101,8 +101,7 @@ static __used noinline struct dummy *dummy_alloc(void) if (!d) return NULL; - d->jiffies_expire = jiffies + - msecs_to_jiffies(1000 * EXPIRE_PERIOD); + d->jiffies_expire = jiffies + secs_to_jiffies(EXPIRE_PERIOD); /* Oops, forgot to save leak! */ leak = kzalloc(sizeof(*leak), GFP_KERNEL); @@ -152,8 +151,7 @@ static void alloc_work_func(struct work_struct *work) list_add(&d->list, &dummy_list); mutex_unlock(&dummy_list_mutex); - schedule_delayed_work(&alloc_dwork, - msecs_to_jiffies(1000 * ALLOC_PERIOD)); + schedule_delayed_work(&alloc_dwork, secs_to_jiffies(ALLOC_PERIOD)); } /* @@ -184,16 +182,13 @@ static void cleanup_work_func(struct work_struct *work) } mutex_unlock(&dummy_list_mutex); - schedule_delayed_work(&cleanup_dwork, - msecs_to_jiffies(1000 * CLEANUP_PERIOD)); + schedule_delayed_work(&cleanup_dwork, secs_to_jiffies(CLEANUP_PERIOD)); } static int livepatch_shadow_mod_init(void) { - schedule_delayed_work(&alloc_dwork, - msecs_to_jiffies(1000 * ALLOC_PERIOD)); - schedule_delayed_work(&cleanup_dwork, - msecs_to_jiffies(1000 * CLEANUP_PERIOD)); + schedule_delayed_work(&alloc_dwork, secs_to_jiffies(ALLOC_PERIOD)); + schedule_delayed_work(&cleanup_dwork, secs_to_jiffies(CLEANUP_PERIOD)); return 0; } diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9eed3683ad76..9d469c20871f 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -834,16 +834,6 @@ foreach my $entry (@mode_permission_funcs) { $mode_perms_search = "(?:${mode_perms_search})"; our %deprecated_apis = ( - "synchronize_rcu_bh" => "synchronize_rcu", - "synchronize_rcu_bh_expedited" => "synchronize_rcu_expedited", - "call_rcu_bh" => "call_rcu", - "rcu_barrier_bh" => "rcu_barrier", - "synchronize_sched" => "synchronize_rcu", - "synchronize_sched_expedited" => "synchronize_rcu_expedited", - "call_rcu_sched" => "call_rcu", - "rcu_barrier_sched" => "rcu_barrier", - "get_state_synchronize_sched" => "get_state_synchronize_rcu", - "cond_synchronize_sched" => "cond_synchronize_rcu", "kmap" => "kmap_local_page", "kunmap" => "kunmap_local", "kmap_atomic" => "kmap_local_page", @@ -2875,7 +2865,7 @@ sub process { if ($realfile =~ m@^include/asm/@) { ERROR("MODIFIED_INCLUDE_ASM", - "do not modify files in include/asm, change architecture specific files in include/asm-\n" . "$here$rawline\n"); + "do not modify files in include/asm, change architecture specific files in arch//include/asm\n" . "$here$rawline\n"); } $found_file = 1; } @@ -3237,12 +3227,12 @@ sub process { my ($cid, $ctitle) = git_commit_info($orig_commit, $id, $title); - if ($ctitle ne $title || $tag_case || $tag_space || - $id_length || $id_case || !$title_has_quotes) { + if (defined($cid) && ($ctitle ne $title || $tag_case || $tag_space || $id_length || $id_case || !$title_has_quotes)) { + my $fixed = "Fixes: $cid (\"$ctitle\")"; if (WARN("BAD_FIXES_TAG", - "Please use correct Fixes: style 'Fixes: <12 chars of sha1> (\"\")' - ie: 'Fixes: $cid (\"$ctitle\")'\n" . $herecurr) && + "Please use correct Fixes: style 'Fixes: <12+ chars of sha1> (\"<title line>\")' - ie: '$fixed'\n" . $herecurr) && $fix) { - $fixed[$fixlinenr] = "Fixes: $cid (\"$ctitle\")"; + $fixed[$fixlinenr] = $fixed; } } } @@ -5513,9 +5503,9 @@ sub process { } } -# check for unnecessary parentheses around comparisons in if uses -# when !drivers/staging or command-line uses --strict - if (($realfile !~ m@^(?:drivers/staging/)@ || $check_orig) && +# check for unnecessary parentheses around comparisons +# except in drivers/staging + if (($realfile !~ m@^(?:drivers/staging/)@) && $perl_version_ok && defined($stat) && $stat =~ /(^.\s*if\s*($balanced_parens))/) { my $if_stat = $1; diff --git a/scripts/coccinelle/misc/secs_to_jiffies.cocci b/scripts/coccinelle/misc/secs_to_jiffies.cocci new file mode 100644 index 000000000000..8bbb2884ea5d --- /dev/null +++ b/scripts/coccinelle/misc/secs_to_jiffies.cocci @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +/// +/// Find usages of: +/// - msecs_to_jiffies(value*1000) +/// - msecs_to_jiffies(value*MSEC_PER_SEC) +/// +// Confidence: High +// Copyright: (C) 2024 Easwar Hariharan, Microsoft +// Keywords: secs, seconds, jiffies +// + +virtual patch + +@depends on patch@ constant C; @@ + +- msecs_to_jiffies(C * 1000) ++ secs_to_jiffies(C) + +@depends on patch@ constant C; @@ + +- msecs_to_jiffies(C * MSEC_PER_SEC) ++ secs_to_jiffies(C) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 05bd9ca1fbfa..a290db720b0f 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -222,6 +222,7 @@ autonymous||autonomous auxillary||auxiliary auxilliary||auxiliary avaiable||available +avaialable||available avaible||available availabe||available availabled||available @@ -267,6 +268,7 @@ broadcase||broadcast broadcat||broadcast bufer||buffer bufferred||buffered +bufferur||buffer bufufer||buffer cacluated||calculated caculate||calculate @@ -405,6 +407,7 @@ configutation||configuration congiuration||configuration conider||consider conjuction||conjunction +connction||connection connecetd||connected connectinos||connections connetor||connector @@ -413,6 +416,7 @@ connnections||connections consistancy||consistency consistant||consistent consits||consists +constructred||constructed containes||contains containts||contains contaisn||contains @@ -450,6 +454,7 @@ creationg||creating cryptocraphic||cryptographic cummulative||cumulative cunter||counter +curent||current curently||currently cylic||cyclic dafault||default @@ -461,6 +466,7 @@ decendant||descendant decendants||descendants decompres||decompress decsribed||described +decrese||decrease decription||description detault||default dectected||detected @@ -485,6 +491,7 @@ delare||declare delares||declares delaring||declaring delemiter||delimiter +deley||delay delibrately||deliberately delievered||delivered demodualtor||demodulator @@ -551,6 +558,7 @@ disgest||digest disired||desired dispalying||displaying dissable||disable +dissapeared||disappeared diplay||display directon||direction direcly||directly @@ -606,6 +614,7 @@ eigth||eight elementry||elementary eletronic||electronic embeded||embedded +emtpy||empty enabledi||enabled enbale||enable enble||enable @@ -669,6 +678,7 @@ exmaple||example expecially||especially experies||expires explicite||explicit +explicity||explicitly explicitely||explicitly explict||explicit explictely||explicitly @@ -723,10 +733,12 @@ followign||following followings||following follwing||following fonud||found +forcebly||forcibly forseeable||foreseeable forse||force fortan||fortran forwardig||forwarding +forwared||forwarded frambuffer||framebuffer framming||framing framwork||framework @@ -767,6 +779,7 @@ grahpical||graphical granularty||granularity grapic||graphic grranted||granted +grups||groups guage||gauge guarenteed||guaranteed guarentee||guarantee @@ -780,6 +793,7 @@ hardare||hardware harware||hardware hardward||hardware havind||having +heigth||height heirarchically||hierarchically heirarchy||hierarchy heirachy||hierarchy @@ -788,9 +802,11 @@ hearbeat||heartbeat heterogenous||heterogeneous hexdecimal||hexadecimal hybernate||hibernate +hiearchy||hierarchy hierachy||hierarchy hierarchie||hierarchy homogenous||homogeneous +horizental||horizontal howver||however hsould||should hypervior||hypervisor @@ -842,6 +858,7 @@ independed||independent indiate||indicate indicat||indicate inexpect||inexpected +infalte||inflate inferface||interface infinit||infinite infomation||information @@ -861,6 +878,7 @@ initators||initiators initialiazation||initialization initializationg||initialization initializiation||initialization +initializtion||initialization initialze||initialize initialzed||initialized initialzing||initializing @@ -877,6 +895,7 @@ instanciate||instantiate instanciated||instantiated instuments||instruments insufficent||insufficient +intead||instead inteface||interface integreated||integrated integrety||integrity @@ -1081,6 +1100,7 @@ notications||notifications notifcations||notifications notifed||notified notity||notify +notfify||notify nubmer||number numebr||number numer||number @@ -1122,6 +1142,7 @@ orientatied||orientated orientied||oriented orignal||original originial||original +orphanded||orphaned otherise||otherwise ouput||output oustanding||outstanding @@ -1184,9 +1205,11 @@ peroid||period persistance||persistence persistant||persistent phoneticly||phonetically +pipline||pipeline plaform||platform plalform||platform platfoem||platform +platfomr||platform platfrom||platform plattform||platform pleaes||please @@ -1211,6 +1234,7 @@ preceeding||preceding preceed||precede precendence||precedence precission||precision +predicition||prediction preemptable||preemptible prefered||preferred prefferably||preferably @@ -1289,6 +1313,7 @@ querrying||querying queus||queues randomally||randomly raoming||roaming +readyness||readiness reasearcher||researcher reasearchers||researchers reasearch||research @@ -1305,8 +1330,10 @@ recieves||receives recieving||receiving recogniced||recognised recognizeable||recognizable +recompte||recompute recommanded||recommended recyle||recycle +redect||reject redircet||redirect redirectrion||redirection redundacy||redundancy @@ -1314,6 +1341,7 @@ reename||rename refcounf||refcount refence||reference refered||referred +referencce||reference referenace||reference refererence||reference refering||referring @@ -1348,11 +1376,13 @@ replys||replies reponse||response representaion||representation repsonse||response +reqested||requested reqeust||request reqister||register requed||requeued requestied||requested requiere||require +requieres||requires requirment||requirement requred||required requried||required @@ -1440,6 +1470,7 @@ sequencial||sequential serivce||service serveral||several servive||service +sesion||session setts||sets settting||setting shapshot||snapshot @@ -1602,11 +1633,13 @@ trys||tries thses||these tiggers||triggers tiggered||triggered +tiggerring||triggering tipically||typically timeing||timing timming||timing timout||timeout tmis||this +tolarance||tolerance toogle||toggle torerable||tolerable torlence||tolerance @@ -1633,6 +1666,7 @@ trasfer||transfer trasmission||transmission trasmitter||transmitter treshold||threshold +trigged||triggered triggerd||triggered trigerred||triggered trigerring||triggering @@ -1648,6 +1682,7 @@ uknown||unknown usccess||success uncommited||uncommitted uncompatible||incompatible +uncomressed||uncompressed unconditionaly||unconditionally undeflow||underflow undelying||underlying @@ -1715,6 +1750,7 @@ utitity||utility utitlty||utility vaid||valid vaild||valid +validationg||validating valide||valid variantions||variations varible||variable @@ -1724,6 +1760,7 @@ verbse||verbose veify||verify verfication||verification veriosn||version +versoin||version verisons||versions verison||version veritical||vertical diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index e1a5e13ea269..1a2d02fee09b 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -76,7 +76,6 @@ static void report_access(const char *access, struct task_struct *target, struct task_struct *agent) { struct access_report_info *info; - char agent_comm[sizeof(agent->comm)]; assert_spin_locked(&target->alloc_lock); /* for target->comm */ @@ -86,8 +85,7 @@ static void report_access(const char *access, struct task_struct *target, */ pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", - access, target->comm, target->pid, - get_task_comm(agent_comm, agent), agent->pid); + access, target->comm, target->pid, agent->comm, agent->pid); return; } diff --git a/sound/usb/line6/toneport.c b/sound/usb/line6/toneport.c index ca2c6f5de407..c073b38cd673 100644 --- a/sound/usb/line6/toneport.c +++ b/sound/usb/line6/toneport.c @@ -386,7 +386,7 @@ static int toneport_setup(struct usb_line6_toneport *toneport) toneport_update_led(toneport); schedule_delayed_work(&toneport->line6.startup_work, - msecs_to_jiffies(TONEPORT_PCM_DELAY * 1000)); + secs_to_jiffies(TONEPORT_PCM_DELAY)); return 0; } diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 1334214546d7..100ad3dc091a 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -192,60 +192,77 @@ static int get_family_id(int sd) } #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1)) +#define delay_ms(t) (t / 1000000ULL) static void print_delayacct(struct taskstats *t) { - printf("\n\nCPU %15s%15s%15s%15s%15s\n" - " %15llu%15llu%15llu%15llu%15.3fms\n" - "IO %15s%15s%15s\n" - " %15llu%15llu%15.3fms\n" - "SWAP %15s%15s%15s\n" - " %15llu%15llu%15.3fms\n" - "RECLAIM %12s%15s%15s\n" - " %15llu%15llu%15.3fms\n" - "THRASHING%12s%15s%15s\n" - " %15llu%15llu%15.3fms\n" - "COMPACT %12s%15s%15s\n" - " %15llu%15llu%15.3fms\n" - "WPCOPY %12s%15s%15s\n" - " %15llu%15llu%15.3fms\n" - "IRQ %15s%15s%15s\n" - " %15llu%15llu%15.3fms\n", + printf("\n\nCPU %15s%15s%15s%15s%15s%15s\n" + " %15llu%15llu%15llu%15llu%15.3fms%13.6fms\n" + "IO %15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms\n" + "SWAP %15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms\n" + "RECLAIM %12s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms\n" + "THRASHING%12s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms\n" + "COMPACT %12s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms\n" + "WPCOPY %12s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms\n" + "IRQ %15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms\n", "count", "real total", "virtual total", - "delay total", "delay average", + "delay total", "delay average", "delay max", "delay min", (unsigned long long)t->cpu_count, (unsigned long long)t->cpu_run_real_total, (unsigned long long)t->cpu_run_virtual_total, (unsigned long long)t->cpu_delay_total, average_ms((double)t->cpu_delay_total, t->cpu_count), - "count", "delay total", "delay average", + delay_ms((double)t->cpu_delay_max), + delay_ms((double)t->cpu_delay_min), + "count", "delay total", "delay average", "delay max", "delay min", (unsigned long long)t->blkio_count, (unsigned long long)t->blkio_delay_total, average_ms((double)t->blkio_delay_total, t->blkio_count), - "count", "delay total", "delay average", + delay_ms((double)t->blkio_delay_max), + delay_ms((double)t->blkio_delay_min), + "count", "delay total", "delay average", "delay max", "delay min", (unsigned long long)t->swapin_count, (unsigned long long)t->swapin_delay_total, average_ms((double)t->swapin_delay_total, t->swapin_count), - "count", "delay total", "delay average", + delay_ms((double)t->swapin_delay_max), + delay_ms((double)t->swapin_delay_min), + "count", "delay total", "delay average", "delay max", "delay min", (unsigned long long)t->freepages_count, (unsigned long long)t->freepages_delay_total, average_ms((double)t->freepages_delay_total, t->freepages_count), - "count", "delay total", "delay average", + delay_ms((double)t->freepages_delay_max), + delay_ms((double)t->freepages_delay_min), + "count", "delay total", "delay average", "delay max", "delay min", (unsigned long long)t->thrashing_count, (unsigned long long)t->thrashing_delay_total, average_ms((double)t->thrashing_delay_total, t->thrashing_count), - "count", "delay total", "delay average", + delay_ms((double)t->thrashing_delay_max), + delay_ms((double)t->thrashing_delay_min), + "count", "delay total", "delay average", "delay max", "delay min", (unsigned long long)t->compact_count, (unsigned long long)t->compact_delay_total, average_ms((double)t->compact_delay_total, t->compact_count), - "count", "delay total", "delay average", + delay_ms((double)t->compact_delay_max), + delay_ms((double)t->compact_delay_min), + "count", "delay total", "delay average", "delay max", "delay min", (unsigned long long)t->wpcopy_count, (unsigned long long)t->wpcopy_delay_total, average_ms((double)t->wpcopy_delay_total, t->wpcopy_count), - "count", "delay total", "delay average", + delay_ms((double)t->wpcopy_delay_max), + delay_ms((double)t->wpcopy_delay_min), + "count", "delay total", "delay average", "delay max", "delay min", (unsigned long long)t->irq_count, (unsigned long long)t->irq_delay_total, - average_ms((double)t->irq_delay_total, t->irq_count)); + average_ms((double)t->irq_delay_total, t->irq_count), + delay_ms((double)t->irq_delay_max), + delay_ms((double)t->irq_delay_min)); } static void task_context_switch_counts(struct taskstats *t) diff --git a/tools/accounting/procacct.c b/tools/accounting/procacct.c index 90c4a37f53d9..e8dee05a6264 100644 --- a/tools/accounting/procacct.c +++ b/tools/accounting/procacct.c @@ -274,12 +274,11 @@ int main(int argc, char *argv[]) int maskset = 0; char *logfile = NULL; int cfd = 0; - int forking = 0; struct msgtemplate msg; - while (!forking) { - c = getopt(argc, argv, "m:vr:"); + while (1) { + c = getopt(argc, argv, "m:vr:w:"); if (c < 0) break; diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config index b3b00269a52a..b0049be00c70 100644 --- a/tools/testing/kunit/configs/all_tests.config +++ b/tools/testing/kunit/configs/all_tests.config @@ -38,9 +38,6 @@ CONFIG_IWLWIFI=y CONFIG_DAMON=y CONFIG_DAMON_VADDR=y CONFIG_DAMON_PADDR=y -CONFIG_DEBUG_FS=y -CONFIG_DAMON_DBGFS=y -CONFIG_DAMON_DBGFS_DEPRECATED=y CONFIG_REGMAP_BUILD=y diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index cffaf2245d4f..eaff1b036989 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -227,6 +227,7 @@ static void *load_creator(void *ptr) unsigned long index = (3 << RADIX_TREE_MAP_SHIFT) - (1 << order); item_insert_order(tree, index, order); + xa_set_mark(tree, index, XA_MARK_1); item_delete_rcu(tree, index); } } @@ -242,8 +243,11 @@ static void *load_worker(void *ptr) rcu_register_thread(); while (!stop_iteration) { + unsigned long find_index = (2 << RADIX_TREE_MAP_SHIFT) + 1; struct item *item = xa_load(ptr, index); assert(!xa_is_internal(item)); + item = xa_find(ptr, &find_index, index, XA_MARK_1); + assert(!xa_is_internal(item)); } rcu_unregister_thread(); diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore index 2ab675fecb6b..2f0297657c81 100644 --- a/tools/testing/selftests/damon/.gitignore +++ b/tools/testing/selftests/damon/.gitignore @@ -1,6 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -huge_count_read_write -debugfs_target_ids_read_before_terminate_race -debugfs_target_ids_pid_leak access_memory access_memory_even diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 812f656260fb..ecbf07afc6dd 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -1,15 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for damon selftests -TEST_GEN_FILES += huge_count_read_write -TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race -TEST_GEN_FILES += debugfs_target_ids_pid_leak TEST_GEN_FILES += access_memory access_memory_even -TEST_FILES = _chk_dependency.sh _debugfs_common.sh _damon_sysfs.py +TEST_FILES = _chk_dependency.sh _damon_sysfs.py # functionality tests -TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += sysfs.sh TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py @@ -17,11 +13,6 @@ TEST_PROGS += damos_tried_regions.py damon_nr_regions.py TEST_PROGS += reclaim.sh lru_sort.sh # regression tests (reproducers of previously found bugs) -TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh -TEST_PROGS += debugfs_duplicate_context_creation.sh -TEST_PROGS += debugfs_rm_non_contexts.sh -TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh -TEST_PROGS += debugfs_target_ids_pid_leak.sh TEST_PROGS += sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py diff --git a/tools/testing/selftests/damon/config b/tools/testing/selftests/damon/config index 0daf38974eb0..a68a9fead5dc 100644 --- a/tools/testing/selftests/damon/config +++ b/tools/testing/selftests/damon/config @@ -1,6 +1,5 @@ CONFIG_DAMON=y CONFIG_DAMON_SYSFS=y -CONFIG_DAMON_DBGFS=y CONFIG_DAMON_PADDR=y CONFIG_DAMON_VADDR=y CONFIG_DAMON_RECLAIM=y diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh deleted file mode 100755 index 902e312bca89..000000000000 --- a/tools/testing/selftests/damon/debugfs_attrs.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test attrs file -# =============== - -file="$DBGFS/attrs" -orig_content=$(cat "$file") - -test_write_succ "$file" "1 2 3 4 5" "$orig_content" "valid input" -test_write_fail "$file" "1 2 3 4" "$orig_content" "no enough fields" -test_write_fail "$file" "1 2 3 5 4" "$orig_content" \ - "min_nr_regions > max_nr_regions" -test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written" -echo "$orig_content" > "$file" diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh deleted file mode 100755 index bd6c22d96ead..000000000000 --- a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test duplicated context creation -# ================================ - -if ! echo foo > "$DBGFS/mk_contexts" -then - echo "context creation failed" - exit 1 -fi - -if echo foo > "$DBGFS/mk_contexts" 2> /dev/null -then - echo "duplicate context creation success" - exit 1 -fi - -if ! echo foo > "$DBGFS/rm_contexts" -then - echo "context deletion failed" - exit 1 -fi - -exit 0 diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh deleted file mode 100755 index effbea33dc16..000000000000 --- a/tools/testing/selftests/damon/debugfs_empty_targets.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test empty targets case -# ======================= - -orig_target_ids=$(cat "$DBGFS/target_ids") -echo "" > "$DBGFS/target_ids" - -if [ -f "$DBGFS/monitor_on_DEPRECATED" ] -then - monitor_on_file="$DBGFS/monitor_on_DEPRECATED" -else - monitor_on_file="$DBGFS/monitor_on" -fi - -orig_monitor_on=$(cat "$monitor_on_file") -test_write_fail "$monitor_on_file" "on" "orig_monitor_on" "empty target ids" -echo "$orig_target_ids" > "$DBGFS/target_ids" diff --git a/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh b/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh deleted file mode 100755 index 922cadac2950..000000000000 --- a/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test huge count read write -# ========================== - -dmesg -C - -for file in "$DBGFS/"* -do - ./huge_count_read_write "$file" -done - -if dmesg | grep -q WARNING -then - dmesg - exit 1 -else - exit 0 -fi diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh deleted file mode 100755 index f3ffeb1343cf..000000000000 --- a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test putting non-ctx files/dirs to rm_contexts file -# =================================================== - -dmesg -C - -for file in "$DBGFS/"* -do - (echo "$(basename "$f")" > "$DBGFS/rm_contexts") &> /dev/null - if dmesg | grep -q BUG - then - dmesg - exit 1 - fi -done diff --git a/tools/testing/selftests/damon/debugfs_schemes.sh b/tools/testing/selftests/damon/debugfs_schemes.sh deleted file mode 100755 index 5b39ab44731c..000000000000 --- a/tools/testing/selftests/damon/debugfs_schemes.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test schemes file -# ================= - -file="$DBGFS/schemes" -orig_content=$(cat "$file") - -test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \ - "$orig_content" "valid input" -test_write_fail "$file" "1 2 -3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines" -test_write_succ "$file" "" "$orig_content" "disabling" -test_write_fail "$file" "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" \ - "$orig_content" "wrong condition ranges" -echo "$orig_content" > "$file" diff --git a/tools/testing/selftests/damon/debugfs_target_ids.sh b/tools/testing/selftests/damon/debugfs_target_ids.sh deleted file mode 100755 index 49aeabdb0aae..000000000000 --- a/tools/testing/selftests/damon/debugfs_target_ids.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test target_ids file -# ==================== - -file="$DBGFS/target_ids" -orig_content=$(cat "$file") - -test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input" -test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input" -test_content "$file" "$orig_content" "1 2" "non-integer was there" -test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input" -test_content "$file" "$orig_content" "" "wrong input written" -test_write_succ "$file" "" "$orig_content" "empty input" -test_content "$file" "$orig_content" "" "empty input written" -echo "$orig_content" > "$file" diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c deleted file mode 100644 index 0cc2eef7d142..000000000000 --- a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Author: SeongJae Park <sj@kernel.org> - */ - -#define _GNU_SOURCE - -#include <fcntl.h> -#include <stdbool.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <sys/types.h> -#include <sys/wait.h> -#include <sys/time.h> -#include <unistd.h> - -#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids" - -static void write_targetid_exit(void) -{ - int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR); - char pid_str[128]; - - snprintf(pid_str, sizeof(pid_str), "%d", getpid()); - write(target_ids_fd, pid_str, sizeof(pid_str)); - close(target_ids_fd); - exit(0); -} - -unsigned long msec_timestamp(void) -{ - struct timeval tv; - - gettimeofday(&tv, NULL); - return tv.tv_sec * 1000UL + tv.tv_usec / 1000; -} - -int main(int argc, char *argv[]) -{ - unsigned long start_ms; - int time_to_run, nr_forks = 0; - - if (argc != 2) { - fprintf(stderr, "Usage: %s <msecs to run>\n", argv[0]); - exit(1); - } - time_to_run = atoi(argv[1]); - - start_ms = msec_timestamp(); - while (true) { - int pid = fork(); - - if (pid < 0) { - fprintf(stderr, "fork() failed\n"); - exit(1); - } - if (pid == 0) - write_targetid_exit(); - wait(NULL); - nr_forks++; - - if (msec_timestamp() - start_ms > time_to_run) - break; - } - printf("%d\n", nr_forks); - return 0; -} diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh deleted file mode 100755 index 31fe33c2b032..000000000000 --- a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -before=$(grep "^pid " /proc/slabinfo | awk '{print $2}') - -nr_leaks=$(./debugfs_target_ids_pid_leak 1000) -expected_after_max=$((before + nr_leaks / 2)) - -after=$(grep "^pid " /proc/slabinfo | awk '{print $2}') - -echo > /sys/kernel/debug/damon/target_ids - -echo "tried $nr_leaks pid leak" -echo "number of active pid slabs: $before -> $after" -echo "(up to $expected_after_max expected)" -if [ $after -gt $expected_after_max ] -then - echo "maybe pids are leaking" - exit 1 -else - exit 0 -fi diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c deleted file mode 100644 index b06f52a8ce2d..000000000000 --- a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Author: SeongJae Park <sj@kernel.org> - */ -#define _GNU_SOURCE - -#include <fcntl.h> -#include <stdbool.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <sys/types.h> -#include <sys/wait.h> -#include <time.h> -#include <unistd.h> - -#define DBGFS_MONITOR_ON "/sys/kernel/debug/damon/monitor_on_DEPRECATED" -#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids" - -static void turn_damon_on_exit(void) -{ - int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR); - int monitor_on_fd = open(DBGFS_MONITOR_ON, O_RDWR); - char pid_str[128]; - - snprintf(pid_str, sizeof(pid_str), "%d", getpid()); - write(target_ids_fd, pid_str, sizeof(pid_str)); - write(monitor_on_fd, "on\n", 3); - close(target_ids_fd); - close(monitor_on_fd); - usleep(1000); - exit(0); -} - -static void try_race(void) -{ - int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR); - int pid = fork(); - int buf[256]; - - if (pid < 0) { - fprintf(stderr, "fork() failed\n"); - exit(1); - } - if (pid == 0) - turn_damon_on_exit(); - while (true) { - int status; - - read(target_ids_fd, buf, sizeof(buf)); - if (waitpid(-1, &status, WNOHANG) == pid) - break; - } - close(target_ids_fd); -} - -static inline uint64_t ts_to_ms(struct timespec *ts) -{ - return (uint64_t)ts->tv_sec * 1000 + (uint64_t)ts->tv_nsec / 1000000; -} - -int main(int argc, char *argv[]) -{ - struct timespec start_time, now; - int runtime_ms; - - if (argc != 2) { - fprintf(stderr, "Usage: %s <runtime in ms>\n", argv[0]); - exit(1); - } - runtime_ms = atoi(argv[1]); - clock_gettime(CLOCK_MONOTONIC, &start_time); - while (true) { - try_race(); - clock_gettime(CLOCK_MONOTONIC, &now); - if (ts_to_ms(&now) - ts_to_ms(&start_time) > runtime_ms) - break; - } - return 0; -} diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh deleted file mode 100755 index fc793c4c9aea..000000000000 --- a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -dmesg -C - -./debugfs_target_ids_read_before_terminate_race 5000 - -if dmesg | grep -q dbgfs_target_ids_read -then - dmesg - exit 1 -else - exit 0 -fi diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c deleted file mode 100644 index 53e69a669668..000000000000 --- a/tools/testing/selftests/damon/huge_count_read_write.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Author: SeongJae Park <sj@kernel.org> - */ - -#include <fcntl.h> -#include <stdlib.h> -#include <unistd.h> -#include <stdio.h> - -#pragma GCC diagnostic push -#if __GNUC__ >= 11 && __GNUC_MINOR__ >= 1 -/* Ignore read(2) overflow and write(2) overread compile warnings */ -#pragma GCC diagnostic ignored "-Wstringop-overread" -#pragma GCC diagnostic ignored "-Wstringop-overflow" -#endif - -void write_read_with_huge_count(char *file) -{ - int filedesc = open(file, O_RDWR); - char buf[256]; - int ret; - - printf("%s %s\n", __func__, file); - if (filedesc < 0) { - fprintf(stderr, "failed opening %s\n", file); - exit(1); - } - - write(filedesc, "", 0xfffffffful); - ret = read(filedesc, buf, 0xfffffffful); - close(filedesc); -} - -#pragma GCC diagnostic pop - -int main(int argc, char *argv[]) -{ - if (argc != 2) { - fprintf(stderr, "Usage: %s <file>\n", argv[0]); - exit(1); - } - write_read_with_huge_count(argv[1]); - - return 0; -} diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index a5a72415e37b..66d444ae1676 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -174,7 +174,7 @@ static void test_name(struct __test_metadata *_metadata); \ static inline void wrapper_##test_name( \ struct __test_metadata *_metadata, \ - struct __fixture_variant_metadata *variant) \ + struct __fixture_variant_metadata __attribute__((unused)) *variant) \ { \ _metadata->setup_completed = true; \ if (setjmp(_metadata->env) == 0) \ @@ -756,7 +756,7 @@ /* Avoid multiple evaluation of the cases */ \ __typeof__(_expected) __exp = (_expected); \ __typeof__(_seen) __seen = (_seen); \ - if (!(__exp _t __seen)) { \ + if (!(__exp _t (__typeof__(_expected)) __seen)) { \ /* Report with actual signedness to avoid weird output. */ \ switch (is_signed_type(__exp) * 2 + is_signed_type(__seen)) { \ case 0: { \ @@ -965,7 +965,7 @@ static inline void __test_check_assert(struct __test_metadata *t) } struct __test_metadata *__active_test; -static void __timeout_handler(int sig, siginfo_t *info, void *ucontext) +static void __timeout_handler(int sig, siginfo_t *info, void __attribute__((unused)) *ucontext) { struct __test_metadata *t = __active_test; diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 8f01f4da1c0d..121000c28c10 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -27,6 +27,7 @@ protection_keys_64 madv_populate uffd-stress uffd-unit-tests +uffd-wp-mremap mlock-intersect-test mlock-random-test virtual_address_range @@ -36,6 +37,9 @@ map_fixed_noreplace write_to_hugetlbfs hmm-tests memfd_secret +hugetlb_dio +pkey_sighandler_tests_32 +pkey_sighandler_tests_64 soft-dirty split_huge_page_test ksm_tests @@ -49,7 +53,6 @@ va_high_addr_switch hugetlb_fault_after_madv hugetlb_madv_vs_map mseal_test -seal_elf droppable hugetlb_dio pkey_sighandler_tests_32 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 3de23ea4663f..006ed2e8df87 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -33,9 +33,17 @@ endif # LDLIBS. MAKEFLAGS += --no-builtin-rules -CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +CFLAGS = -Wall -O2 -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +CFLAGS += -Wunreachable-code -Wunused -Wunused-parameter -Wunused-function -Wunused-variable LDLIBS = -lrt -lpthread -lm +# Some distributions (such as Ubuntu) configure GCC so that _FORTIFY_SOURCE is +# automatically enabled at -O1 or above. This triggers various unused-result +# warnings where functions such as read() or write() are called and their +# return value is not checked. Disable _FORTIFY_SOURCE to silence those +# warnings. +CFLAGS += -U_FORTIFY_SOURCE + KDIR ?= /lib/modules/$(shell uname -r)/build ifneq (,$(wildcard $(KDIR)/Module.symvers)) ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h)) @@ -75,13 +83,13 @@ TEST_GEN_FILES += mrelease_test TEST_GEN_FILES += mremap_dontunmap TEST_GEN_FILES += mremap_test TEST_GEN_FILES += mseal_test -TEST_GEN_FILES += seal_elf TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress TEST_GEN_FILES += uffd-unit-tests +TEST_GEN_FILES += uffd-wp-mremap TEST_GEN_FILES += split_huge_page_test TEST_GEN_FILES += ksm_tests TEST_GEN_FILES += ksm_functional_tests @@ -152,11 +160,16 @@ $(TEST_GEN_FILES): vm_util.c thp_settings.c $(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c +$(OUTPUT)/uffd-wp-mremap: uffd-common.c +$(OUTPUT)/protection_keys: pkey_util.c +$(OUTPUT)/pkey_sighandler_tests: pkey_util.c ifeq ($(ARCH),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) +$(BINARIES_32) $(BINARIES_64): pkey_util.c + define gen-target-rule-32 $(1) $(1)_32: $(OUTPUT)/$(1)_32 .PHONY: $(1) $(1)_32 diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 2c3a0eb6b22d..f6f32a5732e9 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -134,7 +134,7 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size, lseek(fd, 0, SEEK_SET); if (write(fd, init_nr_hugepages, strlen(init_nr_hugepages)) - != strlen(init_nr_hugepages)) { + != (signed long int)strlen(init_nr_hugepages)) { ksft_print_msg("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n", strerror(errno)); goto close_fd; @@ -194,7 +194,7 @@ int set_zero_hugepages(unsigned long *initial_nr_hugepages) return ret; } -int main(int argc, char **argv) +int main(void) { struct rlimit lim; struct map_list *list = NULL, *entry; diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 1238e1c5aae1..e37eb863e66c 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -84,7 +84,7 @@ static void detect_huge_zeropage(void) return; ret = pread(fd, buf, sizeof(buf), 0); - if (ret > 0 && ret < sizeof(buf)) { + if (ret > 0 && (unsigned int)ret < sizeof(buf)) { buf[ret] = 0; enabled = strtoul(buf, NULL, 10); @@ -263,12 +263,14 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb) +static void test_cow_in_parent(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false); } -static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb) +static void test_cow_in_parent_mprotect(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false); } @@ -408,10 +410,11 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork) struct io_uring_cqe *cqe; struct io_uring_sqe *sqe; struct io_uring ring; - ssize_t cur, total; struct iovec iov; char *buf, *tmp; + size_t total; int ret, fd; + ssize_t cur; FILE *file; ret = setup_comm_pipes(&comm_pipes); @@ -515,7 +518,7 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork) goto quit_child; } - if (cqe->res != size) { + if ((unsigned int) cqe->res != size) { ksft_test_result_fail("write_fixed failed\n"); goto quit_child; } @@ -529,7 +532,7 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork) ksft_test_result_fail("pread() failed\n"); goto quit_child; } - total += cur; + total += (size_t)cur; } /* Finally, check if we read what we expected. */ @@ -553,12 +556,14 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb) +static void test_iouring_ro(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_iouring(mem, size, false); } -static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb) +static void test_iouring_fork(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_iouring(mem, size, true); } @@ -702,36 +707,38 @@ free_tmp: free(tmp); } -static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb) +static void test_ro_pin_on_shared(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); } -static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb) +static void test_ro_fast_pin_on_shared(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); } static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size, - bool is_hugetlb) + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); } static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size, - bool is_hugetlb) + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); } static void test_ro_pin_on_ro_exclusive(char *mem, size_t size, - bool is_hugetlb) + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); } static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size, - bool is_hugetlb) + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); } @@ -1192,7 +1199,7 @@ static void run_anon_test_case(struct test_case const *test_case) static void run_anon_test_cases(void) { - int i; + unsigned int i; ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); @@ -1420,7 +1427,7 @@ static const struct test_case anon_thp_test_cases[] = { static void run_anon_thp_test_cases(void) { - int i; + unsigned int i; if (!pmdsize) return; @@ -1457,13 +1464,14 @@ static void test_cow(char *mem, const char *smem, size_t size) "Other mapping not modified\n"); free(old); } +//typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); -static void test_ro_pin(char *mem, const char *smem, size_t size) +static void test_ro_pin(char *mem, const char __attribute__((unused)) *smem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST, false); } -static void test_ro_fast_pin(char *mem, const char *smem, size_t size) +static void test_ro_fast_pin(char *mem, const char __attribute__((unused)) *smem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST, true); } @@ -1752,7 +1760,7 @@ static void run_non_anon_test_case(struct non_anon_test_case const *test_case) static void run_non_anon_test_cases(void) { - int i; + unsigned int i; ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); @@ -1769,7 +1777,7 @@ static int tests_per_non_anon_test_case(void) return tests; } -int main(int argc, char **argv) +int main(void) { int err; struct thp_settings default_settings; diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c index f3d9ecf96890..90ea6377810c 100644 --- a/tools/testing/selftests/mm/droppable.c +++ b/tools/testing/selftests/mm/droppable.c @@ -15,7 +15,7 @@ #include "../kselftest.h" -int main(int argc, char *argv[]) +int main(void) { size_t alloc_size = 134217728; size_t page_size = getpagesize(); diff --git a/tools/testing/selftests/mm/guard-pages.c b/tools/testing/selftests/mm/guard-pages.c index 7cdf815d0d63..fc1165ef2015 100644 --- a/tools/testing/selftests/mm/guard-pages.c +++ b/tools/testing/selftests/mm/guard-pages.c @@ -55,6 +55,12 @@ static int pidfd_open(pid_t pid, unsigned int flags) return syscall(SYS_pidfd_open, pid, flags); } +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, + size_t n, int advice, unsigned int flags) +{ + return syscall(__NR_process_madvise, pidfd, iovec, n, advice, flags); +} + /* * Enable our signal catcher and try to read/write the specified buffer. The * return value indicates whether the read/write succeeds without a fatal @@ -136,7 +142,7 @@ TEST_F(guard_pages, basic) const unsigned long NUM_PAGES = 10; const unsigned long page_size = self->page_size; char *ptr; - int i; + unsigned int i; ptr = mmap(NULL, NUM_PAGES * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); @@ -419,7 +425,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0); /* Now guard in one step. */ - count = process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0); + count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0); /* OK we don't have permission to do this, skip. */ if (count == -1 && errno == EPERM) @@ -440,7 +446,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size])); /* Now do the same with unguard... */ - count = process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0); + count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0); /* ...and everything should now succeed. */ @@ -990,7 +996,7 @@ TEST_F(guard_pages, fork) MAP_ANON | MAP_PRIVATE, -1, 0); ASSERT_NE(ptr, MAP_FAILED); - /* Establish guard apges in the first 5 pages. */ + /* Establish guard pages in the first 5 pages. */ ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0); pid = fork(); @@ -1029,6 +1035,77 @@ TEST_F(guard_pages, fork) ASSERT_EQ(munmap(ptr, 10 * page_size), 0); } +/* + * Assert expected behaviour after we fork populated ranges of anonymous memory + * and then guard and unguard the range. + */ +TEST_F(guard_pages, fork_cow) +{ + const unsigned long page_size = self->page_size; + char *ptr; + pid_t pid; + unsigned int i; + + /* Map 10 pages. */ + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Populate range. */ + for (i = 0; i < 10 * page_size; i++) { + char chr = 'a' + (i % 26); + + ptr[i] = chr; + } + + pid = fork(); + ASSERT_NE(pid, -1); + if (!pid) { + /* This is the child process now. */ + + /* Ensure the range is as expected. */ + for (i = 0; i < 10 * page_size; i++) { + char expected = 'a' + (i % 26); + char actual = ptr[i]; + + ASSERT_EQ(actual, expected); + } + + /* Establish guard pages across the whole range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0); + /* Remove it. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0); + + /* + * By removing the guard pages, the page tables will be + * cleared. Assert that we are looking at the zero page now. + */ + for (i = 0; i < 10 * page_size; i++) { + char actual = ptr[i]; + + ASSERT_EQ(actual, '\0'); + } + + exit(0); + } + + /* Parent process. */ + + /* Parent simply waits on child. */ + waitpid(pid, NULL, 0); + + /* Ensure the range is unchanged in parent anon range. */ + for (i = 0; i < 10 * page_size; i++) { + char expected = 'a' + (i % 26); + char actual = ptr[i]; + + ASSERT_EQ(actual, expected); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + /* * Assert that forking a process with VMAs that do have VM_WIPEONFORK set * behave as expected. diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index 9423ad439a61..7f1b4ad7fcae 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -444,9 +444,10 @@ static int tests_per_test_case(void) return 3 + nr_hugetlbsizes; } -int main(int argc, char **argv) +int main(void) { - int i, err; + unsigned int i; + int err; pagesize = getpagesize(); nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 141bf63cbe05..3b4db583bd3b 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -796,7 +796,7 @@ TEST_F(hmm, anon_write_hugetlbfs) int ret; default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) + if (default_hsize*1024 < default_hsize) SKIP(return, "Huge page size could not be determined"); default_hsize = default_hsize*1024; /* KB to B */ @@ -1579,7 +1579,7 @@ TEST_F(hmm, compound) /* Skip test if we can't allocate a hugetlbfs page. */ default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) + if (default_hsize*1024 < default_hsize) SKIP(return, "Huge page size could not be determined"); default_hsize = default_hsize*1024; /* KB to B */ diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c index df366a4d1b92..8d30ebfc9b86 100644 --- a/tools/testing/selftests/mm/hugepage-vmemmap.c +++ b/tools/testing/selftests/mm/hugepage-vmemmap.c @@ -51,7 +51,8 @@ static unsigned long virt_to_pfn(void *addr) static int check_page_flags(unsigned long pfn) { - int fd, i; + int fd; + unsigned int i; unsigned long pageflags; fd = open("/proc/kpageflags", O_RDONLY); @@ -87,7 +88,7 @@ static int check_page_flags(unsigned long pfn) return 0; } -int main(int argc, char **argv) +int main(void) { void *addr; unsigned long pfn; diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index e74107185324..8f527084858d 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -26,7 +26,7 @@ #define validate_free_pages(exp_free) \ do { \ - int fhp = get_free_hugepages(); \ + unsigned int fhp = get_free_hugepages(); \ if (fhp != (exp_free)) { \ printf("Unexpected number of free huge " \ "pages line %d\n", __LINE__); \ @@ -58,7 +58,7 @@ void read_fault_pages(void *addr, unsigned long nr_pages) } } -int main(int argc, char **argv) +int main(int __attribute__((unused)) argc, char **argv) { unsigned long free_hugepages; void *addr, *addr2; diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c index ba6cc6f9cabc..e2a2bb1989d5 100644 --- a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c +++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c @@ -72,7 +72,7 @@ static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size, { char buf[MAX_WRITE_READ_CHUNK_SIZE]; ssize_t ret_count = 0; - ssize_t total_ret_count = 0; + size_t total_ret_count = 0; char val = offset / wr_chunk_size + offset % wr_chunk_size; printf(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset); @@ -83,7 +83,7 @@ static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size, return false; } - while (offset + total_ret_count < len) { + while ((unsigned long)offset + total_ret_count < len) { ret_count = read(fd, buf, wr_chunk_size); if (ret_count == 0) { printf(PREFIX PREFIX "read reach end of the file\n"); @@ -109,7 +109,7 @@ static bool read_hugepage_filemap(int fd, size_t len, { char buf[MAX_WRITE_READ_CHUNK_SIZE]; ssize_t ret_count = 0; - ssize_t total_ret_count = 0; + size_t total_ret_count = 0; char val = 0; printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", diff --git a/tools/testing/selftests/mm/hugetlb-soft-offline.c b/tools/testing/selftests/mm/hugetlb-soft-offline.c index f086f0e04756..cb087303f5ed 100644 --- a/tools/testing/selftests/mm/hugetlb-soft-offline.c +++ b/tools/testing/selftests/mm/hugetlb-soft-offline.c @@ -216,7 +216,7 @@ static void test_soft_offline_common(int enable_soft_offline) enable_soft_offline); } -int main(int argc, char **argv) +int main(void) { ksft_print_header(); ksft_set_plan(2); diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c index db63abe5ee5e..62f368d4c8c1 100644 --- a/tools/testing/selftests/mm/hugetlb_dio.c +++ b/tools/testing/selftests/mm/hugetlb_dio.c @@ -63,7 +63,7 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) memset(buffer, 'A', writesize); /* Write the buffer to the file */ - if (write(fd, buffer, writesize) != (writesize)) { + if (write(fd, buffer, writesize) != (signed int)writesize) { munmap(orig_buffer, h_pagesize); close(fd); ksft_exit_fail_perror("Error writing to file\n"); diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c index e2640529dbb2..2b5acb13ee0b 100644 --- a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c +++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c @@ -28,7 +28,7 @@ static void signal_handler(int signal) } /* Touch the memory while it is being madvised() */ -void *touch(void *unused) +void *touch(void __attribute__((unused)) *unused) { char *ptr = (char *)huge_ptr; @@ -41,7 +41,7 @@ void *touch(void *unused) return NULL; } -void *madv(void *unused) +void *madv(void __attribute__((unused)) *unused) { usleep(rand() % 10); @@ -88,7 +88,7 @@ int main(void) MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if ((unsigned long)huge_ptr == -1) + if (huge_ptr == MAP_FAILED) ksft_exit_skip("Failed to allocated huge page\n"); pthread_create(&thread1, NULL, madv, NULL); diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c index 8f122a0f0828..eda38b63e9e8 100644 --- a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -33,7 +33,7 @@ size_t mmap_size; char *huge_ptr; /* Touch the memory while it is being madvised() */ -void *touch(void *unused) +void *touch(void __attribute__((unused)) *unused) { for (int i = 0; i < INLOOP_ITER; i++) huge_ptr[0] = '.'; @@ -41,7 +41,7 @@ void *touch(void *unused) return NULL; } -void *madv(void *unused) +void *madv(void __attribute__((unused)) *unused) { for (int i = 0; i < INLOOP_ITER; i++) madvise(huge_ptr, mmap_size, MADV_DONTNEED); @@ -54,7 +54,7 @@ void *madv(void *unused) * The other hugepage should be flipping from used <-> reserved, because * of madvise(DONTNEED). */ -void *map_extra(void *unused) +void *map_extra(void __attribute__((unused)) *unused) { void *ptr; @@ -100,7 +100,7 @@ int main(void) MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if ((unsigned long)huge_ptr == -1) { + if (huge_ptr == MAP_FAILED) { ksft_test_result_fail("Failed to allocate huge page\n"); return KSFT_FAIL; } diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 8a4d34cce36b..dde7c7fbbac2 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -140,7 +140,7 @@ static void get_finfo(const char *dir) exit(EXIT_FAILURE); } if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, - finfo.dir) >= sizeof(finfo.path)) { + finfo.dir) >= (signed int)sizeof(finfo.path)) { printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } @@ -155,7 +155,7 @@ static void get_finfo(const char *dir) /* Find owning device's queue/read_ahead_kb control */ if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(path)) { + >= (signed int)sizeof(path)) { printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } @@ -169,7 +169,7 @@ static void get_finfo(const char *dir) sizeof(finfo.dev_queue_read_ahead_path), "/sys/dev/block/%d:%d/queue/read_ahead_kb", major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(finfo.dev_queue_read_ahead_path)) { + >= (signed int)sizeof(finfo.dev_queue_read_ahead_path)) { printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } @@ -197,7 +197,7 @@ static void get_finfo(const char *dir) if (snprintf(finfo.dev_queue_read_ahead_path, sizeof(finfo.dev_queue_read_ahead_path), "/sys/block/%s/queue/read_ahead_kb", - str) >= sizeof(finfo.dev_queue_read_ahead_path)) { + str) >= (signed int)sizeof(finfo.dev_queue_read_ahead_path)) { printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } @@ -271,7 +271,7 @@ static void *alloc_mapping(int nr) static void fill_memory(int *p, unsigned long start, unsigned long end) { - int i; + unsigned int i; for (i = start / page_size; i < end / page_size; i++) p[i * page_size / sizeof(*p)] = i + 0xdead0000; @@ -333,10 +333,10 @@ static void *alloc_hpage(struct mem_ops *ops) static void validate_memory(int *p, unsigned long start, unsigned long end) { - int i; + unsigned int i; for (i = start / page_size; i < end / page_size; i++) { - if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { + if ((unsigned int)p[i * page_size / sizeof(*p)] != i + 0xdead0000) { printf("Page %d is corrupted: %#x\n", i, p[i * page_size / sizeof(*p)]); exit(EXIT_FAILURE); @@ -537,7 +537,7 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, static bool wait_for_scan(const char *msg, char *p, int nr_hpages, struct mem_ops *ops) { - int full_scans; + unsigned int full_scans; int timeout = 6; /* 3 seconds */ /* Sanity check */ diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 66b4e111b5a2..4f96126e4e1f 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -306,7 +306,7 @@ static void test_unmerge_zero_pages(void) /* Check if ksm_zero_pages is updated correctly after KSM merging */ pages_expected = size / pagesize; - if (pages_expected != get_my_ksm_zero_pages()) { + if ((signed long)pages_expected != get_my_ksm_zero_pages()) { ksft_test_result_fail("'ksm_zero_pages' updated after merging\n"); goto unmap; } @@ -319,7 +319,7 @@ static void test_unmerge_zero_pages(void) /* Check if ksm_zero_pages is updated correctly after unmerging */ pages_expected /= 2; - if (pages_expected != get_my_ksm_zero_pages()) { + if ((signed long)pages_expected != get_my_ksm_zero_pages()) { ksft_test_result_fail("'ksm_zero_pages' updated after unmerging\n"); goto unmap; } @@ -625,7 +625,7 @@ static void test_prot_none(void) { const unsigned int size = 2 * MiB; char *map; - int i; + unsigned int i; ksft_print_msg("[RUN] %s\n", __func__); diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c index b748c48908d9..323cfcb14e4d 100644 --- a/tools/testing/selftests/mm/ksm_tests.c +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -265,8 +265,7 @@ static int ksm_merge_pages(int merge_type, void *addr, size_t size, return 0; } -static int ksm_unmerge_pages(void *addr, size_t size, - struct timespec start_time, int timeout) +static int ksm_unmerge_pages(void *addr, size_t size) { if (madvise(addr, size, MADV_UNMERGEABLE)) { perror("madvise"); @@ -483,7 +482,7 @@ static int get_first_mem_node(void) return get_next_mem_node(numa_max_node()); } -static int check_ksm_numa_merge(int merge_type, int mapping, int prot, int timeout, +static int check_ksm_numa_merge(int merge_type, int timeout, bool merge_across_nodes, size_t page_size) { void *numa1_map_ptr, *numa2_map_ptr; @@ -547,8 +546,7 @@ err_out: return KSFT_FAIL; } -static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot, - int timeout, size_t map_size) +static int ksm_merge_hugepages_time(int merge_type, int timeout, size_t map_size) { void *map_ptr, *map_ptr_orig; struct timespec start_time, end_time; @@ -678,7 +676,7 @@ static int ksm_unmerge_time(int merge_type, int mapping, int prot, int timeout, perror("clock_gettime"); goto err_out; } - if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) + if (ksm_unmerge_pages(map_ptr, map_size)) goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { perror("clock_gettime"); @@ -776,7 +774,7 @@ err_out: int main(int argc, char *argv[]) { - int ret, opt; + int ret = 0, opt; int prot = 0; int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; int merge_type = KSM_MERGE_TYPE_DEFAULT; @@ -906,8 +904,8 @@ int main(int argc, char *argv[]) page_size); break; case CHECK_KSM_NUMA_MERGE: - ret = check_ksm_numa_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, - ksm_scan_limit_sec, merge_across_nodes, page_size); + ret = check_ksm_numa_merge(merge_type, ksm_scan_limit_sec, merge_across_nodes, + page_size); break; case KSM_MERGE_TIME: if (size_MB == 0) { @@ -922,8 +920,7 @@ int main(int argc, char *argv[]) printf("Option '-s' is required.\n"); return KSFT_FAIL; } - ret = ksm_merge_hugepages_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, - ksm_scan_limit_sec, size_MB); + ret = ksm_merge_hugepages_time(merge_type, ksm_scan_limit_sec, size_MB); break; case KSM_UNMERGE_TIME: if (size_MB == 0) { diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c index ef7d911da13e..c6a3ee56a54a 100644 --- a/tools/testing/selftests/mm/madv_populate.c +++ b/tools/testing/selftests/mm/madv_populate.c @@ -281,7 +281,7 @@ static int system_has_softdirty(void) #endif } -int main(int argc, char **argv) +int main(void) { int nr_tests = 16; int err; diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c index 5c8a53869b1b..0dd849b4affa 100644 --- a/tools/testing/selftests/mm/map_populate.c +++ b/tools/testing/selftests/mm/map_populate.c @@ -74,7 +74,7 @@ static int child_f(int sock, unsigned long *smap, int fd) return ksft_cnt.ksft_pass; } -int main(int argc, char **argv) +int main(void) { int sock[2], child, ret; FILE *ftmp; diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c index 74c911aa3aea..f9d728e18678 100644 --- a/tools/testing/selftests/mm/memfd_secret.c +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -121,7 +121,7 @@ close_pipe: close(pipefd[1]); } -static void try_process_vm_read(int fd, int pipefd[2]) +static void try_process_vm_read(int __attribute__((unused)) fd, int pipefd[2]) { struct iovec liov, riov; char buf[64]; @@ -145,7 +145,7 @@ static void try_process_vm_read(int fd, int pipefd[2]) exit(KSFT_FAIL); } -static void try_ptrace(int fd, int pipefd[2]) +static void try_ptrace(int __attribute__((unused)) fd, int pipefd[2]) { pid_t ppid = getppid(); int status; @@ -297,7 +297,7 @@ static void prepare(void) #define NUM_TESTS 6 -int main(int argc, char *argv[]) +int main(void) { int fd; diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index 64bcbb7151cf..1e3a595fbf01 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -204,4 +204,103 @@ TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) ASSERT_EQ(pthread_cancel(self->threads[i]), 0); } +/* + * migration test with shared anon THP page + */ + +TEST_F_TIMEOUT(migration, shared_anon_thp, 2*RUNTIME) +{ + pid_t pid; + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, 2 * TWOMEG, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); + ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) { + pid = fork(); + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGHUP); + /* Parent may have died before prctl so check now. */ + if (getppid() == 1) + kill(getpid(), SIGHUP); + access_mem(ptr); + } else { + self->pids[i] = pid; + } + } + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); +} + +/* + * migration test with private anon hugetlb page + */ +TEST_F_TIMEOUT(migration, private_anon_htlb, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +/* + * migration test with shared anon hugetlb page + */ +TEST_F_TIMEOUT(migration, shared_anon_htlb, 2*RUNTIME) +{ + pid_t pid; + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) { + pid = fork(); + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGHUP); + /* Parent may have died before prctl so check now. */ + if (getppid() == 1) + kill(getpid(), SIGHUP); + access_mem(ptr); + } else { + self->pids[i] = pid; + } + } + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c index 1cd80b0f76c3..f410699458f2 100644 --- a/tools/testing/selftests/mm/mlock-random-test.c +++ b/tools/testing/selftests/mm/mlock-random-test.c @@ -138,7 +138,7 @@ static void test_mlock_within_limit(char *p, int alloc_size) int page_size = 0; getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur < alloc_size) + if (cur.rlim_cur < (unsigned int)alloc_size) ksft_exit_fail_msg("alloc_size[%d] < %u rlimit,lead to mlock failure\n", alloc_size, (unsigned int)cur.rlim_cur); @@ -204,7 +204,7 @@ static void test_mlock_outof_limit(char *p, int alloc_size) struct rlimit cur; getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur >= alloc_size) + if (cur.rlim_cur >= (unsigned int)alloc_size) ksft_exit_fail_msg("alloc_size[%d] >%u rlimit, violates test condition\n", alloc_size, (unsigned int)cur.rlim_cur); @@ -236,7 +236,7 @@ static void test_mlock_outof_limit(char *p, int alloc_size) ksft_test_result_pass("%s\n", __func__); } -int main(int argc, char **argv) +int main(void) { char *p = NULL; diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c index 7f0d50fa361d..358711e8191f 100644 --- a/tools/testing/selftests/mm/mlock2-tests.c +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -425,7 +425,7 @@ static void test_mlockall(void) munlockall(); } -int main(int argc, char **argv) +int main(void) { int ret, size = 3 * getpagesize(); void *map; diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 5a3a9bcba640..bb84476a177f 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -34,7 +34,7 @@ struct config { unsigned long long dest_alignment; unsigned long long region_size; int overlapping; - int dest_preamble_size; + unsigned int dest_preamble_size; }; struct test { @@ -328,7 +328,7 @@ static void mremap_move_within_range(unsigned int pattern_seed, char *rand_addr) { char *test_name = "mremap mremap move within range"; void *src, *dest; - int i, success = 1; + unsigned int i, success = 1; size_t size = SIZE_MB(20); void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, @@ -384,7 +384,7 @@ out: static long long remap_region(struct config c, unsigned int threshold_mb, char *rand_addr) { - void *addr, *src_addr, *dest_addr, *dest_preamble_addr; + void *addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; @@ -569,7 +569,7 @@ static void mremap_move_1mb_from_start(unsigned int pattern_seed, { char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src"; void *src = NULL, *dest = NULL; - int i, success = 1; + unsigned int i, success = 1; /* Config to reuse get_source_mapping() to do an aligned mmap. */ struct config c = { @@ -636,7 +636,7 @@ out: static void run_mremap_test_case(struct test test_case, int *failures, unsigned int threshold_mb, - unsigned int pattern_seed, char *rand_addr) + char *rand_addr) { long long remap_time = remap_region(test_case.config, threshold_mb, rand_addr); @@ -708,7 +708,8 @@ static int parse_args(int argc, char **argv, unsigned int *threshold_mb, int main(int argc, char **argv) { int failures = 0; - int i, run_perf_tests; + unsigned int i; + int run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; /* hard-coded test configs */ @@ -831,7 +832,7 @@ int main(int argc, char **argv) for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, - pattern_seed, rand_addr); + rand_addr); maps_fp = fopen("/proc/self/maps", "r"); @@ -853,7 +854,7 @@ int main(int argc, char **argv) "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) run_mremap_test_case(perf_test_cases[i], &failures, - threshold_mb, pattern_seed, + threshold_mb, rand_addr); } diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index 01675c412b2a..ad17005521a8 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -802,7 +802,7 @@ static void test_seal_mprotect_partial_mprotect_tail(bool seal) } -static void test_seal_mprotect_two_vma_with_gap(bool seal) +static void test_seal_mprotect_two_vma_with_gap(void) { void *ptr; unsigned long page_size = getpagesize(); @@ -1864,7 +1864,7 @@ static void test_seal_madvise_nodiscard(bool seal) REPORT_TEST_PASS(); } -int main(int argc, char **argv) +int main(void) { bool test_seal = seal_support(); @@ -1913,8 +1913,8 @@ int main(int argc, char **argv) test_seal_mprotect_partial_mprotect(false); test_seal_mprotect_partial_mprotect(true); - test_seal_mprotect_two_vma_with_gap(false); - test_seal_mprotect_two_vma_with_gap(true); + test_seal_mprotect_two_vma_with_gap(); + test_seal_mprotect_two_vma_with_gap(); test_seal_mprotect_merge(false); test_seal_mprotect_merge(true); diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c index 431c1277d83a..ade160966c92 100644 --- a/tools/testing/selftests/mm/on-fault-limit.c +++ b/tools/testing/selftests/mm/on-fault-limit.c @@ -28,7 +28,7 @@ static void test_limit(void) munlockall(); } -int main(int argc, char **argv) +int main(void) { ksft_print_header(); ksft_set_plan(1); diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index bcc73b4e805c..57b4bba2b45f 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -34,8 +34,8 @@ #define PAGEMAP "/proc/self/pagemap" int pagemap_fd; int uffd; -int page_size; -int hpage_size; +unsigned int page_size; +unsigned int hpage_size; const char *progname; #define LEN(region) ((region.end - region.start)/page_size) @@ -235,7 +235,9 @@ int get_reads(struct page_region *vec, int vec_size) int sanity_tests_sd(void) { - int mem_size, vec_size, ret, ret2, ret3, i, num_pages = 1000, total_pages = 0; + unsigned long long mem_size, vec_size, i, total_pages = 0; + long ret, ret2, ret3; + int num_pages = 1000; int total_writes, total_reads, reads, count; struct page_region *vec, *vec2; char *mem, *m[2]; @@ -321,9 +323,9 @@ int sanity_tests_sd(void) ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); - ksft_test_result(ret == mem_size/(page_size * 2), + ksft_test_result((unsigned long long)ret == mem_size/(page_size * 2), "%s Repeated pattern of written and non-written pages\n", __func__); /* 4. Repeated pattern of written and non-written pages in parts */ @@ -331,21 +333,21 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, num_pages/2 - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ret2 = pagemap_ioctl(mem, mem_size, vec, 2, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret2 < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno)); ret3 = pagemap_ioctl(mem, mem_size, vec, vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret3 < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret3, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret3, errno, strerror(errno)); ksft_test_result((ret + ret3) == num_pages/2 && ret2 == 2, - "%s Repeated pattern of written and non-written pages in parts %d %d %d\n", + "%s Repeated pattern of written and non-written pages in parts %ld %ld %ld\n", __func__, ret, ret3, ret2); /* 5. Repeated pattern of written and non-written pages max_pages */ @@ -357,13 +359,13 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, num_pages/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ret2 = pagemap_ioctl(mem, mem_size, vec, vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret2 < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno)); ksft_test_result(ret == num_pages/2 && ret2 == 1, "%s Repeated pattern of written and non-written pages max_pages\n", @@ -378,12 +380,12 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ret2 = pagemap_ioctl(mem, mem_size, vec2, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret2 < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno)); ksft_test_result(ret == 1 && LEN(vec[0]) == 2 && vec[0].start == (uintptr_t)(mem + page_size) && @@ -416,7 +418,7 @@ int sanity_tests_sd(void) ret = pagemap_ioctl(m[1], mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && LEN(vec[0]) == mem_size/page_size, "%s Two regions\n", __func__); @@ -448,7 +450,7 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); for (i = 0; i < mem_size/page_size; i += 2) mem[i * page_size]++; @@ -457,7 +459,7 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); total_pages += ret; @@ -465,7 +467,7 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); total_pages += ret; @@ -473,7 +475,7 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); total_pages += ret; @@ -515,9 +517,9 @@ int sanity_tests_sd(void) vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); - if (ret > vec_size) + if ((unsigned long)ret > vec_size) break; reads = get_reads(vec, ret); @@ -554,63 +556,63 @@ int sanity_tests_sd(void) ret = pagemap_ioc(mem, 0, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 0 && walk_end == (long)mem, "Walk_end: Same start and end address\n"); ret = pagemap_ioc(mem, 0, vec, vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 0 && walk_end == (long)mem, "Walk_end: Same start and end with WP\n"); ret = pagemap_ioc(mem, 0, vec, 0, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 0 && walk_end == (long)mem, "Walk_end: Same start and end with 0 output buffer\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), "Walk_end: Big vec\n"); ret = pagemap_ioc(mem, mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), "Walk_end: vec of minimum length\n"); ret = pagemap_ioc(mem, mem_size, vec, 1, 0, vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), "Walk_end: Max pages specified\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size/2), "Walk_end: Half max pages\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, 1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size), "Walk_end: 1 max page\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, -1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), "Walk_end: max pages\n"); @@ -621,49 +623,49 @@ int sanity_tests_sd(void) ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); - ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); + ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size), "Walk_end sparse: Big vec\n"); ret = pagemap_ioc(mem, mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), "Walk_end sparse: vec of minimum length\n"); ret = pagemap_ioc(mem, mem_size, vec, 1, 0, vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), "Walk_end sparse: Max pages specified\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size/2, 0, vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); - ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); + ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size), "Walk_end sparse: Max pages specified\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); - ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); + ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size), "Walk_end sparse: Max pages specified\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); - ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); + ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size), "Walk_endsparse : Half max pages\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, 1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), "Walk_end: 1 max page\n"); @@ -674,9 +676,10 @@ int sanity_tests_sd(void) return 0; } -int base_tests(char *prefix, char *mem, int mem_size, int skip) +int base_tests(char *prefix, char *mem, unsigned long long mem_size, int skip) { - int vec_size, written; + unsigned long long vec_size; + int written; struct page_region *vec, *vec2; if (skip) { @@ -799,8 +802,8 @@ int hpage_unit_tests(void) char *map; int ret, ret2; size_t num_pages = 10; - int map_size = hpage_size * num_pages; - int vec_size = map_size/page_size; + unsigned long long map_size = hpage_size * num_pages; + unsigned long long vec_size = map_size/page_size; struct page_region *vec, *vec2; vec = malloc(sizeof(struct page_region) * vec_size); @@ -1047,7 +1050,8 @@ static void test_simple(void) int sanity_tests(void) { - int mem_size, vec_size, ret, fd, i, buf_size; + unsigned long long mem_size, vec_size; + int ret, fd, i, buf_size; struct page_region *vec; char *mem, *fmem; struct stat sbuf; @@ -1312,7 +1316,9 @@ static ssize_t get_dirty_pages_reset(char *mem, unsigned int count, { struct pm_scan_arg arg = {0}; struct page_region rgns[256]; - int i, j, cnt, ret; + unsigned long long i, j; + long ret; + int cnt; arg.size = sizeof(struct pm_scan_arg); arg.start = (uintptr_t)mem; @@ -1330,7 +1336,7 @@ static ssize_t get_dirty_pages_reset(char *mem, unsigned int count, ksft_exit_fail_msg("ioctl failed\n"); cnt = 0; - for (i = 0; i < ret; ++i) { + for (i = 0; i < (unsigned long)ret; ++i) { if (rgns[i].categories != PAGE_IS_WRITTEN) ksft_exit_fail_msg("wrong flags\n"); @@ -1384,9 +1390,10 @@ void *thread_proc(void *mem) static void transact_test(int page_size) { unsigned int i, count, extra_pages; + unsigned int c; pthread_t th; char *mem; - int ret, c; + int ret; if (pthread_barrier_init(&start_barrier, NULL, nthreads + 1)) ksft_exit_fail_msg("pthread_barrier_init\n"); @@ -1405,9 +1412,9 @@ static void transact_test(int page_size) memset(mem, 0, 0x1000 * nthreads * pages_per_thread); count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); - ksft_test_result(count > 0, "%s count %d\n", __func__, count); + ksft_test_result(count > 0, "%s count %u\n", __func__, count); count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); - ksft_test_result(count == 0, "%s count %d\n", __func__, count); + ksft_test_result(count == 0, "%s count %u\n", __func__, count); finish = 0; for (i = 0; i < nthreads; ++i) @@ -1429,7 +1436,7 @@ static void transact_test(int page_size) ksft_exit_fail_msg("pthread_barrier_wait\n"); if (count > nthreads * access_per_thread) - ksft_exit_fail_msg("Too big count %d expected %d, iter %d\n", + ksft_exit_fail_msg("Too big count %u expected %u, iter %u\n", count, nthreads * access_per_thread, i); c = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); @@ -1454,7 +1461,7 @@ static void transact_test(int page_size) * access and application gets page fault again for the same write. */ if (count < nthreads * access_per_thread) { - ksft_test_result_fail("Lost update, iter %d, %d vs %d.\n", i, count, + ksft_test_result_fail("Lost update, iter %u, %u vs %u.\n", i, count, nthreads * access_per_thread); return; } @@ -1467,15 +1474,16 @@ static void transact_test(int page_size) finish = 1; pthread_barrier_wait(&end_barrier); - ksft_test_result_pass("%s Extra pages %u (%.1lf%%), extra thread faults %d.\n", __func__, + ksft_test_result_pass("%s Extra pages %u (%.1lf%%), extra thread faults %u.\n", __func__, extra_pages, 100.0 * extra_pages / (iter_count * nthreads * access_per_thread), extra_thread_faults); } -int main(int argc, char *argv[]) +int main(int __attribute__((unused)) argc, char *argv[]) { - int mem_size, shmid, buf_size, fd, i, ret; + int shmid, buf_size, fd, i, ret; + unsigned long long mem_size; char *mem, *map, *fmem; struct stat sbuf; diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h index d9d2100eafc0..8e9685e03c44 100644 --- a/tools/testing/selftests/mm/pkey-arm64.h +++ b/tools/testing/selftests/mm/pkey-arm64.h @@ -30,7 +30,7 @@ #define NR_PKEYS 8 #define NR_RESERVED_PKEYS 1 /* pkey-0 */ -#define PKEY_ALLOW_ALL 0x77777777 +#define PKEY_REG_ALLOW_ALL 0x77777777 #define PKEY_REG_ALLOW_NONE 0x0 #define PKEY_BITS_PER_PKEY 4 @@ -81,11 +81,11 @@ static inline int get_arch_reserved_keys(void) return NR_RESERVED_PKEYS; } -void expect_fault_on_read_execonly_key(void *p1, int pkey) +static inline void expect_fault_on_read_execonly_key(void *p1, int pkey) { } -void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) { return PTR_ERR_ENOTSUP; } diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index f7cfe163b0ff..f080e97b39be 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -13,22 +13,22 @@ #include <ucontext.h> #include <sys/mman.h> +#include <linux/types.h> + #include "../kselftest.h" /* Define some kernel-like types */ -#define u8 __u8 -#define u16 __u16 -#define u32 __u32 -#define u64 __u64 +typedef __u8 u8; +typedef __u16 u16; +typedef __u32 u32; +typedef __u64 u64; #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) #ifndef DEBUG_LEVEL #define DEBUG_LEVEL 0 #endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; extern int test_nr; extern int iteration_nr; @@ -83,17 +83,18 @@ extern void abort_hooks(void); #ifndef noinline # define noinline __attribute__((noinline)) #endif +#ifndef __maybe_unused +# define __maybe_unused __attribute__((__unused__)) +#endif -noinline int read_ptr(int *ptr) -{ - /* Keep GCC from optimizing this away somehow */ - barrier(); - return *ptr; -} - -void expected_pkey_fault(int pkey); int sys_pkey_alloc(unsigned long flags, unsigned long init_val); int sys_pkey_free(unsigned long pkey); +int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); + +/* For functions called from protection_keys.c only */ +noinline int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, unsigned long pkey); void record_pkey_malloc(void *ptr, long size, int prot); @@ -171,38 +172,6 @@ static inline void write_pkey_reg(u64 pkey_reg) pkey_reg, __read_pkey_reg()); } -/* - * These are technically racy. since something could - * change PKEY register between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - u64 pkey_reg = read_pkey_reg(); - int bit = pkey * 2; - - if (do_allow) - pkey_reg &= (1<<bit); - else - pkey_reg |= (1<<bit); - - dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); - write_pkey_reg(pkey_reg); -} - -static inline void __pkey_write_allow(int pkey, int do_allow_write) -{ - u64 pkey_reg = read_pkey_reg(); - int bit = pkey * 2 + 1; - - if (do_allow_write) - pkey_reg &= (1<<bit); - else - pkey_reg |= (1<<bit); - - write_pkey_reg(pkey_reg); - dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); -} - #define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) #define ALIGN_PTR_UP(p, ptr_align_to) \ diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h index 3d0c0bdae5bc..1bad310d282a 100644 --- a/tools/testing/selftests/mm/pkey-powerpc.h +++ b/tools/testing/selftests/mm/pkey-powerpc.h @@ -91,7 +91,7 @@ static inline int get_arch_reserved_keys(void) return NR_RESERVED_PKEYS_64K_3KEYS; } -void expect_fault_on_read_execonly_key(void *p1, int pkey) +static inline void expect_fault_on_read_execonly_key(void *p1, int pkey) { /* * powerpc does not allow userspace to change permissions of exec-only @@ -105,7 +105,7 @@ void expect_fault_on_read_execonly_key(void *p1, int pkey) /* 4-byte instructions * 16384 = 64K page */ #define __page_o_noops() asm(".rept 16384 ; nop; .endr") -void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) { void *ptr; int ret; diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h index ac91777c8917..f7ecd335df1e 100644 --- a/tools/testing/selftests/mm/pkey-x86.h +++ b/tools/testing/selftests/mm/pkey-x86.h @@ -113,7 +113,7 @@ static inline u32 pkey_bit_position(int pkey) #define XSTATE_PKEY 0x200 #define XSTATE_BV_OFFSET 512 -int pkey_reg_xstate_offset(void) +static inline int pkey_reg_xstate_offset(void) { unsigned int eax; unsigned int ebx; @@ -148,7 +148,7 @@ static inline int get_arch_reserved_keys(void) return NR_RESERVED_PKEYS; } -void expect_fault_on_read_execonly_key(void *p1, int pkey) +static inline void expect_fault_on_read_execonly_key(void *p1, int pkey) { int ptr_contents; @@ -157,7 +157,7 @@ void expect_fault_on_read_execonly_key(void *p1, int pkey) expected_pkey_fault(pkey); } -void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) { return PTR_ERR_ENOTSUP; } diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c index c593a426341c..600ef57f4baa 100644 --- a/tools/testing/selftests/mm/pkey_sighandler_tests.c +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -32,11 +32,9 @@ #define STACK_SIZE PTHREAD_STACK_MIN -void expected_pkey_fault(int pkey) {} - -pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; -pthread_cond_t cond = PTHREAD_COND_INITIALIZER; -siginfo_t siginfo = {0}; +static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +static siginfo_t siginfo = {0}; /* * We need to use inline assembly instead of glibc's syscall because glibc's @@ -163,7 +161,7 @@ static void *thread_segv_with_pkey0_disabled(void *ptr) __write_pkey_reg(pkey_reg_restrictive_default()); /* Segfault (with SEGV_MAPERR) */ - *(int *) (0x1) = 1; + *(volatile int *)NULL = 1; return NULL; } @@ -179,7 +177,6 @@ static void *thread_segv_pkuerr_stack(void *ptr) static void *thread_segv_maperr_ptr(void *ptr) { stack_t *stack = ptr; - int *bad = (int *)1; u64 pkey_reg; /* @@ -195,7 +192,7 @@ static void *thread_segv_maperr_ptr(void *ptr) __write_pkey_reg(pkey_reg); /* Segfault */ - *bad = 1; + *(volatile int *)NULL = 1; syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); return NULL; } @@ -234,7 +231,7 @@ static void test_sigsegv_handler_with_pkey0_disabled(void) ksft_test_result(siginfo.si_signo == SIGSEGV && siginfo.si_code == SEGV_MAPERR && - siginfo.si_addr == (void *)1, + siginfo.si_addr == NULL, "%s\n", __func__); } @@ -314,11 +311,11 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void) __write_pkey_reg(pkey_reg); /* Protect the new stack with MPK 1 */ - pkey = pkey_alloc(0, 0); - pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); + pkey = sys_pkey_alloc(0, 0); + sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); /* Set up alternate signal stack that will use the default MPK */ - sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); sigstack.ss_flags = 0; sigstack.ss_size = STACK_SIZE; @@ -349,7 +346,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void) ksft_test_result(siginfo.si_signo == SIGSEGV && siginfo.si_code == SEGV_MAPERR && - siginfo.si_addr == (void *)1, + siginfo.si_addr == NULL, "%s\n", __func__); } @@ -487,11 +484,11 @@ static void test_pkru_sigreturn(void) __write_pkey_reg(pkey_reg); /* Protect the stack with MPK 2 */ - pkey = pkey_alloc(0, 0); - pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); + pkey = sys_pkey_alloc(0, 0); + sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); /* Set up alternate signal stack that will use the default MPK */ - sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); sigstack.ss_flags = 0; sigstack.ss_size = STACK_SIZE; @@ -531,13 +528,16 @@ static void (*pkey_tests[])(void) = { test_pkru_sigreturn }; -int main(int argc, char *argv[]) +int main(void) { - int i; + unsigned int i; ksft_print_header(); ksft_set_plan(ARRAY_SIZE(pkey_tests)); + if (!is_pkeys_supported()) + ksft_exit_skip("pkeys not supported\n"); + for (i = 0; i < ARRAY_SIZE(pkey_tests); i++) (*pkey_tests[i])(); diff --git a/tools/testing/selftests/mm/pkey_util.c b/tools/testing/selftests/mm/pkey_util.c new file mode 100644 index 000000000000..ca4ad0d44ab2 --- /dev/null +++ b/tools/testing/selftests/mm/pkey_util.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <sys/syscall.h> +#include <unistd.h> + +#include "pkey-helpers.h" + +int sys_pkey_alloc(unsigned long flags, unsigned long init_val) +{ + int ret = syscall(SYS_pkey_alloc, flags, init_val); + dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", + __func__, flags, init_val, ret, errno); + return ret; +} + +int sys_pkey_free(unsigned long pkey) +{ + int ret = syscall(SYS_pkey_free, pkey); + dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); + return ret; +} + +int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int sret; + + dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, + ptr, size, orig_prot, pkey); + + errno = 0; + sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey); + if (errno) { + dprintf2("SYS_mprotect_key sret: %d\n", sret); + dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); + dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); + if (DEBUG_LEVEL >= 2) + perror("SYS_mprotect_pkey"); + } + return sret; +} diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 4990f7ab4cb7..28960634044c 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -53,9 +53,15 @@ int test_nr; u64 shadow_pkey_reg; int dprint_in_signal; -char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -void cat_into_file(char *str, char *file) +noinline int read_ptr(int *ptr) +{ + /* Keep GCC from optimizing this away somehow */ + barrier(); + return *ptr; +} + +static void cat_into_file(char *str, char *file) { int fd = open(file, O_RDWR); int ret; @@ -72,7 +78,7 @@ void cat_into_file(char *str, char *file) } ret = write(fd, str, strlen(str)); - if (ret != strlen(str)) { + if (ret != (signed int)strlen(str)) { perror("write to file failed"); fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); exit(__LINE__); @@ -82,7 +88,7 @@ void cat_into_file(char *str, char *file) #if CONTROL_TRACING > 0 static int warned_tracing; -int tracing_root_ok(void) +static int tracing_root_ok(void) { if (geteuid() != 0) { if (!warned_tracing) @@ -95,7 +101,7 @@ int tracing_root_ok(void) } #endif -void tracing_on(void) +static void tracing_on(void) { #if CONTROL_TRACING > 0 #define TRACEDIR "/sys/kernel/tracing" @@ -119,7 +125,7 @@ void tracing_on(void) #endif } -void tracing_off(void) +static void tracing_off(void) { #if CONTROL_TRACING > 0 if (!tracing_root_ok()) @@ -153,7 +159,7 @@ __attribute__((__aligned__(65536))) #else __attribute__((__aligned__(PAGE_SIZE))) #endif -void lots_o_noops_around_write(int *write_to_me) +static void lots_o_noops_around_write(int *write_to_me) { dprintf3("running %s()\n", __func__); __page_o_noops(); @@ -164,7 +170,7 @@ void lots_o_noops_around_write(int *write_to_me) dprintf3("%s() done\n", __func__); } -void dump_mem(void *dumpme, int len_bytes) +static void dump_mem(void *dumpme, int len_bytes) { char *c = (void *)dumpme; int i; @@ -207,7 +213,7 @@ static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) return 0; } -void pkey_disable_set(int pkey, int flags) +static void pkey_disable_set(int pkey, int flags) { unsigned long syscall_flags = 0; int ret; @@ -245,7 +251,7 @@ void pkey_disable_set(int pkey, int flags) pkey, flags); } -void pkey_disable_clear(int pkey, int flags) +static void pkey_disable_clear(int pkey, int flags) { unsigned long syscall_flags = 0; int ret; @@ -271,19 +277,19 @@ void pkey_disable_clear(int pkey, int flags) pkey, read_pkey_reg()); } -void pkey_write_allow(int pkey) +__maybe_unused static void pkey_write_allow(int pkey) { pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); } -void pkey_write_deny(int pkey) +__maybe_unused static void pkey_write_deny(int pkey) { pkey_disable_set(pkey, PKEY_DISABLE_WRITE); } -void pkey_access_allow(int pkey) +__maybe_unused static void pkey_access_allow(int pkey) { pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); } -void pkey_access_deny(int pkey) +__maybe_unused static void pkey_access_deny(int pkey) { pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); } @@ -301,9 +307,9 @@ static char *si_code_str(int si_code) return "UNKNOWN"; } -int pkey_faults; -int last_si_pkey = -1; -void signal_handler(int signum, siginfo_t *si, void *vucontext) +static int pkey_faults; +static int last_si_pkey = -1; +static void signal_handler(int signum, siginfo_t *si, void *vucontext) { ucontext_t *uctxt = vucontext; int trapno; @@ -390,27 +396,21 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) /* restore access and let the faulting instruction continue */ pkey_access_allow(siginfo_pkey); #elif defined(__aarch64__) - aarch64_write_signal_pkey(uctxt, PKEY_ALLOW_ALL); + aarch64_write_signal_pkey(uctxt, PKEY_REG_ALLOW_ALL); #endif /* arch */ pkey_faults++; dprintf1("<<<<==================================================\n"); dprint_in_signal = 0; } -int wait_all_children(void) -{ - int status; - return waitpid(-1, &status, 0); -} - -void sig_chld(int x) +static void sig_chld(int x) { dprint_in_signal = 1; dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); dprint_in_signal = 0; } -void setup_sigsegv_handler(void) +static void setup_sigsegv_handler(void) { int r, rs; struct sigaction newact; @@ -436,13 +436,13 @@ void setup_sigsegv_handler(void) pkey_assert(r == 0); } -void setup_handlers(void) +static void setup_handlers(void) { signal(SIGCHLD, &sig_chld); setup_sigsegv_handler(); } -pid_t fork_lazy_child(void) +static pid_t fork_lazy_child(void) { pid_t forkret; @@ -460,35 +460,7 @@ pid_t fork_lazy_child(void) return forkret; } -int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey) -{ - int sret; - - dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, - ptr, size, orig_prot, pkey); - - errno = 0; - sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey); - if (errno) { - dprintf2("SYS_mprotect_key sret: %d\n", sret); - dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); - dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); - if (DEBUG_LEVEL >= 2) - perror("SYS_mprotect_pkey"); - } - return sret; -} - -int sys_pkey_alloc(unsigned long flags, unsigned long init_val) -{ - int ret = syscall(SYS_pkey_alloc, flags, init_val); - dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", - __func__, flags, init_val, ret, errno); - return ret; -} - -int alloc_pkey(void) +static int alloc_pkey(void) { int ret; unsigned long init_val = 0x0; @@ -534,19 +506,12 @@ int alloc_pkey(void) return ret; } -int sys_pkey_free(unsigned long pkey) -{ - int ret = syscall(SYS_pkey_free, pkey); - dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); - return ret; -} - /* * I had a bug where pkey bits could be set by mprotect() but * not cleared. This ensures we get lots of random bit sets * and clears on the vma and pte pkey bits. */ -int alloc_random_pkey(void) +static int alloc_random_pkey(void) { int max_nr_pkey_allocs; int ret; @@ -629,13 +594,13 @@ struct pkey_malloc_record { }; struct pkey_malloc_record *pkey_malloc_records; struct pkey_malloc_record *pkey_last_malloc_record; -long nr_pkey_malloc_records; +static long nr_pkey_malloc_records; void record_pkey_malloc(void *ptr, long size, int prot) { - long i; + unsigned long i; struct pkey_malloc_record *rec = NULL; - for (i = 0; i < nr_pkey_malloc_records; i++) { + for (i = 0; i < (unsigned long)nr_pkey_malloc_records; i++) { rec = &pkey_malloc_records[i]; /* find a free record */ if (rec) @@ -667,7 +632,7 @@ void record_pkey_malloc(void *ptr, long size, int prot) nr_pkey_malloc_records++; } -void free_pkey_malloc(void *ptr) +static void free_pkey_malloc(void *ptr) { long i; int ret; @@ -694,8 +659,7 @@ void free_pkey_malloc(void *ptr) pkey_assert(false); } - -void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) +static void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) { void *ptr; int ret; @@ -715,7 +679,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) return ptr; } -void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) +static void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) { int ret; void *ptr; @@ -745,10 +709,10 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) return ptr; } -int hugetlb_setup_ok; +static int hugetlb_setup_ok; #define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" #define GET_NR_HUGE_PAGES 10 -void setup_hugetlbfs(void) +static void setup_hugetlbfs(void) { int err; int fd; @@ -796,7 +760,7 @@ void setup_hugetlbfs(void) hugetlb_setup_ok = 1; } -void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) +static void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) { void *ptr; int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; @@ -817,42 +781,15 @@ void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) return ptr; } -void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) -{ - void *ptr; - int fd; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - fd = open("/dax/foo", O_RDWR); - pkey_assert(fd >= 0); - - ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); - pkey_assert(ptr != (void *)-1); - - mprotect_pkey(ptr, size, prot, pkey); - - record_pkey_malloc(ptr, size, prot); - - dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); - close(fd); - return ptr; -} - -void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { +static void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { malloc_pkey_with_mprotect, malloc_pkey_with_mprotect_subpage, malloc_pkey_anon_huge, malloc_pkey_hugetlb -/* can not do direct with the pkey_mprotect() API: - malloc_pkey_mmap_direct, - malloc_pkey_mmap_dax, -*/ }; -void *malloc_pkey(long size, int prot, u16 pkey) +static void *malloc_pkey(long size, int prot, u16 pkey) { void *ret; static int malloc_type; @@ -882,7 +819,7 @@ void *malloc_pkey(long size, int prot, u16 pkey) return ret; } -int last_pkey_faults; +static int last_pkey_faults; #define UNKNOWN_PKEY -2 void expected_pkey_fault(int pkey) { @@ -905,7 +842,7 @@ void expected_pkey_fault(int pkey) */ if (__read_pkey_reg() != 0) #elif defined(__aarch64__) - if (__read_pkey_reg() != PKEY_ALLOW_ALL) + if (__read_pkey_reg() != PKEY_REG_ALLOW_ALL) #else if (__read_pkey_reg() != shadow_pkey_reg) #endif /* arch */ @@ -924,24 +861,24 @@ void expected_pkey_fault(int pkey) pkey_assert(last_pkey_faults == pkey_faults); \ } while (0) -int test_fds[10] = { -1 }; -int nr_test_fds; -void __save_test_fd(int fd) +static int test_fds[10] = { -1 }; +static int nr_test_fds; +static void __save_test_fd(int fd) { pkey_assert(fd >= 0); - pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); + pkey_assert(nr_test_fds < (signed int)ARRAY_SIZE(test_fds)); test_fds[nr_test_fds] = fd; nr_test_fds++; } -int get_test_read_fd(void) +static int get_test_read_fd(void) { int test_fd = open("/etc/passwd", O_RDONLY); __save_test_fd(test_fd); return test_fd; } -void close_test_fds(void) +static void close_test_fds(void) { int i; @@ -954,13 +891,13 @@ void close_test_fds(void) nr_test_fds = 0; } -void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +static void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) { int i, err; int max_nr_pkey_allocs; int alloced_pkeys[NR_PKEYS]; int nr_alloced = 0; - long size; + unsigned long size; pkey_assert(pkey_last_malloc_record); size = pkey_last_malloc_record->size; @@ -1006,7 +943,7 @@ void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) pkey_assert(!err); } -void test_read_of_write_disabled_region(int *ptr, u16 pkey) +static void test_read_of_write_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1016,7 +953,7 @@ void test_read_of_write_disabled_region(int *ptr, u16 pkey) dprintf1("*ptr: %d\n", ptr_contents); dprintf1("\n"); } -void test_read_of_access_disabled_region(int *ptr, u16 pkey) +static void test_read_of_access_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1028,7 +965,7 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey) expected_pkey_fault(pkey); } -void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, +static void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, u16 pkey) { int ptr_contents; @@ -1045,7 +982,7 @@ void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, expected_pkey_fault(pkey); } -void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, +static void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, u16 pkey) { *ptr = __LINE__; @@ -1056,14 +993,14 @@ void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, expected_pkey_fault(pkey); } -void test_write_of_write_disabled_region(int *ptr, u16 pkey) +static void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); pkey_write_deny(pkey); *ptr = __LINE__; expected_pkey_fault(pkey); } -void test_write_of_access_disabled_region(int *ptr, u16 pkey) +static void test_write_of_access_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); pkey_access_deny(pkey); @@ -1071,7 +1008,7 @@ void test_write_of_access_disabled_region(int *ptr, u16 pkey) expected_pkey_fault(pkey); } -void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, +static void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, u16 pkey) { *ptr = __LINE__; @@ -1082,7 +1019,7 @@ void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, expected_pkey_fault(pkey); } -void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) +static void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) { int ret; int test_fd = get_test_read_fd(); @@ -1094,7 +1031,8 @@ void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) dprintf1("read ret: %d\n", ret); pkey_assert(ret); } -void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) + +static void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) { int ret; int test_fd = get_test_read_fd(); @@ -1107,7 +1045,7 @@ void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) pkey_assert(ret); } -void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) +static void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) { int pipe_ret, vmsplice_ret; struct iovec iov; @@ -1129,7 +1067,7 @@ void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) close(pipe_fds[1]); } -void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) +static void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) { int ignored = 0xdada; int futex_ret; @@ -1147,7 +1085,7 @@ void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) } /* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) +static void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) { int err; int i; @@ -1170,7 +1108,7 @@ void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) } /* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) +static void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) { int err; int bad_pkey = NR_PKEYS+99; @@ -1180,7 +1118,7 @@ void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) pkey_assert(err); } -void become_child(void) +static void become_child(void) { pid_t forkret; @@ -1196,7 +1134,7 @@ void become_child(void) } /* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_alloc_exhaust(int *ptr, u16 pkey) +static void test_pkey_alloc_exhaust(int *ptr, u16 pkey) { int err; int allocated_pkeys[NR_PKEYS] = {0}; @@ -1263,7 +1201,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) } } -void arch_force_pkey_reg_init(void) +static void arch_force_pkey_reg_init(void) { #if defined(__i386__) || defined(__x86_64__) /* arch */ u64 *buf; @@ -1302,7 +1240,7 @@ void arch_force_pkey_reg_init(void) * a long-running test that continually checks the pkey * register. */ -void test_pkey_init_state(int *ptr, u16 pkey) +static void test_pkey_init_state(int *ptr, u16 pkey) { int err; int allocated_pkeys[NR_PKEYS] = {0}; @@ -1340,9 +1278,9 @@ void test_pkey_init_state(int *ptr, u16 pkey) * have to call pkey_alloc() to use it first. Make sure that it * is usable. */ -void test_mprotect_with_pkey_0(int *ptr, u16 pkey) +static void test_mprotect_with_pkey_0(int *ptr, u16 pkey) { - long size; + unsigned long size; int prot; assert(pkey_last_malloc_record); @@ -1364,7 +1302,7 @@ void test_mprotect_with_pkey_0(int *ptr, u16 pkey) mprotect_pkey(ptr, size, prot, pkey); } -void test_ptrace_of_child(int *ptr, u16 pkey) +static void test_ptrace_of_child(int *ptr, u16 pkey) { __attribute__((__unused__)) int peek_result; pid_t child_pid; @@ -1440,7 +1378,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) free(plain_ptr_unaligned); } -void *get_pointer_to_instructions(void) +static void *get_pointer_to_instructions(void) { void *p1; @@ -1461,7 +1399,7 @@ void *get_pointer_to_instructions(void) return p1; } -void test_executing_on_unreadable_memory(int *ptr, u16 pkey) +static void test_executing_on_unreadable_memory(int *ptr, u16 pkey) { void *p1; int scratch; @@ -1493,7 +1431,7 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) pkey_assert(!ret); } -void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) +static void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) { void *p1; int scratch; @@ -1542,7 +1480,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) } #if defined(__i386__) || defined(__x86_64__) -void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +static void test_ptrace_modifies_pkru(int *ptr, u16 pkey) { u32 new_pkru; pid_t child; @@ -1590,7 +1528,7 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey) pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); xsave = (void *)malloc(xsave_size); - pkey_assert(xsave > 0); + pkey_assert(xsave != NULL); /* Modify the PKRU register directly */ iov.iov_base = xsave; @@ -1665,7 +1603,7 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey) #endif #if defined(__aarch64__) -void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +static void test_ptrace_modifies_pkru(int *ptr, u16 pkey) { pid_t child; int status, ret; @@ -1742,7 +1680,7 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey) } #endif -void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) +static void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) { int size = PAGE_SIZE; int sret; @@ -1756,7 +1694,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) pkey_assert(sret < 0); } -void (*pkey_tests[])(int *ptr, u16 pkey) = { +static void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_write_disabled_region, test_read_of_access_disabled_region, test_read_of_access_disabled_region_with_page_already_mapped, @@ -1782,12 +1720,12 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { #endif }; -void run_tests_once(void) +static void run_tests_once(void) { int *ptr; int prot = PROT_READ|PROT_WRITE; - for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { + for (test_nr = 0; test_nr < (signed int)ARRAY_SIZE(pkey_tests); test_nr++) { int pkey; int orig_pkey_faults = pkey_faults; @@ -1816,7 +1754,7 @@ void run_tests_once(void) iteration_nr++; } -void pkey_setup_shadow(void) +static void pkey_setup_shadow(void) { shadow_pkey_reg = __read_pkey_reg(); } diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 2fc290d9430c..333c468c2699 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -45,6 +45,8 @@ separated by spaces: vmalloc smoke tests - hmm hmm smoke tests +- madv_guard + test madvise(2) MADV_GUARD_INSTALL and MADV_GUARD_REMOVE options - madv_populate test memadvise(2) MADV_POPULATE_{READ,WRITE} options - memfd_secret @@ -307,6 +309,7 @@ CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 3 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem-private 20 16 +CATEGORY="userfaultfd" run_test ./uffd-wp-mremap #cleanup echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages @@ -375,6 +378,9 @@ CATEGORY="mremap" run_test ./mremap_dontunmap CATEGORY="hmm" run_test bash ./test_hmm.sh smoke +# MADV_GUARD_INSTALL and MADV_GUARD_REMOVE tests +CATEGORY="madv_guard" run_test ./guard-pages + # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c deleted file mode 100644 index d9f8ba8d5050..000000000000 --- a/tools/testing/selftests/mm/seal_elf.c +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include <sys/mman.h> -#include <stdint.h> -#include <asm-generic/unistd.h> -#include <string.h> -#include <sys/time.h> -#include <sys/resource.h> -#include <stdbool.h> -#include "../kselftest.h" -#include <syscall.h> -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <fcntl.h> -#include <sys/ioctl.h> -#include <sys/vfs.h> -#include <sys/stat.h> -#include "mseal_helpers.h" - -/* - * define sys_xyx to call syscall directly. - */ -static int sys_mseal(void *start, size_t len) -{ - int sret; - - errno = 0; - sret = syscall(__NR_mseal, start, len, 0); - return sret; -} - -static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot) -{ - int sret; - - errno = 0; - sret = syscall(__NR_mprotect, ptr, size, prot); - return sret; -} - -static bool seal_support(void) -{ - int ret; - void *ptr; - unsigned long page_size = getpagesize(); - - ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (ptr == (void *) -1) - return false; - - ret = sys_mseal(ptr, page_size); - if (ret < 0) - return false; - - return true; -} - -const char somestr[4096] = {"READONLY"}; - -static void test_seal_elf(void) -{ - int ret; - FILE *maps; - char line[512]; - uintptr_t addr_start, addr_end; - char prot[5]; - char filename[256]; - unsigned long page_size = getpagesize(); - unsigned long long ptr = (unsigned long long) somestr; - char *somestr2 = (char *)somestr; - - /* - * Modify the protection of readonly somestr - */ - if (((unsigned long long)ptr % page_size) != 0) - ptr = (unsigned long long)ptr & ~(page_size - 1); - - ksft_print_msg("somestr = %s\n", somestr); - ksft_print_msg("change protection to rw\n"); - ret = sys_mprotect((void *)ptr, page_size, PROT_READ|PROT_WRITE); - FAIL_TEST_IF_FALSE(!ret); - *somestr2 = 'A'; - ksft_print_msg("somestr is modified to: %s\n", somestr); - ret = sys_mprotect((void *)ptr, page_size, PROT_READ); - FAIL_TEST_IF_FALSE(!ret); - - maps = fopen("/proc/self/maps", "r"); - FAIL_TEST_IF_FALSE(maps); - - /* - * apply sealing to elf binary - */ - while (fgets(line, sizeof(line), maps)) { - if (sscanf(line, "%lx-%lx %4s %*x %*x:%*x %*u %255[^\n]", - &addr_start, &addr_end, prot, filename) == 4) { - if (strlen(filename)) { - /* - * seal the mapping if read only. - */ - if (strstr(prot, "r-")) { - ret = sys_mseal((void *)addr_start, addr_end - addr_start); - FAIL_TEST_IF_FALSE(!ret); - ksft_print_msg("sealed: %lx-%lx %s %s\n", - addr_start, addr_end, prot, filename); - if ((uintptr_t) somestr >= addr_start && - (uintptr_t) somestr <= addr_end) - ksft_print_msg("mapping for somestr found\n"); - } - } - } - } - fclose(maps); - - ret = sys_mprotect((void *)ptr, page_size, PROT_READ | PROT_WRITE); - FAIL_TEST_IF_FALSE(ret < 0); - ksft_print_msg("somestr is sealed, mprotect is rejected\n"); - - REPORT_TEST_PASS(); -} - -int main(int argc, char **argv) -{ - bool test_seal = seal_support(); - - ksft_print_header(); - ksft_print_msg("pid=%d\n", getpid()); - - if (!test_seal) - ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n"); - - ksft_set_plan(1); - - test_seal_elf(); - - ksft_finished(); -} diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index bdfa5d085f00..68edb2475ccd 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -74,10 +74,10 @@ static void test_vma_reuse(int pagemap_fd, int pagesize) munmap(map2, pagesize); } -static void test_hugepage(int pagemap_fd, int pagesize) +static void test_hugepage(int pagemap_fd) { char *map; - int i, ret; + unsigned int i, ret; size_t hpage_len = read_pmd_pagesize(); if (!hpage_len) @@ -128,7 +128,7 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon) { const char *type[] = {"file", "anon"}; const char *fname = "./soft-dirty-test-file"; - int test_fd; + int test_fd = 0; char *map; if (anon) { @@ -187,7 +187,7 @@ static void test_mprotect_file(int pagemap_fd, int pagesize) test_mprotect(pagemap_fd, pagesize, false); } -int main(int argc, char **argv) +int main(void) { int pagemap_fd; int pagesize; @@ -203,7 +203,7 @@ int main(int argc, char **argv) test_simple(pagemap_fd, pagesize); test_vma_reuse(pagemap_fd, pagesize); - test_hugepage(pagemap_fd, pagesize); + test_hugepage(pagemap_fd); test_mprotect_anon(pagemap_fd, pagesize); test_mprotect_file(pagemap_fd, pagesize); diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index eb6d1b9fc362..3d3bc40a268b 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -103,43 +103,33 @@ static char *allocate_zero_filled_hugepage(size_t len) return result; } -static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len) +static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, size_t len) { unsigned long rss_anon_before, rss_anon_after; size_t i; - if (!check_huge_anon(one_page, 4, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } + if (!check_huge_anon(one_page, 4, pmd_pagesize)) + ksft_exit_fail_msg("No THP is allocated\n"); rss_anon_before = rss_anon(); - if (!rss_anon_before) { - printf("No RssAnon is allocated before split\n"); - exit(EXIT_FAILURE); - } + if (!rss_anon_before) + ksft_exit_fail_msg("No RssAnon is allocated before split\n"); /* split all THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, (uint64_t)one_page + len, 0); for (i = 0; i < len; i++) - if (one_page[i] != (char)0) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } + if (one_page[i] != (char)0) + ksft_exit_fail_msg("%ld byte corrupted\n", i); - if (!check_huge_anon(one_page, 0, pmd_pagesize)) { - printf("Still AnonHugePages not split\n"); - exit(EXIT_FAILURE); - } + if (!check_huge_anon(one_page, 0, pmd_pagesize)) + ksft_exit_fail_msg("Still AnonHugePages not split\n"); rss_anon_after = rss_anon(); - if (rss_anon_after >= rss_anon_before) { - printf("Incorrect RssAnon value. Before: %ld After: %ld\n", + if (rss_anon_after >= rss_anon_before) + ksft_exit_fail_msg("Incorrect RssAnon value. Before: %ld After: %ld\n", rss_anon_before, rss_anon_after); - exit(EXIT_FAILURE); - } } void split_pmd_zero_pages(void) @@ -149,12 +139,12 @@ void split_pmd_zero_pages(void) size_t len = nr_hpages * pmd_pagesize; one_page = allocate_zero_filled_hugepage(len); - verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); - printf("Split zero filled huge pages successful\n"); + verify_rss_anon_split_huge_page_all_zeroes(one_page, len); + ksft_test_result_pass("Split zero filled huge pages successful\n"); free(one_page); } -void split_pmd_thp(void) +void split_pmd_thp_to_order(int order) { char *one_page; size_t len = 4 * pmd_pagesize; @@ -174,7 +164,7 @@ void split_pmd_thp(void) /* split all THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, - (uint64_t)one_page + len, 0); + (uint64_t)one_page + len, order); for (i = 0; i < len; i++) if (one_page[i] != (char)i) @@ -184,7 +174,7 @@ void split_pmd_thp(void) if (!check_huge_anon(one_page, 0, pmd_pagesize)) ksft_exit_fail_msg("Still AnonHugePages not split\n"); - ksft_test_result_pass("Split huge pages successful\n"); + ksft_test_result_pass("Split huge pages to order %d successful\n", order); free(one_page); } @@ -491,7 +481,7 @@ int main(int argc, char **argv) if (argc > 1) optional_xfs_path = argv[1]; - ksft_set_plan(3+9); + ksft_set_plan(1+9+2+9); pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; @@ -502,7 +492,10 @@ int main(int argc, char **argv) fd_size = 2 * pmd_pagesize; split_pmd_zero_pages(); - split_pmd_thp(); + + for (i = 0; i < 9; i++) + split_pmd_thp_to_order(i); + split_pte_mapped_thp(); split_file_backed_thp(); diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c index 577eaab6266f..ad872af1c81a 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/thp_settings.c @@ -87,7 +87,7 @@ int write_file(const char *path, const char *buf, size_t buflen) return (unsigned int) numwritten; } -const unsigned long read_num(const char *path) +unsigned long read_num(const char *path) { char buf[21]; @@ -172,7 +172,7 @@ void thp_write_string(const char *name, const char *val) } } -const unsigned long thp_read_num(const char *name) +unsigned long thp_read_num(const char *name) { char path[PATH_MAX]; int ret; diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h index 876235a23460..fc131d23d593 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/thp_settings.h @@ -64,12 +64,12 @@ struct thp_settings { int read_file(const char *path, char *buf, size_t buflen); int write_file(const char *path, const char *buf, size_t buflen); -const unsigned long read_num(const char *path); +unsigned long read_num(const char *path); void write_num(const char *path, unsigned long num); int thp_read_string(const char *name, const char * const strings[]); void thp_write_string(const char *name, const char *val); -const unsigned long thp_read_num(const char *name); +unsigned long thp_read_num(const char *name); void thp_write_num(const char *name, unsigned long num); void thp_write_settings(struct thp_settings *settings); diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index e4370b79b62f..515b89ac4eb5 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -65,7 +65,7 @@ void show(unsigned long ps) { char buf[100]; - if (ps == getpagesize()) + if ((signed long)ps == getpagesize()) return; ksft_print_msg("%luMB: ", ps >> 20); @@ -106,7 +106,7 @@ unsigned long read_sysfs(int warn, char *fmt, ...) unsigned long read_free(unsigned long ps) { - return read_sysfs(ps != getpagesize(), + return read_sysfs((signed long)ps != getpagesize(), "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", ps >> 10); } @@ -126,7 +126,7 @@ void test_mmap(unsigned long size, unsigned flags) after = read_free(size); show(size); - ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, + ksft_test_result((signed long)size == getpagesize() || (before - after) == NUM_PAGES, "%s mmap\n", __func__); if (munmap(map, size * NUM_PAGES)) @@ -164,7 +164,7 @@ void test_shmget(unsigned long size, unsigned flags) after = read_free(size); show(size); - ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, + ksft_test_result((signed long)size == getpagesize() || (before - after) == NUM_PAGES, "%s: mmap\n", __func__); if (shmdt(map)) ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno)); @@ -173,7 +173,7 @@ void test_shmget(unsigned long size, unsigned flags) void find_pagesizes(void) { unsigned long largest = getpagesize(); - int i; + unsigned int i; glob_t g; glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 717539eddf98..47bdcb47481a 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -46,7 +46,7 @@ static void anon_release_pages(char *rel_area) err("madvise(MADV_DONTNEED) failed"); } -static int anon_allocate_area(void **alloc_area, bool is_src) +static int anon_allocate_area(void **alloc_area, bool __attribute__((unused)) is_src) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); @@ -57,7 +57,9 @@ static int anon_allocate_area(void **alloc_area, bool is_src) return 0; } -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) +static void noop_alias_mapping(__u64 __attribute__((unused)) *start, + size_t __attribute__((unused)) len, + unsigned long __attribute__((unused)) offset) { } @@ -108,7 +110,8 @@ static int hugetlb_allocate_area(void **alloc_area, bool is_src) return 0; } -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) +static void hugetlb_alias_mapping(__u64 *start, size_t __attribute__((unused)) len, + unsigned long offset) { if (!map_shared) return; @@ -167,12 +170,13 @@ static int shmem_allocate_area(void **alloc_area, bool is_src) return 0; } -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +static void shmem_alias_mapping(__u64 *start, size_t __attribute__((unused)) len, + unsigned long offset) { *start = (unsigned long)area_dst_alias + offset; } -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +static void shmem_check_pmd_mapping(void __attribute__((unused)) *p, int expect_nr_hpages) { if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, read_pmd_pagesize())) @@ -416,7 +420,7 @@ static void continue_range(int ufd, __u64 start, __u64 len, bool wp) ret, (int64_t) req.mapped); } -int uffd_read_msg(int ufd, struct uffd_msg *msg) +int uffd_read_msg(struct uffd_msg *msg) { int ret = read(uffd, msg, sizeof(*msg)); @@ -446,7 +450,7 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) args->wp_faults++; } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { uint8_t *area; - int b; + unsigned int b; /* * Minor page faults @@ -537,7 +541,7 @@ void *uffd_poll_thread(void *arg) } if (!(pollfd[0].revents & POLLIN)) err("pollfd[0].revents %d", pollfd[0].revents); - if (uffd_read_msg(uffd, &msg)) + if (uffd_read_msg(&msg)) continue; switch (msg.event) { default: @@ -617,7 +621,7 @@ int __copy_page(int ufd, unsigned long offset, bool retry, bool wp) err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); wake_range(ufd, uffdio_copy.dst, page_size); - } else if (uffdio_copy.copy != page_size) { + } else if (uffdio_copy.copy != (signed long)page_size) { err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); } else { if (test_uffdio_copy_eexist && retry) { @@ -651,7 +655,7 @@ int move_page(int ufd, unsigned long offset, unsigned long len) err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move); wake_range(ufd, uffdio_move.dst, len); - } else if (uffdio_move.move != len) { + } else if (uffdio_move.move != (signed long)len) { err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move); } else return 1; diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index a70ae10b5f62..4a5d5b37107c 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -117,7 +117,7 @@ void uffd_stats_report(struct uffd_args *args, int n_cpus); int uffd_test_ctx_init(uint64_t features, const char **errmsg); void uffd_test_ctx_clear(void); int userfaultfd_open(uint64_t *features); -int uffd_read_msg(int ufd, struct uffd_msg *msg); +int uffd_read_msg(struct uffd_msg *msg); void wp_range(int ufd, __u64 start, __u64 len, bool wp); void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args); int __copy_page(int ufd, unsigned long offset, bool retry, bool wp); diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index a4b83280998a..5509ec32c329 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -77,7 +77,7 @@ static void usage(void) static void uffd_stats_reset(struct uffd_args *args, unsigned long n_cpus) { - int i; + unsigned int i; for (i = 0; i < n_cpus; i++) { args[i].cpu = i; @@ -136,7 +136,7 @@ static void *uffd_read_thread(void *arg) /* from here cancellation is ok */ for (;;) { - if (uffd_read_msg(uffd, &msg)) + if (uffd_read_msg(&msg)) continue; uffd_handle_page_fault(&msg, args); } diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index a2e71b1636e7..c3d59ec75404 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -244,7 +244,7 @@ static void *fork_event_consumer(void *data) ready_for_fork = true; /* Read until a full msg received */ - while (uffd_read_msg(args->parent_uffd, &msg)); + while (uffd_read_msg(&msg)); if (msg.event != UFFD_EVENT_FORK) err("wrong message: %u\n", msg.event); @@ -357,7 +357,7 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin) return result; } -static void uffd_wp_unpopulated_test(uffd_test_args_t *args) +static void uffd_wp_unpopulated_test(uffd_test_args_t __attribute__((unused)) *args) { uint64_t value; int pagemap_fd; @@ -483,8 +483,7 @@ static void uffd_wp_fork_with_event_test(uffd_test_args_t *args) uffd_wp_fork_test_common(args, true); } -static void uffd_wp_fork_pin_test_common(uffd_test_args_t *args, - bool with_event) +static void uffd_wp_fork_pin_test_common(bool with_event) { int pagemap_fd; pin_args pin_args = {}; @@ -535,14 +534,14 @@ out: close(pagemap_fd); } -static void uffd_wp_fork_pin_test(uffd_test_args_t *args) +static void uffd_wp_fork_pin_test(uffd_test_args_t __attribute__((unused)) *args) { - uffd_wp_fork_pin_test_common(args, false); + uffd_wp_fork_pin_test_common(false); } -static void uffd_wp_fork_pin_with_event_test(uffd_test_args_t *args) +static void uffd_wp_fork_pin_with_event_test(uffd_test_args_t __attribute__((unused)) *args) { - uffd_wp_fork_pin_test_common(args, true); + uffd_wp_fork_pin_test_common(true); } static void check_memory_contents(char *p) @@ -627,24 +626,25 @@ static void uffd_minor_test_common(bool test_collapse, bool test_wp) uffd_test_pass(); } -void uffd_minor_test(uffd_test_args_t *args) +void uffd_minor_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_minor_test_common(false, false); } -void uffd_minor_wp_test(uffd_test_args_t *args) +void uffd_minor_wp_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_minor_test_common(false, true); } -void uffd_minor_collapse_test(uffd_test_args_t *args) +void uffd_minor_collapse_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_minor_test_common(true, false); } static sigjmp_buf jbuf, *sigbuf; -static void sighndl(int sig, siginfo_t *siginfo, void *ptr) +static void sighndl(int sig, siginfo_t __attribute__((unused)) *siginfo, + void __attribute__((unused)) *ptr) { if (sig == SIGBUS) { if (sigbuf) @@ -820,12 +820,12 @@ static void uffd_sigbus_test_common(bool wp) uffd_test_pass(); } -static void uffd_sigbus_test(uffd_test_args_t *args) +static void uffd_sigbus_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_sigbus_test_common(false); } -static void uffd_sigbus_wp_test(uffd_test_args_t *args) +static void uffd_sigbus_wp_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_sigbus_test_common(true); } @@ -873,12 +873,12 @@ static void uffd_events_test_common(bool wp) uffd_test_pass(); } -static void uffd_events_test(uffd_test_args_t *args) +static void uffd_events_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_events_test_common(false); } -static void uffd_events_wp_test(uffd_test_args_t *args) +static void uffd_events_wp_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_events_test_common(true); } @@ -917,7 +917,7 @@ static bool do_uffdio_zeropage(int ufd, bool has_zeropage) else if (res != -EINVAL) err("UFFDIO_ZEROPAGE not -EINVAL"); } else if (has_zeropage) { - if (res != page_size) + if (res != (signed long)page_size) err("UFFDIO_ZEROPAGE unexpected size"); else retry_uffdio_zeropage(ufd, &uffdio_zeropage); @@ -946,10 +946,10 @@ uffd_register_detect_zeropage(int uffd, void *addr, uint64_t len) } /* exercise UFFDIO_ZEROPAGE */ -static void uffd_zeropage_test(uffd_test_args_t *args) +static void uffd_zeropage_test(uffd_test_args_t __attribute__((unused)) *args) { bool has_zeropage; - int i; + unsigned int i; has_zeropage = uffd_register_detect_zeropage(uffd, area_dst, page_size); if (area_dst_alias) @@ -997,12 +997,12 @@ static void do_uffdio_poison(int uffd, unsigned long offset) if (ret) err("UFFDIO_POISON error: %"PRId64, (int64_t)res); - else if (res != page_size) + else if (res != (signed long)page_size) err("UFFDIO_POISON unexpected size: %"PRId64, (int64_t)res); } static void uffd_poison_handle_fault( - struct uffd_msg *msg, struct uffd_args *args) + struct uffd_msg *msg, struct uffd_args __attribute__((unused)) *args) { unsigned long offset; @@ -1023,7 +1023,7 @@ static void uffd_poison_handle_fault( do_uffdio_poison(uffd, offset); } -static void uffd_poison_test(uffd_test_args_t *targs) +static void uffd_poison_test(uffd_test_args_t __attribute__((unused)) *targs) { pthread_t uffd_mon; char c; @@ -1114,7 +1114,7 @@ static void uffd_move_pmd_handle_fault(struct uffd_msg *msg, } static void -uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, +uffd_move_test_common(unsigned long chunk_size, void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args)) { unsigned long nr; @@ -1122,7 +1122,7 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, char c; unsigned long long count; struct uffd_args args = { 0 }; - char *orig_area_src, *orig_area_dst; + char *orig_area_src = NULL, *orig_area_dst = NULL; unsigned long step_size, step_count; unsigned long src_offs = 0; unsigned long dst_offs = 0; @@ -1190,7 +1190,7 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, nr, count, count_verify[src_offs + nr + i]); } } - if (step_size > page_size) { + if (chunk_size > page_size) { area_src = orig_area_src; area_dst = orig_area_dst; } @@ -1206,24 +1206,24 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, uffd_test_pass(); } -static void uffd_move_test(uffd_test_args_t *targs) +static void uffd_move_test(uffd_test_args_t __attribute__((unused)) *targs) { - uffd_move_test_common(targs, page_size, uffd_move_handle_fault); + uffd_move_test_common(page_size, uffd_move_handle_fault); } -static void uffd_move_pmd_test(uffd_test_args_t *targs) +static void uffd_move_pmd_test(uffd_test_args_t __attribute__((unused)) *targs) { if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) err("madvise(MADV_HUGEPAGE) failure"); - uffd_move_test_common(targs, read_pmd_pagesize(), + uffd_move_test_common(read_pmd_pagesize(), uffd_move_pmd_handle_fault); } -static void uffd_move_pmd_split_test(uffd_test_args_t *targs) +static void uffd_move_pmd_split_test(uffd_test_args_t __attribute__((unused)) *targs) { if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) err("madvise(MADV_NOHUGEPAGE) failure"); - uffd_move_test_common(targs, read_pmd_pagesize(), + uffd_move_test_common(read_pmd_pagesize(), uffd_move_pmd_handle_fault); } diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c new file mode 100644 index 000000000000..f548b1e1f197 --- /dev/null +++ b/tools/testing/selftests/mm/uffd-wp-mremap.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define _GNU_SOURCE +#include <stdbool.h> +#include <stdint.h> +#include <fcntl.h> +#include <assert.h> +#include <linux/mman.h> +#include <sys/mman.h> +#include "../kselftest.h" +#include "thp_settings.h" +#include "uffd-common.h" + +static int pagemap_fd; +static size_t pagesize; +static int nr_pagesizes = 1; +static int nr_thpsizes; +static size_t thpsizes[20]; +static int nr_hugetlbsizes; +static size_t hugetlbsizes[10]; + +static int sz2ord(size_t size) +{ + return __builtin_ctzll(size / pagesize); +} + +static int detect_thp_sizes(size_t sizes[], int max) +{ + int count = 0; + unsigned long orders; + size_t kb; + int i; + + /* thp not supported at all. */ + if (!read_pmd_pagesize()) + return 0; + + orders = thp_supported_orders(); + + for (i = 0; orders && count < max; i++) { + if (!(orders & (1UL << i))) + continue; + orders &= ~(1UL << i); + kb = (pagesize >> 10) << i; + sizes[count++] = kb * 1024; + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb); + } + + return count; +} + +static void *mmap_aligned(size_t size, int prot, int flags) +{ + size_t mmap_size = size * 2; + char *mmap_mem, *mem; + + mmap_mem = mmap(NULL, mmap_size, prot, flags, -1, 0); + if (mmap_mem == MAP_FAILED) + return mmap_mem; + + mem = (char *)(((uintptr_t)mmap_mem + size - 1) & ~(size - 1)); + munmap(mmap_mem, mem - mmap_mem); + munmap(mem + size, mmap_mem + mmap_size - mem - size); + + return mem; +} + +static void *alloc_one_folio(size_t size, bool private, bool hugetlb) +{ + bool thp = !hugetlb && size > pagesize; + int flags = MAP_ANONYMOUS; + int prot = PROT_READ | PROT_WRITE; + char *mem, *addr; + + assert((size & (size - 1)) == 0); + + if (private) + flags |= MAP_PRIVATE; + else + flags |= MAP_SHARED; + + /* + * For THP, we must explicitly enable the THP size, allocate twice the + * required space then manually align. + */ + if (thp) { + struct thp_settings settings = *thp_current_settings(); + + if (private) + settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; + else + settings.shmem_hugepages[sz2ord(size)].enabled = SHMEM_ALWAYS; + + thp_push_settings(&settings); + + mem = mmap_aligned(size, prot, flags); + } else { + if (hugetlb) { + flags |= MAP_HUGETLB; + flags |= __builtin_ctzll(size) << MAP_HUGE_SHIFT; + } + + mem = mmap(NULL, size, prot, flags, -1, 0); + } + + if (mem == MAP_FAILED) { + mem = NULL; + goto out; + } + + assert(((uintptr_t)mem & (size - 1)) == 0); + + /* + * Populate the folio by writing the first byte and check that all pages + * are populated. Finally set the whole thing to non-zero data to avoid + * kernel from mapping it back to the zero page. + */ + mem[0] = 1; + for (addr = mem; addr < mem + size; addr += pagesize) { + if (!pagemap_is_populated(pagemap_fd, addr)) { + munmap(mem, size); + mem = NULL; + goto out; + } + } + memset(mem, 1, size); +out: + if (thp) + thp_pop_settings(); + + return mem; +} + +static bool check_uffd_wp_state(void *mem, size_t size, bool expect) +{ + uint64_t pte; + void *addr; + + for (addr = mem; addr < mem + size; addr += pagesize) { + pte = pagemap_get_entry(pagemap_fd, addr); + if (!!(pte & PM_UFFD_WP) != expect) { + ksft_test_result_fail("uffd-wp not %s for pte %lu!\n", + expect ? "set" : "clear", + (addr - mem) / pagesize); + return false; + } + } + + return true; +} + +static bool range_is_swapped(void *addr, size_t size) +{ + for (; size; addr += pagesize, size -= pagesize) + if (!pagemap_is_swapped(pagemap_fd, addr)) + return false; + return true; +} + +static void test_one_folio(size_t size, bool private, bool swapout, bool hugetlb) +{ + struct uffdio_writeprotect wp_prms; + uint64_t features = 0; + void *addr = NULL; + void *mem = NULL; + + assert(!(hugetlb && swapout)); + + ksft_print_msg("[RUN] %s(size=%zu, private=%s, swapout=%s, hugetlb=%s)\n", + __func__, + size, + private ? "true" : "false", + swapout ? "true" : "false", + hugetlb ? "true" : "false"); + + /* Allocate a folio of required size and type. */ + mem = alloc_one_folio(size, private, hugetlb); + if (!mem) { + ksft_test_result_fail("alloc_one_folio() failed\n"); + goto out; + } + + /* Register range for uffd-wp. */ + if (userfaultfd_open(&features)) { + ksft_test_result_fail("userfaultfd_open() failed\n"); + goto out; + } + if (uffd_register(uffd, mem, size, false, true, false)) { + ksft_test_result_fail("uffd_register() failed\n"); + goto out; + } + wp_prms.mode = UFFDIO_WRITEPROTECT_MODE_WP; + wp_prms.range.start = (uintptr_t)mem; + wp_prms.range.len = size; + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp_prms)) { + ksft_test_result_fail("ioctl(UFFDIO_WRITEPROTECT) failed\n"); + goto out; + } + + if (swapout) { + madvise(mem, size, MADV_PAGEOUT); + if (!range_is_swapped(mem, size)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto out; + } + } + + /* Check that uffd-wp is set for all PTEs in range. */ + if (!check_uffd_wp_state(mem, size, true)) + goto out; + + /* + * Move the mapping to a new, aligned location. Since + * UFFD_FEATURE_EVENT_REMAP is not set, we expect the uffd-wp bit for + * each PTE to be cleared in the new mapping. + */ + addr = mmap_aligned(size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS); + if (addr == MAP_FAILED) { + ksft_test_result_fail("mmap_aligned() failed\n"); + goto out; + } + if (mremap(mem, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, addr) == MAP_FAILED) { + ksft_test_result_fail("mremap() failed\n"); + munmap(addr, size); + goto out; + } + mem = addr; + + /* Check that uffd-wp is cleared for all PTEs in range. */ + if (!check_uffd_wp_state(mem, size, false)) + goto out; + + ksft_test_result_pass("%s(size=%zu, private=%s, swapout=%s, hugetlb=%s)\n", + __func__, + size, + private ? "true" : "false", + swapout ? "true" : "false", + hugetlb ? "true" : "false"); +out: + if (mem) + munmap(mem, size); + if (uffd >= 0) { + close(uffd); + uffd = -1; + } +} + +struct testcase { + size_t *sizes; + int *nr_sizes; + bool private; + bool swapout; + bool hugetlb; +}; + +static const struct testcase testcases[] = { + /* base pages. */ + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = false, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = true, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = false, + .swapout = true, + .hugetlb = false, + }, + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = true, + .swapout = true, + .hugetlb = false, + }, + + /* thp. */ + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = false, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = true, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = false, + .swapout = true, + .hugetlb = false, + }, + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = true, + .swapout = true, + .hugetlb = false, + }, + + /* hugetlb. */ + { + .sizes = hugetlbsizes, + .nr_sizes = &nr_hugetlbsizes, + .private = false, + .swapout = false, + .hugetlb = true, + }, + { + .sizes = hugetlbsizes, + .nr_sizes = &nr_hugetlbsizes, + .private = true, + .swapout = false, + .hugetlb = true, + }, +}; + +int main(void) +{ + struct thp_settings settings; + int i, j, plan = 0; + + pagesize = getpagesize(); + nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); + nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, + ARRAY_SIZE(hugetlbsizes)); + + /* If THP is supported, save THP settings and initially disable THP. */ + if (nr_thpsizes) { + thp_save_settings(); + thp_read_settings(&settings); + for (i = 0; i < NR_ORDERS; i++) { + settings.hugepages[i].enabled = THP_NEVER; + settings.shmem_hugepages[i].enabled = SHMEM_NEVER; + } + thp_push_settings(&settings); + } + + for (i = 0; i < ARRAY_SIZE(testcases); i++) + plan += *testcases[i].nr_sizes; + ksft_set_plan(plan); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + + for (i = 0; i < ARRAY_SIZE(testcases); i++) { + const struct testcase *tc = &testcases[i]; + + for (j = 0; j < *tc->nr_sizes; j++) + test_one_folio(tc->sizes[j], tc->private, tc->swapout, + tc->hugetlb); + } + + /* If THP is supported, restore original THP settings. */ + if (nr_thpsizes) + thp_restore_settings(); + + i = ksft_get_fail_cnt(); + if (i) + ksft_exit_fail_msg("%d out of %d tests failed\n", + i, ksft_test_num()); + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 484f82c7b7c8..6e4269b9b54d 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -160,7 +160,7 @@ static int validate_complete_va_space(void) return 0; } -int main(int argc, char *argv[]) +int main(void) { char *ptr[NR_CHUNKS_LOW]; char **hptr; diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index d8d0cf04bb57..7519c9a892f0 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -138,7 +138,7 @@ void clear_softdirty(void) ksft_exit_fail_msg("opening clear_refs failed\n"); ret = write(fd, ctrl, strlen(ctrl)); close(fd); - if (ret != strlen(ctrl)) + if (ret != (signed int)strlen(ctrl)) ksft_exit_fail_msg("writing clear_refs failed\n"); } diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c index 1289d311efd7..34c91f7e6128 100644 --- a/tools/testing/selftests/mm/write_to_hugetlbfs.c +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) size = atoi(optarg); break; case 'p': - strncpy(path, optarg, sizeof(path)); + strncpy(path, optarg, sizeof(path) - 1); break; case 'm': if (atoi(optarg) >= MAX_METHOD) { diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index 9faa686f90e4..e9728e86b4f2 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -497,7 +497,7 @@ static int child_poll_leader_exit_test(void *args) pthread_create(&t2, NULL, test_pidfd_poll_leader_exit_thread, NULL); /* - * glibc exit calls exit_group syscall, so explicity call exit only + * glibc exit calls exit_group syscall, so explicitly call exit only * so that only the group leader exits, leaving the threads alone. */ *child_exit_secs = time(NULL); diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h index 3e1b6adc027b..788c597c4fde 100644 --- a/tools/testing/vma/linux/atomic.h +++ b/tools/testing/vma/linux/atomic.h @@ -9,4 +9,9 @@ #define atomic_set(x, y) uatomic_set(x, y) #define U8_MAX UCHAR_MAX +#ifndef atomic_cmpxchg_relaxed +#define atomic_cmpxchg_relaxed uatomic_cmpxchg +#define atomic_cmpxchg_release uatomic_cmpxchg +#endif /* atomic_cmpxchg_relaxed */ + #endif /* _LINUX_ATOMIC_H */ diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 8fab5e13c7c3..04ab45e27fb8 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -18,6 +18,12 @@ static bool fail_prealloc; #define vma_iter_prealloc(vmi, vma) \ (fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL)) +#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536 + +unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; + /* * Directly import the VMA implementation here. Our vma_internal.h wrapper * provides userland-equivalent functionality for everything vma.c uses. @@ -47,6 +53,11 @@ struct task_struct *get_current(void) return &__current; } +unsigned long rlimit(unsigned int limit) +{ + return (unsigned long)-1; +} + /* Helper function to simply allocate a VMA. */ static struct vm_area_struct *alloc_vma(struct mm_struct *mm, unsigned long start, @@ -89,7 +100,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, * begun. Linking to the tree will have caused this to be incremented, * which means we will get a false positive otherwise. */ - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return vma; } @@ -214,7 +225,7 @@ static bool vma_write_started(struct vm_area_struct *vma) int seq = vma->vm_lock_seq; /* We reset after each check. */ - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; /* The vma_start_write() stub simply increments this value. */ return seq > -1; @@ -1563,6 +1574,57 @@ static bool test_expand_only_mode(void) return true; } +static bool test_mmap_region_basic(void) +{ + struct mm_struct mm = {}; + unsigned long addr; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, &mm, 0); + + current->mm = &mm; + + /* Map at 0x300000, length 0x3000. */ + addr = __mmap_region(NULL, 0x300000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x300, NULL); + ASSERT_EQ(addr, 0x300000); + + /* Map at 0x250000, length 0x3000. */ + addr = __mmap_region(NULL, 0x250000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x250, NULL); + ASSERT_EQ(addr, 0x250000); + + /* Map at 0x303000, merging to 0x300000 of length 0x6000. */ + addr = __mmap_region(NULL, 0x303000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x303, NULL); + ASSERT_EQ(addr, 0x303000); + + /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */ + addr = __mmap_region(NULL, 0x24d000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x24d, NULL); + ASSERT_EQ(addr, 0x24d000); + + ASSERT_EQ(mm.map_count, 2); + + for_each_vma(vmi, vma) { + if (vma->vm_start == 0x300000) { + ASSERT_EQ(vma->vm_end, 0x306000); + ASSERT_EQ(vma->vm_pgoff, 0x300); + } else if (vma->vm_start == 0x24d000) { + ASSERT_EQ(vma->vm_end, 0x253000); + ASSERT_EQ(vma->vm_pgoff, 0x24d); + } else { + ASSERT_FALSE(true); + } + } + + cleanup_mm(&mm, &vmi); + return true; +} + int main(void) { int num_tests = 0, num_fail = 0; @@ -1596,6 +1658,8 @@ int main(void) TEST(copy_vma); TEST(expand_only_mode); + TEST(mmap_region_basic); + #undef TEST printf("%d tests run, %d passed, %d failed.\n", diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index e76ff579e1fd..49a85ce0d45a 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -25,13 +25,24 @@ #include <linux/maple_tree.h> #include <linux/mm.h> #include <linux/rbtree.h> -#include <linux/rwsem.h> +#include <linux/refcount.h> + +extern unsigned long stack_guard_gap; +#ifdef CONFIG_MMU +extern unsigned long mmap_min_addr; +extern unsigned long dac_mmap_min_addr; +#else +#define mmap_min_addr 0UL +#define dac_mmap_min_addr 0UL +#endif #define VM_WARN_ON(_expr) (WARN_ON(_expr)) #define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) #define VM_BUG_ON(_expr) (BUG_ON(_expr)) #define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) +#define MMF_HAS_MDWE 28 + #define VM_NONE 0x00000000 #define VM_READ 0x00000001 #define VM_WRITE 0x00000002 @@ -39,6 +50,7 @@ #define VM_SHARED 0x00000008 #define VM_MAYREAD 0x00000010 #define VM_MAYWRITE 0x00000020 +#define VM_MAYEXEC 0x00000040 #define VM_GROWSDOWN 0x00000100 #define VM_PFNMAP 0x00000400 #define VM_LOCKED 0x00002000 @@ -51,6 +63,8 @@ #define VM_STACK VM_GROWSDOWN #define VM_SHADOW_STACK VM_NONE #define VM_SOFTDIRTY 0 +#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ +#define VM_GROWSUP VM_NONE #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) @@ -58,6 +72,20 @@ /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC + +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + +#define RLIMIT_STACK 3 /* max stack size */ +#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ + +#define CAP_IPC_LOCK 14 + #ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) @@ -106,10 +134,6 @@ typedef __bitwise unsigned int vm_fault_t; */ #define pr_warn_once pr_err -typedef struct refcount_struct { - atomic_t refs; -} refcount_t; - struct kref { refcount_t refcount; }; @@ -122,10 +146,22 @@ enum { TASK_COMM_LEN = 16, }; +/* + * Flags for bug emulation. + * + * These occupy the top three bytes. + */ +enum { + READ_IMPLIES_EXEC = 0x0400000, +}; + struct task_struct { char comm[TASK_COMM_LEN]; pid_t pid; struct mm_struct *mm; + + /* Used for emulating ABI behavior of previous Linux versions: */ + unsigned int personality; }; struct task_struct *get_current(void); @@ -186,17 +222,18 @@ struct mm_struct { unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ -}; -struct vma_lock { - struct rw_semaphore lock; -}; + unsigned long def_flags; + unsigned long flags; /* Must use atomic bitops to access */ +}; struct file { struct address_space *f_mapping; }; +#define VMA_LOCK_OFFSET 0x40000000 + struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -224,16 +261,13 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* Flag to indicate areas detached from the mm->mm_mt tree */ - bool detached; - /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_lock->lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_lock->lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -241,8 +275,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; - struct vma_lock *vm_lock; + unsigned int vm_lock_seq; #endif /* @@ -295,6 +328,10 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + refcount_t vm_refcnt; +#endif } __randomize_layout; struct vm_fault {}; @@ -373,6 +410,17 @@ struct vm_operations_struct { unsigned long addr); }; +struct vm_unmapped_area_info { +#define VM_UNMAPPED_AREA_TOPDOWN 1 + unsigned long flags; + unsigned long length; + unsigned long low_limit; + unsigned long high_limit; + unsigned long align_mask; + unsigned long align_offset; + unsigned long start_gap; +}; + static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); @@ -408,37 +456,54 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) return mas_find(&vmi->mas, ULONG_MAX); } -static inline bool vma_lock_alloc(struct vm_area_struct *vma) +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ +static inline void vma_assert_attached(struct vm_area_struct *vma) { - vma->vm_lock = calloc(1, sizeof(struct vma_lock)); + VM_BUG_ON_VMA(!refcount_read(&vma->vm_refcnt), vma); +} - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; - - return true; +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt), vma); } static inline void vma_assert_write_locked(struct vm_area_struct *); -static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) +static inline void vma_mark_attached(struct vm_area_struct *vma) { - /* When detaching vma should be write-locked */ - if (detached) - vma_assert_write_locked(vma); - vma->detached = detached; + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set(&vma->vm_refcnt, 1); +} + +static inline void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* We are the only writer, so no need to use vma_refcount_put(). */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* + * Reader must have temporarily raised vm_refcnt but it will + * drop it without using the vma since vma is write-locked. + */ + } } extern const struct vm_operations_struct vma_dummy_vm_ops; +extern unsigned long rlimit(unsigned int limit); + static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_mark_detached(vma, false); + vma->vm_lock_seq = UINT_MAX; } static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) @@ -449,10 +514,6 @@ static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return NULL; vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - free(vma); - return NULL; - } return vma; } @@ -465,10 +526,8 @@ static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return NULL; memcpy(new, orig, sizeof(*new)); - if (!vma_lock_alloc(new)) { - free(new); - return NULL; - } + refcount_set(&new->vm_refcnt, 0); + new->vm_lock_seq = UINT_MAX; INIT_LIST_HEAD(&new->anon_vma_chain); return new; @@ -638,20 +697,9 @@ static inline void mpol_put(struct mempolicy *) { } -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - free(vma->vm_lock); -} - -static inline void __vm_area_free(struct vm_area_struct *vma) -{ - vma_lock_free(vma); - free(vma); -} - static inline void vm_area_free(struct vm_area_struct *vma) { - __vm_area_free(vma); + free(vma); } static inline void lru_add_drain(void) @@ -853,6 +901,11 @@ static inline void mmap_write_unlock(struct mm_struct *) { } +static inline int mmap_write_lock_killable(struct mm_struct *) +{ + return 0; +} + static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) @@ -938,7 +991,7 @@ static inline bool is_file_hugepages(struct file *) static inline int security_vm_enough_memory_mm(struct mm_struct *, long) { - return true; + return 0; } static inline bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long) @@ -1033,4 +1086,159 @@ static inline int mmap_file(struct file *, struct vm_area_struct *) return 0; } +static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_GROWSDOWN) + return stack_guard_gap; + + /* See reasoning around the VM_SHADOW_STACK definition */ + if (vma->vm_flags & VM_SHADOW_STACK) + return PAGE_SIZE; + + return 0; +} + +static inline unsigned long vm_start_gap(struct vm_area_struct *vma) +{ + unsigned long gap = stack_guard_start_gap(vma); + unsigned long vm_start = vma->vm_start; + + vm_start -= gap; + if (vm_start > vma->vm_start) + vm_start = 0; + return vm_start; +} + +static inline unsigned long vm_end_gap(struct vm_area_struct *vma) +{ + unsigned long vm_end = vma->vm_end; + + if (vma->vm_flags & VM_GROWSUP) { + vm_end += stack_guard_gap; + if (vm_end < vma->vm_end) + vm_end = -PAGE_SIZE; + } + return vm_end; +} + +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, unsigned long len) +{ + return 0; +} + +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_ACCESS_FLAGS; +} + +static inline bool capable(int cap) +{ + return true; +} + +static inline bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, + unsigned long bytes) +{ + unsigned long locked_pages, limit_pages; + + if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + return true; + + locked_pages = bytes >> PAGE_SHIFT; + locked_pages += mm->locked_vm; + + limit_pages = rlimit(RLIMIT_MEMLOCK); + limit_pages >>= PAGE_SHIFT; + + return locked_pages <= limit_pages; +} + +static inline int __anon_vma_prepare(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma)); + + if (!anon_vma) + return -ENOMEM; + + anon_vma->root = anon_vma; + vma->anon_vma = anon_vma; + + return 0; +} + +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + +static inline void userfaultfd_unmap_complete(struct mm_struct *mm, + struct list_head *uf) +{ +} + +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + * + * This is only applicable if the user has set the Memory-Deny-Write-Execute + * (MDWE) protection mask for the current process. + * + * @old specifies the VMA flags the VMA originally possessed, and @new the ones + * we propose to set. + * + * Return: false if proposed change is OK, true if not ok and should be denied. + */ +static inline bool map_deny_write_exec(unsigned long old, unsigned long new) +{ + /* If MDWE is disabled, we have nothing to deny. */ + if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return false; + + /* If the new VMA is not executable, we have nothing to deny. */ + if (!(new & VM_EXEC)) + return false; + + /* Under MDWE we do not accept newly writably executable VMAs... */ + if (new & VM_WRITE) + return true; + + /* ...nor previously non-executable VMAs becoming executable. */ + if (!(old & VM_EXEC)) + return true; + + return false; +} + +static inline int mapping_map_writable(struct address_space *mapping) +{ + int c = atomic_read(&mapping->i_mmap_writable); + + /* Derived from the raw_atomic_inc_unless_negative() implementation. */ + do { + if (c < 0) + return -EPERM; + } while (!__sync_bool_compare_and_swap(&mapping->i_mmap_writable, c, c+1)); + + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */