Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

2025-01-15 02:05:33 +00:00 · 2025-01-13 09:34:41 +11:00 · 2025-01-13 09:34:41 +11:00 · ada24f0158
commit ada24f0158
parent 629d79aa0d 972ccc84da
536 changed files with 11136 additions and 10750 deletions
--- a/.mailmap
+++ b/.mailmap
@ -410,6 +410,7 @@ Liam Mark <quic_lmark@quicinc.com> <lmark@codeaurora.org>
 Linas Vepstas <linas@austin.ibm.com>
 Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
 Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
+Linus Lüssing <linus.luessing@c0d3.blue> <ll@simonwunderlich.de>
 <linux-hardening@vger.kernel.org> <kernel-hardening@lists.openwall.com>
 Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
 Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
--- a/2
+++ b/2
@ -4339,7 +4339,7 @@ D: Freescale Highspeed USB device driver
 D: Freescale QE SoC support and Ethernet driver
 S: B-1206 Jingmao Guojigongyu
 S: 16 Baliqiao Nanjie, Beijing 101100
-S: People's Repulic of China
+S: People's Republic of China

 N: Vlad Yasevich
 E: vyasevich@gmail.com
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@ -355,10 +355,15 @@ Description:	If 'target' is written to the 'type' file, writing to or
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/matching
 Date:		Dec 2022
 Contact:	SeongJae Park <sj@kernel.org>
-Description:	Writing 'Y' or 'N' to this file sets whether to filter out
-		pages that do or do not match to the 'type' and 'memcg_path',
-		respectively.  Filter out means the action of the scheme will
-		not be applied to.
+Description:	Writing 'Y' or 'N' to this file sets whether the filter is for
+		the memory of the 'type', or all except the 'type'.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/allow
+Date:		Jan 2025
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Writing 'Y' or 'N' to this file sets whether to allow or reject
+		applying the scheme's action to the memory that satisfies the
+		'type' and the 'matching' of the directory.

 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_tried
 Date:		Mar 2022
@ -384,6 +389,12 @@ Contact:	SeongJae Park <sj@kernel.org>
 Description:	Reading this file returns the total size of regions that the
 		action of the scheme has successfully applied in bytes.

+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/sz_ops_filter_passed
+Date:		Dec 2024
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the total size of memory that passed
+		DAMON operations layer-handled filters of the scheme in bytes.
+
 What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/qt_exceeds
 Date:		Mar 2022
 Contact:	SeongJae Park <sj@kernel.org>
@ -424,3 +435,10 @@ Contact:	SeongJae Park <sj@kernel.org>
 Description:	Reading this file returns the 'age' of a memory region that
 		corresponding DAMON-based Operation Scheme's action has tried
 		to be applied.
+
+What:		/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/sz_filter_passed
+Date:		Dec 2024
+Contact:	SeongJae Park <sj@kernel.org>
+Description:	Reading this file returns the size of the memory in the region
+		that passed DAMON operations layer-handled filters of the
+		scheme in bytes.
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@ -100,29 +100,29 @@ Get delays, since system boot, for pid 10::
 	# ./getdelays -d -p 10
 	(output similar to next case)

-Get sum of delays, since system boot, for all pids with tgid 5::
+Get sum and peak of delays, since system boot, for all pids with tgid 242::

-	# ./getdelays -d -t 5
+	bash-4.4# ./getdelays -d -t 242
 	print delayacct stats ON
-	TGID	5
+	TGID    242


-	CPU             count     real total  virtual total    delay total  delay average
-	                    8        7000000        6872122        3382277          0.423ms
-	IO              count    delay total  delay average
-                   0              0          0.000ms
-	SWAP            count    delay total  delay average
-                       0              0          0.000ms
-	RECLAIM         count    delay total  delay average
-                   0              0          0.000ms
-	THRASHING       count    delay total  delay average
-                       0              0          0.000ms
-	COMPACT         count    delay total  delay average
-                       0              0          0.000ms
-	WPCOPY          count    delay total  delay average
-                       0              0          0.000ms
-	IRQ             count    delay total  delay average
-                       0              0          0.000ms
+	CPU         count     real total  virtual total    delay total  delay average      delay max      delay min
+	               39      156000000      156576579        2111069          0.054ms     0.212296ms     0.031307ms
+	IO          count    delay total  delay average      delay max      delay min
+	                0              0          0.000ms     0.000000ms     0.000000ms
+	SWAP        count    delay total  delay average      delay max      delay min
+	                0              0          0.000ms     0.000000ms     0.000000ms
+	RECLAIM     count    delay total  delay average      delay max      delay min
+	                0              0          0.000ms     0.000000ms     0.000000ms
+	THRASHING   count    delay total  delay average      delay max      delay min
+	                0              0          0.000ms     0.000000ms     0.000000ms
+	COMPACT     count    delay total  delay average      delay max      delay min
+	                0              0          0.000ms     0.000000ms     0.000000ms
+	WPCOPY      count    delay total  delay average      delay max      delay min
+	              156       11215873          0.072ms     0.207403ms     0.033913ms
+	IRQ         count    delay total  delay average      delay max      delay min
+	                0              0          0.000ms     0.000000ms     0.000000ms

 Get IO accounting for pid 1, it works only with -p::

--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@ -3351,8 +3351,8 @@
 			[KNL] Set the initial state for the memory hotplug
 			onlining policy. If not specified, the default value is
 			set according to the
-			CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
-			option.
+			CONFIG_MHP_DEFAULT_ONLINE_TYPE kernel config
+			options.
 			See Documentation/admin-guide/mm/memory-hotplug.rst.

 	memmap=exactmap	[KNL,X86,EARLY] Enable setting of an exact
@ -6992,6 +6992,13 @@
 			See Documentation/admin-guide/mm/transhuge.rst
 			for more details.

+	transparent_hugepage_tmpfs= [KNL]
+			Format: [always|within_size|advise|never]
+			Can be used to control the default hugepage allocation policy
+			for the tmpfs mount.
+			See Documentation/admin-guide/mm/transhuge.rst
+			for more details.
+
 	trusted.source=	[KEYS]
 			Format: <string>
 			This parameter identifies the trust source as a backend
--- a/Documentation/admin-guide/mm/damon/start.rst
+++ b/Documentation/admin-guide/mm/damon/start.rst
@ -42,32 +42,45 @@ the execution. ::

    $ git clone https://github.com/sjp38/masim; cd masim; make
    $ sudo damo start "./masim ./configs/stairs.cfg --quiet"
-    $ sudo ./damo show
-    0   addr [85.541 TiB  , 85.541 TiB ) (57.707 MiB ) access 0 %   age 10.400 s
-    1   addr [85.541 TiB  , 85.542 TiB ) (413.285 MiB) access 0 %   age 11.400 s
-    2   addr [127.649 TiB , 127.649 TiB) (57.500 MiB ) access 0 %   age 1.600 s
-    3   addr [127.649 TiB , 127.649 TiB) (32.500 MiB ) access 0 %   age 500 ms
-    4   addr [127.649 TiB , 127.649 TiB) (9.535 MiB  ) access 100 % age 300 ms
-    5   addr [127.649 TiB , 127.649 TiB) (8.000 KiB  ) access 60 %  age 0 ns
-    6   addr [127.649 TiB , 127.649 TiB) (6.926 MiB  ) access 0 %   age 1 s
-    7   addr [127.998 TiB , 127.998 TiB) (120.000 KiB) access 0 %   age 11.100 s
-    8   addr [127.998 TiB , 127.998 TiB) (8.000 KiB  ) access 40 %  age 100 ms
-    9   addr [127.998 TiB , 127.998 TiB) (4.000 KiB  ) access 0 %   age 11 s
-    total size: 577.590 MiB
-    $ sudo ./damo stop
+    $ sudo damo report access
+    heatmap: 641111111000000000000000000000000000000000000000000000[...]33333333333333335557984444[...]7
+    # min/max temperatures: -1,840,000,000, 370,010,000, column size: 3.925 MiB
+    0   addr 86.182 TiB   size 8.000 KiB   access 0 %   age 14.900 s
+    1   addr 86.182 TiB   size 8.000 KiB   access 60 %  age 0 ns
+    2   addr 86.182 TiB   size 3.422 MiB   access 0 %   age 4.100 s
+    3   addr 86.182 TiB   size 2.004 MiB   access 95 %  age 2.200 s
+    4   addr 86.182 TiB   size 29.688 MiB  access 0 %   age 14.100 s
+    5   addr 86.182 TiB   size 29.516 MiB  access 0 %   age 16.700 s
+    6   addr 86.182 TiB   size 29.633 MiB  access 0 %   age 17.900 s
+    7   addr 86.182 TiB   size 117.652 MiB access 0 %   age 18.400 s
+    8   addr 126.990 TiB  size 62.332 MiB  access 0 %   age 9.500 s
+    9   addr 126.990 TiB  size 13.980 MiB  access 0 %   age 5.200 s
+    10  addr 126.990 TiB  size 9.539 MiB   access 100 % age 3.700 s
+    11  addr 126.990 TiB  size 16.098 MiB  access 0 %   age 6.400 s
+    12  addr 127.987 TiB  size 132.000 KiB access 0 %   age 2.900 s
+    total size: 314.008 MiB
+    $ sudo damo stop

 The first command of the above example downloads and builds an artificial
 memory access generator program called ``masim``.  The second command asks DAMO
-to execute the artificial generator process start via the given command and
-make DAMON monitors the generator process.  The third command retrieves the
-current snapshot of the monitored access pattern of the process from DAMON and
-shows the pattern in a human readable format.
+to start the program via the given command and make DAMON monitors the newly
+started process.  The third command retrieves the current snapshot of the
+monitored access pattern of the process from DAMON and shows the pattern in a
+human readable format.

-Each line of the output shows which virtual address range (``addr [XX, XX)``)
-of the process is how frequently (``access XX %``) accessed for how long time
-(``age XX``).  For example, the fifth region of ~9 MiB size is being most
-frequently accessed for last 300 milliseconds.  Finally, the fourth command
-stops DAMON.
+The first line of the output shows the relative access temperature (hotness) of
+the regions in a single row hetmap format.  Each column on the heatmap
+represents regions of same size on the monitored virtual address space.  The
+position of the colun on the row and the number on the column represents the
+relative location and access temperature of the region.  ``[...]`` means
+unmapped huge regions on the virtual address spaces.  The second line shows
+additional information for better understanding the heatmap.
+
+Each line of the output from the third line shows which virtual address range
+(``addr XX size XX``) of the process is how frequently (``access XX %``)
+accessed for how long time (``age XX``).  For example, the evelenth region of
+~9.5 MiB size is being most frequently accessed for last 3.7 seconds.  Finally,
+the fourth command stops DAMON.

 Note that DAMON can monitor not only virtual address spaces but multiple types
 of address spaces including the physical address space.
@ -95,7 +108,7 @@ Visualizing Recorded Patterns
 You can visualize the pattern in a heatmap, showing which memory region
 (x-axis) got accessed when (y-axis) and how frequently (number).::

-    $ sudo damo report heats --heatmap stdout
+    $ sudo damo report heatmap
    22222222222222222222222222222222222222211111111111111111111111111111111111111100
    44444444444444444444444444444444444444434444444444444444444444444444444444443200
    44444444444444444444444444444444444444433444444444444444444444444444444444444200
@ -160,6 +173,6 @@ Data Access Pattern Aware Memory Management
 Below command makes every memory region of size >=4K that has not accessed for
 >=60 seconds in your workload to be swapped out. ::

-    $ sudo damo schemes --damos_access_rate 0 0 --damos_sz_region 4K max \
-                        --damos_age 60s max --damos_action pageout \
-                        <pid of your workload>
+    $ sudo damo start --damos_access_rate 0 0 --damos_sz_region 4K max \
+                      --damos_age 60s max --damos_action pageout \
+                      <pid of your workload>
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@ -26,12 +26,6 @@ DAMON provides below interfaces for different users.
  writing kernel space DAMON application programs for you.  You can even extend
  DAMON for various address spaces.  For detail, please refer to the interface
  :doc:`document </mm/damon/api>`.
- *debugfs interface. (DEPRECATED!)*
-  :ref:`This <debugfs_interface>` is almost identical to :ref:`sysfs interface
-  <sysfs_interface>`.  This is deprecated, so users should move to the
-  :ref:`sysfs interface <sysfs_interface>`.  If you depend on this and cannot
-  move, please report your usecase to damon@lists.linux.dev and
-  linux-mm@kvack.org.

 .. _sysfs_interface:

@ -89,10 +83,10 @@ comma (",").
    │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value
    │ │ │ │ │ │ │ :ref:`watermarks <sysfs_watermarks>`/metric,interval_us,high,mid,low
    │ │ │ │ │ │ │ :ref:`filters <sysfs_filters>`/nr_filters
-    │ │ │ │ │ │ │ │ 0/type,matching,memcg_id
-    │ │ │ │ │ │ │ :ref:`stats <sysfs_schemes_stats>`/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
+    │ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx
+    │ │ │ │ │ │ │ :ref:`stats <sysfs_schemes_stats>`/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds
    │ │ │ │ │ │ │ :ref:`tried_regions <sysfs_schemes_tried_regions>`/total_bytes
-    │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
+    │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age,sz_filter_passed
    │ │ │ │ │ │ │ │ ...
    │ │ │ │ │ │ ...
    │ │ │ │ ...
@ -412,59 +406,62 @@ number (``N``) to the file creates the number of child directories named ``0``
 to ``N-1``.  Each directory represents each filter.  The filters are evaluated
 in the numeric order.

-Each filter directory contains six files, namely ``type``, ``matcing``,
-``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``.  To ``type``
-file, you can write one of five special keywords: ``anon`` for anonymous pages,
-``memcg`` for specific memory cgroup, ``young`` for young pages, ``addr`` for
-specific address range (an open-ended interval), or ``target`` for specific
-DAMON monitoring target filtering.  In case of the memory cgroup filtering, you
-can specify the memory cgroup of the interest by writing the path of the memory
-cgroup from the cgroups mount point to ``memcg_path`` file.  In case of the
-address range filtering, you can specify the start and end address of the range
-to ``addr_start`` and ``addr_end`` files, respectively.  For the DAMON
-monitoring target filtering, you can specify the index of the target between
-the list of the DAMON context's monitoring targets list to ``target_idx`` file.
-You can write ``Y`` or ``N`` to ``matching`` file to filter out pages that does
-or does not match to the type, respectively.  Then, the scheme's action will
-not be applied to the pages that specified to be filtered out.
+Each filter directory contains seven files, namely ``type``, ``matching``,
+``allow``, ``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``.
+To ``type`` file, you can write one of five special keywords: ``anon`` for
+anonymous pages, ``memcg`` for specific memory cgroup, ``young`` for young
+pages, ``addr`` for specific address range (an open-ended interval), or
+``target`` for specific DAMON monitoring target filtering.  Meaning of the
+types are same to the description on the :ref:`design doc
+<damon_design_damos_filters>`.
+
+In case of the memory cgroup filtering, you can specify the memory cgroup of
+the interest by writing the path of the memory cgroup from the cgroups mount
+point to ``memcg_path`` file.  In case of the address range filtering, you can
+specify the start and end address of the range to ``addr_start`` and
+``addr_end`` files, respectively.  For the DAMON monitoring target filtering,
+you can specify the index of the target between the list of the DAMON context's
+monitoring targets list to ``target_idx`` file.
+
+You can write ``Y`` or ``N`` to ``matching`` file to specify whether the filter
+is for memory that matches the ``type``.  You can write ``Y`` or ``N`` to
+``allow`` file to specify if applying the action to the memory that satisfies
+the ``type`` and ``matching`` should be allowed or not.

 For example, below restricts a DAMOS action to be applied to only non-anonymous
 pages of all memory cgroups except ``/having_care_already``.::

    # echo 2 > nr_filters
-    # # filter out anonymous pages
+    # # disallow anonymous pages
    echo anon > 0/type
    echo Y > 0/matching
+    echo N > 0/allow
    # # further filter out all cgroups except one at '/having_care_already'
    echo memcg > 1/type
    echo /having_care_already > 1/memcg_path
    echo Y > 1/matching
+    echo N > 1/allow

-Note that ``anon`` and ``memcg`` filters are currently supported only when
-``paddr`` :ref:`implementation <sysfs_context>` is being used.
-
-Also, memory regions that are filtered out by ``addr`` or ``target`` filters
-are not counted as the scheme has tried to those, while regions that filtered
-out by other type filters are counted as the scheme has tried to.  The
-difference is applied to :ref:`stats <damos_stats>` and
-:ref:`tried regions <sysfs_schemes_tried_regions>`.
+Refer to the :ref:`DAMOS filters design documentation
+<damon_design_damos_filters>` for more details including how multiple filters
+of different ``allow`` works, when each of the filters are supported, and
+differences on stats.

 .. _sysfs_schemes_stats:

 schemes/<N>/stats/
 ------------------

-DAMON counts the total number and bytes of regions that each scheme is tried to
-be applied, the two numbers for the regions that each scheme is successfully
-applied, and the total number of the quota limit exceeds.  This statistics can
-be used for online analysis or tuning of the schemes.
+DAMON counts statistics for each scheme.  This statistics can be used for
+online analysis or tuning of the schemes.  Refer to :ref:`design doc
+<damon_design_damos_stat>` for more details about the stats.

 The statistics can be retrieved by reading the files under ``stats`` directory
-(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, and
-``qt_exceeds``), respectively.  The files are not updated in real time, so you
-should ask DAMON sysfs interface to update the content of the files for the
-stats by writing a special keyword, ``update_schemes_stats`` to the relevant
-``kdamonds/<N>/state`` file.
+(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``,
+``sz_ops_filter_passed``, and ``qt_exceeds``), respectively.  The files are not
+updated in real time, so you should ask DAMON sysfs interface to update the
+content of the files for the stats by writing a special keyword,
+``update_schemes_stats`` to the relevant ``kdamonds/<N>/state`` file.

 .. _sysfs_schemes_tried_regions:

@ -501,10 +498,10 @@ set the ``access pattern`` as their interested pattern that they want to query.
 tried_regions/<N>/
 ------------------

-In each region directory, you will find four files (``start``, ``end``,
-``nr_accesses``, and ``age``).  Reading the files will show the start and end
-addresses, ``nr_accesses``, and ``age`` of the region that corresponding
-DAMON-based operation scheme ``action`` has tried to be applied.
+In each region directory, you will find five files (``start``, ``end``,
+``nr_accesses``, ``age``, and ``sz_filter_passed``).  Reading the files will
+show the properties of the region that corresponding DAMON-based operation
+scheme ``action`` has tried to be applied.

 Example
 ~~~~~~~
@ -600,306 +597,3 @@ fields are as usual.  It shows the index of the DAMON context (``ctx_idx=X``)
 of the scheme in the list of the contexts of the context's kdamond, the index
 of the scheme (``scheme_idx=X``) in the list of the schemes of the context, in
 addition to the output of ``damon_aggregated`` tracepoint.
-
-
-.. _debugfs_interface:
-
-debugfs Interface (DEPRECATED!)
-===============================
-
-.. note::
-
-  THIS IS DEPRECATED!
-
-  DAMON debugfs interface is deprecated, so users should move to the
-  :ref:`sysfs interface <sysfs_interface>`.  If you depend on this and cannot
-  move, please report your usecase to damon@lists.linux.dev and
-  linux-mm@kvack.org.
-
-DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``,
-``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``,
-``mk_contexts`` and ``rm_contexts`` under its debugfs directory,
-``<debugfs>/damon/``.
-
-
-``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation
-notice.  Reading it returns the deprecation notice, as below::
-
-    # cat DEPRECATED
-    DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
-
-
-Attributes
----------
-
-Users can get and set the ``sampling interval``, ``aggregation interval``,
-``update interval``, and min/max number of monitoring target regions by
-reading from and writing to the ``attrs`` file.  To know about the monitoring
-attributes in detail, please refer to the :doc:`/mm/damon/design`.  For
-example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and
-1000, and then check it again::
-
-    # cd <debugfs>/damon
-    # echo 5000 100000 1000000 10 1000 > attrs
-    # cat attrs
-    5000 100000 1000000 10 1000
-
-
-Target IDs
----------
-
-Some types of address spaces supports multiple monitoring target.  For example,
-the virtual memory address spaces monitoring can have multiple processes as the
-monitoring targets.  Users can set the targets by writing relevant id values of
-the targets to, and get the ids of the current targets by reading from the
-``target_ids`` file.  In case of the virtual address spaces monitoring, the
-values should be pids of the monitoring target processes.  For example, below
-commands set processes having pids 42 and 4242 as the monitoring targets and
-check it again::
-
-    # cd <debugfs>/damon
-    # echo 42 4242 > target_ids
-    # cat target_ids
-    42 4242
-
-Users can also monitor the physical memory address space of the system by
-writing a special keyword, "``paddr\n``" to the file.  Because physical address
-space monitoring doesn't support multiple targets, reading the file will show a
-fake value, ``42``, as below::
-
-    # cd <debugfs>/damon
-    # echo paddr > target_ids
-    # cat target_ids
-    42
-
-Note that setting the target ids doesn't start the monitoring.
-
-
-Initial Monitoring Target Regions
---------------------------------
-
-In case of the virtual address space monitoring, DAMON automatically sets and
-updates the monitoring target regions so that entire memory mappings of target
-processes can be covered.  However, users can want to limit the monitoring
-region to specific address ranges, such as the heap, the stack, or specific
-file-mapped area.  Or, some users can know the initial access pattern of their
-workloads and therefore want to set optimal initial regions for the 'adaptive
-regions adjustment'.
-
-In contrast, DAMON do not automatically sets and updates the monitoring target
-regions in case of physical memory monitoring.  Therefore, users should set the
-monitoring target regions by themselves.
-
-In such cases, users can explicitly set the initial monitoring target regions
-as they want, by writing proper values to the ``init_regions`` file.  The input
-should be a sequence of three integers separated by white spaces that represent
-one region in below form.::
-
-    <target idx> <start address> <end address>
-
-The ``target idx`` should be the index of the target in ``target_ids`` file,
-starting from ``0``, and the regions should be passed in address order.  For
-example, below commands will set a couple of address ranges, ``1-100`` and
-``100-200`` as the initial monitoring target region of pid 42, which is the
-first one (index ``0``) in ``target_ids``, and another couple of address
-ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
-(index ``1``) in ``target_ids``.::
-
-    # cd <debugfs>/damon
-    # cat target_ids
-    42 4242
-    # echo "0   1       100 \
-            0   100     200 \
-            1   20      40  \
-            1   50      100" > init_regions
-
-Note that this sets the initial monitoring target regions only.  In case of
-virtual memory monitoring, DAMON will automatically updates the boundary of the
-regions after one ``update interval``.  Therefore, users should set the
-``update interval`` large enough in this case, if they don't want the
-update.
-
-
-Schemes
-------
-
-Users can get and set the DAMON-based operation :ref:`schemes
-<damon_design_damos>` by reading from and writing to ``schemes`` debugfs file.
-Reading the file also shows the statistics of each scheme.  To the file, each
-of the schemes should be represented in each line in below form::
-
-    <target access pattern> <action> <quota> <watermarks>
-
-You can disable schemes by simply writing an empty string to the file.
-
-Target Access Pattern
-~~~~~~~~~~~~~~~~~~~~~
-
-The target access :ref:`pattern <damon_design_damos_access_pattern>` of the
-scheme.  The ``<target access pattern>`` is constructed with three ranges in
-below form::
-
-    min-size max-size min-acc max-acc min-age max-age
-
-Specifically, bytes for the size of regions (``min-size`` and ``max-size``),
-number of monitored accesses per aggregate interval for access frequency
-(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of
-regions (``min-age`` and ``max-age``) are specified.  Note that the ranges are
-closed interval.
-
-Action
-~~~~~~
-
-The ``<action>`` is a predefined integer for memory management :ref:`actions
-<damon_design_damos_action>`.  The mapping between the ``<action>`` values and
-the memory management actions is as below.  For the detailed meaning of the
-action and DAMON operations set supporting each action, please refer to the
-list on :ref:`design doc <damon_design_damos_action>`.
-
- - 0: ``willneed``
- - 1: ``cold``
- - 2: ``pageout``
- - 3: ``hugepage``
- - 4: ``nohugepage``
- - 5: ``stat``
-
-Quota
-~~~~~
-
-Users can set the :ref:`quotas <damon_design_damos_quotas>` of the given scheme
-via the ``<quota>`` in below form::
-
-    <ms> <sz> <reset interval> <priority weights>
-
-This makes DAMON to try to use only up to ``<ms>`` milliseconds for applying
-the action to memory regions of the ``target access pattern`` within the
-``<reset interval>`` milliseconds, and to apply the action to only up to
-``<sz>`` bytes of memory regions within the ``<reset interval>``.  Setting both
-``<ms>`` and ``<sz>`` zero disables the quota limits.
-
-For the :ref:`prioritization <damon_design_damos_quotas_prioritization>`, users
-can set the weights for the three properties in ``<priority weights>`` in below
-form::
-
-    <size weight> <access frequency weight> <age weight>
-
-Watermarks
-~~~~~~~~~~
-
-Users can specify :ref:`watermarks <damon_design_damos_watermarks>` of the
-given scheme via ``<watermarks>`` in below form::
-
-    <metric> <check interval> <high mark> <middle mark> <low mark>
-
-``<metric>`` is a predefined integer for the metric to be checked.  The
-supported numbers and their meanings are as below.
-
- - 0: Ignore the watermarks
- - 1: System's free memory rate (per thousand)
-
-The value of the metric is checked every ``<check interval>`` microseconds.
-
-If the value is higher than ``<high mark>`` or lower than ``<low mark>``, the
-scheme is deactivated.  If the value is lower than ``<mid mark>``, the scheme
-is activated.
-
-.. _damos_stats:
-
-Statistics
-~~~~~~~~~~
-
-It also counts the total number and bytes of regions that each scheme is tried
-to be applied, the two numbers for the regions that each scheme is successfully
-applied, and the total number of the quota limit exceeds.  This statistics can
-be used for online analysis or tuning of the schemes.
-
-The statistics can be shown by reading the ``schemes`` file.  Reading the file
-will show each scheme you entered in each line, and the five numbers for the
-statistics will be added at the end of each line.
-
-Example
-~~~~~~~
-
-Below commands applies a scheme saying "If a memory region of size in [4KiB,
-8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
-interval in [10, 20], page out the region.  For the paging out, use only up to
-10ms per second, and also don't page out more than 1GiB per second.  Under the
-limitation, page out memory regions having longer age first.  Also, check the
-free memory rate of the system every 5 seconds, start the monitoring and paging
-out when the free memory rate becomes lower than 50%, but stop it if the free
-memory rate becomes larger than 60%, or lower than 30%".::
-
-    # cd <debugfs>/damon
-    # scheme="4096 8192  0 5    10 20    2"  # target access pattern and action
-    # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
-    # scheme+=" 0 0 100"                     # prioritization weights
-    # scheme+=" 1 5000000 600 500 300"       # watermarks
-    # echo "$scheme" > schemes
-
-
-Turning On/Off
--------------
-
-Setting the files as described above doesn't incur effect unless you explicitly
-start the monitoring.  You can start, stop, and check the current status of the
-monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file.
-Writing ``on`` to the file starts the monitoring of the targets with the
-attributes.  Writing ``off`` to the file stops those.  DAMON also stops if
-every target process is terminated.  Below example commands turn on, off, and
-check the status of DAMON::
-
-    # cd <debugfs>/damon
-    # echo on > monitor_on_DEPRECATED
-    # echo off > monitor_on_DEPRECATED
-    # cat monitor_on_DEPRECATED
-    off
-
-Please note that you cannot write to the above-mentioned debugfs files while
-the monitoring is turned on.  If you write to the files while DAMON is running,
-an error code such as ``-EBUSY`` will be returned.
-
-
-Monitoring Thread PID
---------------------
-
-DAMON does requested monitoring with a kernel thread called ``kdamond``.  You
-can get the pid of the thread by reading the ``kdamond_pid`` file.  When the
-monitoring is turned off, reading the file returns ``none``. ::
-
-    # cd <debugfs>/damon
-    # cat monitor_on_DEPRECATED
-    off
-    # cat kdamond_pid
-    none
-    # echo on > monitor_on_DEPRECATED
-    # cat kdamond_pid
-    18594
-
-
-Using Multiple Monitoring Threads
---------------------------------
-
-One ``kdamond`` thread is created for each monitoring context.  You can create
-and remove monitoring contexts for multiple ``kdamond`` required use case using
-the ``mk_contexts`` and ``rm_contexts`` files.
-
-Writing the name of the new context to the ``mk_contexts`` file creates a
-directory of the name on the DAMON debugfs directory.  The directory will have
-DAMON debugfs files for the context. ::
-
-    # cd <debugfs>/damon
-    # ls foo
-    # ls: cannot access 'foo': No such file or directory
-    # echo foo > mk_contexts
-    # ls foo
-    # attrs  init_regions  kdamond_pid  schemes  target_ids
-
-If the context is not needed anymore, you can remove it and the corresponding
-directory by putting the name of the context to the ``rm_contexts`` file. ::
-
-    # echo foo > rm_contexts
-    # ls foo
-    # ls: cannot access 'foo': No such file or directory
-
-Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files
-are in the root directory only.
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@ -280,8 +280,8 @@ The following files are currently defined:
 		       blocks; configure auto-onlining.

 		       The default value depends on the
-		       CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel configuration
-		       option.
+		       CONFIG_MHP_DEFAULT_ONLINE_TYPE kernel configuration
+		       options.

 		       See the ``state`` property of memory blocks for details.
 ``block_size_bytes``   read-only: the size in bytes of a memory block.
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@ -332,6 +332,12 @@ allocation policy for the internal shmem mount by using the kernel parameter
 seven valid policies for shmem (``always``, ``within_size``, ``advise``,
 ``never``, ``deny``, and ``force``).

+Similarly to ``transparent_hugepage_shmem``, you can control the default
+hugepage allocation policy for the tmpfs mount by using the kernel parameter
+``transparent_hugepage_tmpfs=<policy>``, where ``<policy>`` is one of the
+four valid policies for tmpfs (``always``, ``within_size``, ``advise``,
+``never``). The tmpfs mount default policy is ``never``.
+
 In the same manner as ``thp_anon`` controls each supported anonymous THP
 size, ``thp_shmem`` controls each supported shmem THP size. ``thp_shmem``
 has the same format as ``thp_anon``, but also supports the policy
@ -352,8 +358,21 @@ default to ``never``.
 Hugepages in tmpfs/shmem
 ========================

-You can control hugepage allocation policy in tmpfs with mount option
-``huge=``. It can have following values:
+Traditionally, tmpfs only supported a single huge page size ("PMD"). Today,
+it also supports smaller sizes just like anonymous memory, often referred
+to as "multi-size THP" (mTHP). Huge pages of any size are commonly
+represented in the kernel as "large folios".
+
+While there is fine control over the huge page sizes to use for the internal
+shmem mount (see below), ordinary tmpfs mounts will make use of all available
+huge page sizes without any control over the exact sizes, behaving more like
+other file systems.
+
+tmpfs mounts
+------------
+
+The THP allocation policy for tmpfs mounts can be adjusted using the mount
+option: ``huge=``. It can have following values:

 always
    Attempt to allocate huge pages every time we need a new page;
@ -363,24 +382,24 @@ never

 within_size
    Only allocate huge page if it will be fully within i_size.
-    Also respect fadvise()/madvise() hints;
+    Also respect madvise() hints;

 advise
-    Only allocate huge pages if requested with fadvise()/madvise();
+    Only allocate huge pages if requested with madvise();

-The default policy is ``never``.
+Remember, that the kernel may use huge pages of all available sizes, and
+that no fine control as for the internal tmpfs mount is available.
+
+The default policy in the past was ``never``, but it can now be adjusted
+using the kernel parameter ``transparent_hugepage_tmpfs=<policy>``.

 ``mount -o remount,huge= /mountpoint`` works fine after mount: remounting
 ``huge=never`` will not attempt to break up huge pages at all, just stop more
 from being allocated.

-There's also sysfs knob to control hugepage allocation policy for internal
-shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
-is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
-MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
-
-In addition to policies listed above, shmem_enabled allows two further
-values:
+In addition to policies listed above, the sysfs knob
+/sys/kernel/mm/transparent_hugepage/shmem_enabled will affect the
+allocation policy of tmpfs mounts, when set to the following values:

 deny
    For use in emergencies, to force the huge option off from
@ -388,13 +407,24 @@ deny
 force
    Force the huge option on for all - very useful for testing;

-Shmem can also use "multi-size THP" (mTHP) by adding a new sysfs knob to
-control mTHP allocation:
-'/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/shmem_enabled',
-and its value for each mTHP is essentially consistent with the global
-setting.  An 'inherit' option is added to ensure compatibility with these
-global settings.  Conversely, the options 'force' and 'deny' are dropped,
-which are rather testing artifacts from the old ages.
+shmem / internal tmpfs
+----------------------
+The mount internal tmpfs mount is used for SysV SHM, memfds, shared anonymous
+mmaps (of /dev/zero or MAP_ANONYMOUS), GPU drivers' DRM  objects, Ashmem.
+
+To control the THP allocation policy for this internal tmpfs mount, the
+sysfs knob /sys/kernel/mm/transparent_hugepage/shmem_enabled and the knobs
+per THP size in
+'/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/shmem_enabled'
+can be used.
+
+The global knob has the same semantics as the ``huge=`` mount options
+for tmpfs mounts, except that the different huge page sizes can be controlled
+individually, and will only use the setting of the global knob when the
+per-size knob is set to 'inherit'.
+
+The options 'force' and 'deny' are dropped for the individual sizes, which
+are rather testing artifacts from the old ages.

 always
    Attempt to allocate <size> huge pages every time we need a new page;
@ -408,10 +438,10 @@ never

 within_size
    Only allocate <size> huge page if it will be fully within i_size.
-    Also respect fadvise()/madvise() hints;
+    Also respect madvise() hints;

 advise
-    Only allocate <size> huge pages if requested with fadvise()/madvise();
+    Only allocate <size> huge pages if requested with madvise();

 Need of application restart
 ===========================
@ -561,6 +591,16 @@ swpin
 	is incremented every time a huge page is swapped in from a non-zswap
 	swap device in one piece.

+swpin_fallback
+	is incremented if swapin fails to allocate or charge a huge page
+	and instead falls back to using huge pages with lower orders or
+	small pages.
+
+swpin_fallback_charge
+	is incremented if swapin fails to charge a huge page and instead
+	falls back to using  huge pages with lower orders or small pages
+	even though the allocation was successful.
+
 swpout
 	is incremented every time a huge page is swapped out to a non-zswap
 	swap device in one piece without splitting.
--- a/Documentation/core-api/min_heap.rst
+++ b/Documentation/core-api/min_heap.rst
@ -4,6 +4,8 @@
 Min Heap API
 ============

+:Author: Kuan-Wei Chiu <visitorckw@gmail.com>
+
 Introduction
 ============

--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@ -42,8 +42,8 @@ call xa_tag_pointer() to create an entry with a tag, xa_untag_pointer()
 to turn a tagged entry back into an untagged pointer and xa_pointer_tag()
 to retrieve the tag of an entry.  Tagged pointers use the same bits that
 are used to distinguish value entries from normal pointers, so you must
-decide whether they want to store value entries or tagged pointers in
-any particular XArray.
+decide whether you want to store value entries or tagged pointers in any
+particular XArray.

 The XArray does not support storing IS_ERR() pointers as some
 conflict with value entries or internal entries.
@ -52,8 +52,9 @@ An unusual feature of the XArray is the ability to create entries which
 occupy a range of indices.  Once stored to, looking up any index in
 the range will return the same entry as looking up any other index in
 the range.  Storing to any index will store to all of them.  Multi-index
-entries can be explicitly split into smaller entries, or storing ``NULL``
-into any entry will cause the XArray to forget about the range.
+entries can be explicitly split into smaller entries. Unsetting (using
+xa_erase() or xa_store() with ``NULL``) any entry will cause the XArray
+to forget about the range.

 Normal API
 ==========
@ -63,13 +64,14 @@ for statically allocated XArrays or xa_init() for dynamically
 allocated ones.  A freshly-initialised XArray contains a ``NULL``
 pointer at every index.

-You can then set entries using xa_store() and get entries
-using xa_load().  xa_store will overwrite any entry with the
-new entry and return the previous entry stored at that index.  You can
-use xa_erase() instead of calling xa_store() with a
-``NULL`` entry.  There is no difference between an entry that has never
-been stored to, one that has been erased and one that has most recently
-had ``NULL`` stored to it.
+You can then set entries using xa_store() and get entries using
+xa_load().  xa_store() will overwrite any entry with the new entry and
+return the previous entry stored at that index.  You can unset entries
+using xa_erase() or by setting the entry to ``NULL`` using xa_store().
+There is no difference between an entry that has never been stored to
+and one that has been erased with xa_erase(); an entry that has most
+recently had ``NULL`` stored to it is also equivalent except if the
+XArray was initialized with ``XA_FLAGS_ALLOC``.

 You can conditionally replace an entry at an index by using
 xa_cmpxchg().  Like cmpxchg(), it will only succeed if
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@ -48,6 +48,7 @@ fixes/update part 1.1  Stefani Seibold <stefani@seibold.net>    June 9 2009
  3.11	/proc/<pid>/patch_state - Livepatch patch operation state
  3.12	/proc/<pid>/arch_status - Task architecture specific information
  3.13  /proc/<pid>/fd - List of symlinks to open files
+  3.14  /proc/<pid/ksm_stat - Information about the process's ksm status.

  4	Configuring procfs
  4.1	Mount options
@ -484,14 +485,15 @@ Memory Area, or VMA) there is a series of lines such as the following::
    THPeligible:           0
    VmFlags: rd ex mr mw me dw

-The first of these lines shows the same information as is displayed for the
-mapping in /proc/PID/maps.  Following lines show the size of the mapping
-(size); the size of each page allocated when backing a VMA (KernelPageSize),
-which is usually the same as the size in the page table entries; the page size
-used by the MMU when backing a VMA (in most cases, the same as KernelPageSize);
-the amount of the mapping that is currently resident in RAM (RSS); the
-process' proportional share of this mapping (PSS); and the number of clean and
-dirty shared and private pages in the mapping.
+The first of these lines shows the same information as is displayed for
+the mapping in /proc/PID/maps.  Following lines show the size of the
+mapping (size); the size of each page allocated when backing a VMA
+(KernelPageSize), which is usually the same as the size in the page table
+entries; the page size used by the MMU when backing a VMA (in most cases,
+the same as KernelPageSize); the amount of the mapping that is currently
+resident in RAM (RSS); the process's proportional share of this mapping
+(PSS); and the number of clean and dirty shared and private pages in the
+mapping.

 The "proportional set size" (PSS) of a process is the count of pages it has
 in memory, where each page is divided by the number of processes sharing it.
@ -2232,6 +2234,74 @@ The number of open files for the process is stored in 'size' member
 of stat() output for /proc/<pid>/fd for fast access.
 -------------------------------------------------------

+3.14 /proc/<pid/ksm_stat - Information about the process's ksm status
+---------------------------------------------------------------------
+When CONFIG_KSM is enabled, each process has this file which displays
+the information of ksm merging status.
+
+Example
+~~~~~~~
+
+::
+
+    / # cat /proc/self/ksm_stat
+    ksm_rmap_items 0
+    ksm_zero_pages 0
+    ksm_merging_pages 0
+    ksm_process_profit 0
+    ksm_merge_any: no
+    ksm_mergeable: no
+
+Description
+~~~~~~~~~~~
+
+ksm_rmap_items
+^^^^^^^^^^^^^^
+
+The number of ksm_rmap_item structure in use. The structure of
+ksm_rmap_item is to store the reverse mapping information for virtual
+addresses. KSM will generate a ksm_rmap_item for each ksm-scanned page
+of the process.
+
+ksm_zero_pages
+^^^^^^^^^^^^^^
+
+When /sys/kernel/mm/ksm/use_zero_pages is enabled, it represent how many
+empty pages are merged with kernel zero pages by KSM.
+
+ksm_merging_pages
+^^^^^^^^^^^^^^^^^
+
+It represents how many pages of this process are involved in KSM merging
+(not including ksm_zero_pages). It is the same with what
+/proc/<pid>/ksm_merging_pages shows.
+
+ksm_process_profit
+^^^^^^^^^^^^^^^^^^
+
+The profit that KSM brings (Saved bytes). KSM can save memory by merging
+identical pages, but also can consume additional memory, because it needs
+to generate a number of rmap_items to save each scanned page's brief rmap
+information. Some of these pages may be merged, but some may not be abled
+to be merged after being checked several times, which are unprofitable
+memory consumed.
+
+ksm_merge_any
+^^^^^^^^^^^^^
+
+It specifies whether the process's mm is added by prctl() into the
+candidate list of KSM or not, and KSM scanning is fully enabled at process
+level.
+
+ksm_mergeable
+^^^^^^^^^^^^^
+
+It specifies whether any VMAs of the process's mm are currently applicable
+to KSM.
+
+More information about KSM can be found at
+Documentation/admin-guide/mm/ksm.rst.
+

 Chapter 4: Configuring procfs
 =============================
@ -2261,7 +2331,7 @@ arguments are now protected against local eavesdroppers.
 hidepid=invisible or hidepid=2 means hidepid=1 plus all /proc/<pid>/ will be
 fully invisible to other users.  It doesn't mean that it hides a fact whether a
 process with a specific pid value exists (it can be learned by other means, e.g.
-by "kill -0 $PID"), but it hides process' uid and gid, which may be learned by
+by "kill -0 $PID"), but it hides process's uid and gid, which may be learned by
 stat()'ing /proc/<pid>/ otherwise.  It greatly complicates an intruder's task of
 gathering information about running processes, whether some daemon runs with
 elevated privileges, whether other user runs some sensitive program, whether
--- a/Documentation/filesystems/squashfs.rst
+++ b/Documentation/filesystems/squashfs.rst
@ -6,7 +6,7 @@ Squashfs 4.0 Filesystem

 Squashfs is a compressed read-only filesystem for Linux.

-It uses zlib, lz4, lzo, or xz compression to compress files, inodes and
+It uses zlib, lz4, lzo, xz or zstd compression to compress files, inodes and
 directories.  Inodes in the system are very small and all blocks are packed to
 minimise data overhead. Block sizes greater than 4K are supported up to a
 maximum of 1Mbytes (default block size 128K).
@ -16,8 +16,8 @@ use (i.e. in cases where a .tar.gz file may be used), and in constrained
 block device/memory systems (e.g. embedded systems) where low overhead is
 needed.

-Mailing list: squashfs-devel@lists.sourceforge.net
-Web site: www.squashfs.org
+Mailing list (kernel code): linux-fsdevel@vger.kernel.org
+Web site: github.com/plougher/squashfs-tools

 1. Filesystem Features
 ----------------------
@ -58,11 +58,9 @@ inodes have different sizes).

 As squashfs is a read-only filesystem, the mksquashfs program must be used to
 create populated squashfs filesystems.  This and other squashfs utilities
-can be obtained from http://www.squashfs.org.  Usage instructions can be
-obtained from this site also.
-
-The squashfs-tools development tree is now located on kernel.org
-	git://git.kernel.org/pub/scm/fs/squashfs/squashfs-tools.git
+are very likely packaged by your linux distribution (called squashfs-tools).
+The source code can be obtained from github.com/plougher/squashfs-tools.
+Usage instructions can also be obtained from this site.

 2.1 Mount options
 -----------------
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@ -203,6 +203,8 @@ This scheme, however, cannot preserve the quality of the output if the
 assumption is not guaranteed.


+.. _damon_design_adaptive_regions_adjustment:
+
 Adaptive Regions Adjustment
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -264,6 +266,61 @@ tracepoints.  For more details, please refer to the documentations for
 respectively.


+.. _damon_design_monitoring_params_tuning_guide:
+
+Monitoring Parameters Tuning Guide
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In short, set ``aggregation interval`` to capture meaningful amount of accesses
+for the purpose.  The amount of accesses can be measured using ``nr_accesses``
+and ``age`` of regions in the aggregated monitoring results snapshot.  The
+default value of the interval, ``100ms``, turns out to be too short in many
+cases.  Set ``sampling interval`` proportional to ``aggregation interval``.  By
+default, ``1/20`` is recommended as the ratio.
+
+``Aggregation interval`` should be set as the time interval that the workload
+can make an amount of accesses for the monitoring purpose, within the interval.
+If the interval is too short, only small number of accesses are captured.  As a
+result, the monitoring results look everything is samely accessed only rarely.
+For many purposes, that would be useless.  If it is too long, however, the time
+to converge regions with the :ref:`regions adjustment mechanism
+<damon_design_adaptive_regions_adjustment>` can be too long, depending on the
+time scale of the given purpose.  This could happen if the workload is actually
+making only rare accesses but the user thinks the amount of accesses for the
+monitoring purpose too high.  For such cases, the target amount of access to
+capture per ``aggregation interval`` should carefully reconsidered.  Also, note
+that the captured amount of accesses is represented with not only
+``nr_accesses``, but also ``age``.  For example, even if every region on the
+monitoring results show zero ``nr_accesses``, regions could still be
+distinguished using ``age`` values as the recency information.
+
+Hence the optimum value of ``aggregation interval`` depends on the access
+intensiveness of the workload.  The user should tune the interval based on the
+amount of access that captured on each aggregated snapshot of the monitoring
+results.
+
+Note that the default value of the interval is 100 milliseconds, which is too
+short in many cases, especially on large systems.
+
+``Sampling interval`` defines the resolution of each aggregation.  If it is set
+too large, monitoring results will look like every region was samely rarely
+accessed, or samely frequently accessed.  That is, regions become
+undistinguishable based on access pattern, and therefore the results will be
+useless in many use cases.  If ``sampling interval`` is too small, it will not
+degrade the resolution, but will increase the monitoring overhead.  If it is
+appropriate enough to provide a resolution of the monitoring results that
+sufficient for the given purpose, it shouldn't be unnecessarily further
+lowered.  It is recommended to be set proportional to ``aggregation interval``.
+By default, the ratio is set as ``1/20``, and it is still recommended.
+
+Refer to below documents for an example tuning based on the above guide.
+
+.. toctree::
+   :maxdepth: 1
+
+   monitoring_intervals_tuning_example
+
+
 .. _damon_design_damos:

 Operation Schemes
@ -504,9 +561,34 @@ have a list of latency-critical processes.

 To let users optimize DAMOS schemes with such special knowledge, DAMOS provides
 a feature called DAMOS filters.  The feature allows users to set an arbitrary
-number of filters for each scheme.  Each filter specifies the type of target
-memory, and whether it should exclude the memory of the type (filter-out), or
-all except the memory of the type (filter-in).
+number of filters for each scheme.  Each filter specifies
+
+- a type of memory (``type``),
+- whether it is for the memory of the type or all except the type
+  (``matching``), and
+- whether it is to allow (include) or reject (exclude) applying
+  the scheme's action to the memory (``allow``).
+
+When multiple filters are installed, each filter is evaluated in the installed
+order.  If a part of memory is matched to one of the filter, next filters are
+ignored.  If the memory passes through the filters evaluation stage because it
+is not matched to any of the filters, applying the scheme's action to it is
+allowed, same to the behavior when no filter exists.
+
+For example, let's assume 1) a filter for allowing anonymous pages and 2)
+another filter for rejecting young pages are installed in the order.  If a page
+of a region that eligible to apply the scheme's action is an anonymous page,
+the scheme's action will be applied to the page regardless of whether it is
+young or not, since it matches with the first allow-filter.  If the page is
+not anonymous but young, the scheme's action will not be applied, since the
+second reject-filter blocks it.  If the page is neither anonymous nor young,
+the page will pass through the filters evaluation stage since there is no
+matching filter, and the action will be applied to the page.
+
+Note that the action can equally be applied to memory that either explicitly
+filter-allowed or filters evaluation stage passed.  It means that installing
+allow-filters at the end of the list makes no practical change but only
+filters-checking overhead.

 For efficient handling of filters, some types of filters are handled by the
 core layer, while others are handled by operations set.  In the latter case,
@ -516,7 +598,7 @@ filter are not counted as the scheme has tried to the region.  In contrast, if
 a memory regions is filtered by an operations set layer-handled filter, it is
 counted as the scheme has tried.  This difference affects the statistics.

-Below types of filters are currently supported.
+Below ``type`` of filters are currently supported.

 - anonymous page
    - Applied to pages that containing data that not stored in files.
@ -539,6 +621,60 @@ To know how user-space can set the watermarks via :ref:`DAMON sysfs interface
 <sysfs_interface>`, refer to :ref:`filters <sysfs_filters>` part of the
 documentation.

+.. _damon_design_damos_stat:
+
+Statistics
+~~~~~~~~~~
+
+The statistics of DAMOS behaviors that designed to help monitoring, tuning and
+debugging of DAMOS.
+
+DAMOS accounts below statistics for each scheme, from the beginning of the
+scheme's execution.
+
+- ``nr_tried``: Total number of regions that the scheme is tried to be applied.
+- ``sz_trtied``: Total size of regions that the scheme is tried to be applied.
+- ``sz_ops_filter_passed``: Total bytes that passed operations set
+  layer-handled DAMOS filters.
+- ``nr_applied``: Total number of regions that the scheme is applied.
+- ``sz_applied``: Total size of regions that the scheme is applied.
+- ``qt_exceeds``: Total number of times the quota of the scheme has exceeded.
+
+"A scheme is tried to be applied to a region" means DAMOS core logic determined
+the region is eligible to apply the scheme's :ref:`action
+<damon_design_damos_action>`.  The :ref:`access pattern
+<damon_design_damos_access_pattern>`, :ref:`quotas
+<damon_design_damos_quotas>`, :ref:`watermarks
+<damon_design_damos_watermarks>`, and :ref:`filters
+<damon_design_damos_filters>` that handled on core logic could affect this.
+The core logic will only ask the underlying :ref:`operation set
+<damon_operations_set>` to do apply the action to the region, so whether the
+action is really applied or not is unclear.  That's why it is called "tried".
+
+"A scheme is applied to a region" means the :ref:`operation set
+<damon_operations_set>` has applied the action to at least a part of the
+region.  The :ref:`filters <damon_design_damos_filters>` that handled by the
+operation set, and the types of the :ref:`action <damon_design_damos_action>`
+and the pages of the region can affect this.  For example, if a filter is set
+to exclude anonymous pages and the region has only anonymous pages, or if the
+action is ``pageout`` while all pages of the region are unreclaimable, applying
+the action to the region will fail.
+
+To know how user-space can read the stats via :ref:`DAMON sysfs interface
+<sysfs_interface>`, refer to :ref:s`stats <sysfs_stats>` part of the
+documentation.
+
+Regions Walking
+~~~~~~~~~~~~~~~
+
+DAMOS feature allowing users access each region that a DAMOS action has just
+applied.  Using this feature, DAMON :ref:`API <damon_design_api>` allows users
+access full properties of the regions including the access monitoring results
+and amount of the region's internal memory that passed the DAMOS filters.
+:ref:`DAMON sysfs interface <sysfs_interface>` also allows users read the data
+via special :ref:`files <sysfs_schemes_tried_regions>`.
+
+.. _damon_design_api:

 Application Programming Interface
 ---------------------------------
@ -573,15 +709,11 @@ General Purpose User Interface Modules
 DAMON modules that provide user space ABIs for general purpose DAMON usage in
 runtime.

-DAMON user interface modules, namely 'DAMON sysfs interface' and 'DAMON debugfs
-interface' are DAMON API user kernel modules that provide ABIs to the
-user-space.  Please note that DAMON debugfs interface is currently deprecated.
-
-Like many other ABIs, the modules create files on sysfs and debugfs, allow
-users to specify their requests to and get the answers from DAMON by writing to
-and reading from the files.  As a response to such I/O, DAMON user interface
-modules control DAMON and retrieve the results as user requested via the DAMON
-API, and return the results to the user-space.
+Like many other ABIs, the modules create files on pseudo file systems like
+'sysfs', allow users to specify their requests to and get the answers from
+DAMON by writing to and reading from the files.  As a response to such I/O,
+DAMON user interface modules control DAMON and retrieve the results as user
+requested via the DAMON API, and return the results to the user-space.

 The ABIs are designed to be used for user space applications development,
 rather than human beings' fingers.  Human users are recommended to use such
@ -590,8 +722,9 @@ Github (https://github.com/damonitor/damo), Pypi
 (https://pypistats.org/packages/damo), and Fedora
 (https://packages.fedoraproject.org/pkgs/python-damo/damo/).

-Please refer to the ABI :doc:`document </admin-guide/mm/damon/usage>` for
-details of the interfaces.
+Currently, one module for this type, namely 'DAMON sysfs interface' is
+available.  Please refer to the ABI :ref:`doc <sysfs_interface>` for details of
+the interfaces.


 Special-Purpose Access-aware Kernel Modules
@ -599,8 +732,8 @@ Special-Purpose Access-aware Kernel Modules

 DAMON modules that provide user space ABI for specific purpose DAMON usage.

-DAMON sysfs/debugfs user interfaces are for full control of all DAMON features
-in runtime.  For each special-purpose system-wide data access-aware system
+DAMON user interface modules are for full control of all DAMON features in
+runtime.  For each special-purpose system-wide data access-aware system
 operations such as proactive reclamation or LRU lists balancing, the interfaces
 could be simplified by removing unnecessary knobs for the specific purpose, and
 extended for boot-time and even compile time control.  Default values of DAMON
--- a/Documentation/mm/damon/monitoring_intervals_tuning_example.rst
+++ b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst
@ -0,0 +1,247 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================================================
+DAMON Moniting Interval Parameters Tuning Example
+=================================================
+
+DAMON's monitoring parameters need tuning based on given workload and the
+monitoring purpose.  There is a :ref:`tuning guide
+<damon_design_monitoring_params_tuning_guide>` for that.  This document
+provides an example tuning based on the guide.
+
+Setup
+=====
+
+For below example, DAMON of Linux kernel v6.11 and `damo
+<https://github.com/damonitor/damo>`_ (DAMON user-space tool) v2.5.9 was used to
+monitor and visualize access patterns on the physical address space of a system
+running a real-world server workload.
+
+5ms/100ms intervals: Too Short Interval
+=======================================
+
+Let's start by capturing the access pattern snapshot on the physical address
+space of the system using DAMON, with the default interval parameters (5
+milliseconds and 100 milliseconds for the sampling and the aggregation
+intervals, respectively).  Wait ten minutes between the start of DAMON and
+the capturing of the snapshot, to show a meaningful time-wise access patterns.
+::
+
+    # damo start
+    # sleep 600
+    # damo record --snapshot 0 1
+    # damo stop
+
+Then, list the DAMON-found regions of different access patterns, sorted by the
+"access temperature".  "Access temperature" is a metric representing the
+access-hotness of a region.  It is calculated as a weighted sum of the access
+frequency and the age of the region.  If the access frequency is 0 %, the
+temperature is multipled by minus one.  That is, if a region is not accessed,
+it gets minus temperature and it gets lower as not accessed for longer time.
+The sorting is in temperature-ascendint order, so the region at the top of the
+list is the coldest, and the one at the bottom is the hottest one. ::
+
+    # damo report access --sort_regions_by temperature
+    0   addr 16.052 GiB   size 5.985 GiB   access 0 %   age 5.900 s    # coldest
+    1   addr 22.037 GiB   size 6.029 GiB   access 0 %   age 5.300 s
+    2   addr 28.065 GiB   size 6.045 GiB   access 0 %   age 5.200 s
+    3   addr 10.069 GiB   size 5.983 GiB   access 0 %   age 4.500 s
+    4   addr 4.000 GiB    size 6.069 GiB   access 0 %   age 4.400 s
+    5   addr 62.008 GiB   size 3.992 GiB   access 0 %   age 3.700 s
+    6   addr 56.795 GiB   size 5.213 GiB   access 0 %   age 3.300 s
+    7   addr 39.393 GiB   size 6.096 GiB   access 0 %   age 2.800 s
+    8   addr 50.782 GiB   size 6.012 GiB   access 0 %   age 2.800 s
+    9   addr 34.111 GiB   size 5.282 GiB   access 0 %   age 2.300 s
+    10  addr 45.489 GiB   size 5.293 GiB   access 0 %   age 1.800 s    # hottest
+    total size: 62.000 GiB
+
+The list shows not seemingly hot regions, and only minimum access pattern
+diversity.  Every region has zero access frequency.  The number of region is
+10, which is the default ``min_nr_regions value``.  Size of each region is also
+nearly idential.  We can suspect this is because “adaptive regions adjustment”
+mechanism was not well working.  As the guide suggested, we can get relative
+hotness of regions using ``age`` as the recency information.  That would be
+better than nothing, but given the fact that the longest age is only about 6
+seconds while we waited about ten minuts, it is unclear how useful this will
+be.
+
+The temperature ranges to total size of regions of each range histogram
+visualization of the results also shows no interesting distribution pattern. ::
+
+    # damo report access --style temperature-sz-hist
+    <temperature> <total size>
+    [-,590,000,000, -,549,000,000) 5.985 GiB  |**********          |
+    [-,549,000,000, -,508,000,000) 12.074 GiB |********************|
+    [-,508,000,000, -,467,000,000) 0 B        |                    |
+    [-,467,000,000, -,426,000,000) 12.052 GiB |********************|
+    [-,426,000,000, -,385,000,000) 0 B        |                    |
+    [-,385,000,000, -,344,000,000) 3.992 GiB  |*******             |
+    [-,344,000,000, -,303,000,000) 5.213 GiB  |*********           |
+    [-,303,000,000, -,262,000,000) 12.109 GiB |********************|
+    [-,262,000,000, -,221,000,000) 5.282 GiB  |*********           |
+    [-,221,000,000, -,180,000,000) 0 B        |                    |
+    [-,180,000,000, -,139,000,000) 5.293 GiB  |*********           |
+    total size: 62.000 GiB
+
+In short, the parameters provide poor quality monitoring results for hot
+regions detection. According to the :ref:`guide
+<damon_design_monitoring_params_tuning_guide>`, this is due to the too short
+aggregation interval.
+
+100ms/2s intervals: Starts Showing Small Hot Regions
+====================================================
+
+Following the guide, increase the interval 20 times (100 milliseocnds and 2
+seconds for sampling and aggregation intervals, respectively). ::
+
+    # damo start -s 100ms -a 2s
+    # sleep 600
+    # damo record --snapshot 0 1
+    # damo stop
+    # damo report access --sort_regions_by temperature
+    0   addr 10.180 GiB   size 6.117 GiB   access 0 %   age 7 m 8 s    # coldest
+    1   addr 49.275 GiB   size 6.195 GiB   access 0 %   age 6 m 14 s
+    2   addr 62.421 GiB   size 3.579 GiB   access 0 %   age 6 m 4 s
+    3   addr 40.154 GiB   size 6.127 GiB   access 0 %   age 5 m 40 s
+    4   addr 16.296 GiB   size 6.182 GiB   access 0 %   age 5 m 32 s
+    5   addr 34.254 GiB   size 5.899 GiB   access 0 %   age 5 m 24 s
+    6   addr 46.281 GiB   size 2.995 GiB   access 0 %   age 5 m 20 s
+    7   addr 28.420 GiB   size 5.835 GiB   access 0 %   age 5 m 6 s
+    8   addr 4.000 GiB    size 6.180 GiB   access 0 %   age 4 m 16 s
+    9   addr 22.478 GiB   size 5.942 GiB   access 0 %   age 3 m 58 s
+    10  addr 55.470 GiB   size 915.645 MiB access 0 %   age 3 m 6 s
+    11  addr 56.364 GiB   size 6.056 GiB   access 0 %   age 2 m 8 s
+    12  addr 56.364 GiB   size 4.000 KiB   access 95 %  age 16 s
+    13  addr 49.275 GiB   size 4.000 KiB   access 100 % age 8 m 24 s   # hottest
+    total size: 62.000 GiB
+    # damo report access --style temperature-sz-hist
+    <temperature> <total size>
+    [-42,800,000,000, -33,479,999,000) 22.018 GiB |*****************   |
+    [-33,479,999,000, -24,159,998,000) 27.090 GiB |********************|
+    [-24,159,998,000, -14,839,997,000) 6.836 GiB  |******              |
+    [-14,839,997,000, -5,519,996,000)  6.056 GiB  |*****               |
+    [-5,519,996,000, 3,800,005,000)    4.000 KiB  |*                   |
+    [3,800,005,000, 13,120,006,000)    0 B        |                    |
+    [13,120,006,000, 22,440,007,000)   0 B        |                    |
+    [22,440,007,000, 31,760,008,000)   0 B        |                    |
+    [31,760,008,000, 41,080,009,000)   0 B        |                    |
+    [41,080,009,000, 50,400,010,000)   0 B        |                    |
+    [50,400,010,000, 59,720,011,000)   4.000 KiB  |*                   |
+    total size: 62.000 GiB
+
+DAMON found two distinct 4 KiB regions that pretty hot.  The regions are also
+well aged.  The hottest 4 KiB region was keeping the access frequency for about
+8 minutes, and the coldest region was keeping no access for about 7 minutes.
+The distribution on the histogram also looks like having a pattern.
+
+Especially, the finding of the 4 KiB regions among the 62 GiB total memory
+shows DAMON’s adaptive regions adjustment is working as designed.
+
+Still the number of regions is close to the ``min_nr_regions``, and sizes of
+cold regions are similar, though.  Apparently it is improved, but it still has
+rooms to improve.
+
+400ms/8s intervals: Pretty Improved Results
+===========================================
+
+Increase the intervals four times (400 milliseconds and 8 seconds
+for sampling and aggregation intervals, respectively). ::
+
+    # damo start -s 400ms -a 8s
+    # sleep 600
+    # damo record --snapshot 0 1
+    # damo stop
+    # damo report access --sort_regions_by temperature
+    0   addr 64.492 GiB   size 1.508 GiB   access 0 %   age 6 m 48 s    # coldest
+    1   addr 21.749 GiB   size 5.674 GiB   access 0 %   age 6 m 8 s
+    2   addr 27.422 GiB   size 5.801 GiB   access 0 %   age 6 m
+    3   addr 49.431 GiB   size 8.675 GiB   access 0 %   age 5 m 28 s
+    4   addr 33.223 GiB   size 5.645 GiB   access 0 %   age 5 m 12 s
+    5   addr 58.321 GiB   size 6.170 GiB   access 0 %   age 5 m 4 s
+    [...]
+    25  addr 6.615 GiB    size 297.531 MiB access 15 %  age 0 ns
+    26  addr 9.513 GiB    size 12.000 KiB  access 20 %  age 0 ns
+    27  addr 9.511 GiB    size 108.000 KiB access 25 %  age 0 ns
+    28  addr 9.513 GiB    size 20.000 KiB  access 25 %  age 0 ns
+    29  addr 9.511 GiB    size 12.000 KiB  access 30 %  age 0 ns
+    30  addr 9.520 GiB    size 4.000 KiB   access 40 %  age 0 ns
+    [...]
+    41  addr 9.520 GiB    size 4.000 KiB   access 80 %  age 56 s
+    42  addr 9.511 GiB    size 12.000 KiB  access 100 % age 6 m 16 s
+    43  addr 58.321 GiB   size 4.000 KiB   access 100 % age 6 m 24 s
+    44  addr 9.512 GiB    size 4.000 KiB   access 100 % age 6 m 48 s
+    45  addr 58.106 GiB   size 4.000 KiB   access 100 % age 6 m 48 s    # hottest
+    total size: 62.000 GiB
+    # damo report access --style temperature-sz-hist
+    <temperature> <total size>
+    [-40,800,000,000, -32,639,999,000) 21.657 GiB  |********************|
+    [-32,639,999,000, -24,479,998,000) 17.938 GiB  |*****************   |
+    [-24,479,998,000, -16,319,997,000) 16.885 GiB  |****************    |
+    [-16,319,997,000, -8,159,996,000)  586.879 MiB |*                   |
+    [-8,159,996,000, 5,000)            4.946 GiB   |*****               |
+    [5,000, 8,160,006,000)             260.000 KiB |*                   |
+    [8,160,006,000, 16,320,007,000)    0 B         |                    |
+    [16,320,007,000, 24,480,008,000)   0 B         |                    |
+    [24,480,008,000, 32,640,009,000)   0 B         |                    |
+    [32,640,009,000, 40,800,010,000)   16.000 KiB  |*                   |
+    [40,800,010,000, 48,960,011,000)   8.000 KiB   |*                   |
+    total size: 62.000 GiB
+
+The number of regions having different access patterns has significantly
+increased.  Size of each region is also more varied. Total size of non-zero
+access frequency regions is also significantly increased. Maybe this is already
+good enough to make some meaningful memory management efficieny changes.
+
+800ms/16s intervals: Another bias
+=================================
+
+Further double the intervals (800 milliseconds and 16 seconds for sampling
+and aggregation intervals, respectively).  The results is more improved for the
+hot regions detection, but starts looking degrading cold regions detection. ::
+
+    # damo start -s 800ms -a 16s
+    # sleep 600
+    # damo record --snapshot 0 1
+    # damo stop
+    # damo report access --sort_regions_by temperature
+    0   addr 64.781 GiB   size 1.219 GiB   access 0 %   age 4 m 48 s
+    1   addr 24.505 GiB   size 2.475 GiB   access 0 %   age 4 m 16 s
+    2   addr 26.980 GiB   size 504.273 MiB access 0 %   age 4 m
+    3   addr 29.443 GiB   size 2.462 GiB   access 0 %   age 4 m
+    4   addr 37.264 GiB   size 5.645 GiB   access 0 %   age 4 m
+    5   addr 31.905 GiB   size 5.359 GiB   access 0 %   age 3 m 44 s
+    [...]
+    20  addr 8.711 GiB    size 40.000 KiB  access 5 %   age 2 m 40 s
+    21  addr 27.473 GiB   size 1.970 GiB   access 5 %   age 4 m
+    22  addr 48.185 GiB   size 4.625 GiB   access 5 %   age 4 m
+    23  addr 47.304 GiB   size 902.117 MiB access 10 %  age 4 m
+    24  addr 8.711 GiB    size 4.000 KiB   access 100 % age 4 m
+    25  addr 20.793 GiB   size 3.713 GiB   access 5 %   age 4 m 16 s
+    26  addr 8.773 GiB    size 4.000 KiB   access 100 % age 4 m 16 s
+    total size: 62.000 GiB
+    # damo report access --style temperature-sz-hist
+    <temperature> <total size>
+    [-28,800,000,000, -23,359,999,000) 12.294 GiB  |*****************   |
+    [-23,359,999,000, -17,919,998,000) 9.753 GiB   |*************       |
+    [-17,919,998,000, -12,479,997,000) 15.131 GiB  |********************|
+    [-12,479,997,000, -7,039,996,000)  0 B         |                    |
+    [-7,039,996,000, -1,599,995,000)   7.506 GiB   |**********          |
+    [-1,599,995,000, 3,840,006,000)    6.127 GiB   |*********           |
+    [3,840,006,000, 9,280,007,000)     0 B         |                    |
+    [9,280,007,000, 14,720,008,000)    136.000 KiB |*                   |
+    [14,720,008,000, 20,160,009,000)   40.000 KiB  |*                   |
+    [20,160,009,000, 25,600,010,000)   11.188 GiB  |***************     |
+    [25,600,010,000, 31,040,011,000)   4.000 KiB   |*                   |
+    total size: 62.000 GiB
+
+It found more non-zero access frequency regions. The number of regions is still
+much higher than the ``min_nr_regions``, but it is reduced from that of the
+previous setup. And apparently the distribution seems bit biased to hot
+regions.
+
+Conclusion
+==========
+
+With the above experimental tuning results, we can conclude the theory and the
+guide makes sense to at least this workload, and could be applied to similar
+cases.
--- a/Documentation/mm/process_addrs.rst
+++ b/Documentation/mm/process_addrs.rst
@ -531,6 +531,10 @@ are extra requirements for accessing them:
  new page table has been installed in the same location and filled with
  entries. Writers normally need to take the PTE lock and revalidate that the
  PMD entry still refers to the same PTE-level page table.
+  If the writer does not care whether it is the same PTE-level page table, it
+  can take the PMD lock and revalidate that the contents of pmd entry still meet
+  the requirements. In particular, this also happens in :c:func:`!retract_page_tables`
+  when handling :c:macro:`!MADV_COLLAPSE`.

 To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or
 :c:func:`!pte_offset_map` can be used depending on stability requirements.
@ -712,9 +716,14 @@ calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU
 critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`,
 before releasing the RCU lock via :c:func:`!rcu_read_unlock`.

-VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for
-their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it
-via :c:func:`!vma_end_read`.
+In cases when the user already holds mmap read lock, :c:func:`!vma_start_read_locked`
+and :c:func:`!vma_start_read_locked_nested` can be used. These functions do not
+fail due to lock contention but the caller should still check their return values
+in case they fail for other reasons.
+
+VMA read locks increment :c:member:`!vma.vm_refcnt` reference counter for their
+duration and the caller of :c:func:`!lock_vma_under_rcu` must drop it via
+:c:func:`!vma_end_read`.

 VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a
 VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always
@ -722,9 +731,9 @@ acquired. An mmap write lock **must** be held for the duration of the VMA write
 lock, releasing or downgrading the mmap write lock also releases the VMA write
 lock so there is no :c:func:`!vma_end_write` function.

-Note that a semaphore write lock is not held across a VMA lock. Rather, a
-sequence number is used for serialisation, and the write semaphore is only
-acquired at the point of write lock to update this.
+Note that when write-locking a VMA lock, the :c:member:`!vma.vm_refcnt` is temporarily
+modified so that readers can detect the presense of a writer. The reference counter is
+restored once the vma sequence number used for serialisation is updated.

 This ensures the semantics we require - VMA write locks provide exclusive write
 access to the VMA.
@ -734,7 +743,7 @@ Implementation details

 The VMA lock mechanism is designed to be a lightweight means of avoiding the use
 of the heavily contended mmap lock. It is implemented using a combination of a
-read/write semaphore and sequence numbers belonging to the containing
+reference counter and sequence numbers belonging to the containing
 :c:struct:`!struct mm_struct` and the VMA.

 Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic
@ -775,28 +784,31 @@ release of any VMA locks on its release makes sense, as you would never want to
 keep VMAs locked across entirely separate write operations. It also maintains
 correct lock ordering.

-Each time a VMA read lock is acquired, we acquire a read lock on the
-:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that
-the sequence count of the VMA does not match that of the mm.
+Each time a VMA read lock is acquired, we increment :c:member:`!vma.vm_refcnt`
+reference counter and check that the sequence count of the VMA does not match
+that of the mm.

-If it does, the read lock fails. If it does not, we hold the lock, excluding
-writers, but permitting other readers, who will also obtain this lock under RCU.
+If it does, the read lock fails and :c:member:`!vma.vm_refcnt` is dropped.
+If it does not, we keep the reference counter raised, excluding writers, but
+permitting other readers, who can also obtain this lock under RCU.

 Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu`
 are also RCU safe, so the whole read lock operation is guaranteed to function
 correctly.

-On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock`
-read/write semaphore, before setting the VMA's sequence number under this lock,
-also simultaneously holding the mmap write lock.
+On the write side, we set a bit in :c:member:`!vma.vm_refcnt` which can't be
+modified by readers and wait for all readers to drop their reference count.
+Once there are no readers, VMA's sequence number is set to match that of the
+mm. During this entire operation mmap write lock is held.

 This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep
 until these are finished and mutual exclusion is achieved.

-After setting the VMA's sequence number, the lock is released, avoiding
-complexity with a long-term held write lock.
+After setting the VMA's sequence number, the bit in :c:member:`!vma.vm_refcnt`
+indicating a writer is cleared. From this point on, VMA's sequence number will
+indicate VMA's write-locked state until mmap write lock is dropped or downgraded.

-This clever combination of a read/write semaphore and sequence count allows for
+This clever combination of a reference counter and sequence count allows for
 fast RCU-based per-VMA lock acquisition (especially on page fault, though
 utilised elsewhere) with minimal complexity around lock ordering.

--- a/Documentation/mm/split_page_table_lock.rst
+++ b/Documentation/mm/split_page_table_lock.rst
@ -62,7 +62,7 @@ Support of split page table lock by an architecture
 ===================================================

 There's no need in special enabling of PTE split page table lock: everything
-required is done by pagetable_pte_ctor() and pagetable_pte_dtor(), which
+required is done by pagetable_pte_ctor() and pagetable_dtor(), which
 must be called on PTE table allocation / freeing.

 Make sure the architecture doesn't use slab allocator for page table
@ -73,7 +73,7 @@ PMD split lock only makes sense if you have more than two page table
 levels.

 PMD split lock enabling requires pagetable_pmd_ctor() call on PMD table
-allocation and pagetable_pmd_dtor() on freeing.
+allocation and pagetable_dtor() on freeing.

 Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
 pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
@ -26,12 +26,7 @@ DAMON 为不同的用户提供了下面这些接口。
  使用它，用户可以通过读取和写入特殊的sysfs文件来使用DAMON的主要功能。因此，你可以编写和使
  用你个性化的DAMON sysfs包装程序，代替你读/写sysfs文件。  `DAMON用户空间工具
  <https://github.com/damonitor/damo>`_ 就是这种程序的一个例子  它同时支持虚拟和物理地址
-  空间的监测。注意，这个界面只提供简单的监测结果 :ref:`统计 <damos_stats>`。对于详细的监测
-  结果，DAMON提供了一个:ref:`跟踪点 <tracepoint>`。
- *debugfs interface.*
-  :ref:`这 <debugfs_interface>` 几乎与:ref:`sysfs interface <sysfs_interface>` 接
-  口相同。这将在下一个LTS内核发布后被移除，所以用户应该转移到
-  :ref:`sysfs interface <sysfs_interface>`。
+  空间的监测。
 - *内核空间编程接口。*
  :doc:`这 </mm/damon/api>` 这是为内核空间程序员准备的。使用它，用户可以通过为你编写内
  核空间的DAMON应用程序，最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。
@ -335,247 +330,6 @@ tried_regions/<N>/
 请注意，我们强烈建议使用用户空间的工具，如 `damo <https://github.com/damonitor/damo>`_ ，
 而不是像上面那样手动读写文件。以上只是一个例子。

-debugfs接口
-===========
-
-.. note::
-
-  DAMON debugfs接口将在下一个LTS内核发布后被移除，所以用户应该转移到
-  :ref:`sysfs接口<sysfs_interface>`。
-
-DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和
-``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
-
-
-属性
----
-
-用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 、 ``聚集间隔`` 、 ``更新间隔``
-以及监测目标区域的最小/最大数量。要详细了解监测属性，请参考 `:doc:/mm/damon/design` 。例如，
-下面的命令将这些值设置为5ms、100ms、1000ms、10和1000，然后再次检查::
-
-    # cd <debugfs>/damon
-    # echo 5000 100000 1000000 10 1000 > attrs
-    # cat attrs
-    5000 100000 1000000 10 1000
-
-
-目标ID
------
-
-一些类型的地址空间支持多个监测目标。例如，虚拟内存地址空间的监测可以有多个进程作为监测目标。用户
-可以通过写入目标的相关id值来设置目标，并通过读取 ``target_ids`` 文件来获得当前目标的id。在监
-测虚拟地址空间的情况下，这些值应该是监测目标进程的pid。例如，下面的命令将pid为42和4242的进程设
-为监测目标，并再次检查::
-
-    # cd <debugfs>/damon
-    # echo 42 4242 > target_ids
-    # cat target_ids
-    42 4242
-
-用户还可以通过在文件中写入一个特殊的关键字 "paddr\n" 来监测系统的物理内存地址空间。因为物理地
-址空间监测不支持多个目标，读取文件会显示一个假值，即 ``42`` ，如下图所示::
-
-    # cd <debugfs>/damon
-    # echo paddr > target_ids
-    # cat target_ids
-    42
-
-请注意，设置目标ID并不启动监测。
-
-
-初始监测目标区域
----------------
-
-在虚拟地址空间监测的情况下，DAMON自动设置和更新监测的目标区域，这样就可以覆盖目标进程的整个
-内存映射。然而，用户可能希望将监测区域限制在特定的地址范围内，如堆、栈或特定的文件映射区域。
-或者，一些用户可以知道他们工作负载的初始访问模式，因此希望为“自适应区域调整”设置最佳初始区域。
-
-相比之下，DAMON在物理内存监测的情况下不会自动设置和更新监测目标区域。因此，用户应该自己设置
-监测目标区域。
-
-在这种情况下，用户可以通过在 ``init_regions`` 文件中写入适当的值，明确地设置他们想要的初
-始监测目标区域。输入应该是一个由三个整数组成的队列，用空格隔开，代表一个区域的形式如下::
-
-    <target idx> <start address> <end address>
-
-目标idx应该是 ``target_ids`` 文件中目标的索引，从 ``0`` 开始，区域应该按照地址顺序传递。
-例如，下面的命令将设置几个地址范围， ``1-100`` 和 ``100-200`` 作为pid 42的初始监测目标
-区域，这是 ``target_ids`` 中的第一个（索引 ``0`` ），另外几个地址范围， ``20-40`` 和
-``50-100`` 作为pid 4242的地址，这是 ``target_ids`` 中的第二个（索引 ``1`` ）::
-
-    # cd <debugfs>/damon
-    # cat target_ids
-    42 4242
-    # echo "0   1       100 \
-            0   100     200 \
-            1   20      40  \
-            1   50      100" > init_regions
-
-请注意，这只是设置了初始的监测目标区域。在虚拟内存监测的情况下，DAMON会在一个 ``更新间隔``
-后自动更新区域的边界。因此，在这种情况下，如果用户不希望更新的话，应该把 ``更新间隔`` 设
-置得足够大。
-
-
-方案
----
-
-对于通常的基于DAMON的数据访问感知的内存管理优化，用户只是希望系统对特定访问模式的内存区域应用内
-存管理操作。DAMON从用户那里接收这种形式化的操作方案，并将这些方案应用到目标进程中。
-
-用户可以通过读取和写入 ``scheme`` debugfs文件来获得和设置这些方案。读取该文件还可以显示每个
-方案的统计数据。在文件中，每一个方案都应该在每一行中以下列形式表示出来::
-
-    <target access pattern> <action> <quota> <watermarks>
-
-你可以通过简单地在文件中写入一个空字符串来禁用方案。
-
-目标访问模式
-~~~~~~~~~~~~
-
-``<目标访问模式>`` 是由三个范围构成的，形式如下::
-
-    min-size max-size min-acc max-acc min-age max-age
-
-具体来说，区域大小的字节数（ `min-size` 和 `max-size` ），访问频率的每聚合区间的监测访问次
-数（ `min-acc` 和 `max-acc` ），区域年龄的聚合区间数（ `min-age` 和 `max-age` ）都被指定。
-请注意，这些范围是封闭区间。
-
-动作
-~~~~
-
-``<action>`` 是一个预定义的内存管理动作的整数，DAMON将应用于具有目标访问模式的区域。支持
-的数字和它们的含义如下::
-
- - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
- - 1: Call ``madvise()`` for the region with ``MADV_COLD``
- - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
- - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
- - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
- - 5: Do nothing but count the statistics
-
-配额
-~~~~
-
-每个 ``动作`` 的最佳 ``目标访问模式`` 取决于工作负载，所以不容易找到。更糟糕的是，将某个
-动作的方案设置得过于激进会导致严重的开销。为了避免这种开销，用户可以通过下面表格中的 ``<quota>``
-来限制方案的时间和大小配额::
-
-    <ms> <sz> <reset interval> <priority weights>
-
-这使得DAMON在 ``<reset interval>`` 毫秒内，尽量只用 ``<ms>`` 毫秒的时间对 ``目标访
-问模式`` 的内存区域应用动作，并在 ``<reset interval>`` 内只对最多<sz>字节的内存区域应
-用动作。将 ``<ms>`` 和 ``<sz>`` 都设置为零，可以禁用配额限制。
-
-当预计超过配额限制时，DAMON会根据 ``目标访问模式`` 的大小、访问频率和年龄，对发现的内存
-区域进行优先排序。为了实现个性化的优先级，用户可以在 ``<优先级权重>`` 中设置这三个属性的
-权重，具体形式如下::
-
-    <size weight> <access frequency weight> <age weight>
-
-水位
-~~~~
-
-有些方案需要根据系统特定指标的当前值来运行，如自由内存比率。对于这种情况，用户可以为该条
-件指定水位。::
-
-    <metric> <check interval> <high mark> <middle mark> <low mark>
-
-``<metric>`` 是一个预定义的整数，用于要检查的度量。支持的数字和它们的含义如下。
-
- - 0: 忽视水位
- - 1: 系统空闲内存率 (千分比)
-
-每隔 ``<检查间隔>`` 微秒检查一次公制的值。
-
-如果该值高于 ``<高标>`` 或低于 ``<低标>`` ，该方案被停用。如果该值低于 ``<中标>`` ，
-该方案将被激活。
-
-统计数据
-~~~~~~~~
-
-它还统计每个方案被尝试应用的区域的总数量和字节数，每个方案被成功应用的区域的两个数量，以
-及超过配额限制的总数量。这些统计数据可用于在线分析或调整方案。
-
-统计数据可以通过读取方案文件来显示。读取该文件将显示你在每一行中输入的每个 ``方案`` ，
-统计的五个数字将被加在每一行的末尾。
-
-例子
-~~~~
-
-下面的命令应用了一个方案：”如果一个大小为[4KiB, 8KiB]的内存区域在[10, 20]的聚合时间
-间隔内显示出每一个聚合时间间隔[0, 5]的访问量，请分页出该区域。对于分页，每秒最多只能使
-用10ms，而且每秒分页不能超过1GiB。在这一限制下，首先分页出具有较长年龄的内存区域。另外，
-每5秒钟检查一次系统的可用内存率，当可用内存率低于50%时开始监测和分页，但如果可用内存率
-大于60%，或低于30%，则停止监测“::
-
-    # cd <debugfs>/damon
-    # scheme="4096 8192  0 5    10 20    2"  # target access pattern and action
-    # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
-    # scheme+=" 0 0 100"                     # prioritization weights
-    # scheme+=" 1 5000000 600 500 300"       # watermarks
-    # echo "$scheme" > schemes
-
-
-开关
----
-
-除非你明确地启动监测，否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on_DEPRECATED``
-文件来启动、停止和检查监测的当前状态。写入 ``on`` 该文件可以启动对有属性的目标的监测。写入
-``off`` 该文件则停止这些目标。如果每个目标进程被终止，DAMON也会停止。下面的示例命令开启、关
-闭和检查DAMON的状态::
-
-    # cd <debugfs>/damon
-    # echo on > monitor_on_DEPRECATED
-    # echo off > monitor_on_DEPRECATED
-    # cat monitor_on_DEPRECATED
-    off
-
-请注意，当监测开启时，你不能写到上述的debugfs文件。如果你在DAMON运行时写到这些文件，将会返
-回一个错误代码，如 ``-EBUSY`` 。
-
-
-监测线程PID
-----------
-
-DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以通过读取 ``kdamond_pid`` 文件获
-得该线程的 ``pid`` 。当监测被 ``关闭`` 时，读取该文件不会返回任何信息::
-
-    # cd <debugfs>/damon
-    # cat monitor_on_DEPRECATED
-    off
-    # cat kdamond_pid
-    none
-    # echo on > monitor_on_DEPRECATED
-    # cat kdamond_pid
-    18594
-
-
-使用多个监测线程
----------------
-
-每个监测上下文都会创建一个 ``kdamond`` 线程。你可以使用 ``mk_contexts`` 和 ``rm_contexts``
-文件为多个 ``kdamond`` 需要的用例创建和删除监测上下文。
-
-将新上下文的名称写入 ``mk_contexts`` 文件，在 ``DAMON debugfs`` 目录上创建一个该名称的目录。
-该目录将有该上下文的 ``DAMON debugfs`` 文件::
-
-    # cd <debugfs>/damon
-    # ls foo
-    # ls: cannot access 'foo': No such file or directory
-    # echo foo > mk_contexts
-    # ls foo
-    # attrs  init_regions  kdamond_pid  schemes  target_ids
-
-如果不再需要上下文，你可以通过把上下文的名字放到 ``rm_contexts`` 文件中来删除它和相应的目录::
-
-    # echo foo > rm_contexts
-    # ls foo
-    # ls: cannot access 'foo': No such file or directory
-
-注意， ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目录下。
-

 监测结果的监测点
 ================
--- a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
@ -26,12 +26,7 @@ DAMON 爲不同的用戶提供了下面這些接口。
  使用它，用戶可以通過讀取和寫入特殊的sysfs文件來使用DAMON的主要功能。因此，你可以編寫和使
  用你個性化的DAMON sysfs包裝程序，代替你讀/寫sysfs文件。  `DAMON用戶空間工具
  <https://github.com/damonitor/damo>`_ 就是這種程序的一個例子  它同時支持虛擬和物理地址
-  空間的監測。注意，這個界面只提供簡單的監測結果 :ref:`統計 <damos_stats>`。對於詳細的監測
-  結果，DAMON提供了一個:ref:`跟蹤點 <tracepoint>`。
- *debugfs interface.*
-  :ref:`這 <debugfs_interface>` 幾乎與:ref:`sysfs interface <sysfs_interface>` 接
-  口相同。這將在下一個LTS內核發佈後被移除，所以用戶應該轉移到
-  :ref:`sysfs interface <sysfs_interface>`。
+  空間的監測。
 - *內核空間編程接口。*
  :doc:`這 </mm/damon/api>` 這是爲內核空間程序員準備的。使用它，用戶可以通過爲你編寫內
  核空間的DAMON應用程序，最靈活有效地利用DAMON的每一個功能。你甚至可以爲各種地址空間擴展DAMON。
@ -335,247 +330,6 @@ tried_regions/<N>/
 請注意，我們強烈建議使用用戶空間的工具，如 `damo <https://github.com/damonitor/damo>`_ ，
 而不是像上面那樣手動讀寫文件。以上只是一個例子。

-debugfs接口
-===========
-
-.. note::
-
-  DAMON debugfs接口將在下一個LTS內核發佈後被移除，所以用戶應該轉移到
-  :ref:`sysfs接口<sysfs_interface>`。
-
-DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和
-``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
-
-
-屬性
----
-
-用戶可以通過讀取和寫入 ``attrs`` 文件獲得和設置 ``採樣間隔`` 、 ``聚集間隔`` 、 ``更新間隔``
-以及監測目標區域的最小/最大數量。要詳細瞭解監測屬性，請參考 `:doc:/mm/damon/design` 。例如，
-下面的命令將這些值設置爲5ms、100ms、1000ms、10和1000，然後再次檢查::
-
-    # cd <debugfs>/damon
-    # echo 5000 100000 1000000 10 1000 > attrs
-    # cat attrs
-    5000 100000 1000000 10 1000
-
-
-目標ID
------
-
-一些類型的地址空間支持多個監測目標。例如，虛擬內存地址空間的監測可以有多個進程作爲監測目標。用戶
-可以通過寫入目標的相關id值來設置目標，並通過讀取 ``target_ids`` 文件來獲得當前目標的id。在監
-測虛擬地址空間的情況下，這些值應該是監測目標進程的pid。例如，下面的命令將pid爲42和4242的進程設
-爲監測目標，並再次檢查::
-
-    # cd <debugfs>/damon
-    # echo 42 4242 > target_ids
-    # cat target_ids
-    42 4242
-
-用戶還可以通過在文件中寫入一個特殊的關鍵字 "paddr\n" 來監測系統的物理內存地址空間。因爲物理地
-址空間監測不支持多個目標，讀取文件會顯示一個假值，即 ``42`` ，如下圖所示::
-
-    # cd <debugfs>/damon
-    # echo paddr > target_ids
-    # cat target_ids
-    42
-
-請注意，設置目標ID並不啓動監測。
-
-
-初始監測目標區域
----------------
-
-在虛擬地址空間監測的情況下，DAMON自動設置和更新監測的目標區域，這樣就可以覆蓋目標進程的整個
-內存映射。然而，用戶可能希望將監測區域限制在特定的地址範圍內，如堆、棧或特定的文件映射區域。
-或者，一些用戶可以知道他們工作負載的初始訪問模式，因此希望爲“自適應區域調整”設置最佳初始區域。
-
-相比之下，DAMON在物理內存監測的情況下不會自動設置和更新監測目標區域。因此，用戶應該自己設置
-監測目標區域。
-
-在這種情況下，用戶可以通過在 ``init_regions`` 文件中寫入適當的值，明確地設置他們想要的初
-始監測目標區域。輸入應該是一個由三個整數組成的隊列，用空格隔開，代表一個區域的形式如下::
-
-    <target idx> <start address> <end address>
-
-目標idx應該是 ``target_ids`` 文件中目標的索引，從 ``0`` 開始，區域應該按照地址順序傳遞。
-例如，下面的命令將設置幾個地址範圍， ``1-100`` 和 ``100-200`` 作爲pid 42的初始監測目標
-區域，這是 ``target_ids`` 中的第一個（索引 ``0`` ），另外幾個地址範圍， ``20-40`` 和
-``50-100`` 作爲pid 4242的地址，這是 ``target_ids`` 中的第二個（索引 ``1`` ）::
-
-    # cd <debugfs>/damon
-    # cat target_ids
-    42 4242
-    # echo "0   1       100 \
-            0   100     200 \
-            1   20      40  \
-            1   50      100" > init_regions
-
-請注意，這只是設置了初始的監測目標區域。在虛擬內存監測的情況下，DAMON會在一個 ``更新間隔``
-後自動更新區域的邊界。因此，在這種情況下，如果用戶不希望更新的話，應該把 ``更新間隔`` 設
-置得足夠大。
-
-
-方案
----
-
-對於通常的基於DAMON的數據訪問感知的內存管理優化，用戶只是希望系統對特定訪問模式的內存區域應用內
-存管理操作。DAMON從用戶那裏接收這種形式化的操作方案，並將這些方案應用到目標進程中。
-
-用戶可以通過讀取和寫入 ``scheme`` debugfs文件來獲得和設置這些方案。讀取該文件還可以顯示每個
-方案的統計數據。在文件中，每一個方案都應該在每一行中以下列形式表示出來::
-
-    <target access pattern> <action> <quota> <watermarks>
-
-你可以通過簡單地在文件中寫入一個空字符串來禁用方案。
-
-目標訪問模式
-~~~~~~~~~~~~
-
-``<目標訪問模式>`` 是由三個範圍構成的，形式如下::
-
-    min-size max-size min-acc max-acc min-age max-age
-
-具體來說，區域大小的字節數（ `min-size` 和 `max-size` ），訪問頻率的每聚合區間的監測訪問次
-數（ `min-acc` 和 `max-acc` ），區域年齡的聚合區間數（ `min-age` 和 `max-age` ）都被指定。
-請注意，這些範圍是封閉區間。
-
-動作
-~~~~
-
-``<action>`` 是一個預定義的內存管理動作的整數，DAMON將應用於具有目標訪問模式的區域。支持
-的數字和它們的含義如下::
-
- - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
- - 1: Call ``madvise()`` for the region with ``MADV_COLD``
- - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
- - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
- - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
- - 5: Do nothing but count the statistics
-
-配額
-~~~~
-
-每個 ``動作`` 的最佳 ``目標訪問模式`` 取決於工作負載，所以不容易找到。更糟糕的是，將某個
-動作的方案設置得過於激進會導致嚴重的開銷。爲了避免這種開銷，用戶可以通過下面表格中的 ``<quota>``
-來限制方案的時間和大小配額::
-
-    <ms> <sz> <reset interval> <priority weights>
-
-這使得DAMON在 ``<reset interval>`` 毫秒內，儘量只用 ``<ms>`` 毫秒的時間對 ``目標訪
-問模式`` 的內存區域應用動作，並在 ``<reset interval>`` 內只對最多<sz>字節的內存區域應
-用動作。將 ``<ms>`` 和 ``<sz>`` 都設置爲零，可以禁用配額限制。
-
-當預計超過配額限制時，DAMON會根據 ``目標訪問模式`` 的大小、訪問頻率和年齡，對發現的內存
-區域進行優先排序。爲了實現個性化的優先級，用戶可以在 ``<優先級權重>`` 中設置這三個屬性的
-權重，具體形式如下::
-
-    <size weight> <access frequency weight> <age weight>
-
-水位
-~~~~
-
-有些方案需要根據系統特定指標的當前值來運行，如自由內存比率。對於這種情況，用戶可以爲該條
-件指定水位。::
-
-    <metric> <check interval> <high mark> <middle mark> <low mark>
-
-``<metric>`` 是一個預定義的整數，用於要檢查的度量。支持的數字和它們的含義如下。
-
- - 0: 忽視水位
- - 1: 系統空閒內存率 (千分比)
-
-每隔 ``<檢查間隔>`` 微秒檢查一次公制的值。
-
-如果該值高於 ``<高標>`` 或低於 ``<低標>`` ，該方案被停用。如果該值低於 ``<中標>`` ，
-該方案將被激活。
-
-統計數據
-~~~~~~~~
-
-它還統計每個方案被嘗試應用的區域的總數量和字節數，每個方案被成功應用的區域的兩個數量，以
-及超過配額限制的總數量。這些統計數據可用於在線分析或調整方案。
-
-統計數據可以通過讀取方案文件來顯示。讀取該文件將顯示你在每一行中輸入的每個 ``方案`` ，
-統計的五個數字將被加在每一行的末尾。
-
-例子
-~~~~
-
-下面的命令應用了一個方案：”如果一個大小爲[4KiB, 8KiB]的內存區域在[10, 20]的聚合時間
-間隔內顯示出每一個聚合時間間隔[0, 5]的訪問量，請分頁出該區域。對於分頁，每秒最多隻能使
-用10ms，而且每秒分頁不能超過1GiB。在這一限制下，首先分頁出具有較長年齡的內存區域。另外，
-每5秒鐘檢查一次系統的可用內存率，當可用內存率低於50%時開始監測和分頁，但如果可用內存率
-大於60%，或低於30%，則停止監測“::
-
-    # cd <debugfs>/damon
-    # scheme="4096 8192  0 5    10 20    2"  # target access pattern and action
-    # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
-    # scheme+=" 0 0 100"                     # prioritization weights
-    # scheme+=" 1 5000000 600 500 300"       # watermarks
-    # echo "$scheme" > schemes
-
-
-開關
----
-
-除非你明確地啓動監測，否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on_DEPRECATED``
-文件來啓動、停止和檢查監測的當前狀態。寫入 ``on`` 該文件可以啓動對有屬性的目標的監測。寫入
-``off`` 該文件則停止這些目標。如果每個目標進程被終止，DAMON也會停止。下面的示例命令開啓、關
-閉和檢查DAMON的狀態::
-
-    # cd <debugfs>/damon
-    # echo on > monitor_on_DEPRECATED
-    # echo off > monitor_on_DEPRECATED
-    # cat monitor_on_DEPRECATED
-    off
-
-請注意，當監測開啓時，你不能寫到上述的debugfs文件。如果你在DAMON運行時寫到這些文件，將會返
-回一個錯誤代碼，如 ``-EBUSY`` 。
-
-
-監測線程PID
-----------
-
-DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以通過讀取 ``kdamond_pid`` 文件獲
-得該線程的 ``pid`` 。當監測被 ``關閉`` 時，讀取該文件不會返回任何信息::
-
-    # cd <debugfs>/damon
-    # cat monitor_on_DEPRECATED
-    off
-    # cat kdamond_pid
-    none
-    # echo on > monitor_on_DEPRECATED
-    # cat kdamond_pid
-    18594
-
-
-使用多個監測線程
----------------
-
-每個監測上下文都會創建一個 ``kdamond`` 線程。你可以使用 ``mk_contexts`` 和 ``rm_contexts``
-文件爲多個 ``kdamond`` 需要的用例創建和刪除監測上下文。
-
-將新上下文的名稱寫入 ``mk_contexts`` 文件，在 ``DAMON debugfs`` 目錄上創建一個該名稱的目錄。
-該目錄將有該上下文的 ``DAMON debugfs`` 文件::
-
-    # cd <debugfs>/damon
-    # ls foo
-    # ls: cannot access 'foo': No such file or directory
-    # echo foo > mk_contexts
-    # ls foo
-    # attrs  init_regions  kdamond_pid  schemes  target_ids
-
-如果不再需要上下文，你可以通過把上下文的名字放到 ``rm_contexts`` 文件中來刪除它和相應的目錄::
-
-    # echo foo > rm_contexts
-    # ls foo
-    # ls: cannot access 'foo': No such file or directory
-
-注意， ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目錄下。
-

 監測結果的監測點
 ================
--- a/34
+++ b/34
@ -2827,7 +2827,7 @@ ARM/NXP S32G ARCHITECTURE
 R:	Chester Lin <chester62515@gmail.com>
 R:	Matthias Brugger <mbrugger@suse.com>
 R:	Ghennadi Procopciuc <ghennadi.procopciuc@oss.nxp.com>
-L:	NXP S32 Linux Team <s32@nxp.com>
+R:	NXP S32 Linux Team <s32@nxp.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	arch/arm64/boot/dts/freescale/s32g*.dts*
@ -6327,6 +6327,7 @@ F:	Documentation/mm/damon/
 F:	include/linux/damon.h
 F:	include/trace/events/damon.h
 F:	mm/damon/
+F:	samples/damon/
 F:	tools/testing/selftests/damon/

 DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
@ -15068,7 +15069,15 @@ L:	linux-mm@kvack.org
 S:	Maintained
 W:	http://www.linux-mm.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F:	mm/mlock.c
 F:	mm/mmap.c
+F:	mm/mprotect.c
+F:	mm/mremap.c
+F:	mm/mseal.c
+F:	mm/vma.c
+F:	mm/vma.h
+F:	mm/vma_internal.h
+F:	tools/testing/vma/

 MEMORY TECHNOLOGY DEVICES (MTD)
 M:	Miquel Raynal <miquel.raynal@bootlin.com>
@ -16600,8 +16609,8 @@ F:	arch/nios2/

 NITRO ENCLAVES (NE)
 M:	Alexandru Ciobotaru <alcioa@amazon.com>
+R:	The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
 L:	linux-kernel@vger.kernel.org
-L:	The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
 S:	Supported
 W:	https://aws.amazon.com/ec2/nitro/nitro-enclaves/
 F:	Documentation/virt/ne_overview.rst
@ -16612,8 +16621,8 @@ F:	samples/nitro_enclaves/

 NITRO SECURE MODULE (NSM)
 M:	Alexander Graf <graf@amazon.com>
+R:	The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
 L:	linux-kernel@vger.kernel.org
-L:	The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
 S:	Supported
 W:	https://aws.amazon.com/ec2/nitro/nitro-enclaves/
 F:	drivers/misc/nsm.c
@ -18425,8 +18434,8 @@ M:	Fabio Estevam <festevam@gmail.com>
 M:	Shawn Guo <shawnguo@kernel.org>
 M:	Jacky Bai <ping.bai@nxp.com>
 R:	Pengutronix Kernel Team <kernel@pengutronix.de>
+R:	NXP S32 Linux Team <s32@nxp.com>
 L:	linux-gpio@vger.kernel.org
-L:	NXP S32 Linux Team <s32@nxp.com>
 S:	Maintained
 F:	Documentation/devicetree/bindings/pinctrl/fsl,*
 F:	Documentation/devicetree/bindings/pinctrl/nxp,s32*
@ -19561,7 +19570,7 @@ F:	drivers/ras/amd/fmpm.c

 RASPBERRY PI PISP BACK END
 M:	Jacopo Mondi <jacopo.mondi@ideasonboard.com>
-L:	Raspberry Pi Kernel Maintenance <kernel-list@raspberrypi.com>
+R:	Raspberry Pi Kernel Maintenance <kernel-list@raspberrypi.com>
 L:	linux-media@vger.kernel.org
 S:	Maintained
 F:	Documentation/devicetree/bindings/media/raspberrypi,pispbe.yaml
@ -25018,21 +25027,6 @@ F:	include/uapi/linux/vsockmon.h
 F:	net/vmw_vsock/
 F:	tools/testing/vsock/

-VMA
-M:	Andrew Morton <akpm@linux-foundation.org>
-M:	Liam R. Howlett <Liam.Howlett@oracle.com>
-M:	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
-R:	Vlastimil Babka <vbabka@suse.cz>
-R:	Jann Horn <jannh@google.com>
-L:	linux-mm@kvack.org
-S:	Maintained
-W:	https://www.linux-mm.org
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
-F:	mm/vma.c
-F:	mm/vma.h
-F:	mm/vma_internal.h
-F:	tools/testing/vma/
-
 VMALLOC
 M:	Andrew Morton <akpm@linux-foundation.org>
 R:	Uladzislau Rezki <urezki@gmail.com>
--- a/arch/alpha/kernel/core_cia.c
+++ b/arch/alpha/kernel/core_cia.c
@ -331,10 +331,7 @@ cia_prepare_tbia_workaround(int window)
 	long i;

 	/* Use minimal 1K map. */
-	ppte = memblock_alloc(CIA_BROKEN_TBIA_SIZE, 32768);
-	if (!ppte)
-		panic("%s: Failed to allocate %u bytes align=0x%x\n",
-		      __func__, CIA_BROKEN_TBIA_SIZE, 32768);
+	ppte = memblock_alloc_or_panic(CIA_BROKEN_TBIA_SIZE, 32768);
 	pte = (virt_to_phys(ppte) >> (PAGE_SHIFT - 1)) | 1;

 	for (i = 0; i < CIA_BROKEN_TBIA_SIZE / sizeof(unsigned long); ++i)
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@ -81,10 +81,7 @@ mk_resource_name(int pe, int port, char *str)
 	char *name;
 	
 	sprintf(tmp, "PCI %s PE %d PORT %d", str, pe, port);
-	name = memblock_alloc(strlen(tmp) + 1, SMP_CACHE_BYTES);
-	if (!name)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      strlen(tmp) + 1);
+	name = memblock_alloc_or_panic(strlen(tmp) + 1, SMP_CACHE_BYTES);
 	strcpy(name, tmp);

 	return name;
@ -119,10 +116,7 @@ alloc_io7(unsigned int pe)
 		return NULL;
 	}

-	io7 = memblock_alloc(sizeof(*io7), SMP_CACHE_BYTES);
-	if (!io7)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      sizeof(*io7));
+	io7 = memblock_alloc_or_panic(sizeof(*io7), SMP_CACHE_BYTES);
 	io7->pe = pe;
 	raw_spin_lock_init(&io7->irq_lock);

--- a/arch/alpha/kernel/pci.c
+++ b/arch/alpha/kernel/pci.c
@ -391,10 +391,7 @@ alloc_pci_controller(void)
 {
 	struct pci_controller *hose;

-	hose = memblock_alloc(sizeof(*hose), SMP_CACHE_BYTES);
-	if (!hose)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      sizeof(*hose));
+	hose = memblock_alloc_or_panic(sizeof(*hose), SMP_CACHE_BYTES);

 	*hose_tail = hose;
 	hose_tail = &hose->next;
@ -405,13 +402,7 @@ alloc_pci_controller(void)
 struct resource * __init
 alloc_resource(void)
 {
-	void *ptr = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
-
-	if (!ptr)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      sizeof(struct resource));
-
-	return ptr;
+	return memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES);
 }


--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@ -71,14 +71,8 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base,
 	if (align < mem_size)
 		align = mem_size;

-	arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES);
-	if (!arena)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      sizeof(*arena));
-	arena->ptes = memblock_alloc(mem_size, align);
-	if (!arena->ptes)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, mem_size, align);
+	arena = memblock_alloc_or_panic(sizeof(*arena), SMP_CACHE_BYTES);
+	arena->ptes = memblock_alloc_or_panic(mem_size, align);

 	spin_lock_init(&arena->lock);
 	arena->hose = hose;
--- a/arch/alpha/lib/fpreg.c
+++ b/arch/alpha/lib/fpreg.c
@ -10,7 +10,6 @@
 #include <linux/preempt.h>
 #include <asm/fpu.h>
 #include <asm/thread_info.h>
-#include <asm/fpu.h>

 #if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67)
 #define STT(reg,val)  asm volatile ("ftoit $f"#reg",%0" : "=r"(val));
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@ -42,7 +42,7 @@ pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *ret, *init;

-	ret = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	ret = __pgd_alloc(mm, 0);
 	init = pgd_offset(&init_mm, 0UL);
 	if (ret) {
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
--- a/arch/arc/include/asm/pgalloc.h
+++ b/arch/arc/include/asm/pgalloc.h
@ -53,19 +53,14 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_

 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pgd_t *ret = (pgd_t *) __get_free_page(GFP_KERNEL);
+	pgd_t *ret = __pgd_alloc(mm, 0);

 	if (ret) {
 		int num, num2;
-		num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE;
-		memzero(ret, num * sizeof(pgd_t));

+		num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE;
 		num2 = VMALLOC_SIZE / PGDIR_SIZE;
 		memcpy(ret + num, swapper_pg_dir + num, num2 * sizeof(pgd_t));
-
-		memzero(ret + num + num2,
-			       (PTRS_PER_PGD - num - num2) * sizeof(pgd_t));
-
 	}
 	return ret;
 }
--- a/arch/arc/kernel/unaligned.c
+++ b/arch/arc/kernel/unaligned.c
@ -200,7 +200,6 @@ int misaligned_fixup(unsigned long address, struct pt_regs *regs,
 		     struct callee_regs *cregs)
 {
 	struct disasm_state state;
-	char buf[TASK_COMM_LEN];

 	/* handle user mode only and only if enabled by sysadmin */
 	if (!user_mode(regs) || !unaligned_enabled)
@ -212,11 +211,11 @@ int misaligned_fixup(unsigned long address, struct pt_regs *regs,
 			     " performance significantly\n. To enable further"
 			     " logging of such instances, please \n"
 			     " echo 0 > /proc/sys/kernel/ignore-unaligned-usertrap\n",
-			     get_task_comm(buf, current), task_pid_nr(current));
+			     current->comm, task_pid_nr(current));
 	} else {
 		/* Add rate limiting if it gets down to it */
 		pr_warn("%s(%d): unaligned access to/from 0x%lx by PC: 0x%lx\n",
-			get_task_comm(buf, current), task_pid_nr(current),
+			current->comm, task_pid_nr(current),
 			address, regs->ret);

 	}
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@ -26,14 +26,7 @@

 #else /* !CONFIG_MMU */

-#include <linux/swap.h>
 #include <asm/tlbflush.h>
-
-static inline void __tlb_remove_table(void *_table)
-{
-	free_page_and_swap_cache((struct page *)_table);
-}
-
 #include <asm-generic/tlb.h>

 static inline void
@ -41,8 +34,6 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
 {
 	struct ptdesc *ptdesc = page_ptdesc(pte);

-	pagetable_pte_dtor(ptdesc);
-
 #ifndef CONFIG_ARM_LPAE
 	/*
 	 * With the classic ARM MMU, a pte page has two corresponding pmd
@ -61,7 +52,6 @@ __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr)
 #ifdef CONFIG_ARM_LPAE
 	struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);

-	pagetable_pmd_dtor(ptdesc);
 	tlb_remove_ptdesc(tlb, ptdesc);
 #endif
 }
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@ -880,10 +880,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
 		 */
 		boot_alias_start = phys_to_idmap(start);
 		if (arm_has_idmap_alias() && boot_alias_start != IDMAP_INVALID_ADDR) {
-			res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES);
-			if (!res)
-				panic("%s: Failed to allocate %zu bytes\n",
-				      __func__, sizeof(*res));
+			res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES);
 			res->name = "System RAM (boot alias)";
 			res->start = boot_alias_start;
 			res->end = phys_to_idmap(res_end);
@ -891,10 +888,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
 			request_resource(&iomem_resource, res);
 		}

-		res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES);
-		if (!res)
-			panic("%s: Failed to allocate %zu bytes\n", __func__,
-			      sizeof(*res));
+		res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES);
 		res->name  = "System RAM";
 		res->start = start;
 		res->end = res_end;
--- a/arch/arm/mach-pxa/sharpsl_pm.c
+++ b/arch/arm/mach-pxa/sharpsl_pm.c
@ -31,10 +31,10 @@
 /*
 * Constants
 */
-#define SHARPSL_CHARGE_ON_TIME_INTERVAL        (msecs_to_jiffies(1*60*1000))  /* 1 min */
-#define SHARPSL_CHARGE_FINISH_TIME             (msecs_to_jiffies(10*60*1000)) /* 10 min */
-#define SHARPSL_BATCHK_TIME                    (msecs_to_jiffies(15*1000))    /* 15 sec */
-#define SHARPSL_BATCHK_TIME_SUSPEND            (60*10)                        /* 10 min */
+#define SHARPSL_CHARGE_ON_TIME_INTERVAL        (secs_to_jiffies(60))
+#define SHARPSL_CHARGE_FINISH_TIME             (secs_to_jiffies(10*60))
+#define SHARPSL_BATCHK_TIME                    (secs_to_jiffies(15))
+#define SHARPSL_BATCHK_TIME_SUSPEND            (60*10) /* 10 min */

 #define SHARPSL_WAIT_CO_TIME                   15  /* 15 sec */
 #define SHARPSL_WAIT_DISCHARGE_ON              100 /* 100 msec */
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@ -726,13 +726,8 @@ EXPORT_SYMBOL(phys_mem_access_prot);

 static void __init *early_alloc(unsigned long sz)
 {
-	void *ptr = memblock_alloc(sz, sz);
+	return memblock_alloc_or_panic(sz, sz);

-	if (!ptr)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, sz, sz);
-
-	return ptr;
 }

 static void *__init late_alloc(unsigned long sz)
@ -1027,10 +1022,7 @@ void __init iotable_init(struct map_desc *io_desc, int nr)
 	if (!nr)
 		return;

-	svm = memblock_alloc(sizeof(*svm) * nr, __alignof__(*svm));
-	if (!svm)
-		panic("%s: Failed to allocate %zu bytes align=0x%zx\n",
-		      __func__, sizeof(*svm) * nr, __alignof__(*svm));
+	svm = memblock_alloc_or_panic(sizeof(*svm) * nr, __alignof__(*svm));

 	for (md = io_desc; nr; md++, nr--) {
 		create_mapping(md);
@ -1052,10 +1044,7 @@ void __init vm_reserve_area_early(unsigned long addr, unsigned long size,
 	struct vm_struct *vm;
 	struct static_vm *svm;

-	svm = memblock_alloc(sizeof(*svm), __alignof__(*svm));
-	if (!svm)
-		panic("%s: Failed to allocate %zu bytes align=0x%zx\n",
-		      __func__, sizeof(*svm), __alignof__(*svm));
+	svm = memblock_alloc_or_panic(sizeof(*svm), __alignof__(*svm));

 	vm = &svm->vm;
 	vm->addr = (void *)addr;
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@ -162,10 +162,7 @@ void __init paging_init(const struct machine_desc *mdesc)
 	mpu_setup();

 	/* allocate the zero page. */
-	zero_page = (void *)memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!zero_page)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, PAGE_SIZE, PAGE_SIZE);
+	zero_page = (void *)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);

 	bootmem_init();

--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@ -17,11 +17,11 @@
 #include "mm.h"

 #ifdef CONFIG_ARM_LPAE
-#define __pgd_alloc()	kmalloc_array(PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL)
-#define __pgd_free(pgd)	kfree(pgd)
+#define _pgd_alloc(mm)		kmalloc_array(PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL | __GFP_ZERO)
+#define _pgd_free(mm, pgd)	kfree(pgd)
 #else
-#define __pgd_alloc()	(pgd_t *)__get_free_pages(GFP_KERNEL, 2)
-#define __pgd_free(pgd)	free_pages((unsigned long)pgd, 2)
+#define _pgd_alloc(mm)		__pgd_alloc(mm, 2)
+#define _pgd_free(mm, pgd)	__pgd_free(mm, pgd)
 #endif

 /*
@ -35,12 +35,10 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	pmd_t *new_pmd, *init_pmd;
 	pte_t *new_pte, *init_pte;

-	new_pgd = __pgd_alloc();
+	new_pgd = _pgd_alloc(mm);
 	if (!new_pgd)
 		goto no_pgd;

-	memset(new_pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
-
 	/*
 	 * Copy over the kernel and IO PGD entries
 	 */
@ -134,7 +132,7 @@ no_pmd:
 no_pud:
 	p4d_free(mm, new_p4d);
 no_p4d:
-	__pgd_free(new_pgd);
+	_pgd_free(mm, new_pgd);
 no_pgd:
 	return NULL;
 }
@ -207,5 +205,5 @@ no_pgd:
 		p4d_free(mm, p4d);
 	}
 #endif
-	__pgd_free(pgd_base);
+	_pgd_free(mm, pgd_base);
 }
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@ -85,24 +85,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
 	__pgd_populate(pgdp, __pa(p4dp), pgdval);
 }

-static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-	gfp_t gfp = GFP_PGTABLE_USER;
-
-	if (mm == &init_mm)
-		gfp = GFP_PGTABLE_KERNEL;
-	return (p4d_t *)get_zeroed_page(gfp);
-}
-
-static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
-{
-	if (!pgtable_l5_enabled())
-		return;
-	BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
-	free_page((unsigned long)p4d);
-}
-
-#define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
 #else
 static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
 {
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@ -9,12 +9,7 @@
 #define __ASM_TLB_H

 #include <linux/pagemap.h>
-#include <linux/swap.h>

-static inline void __tlb_remove_table(void *_table)
-{
-	free_page_and_swap_cache((struct page *)_table);
-}

 #define tlb_flush tlb_flush
 static void tlb_flush(struct mmu_gather *tlb);
@ -82,7 +77,6 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 {
 	struct ptdesc *ptdesc = page_ptdesc(pte);

-	pagetable_pte_dtor(ptdesc);
 	tlb_remove_ptdesc(tlb, ptdesc);
 }

@ -92,7 +86,6 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
 {
 	struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);

-	pagetable_pmd_dtor(ptdesc);
 	tlb_remove_ptdesc(tlb, ptdesc);
 }
 #endif
@ -106,7 +99,19 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
 	if (!pgtable_l4_enabled())
 		return;

-	pagetable_pud_dtor(ptdesc);
+	tlb_remove_ptdesc(tlb, ptdesc);
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 4
+static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4dp,
+				  unsigned long addr)
+{
+	struct ptdesc *ptdesc = virt_to_ptdesc(p4dp);
+
+	if (!pgtable_l5_enabled())
+		return;
+
 	tlb_remove_ptdesc(tlb, ptdesc);
 }
 #endif
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@ -223,9 +223,7 @@ static void __init request_standard_resources(void)

 	num_standard_resources = memblock.memory.cnt;
 	res_size = num_standard_resources * sizeof(*standard_resources);
-	standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES);
-	if (!standard_resources)
-		panic("%s: Failed to allocate %zu bytes\n", __func__, res_size);
+	standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES);

 	for_each_mem_region(region) {
 		res = &standard_resources[i++];
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@ -33,7 +33,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	gfp_t gfp = GFP_PGTABLE_USER;

 	if (pgdir_is_page_size())
-		return (pgd_t *)__get_free_page(gfp);
+		return __pgd_alloc(mm, 0);
 	else
 		return kmem_cache_alloc(pgd_cache, gfp);
 }
@ -41,7 +41,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	if (pgdir_is_page_size())
-		free_page((unsigned long)pgd);
+		__pgd_free(mm, pgd);
 	else
 		kmem_cache_free(pgd_cache, pgd);
 }
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@ -44,7 +44,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 	pgd_t *ret;
 	pgd_t *init;

-	ret = (pgd_t *) __get_free_page(GFP_KERNEL);
+	ret = __pgd_alloc(mm, 0);
 	if (ret) {
 		init = pgd_offset(&init_mm, 0UL);
 		pgd_init((unsigned long *)ret);
@ -63,7 +63,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)

 #define __pte_free_tlb(tlb, pte, address)		\
 do {							\
-	pagetable_pte_dtor(page_ptdesc(pte));		\
+	pagetable_dtor(page_ptdesc(pte));		\
 	tlb_remove_page_ptdesc(tlb, page_ptdesc(pte));	\
 } while (0)

--- a/arch/hexagon/include/asm/pgalloc.h
+++ b/arch/hexagon/include/asm/pgalloc.h
@ -22,7 +22,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd;

-	pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	pgd = __pgd_alloc(mm, 0);

 	/*
 	 * There may be better ways to do this, but to ensure
@ -89,7 +89,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,

 #define __pte_free_tlb(tlb, pte, addr)				\
 do {								\
-	pagetable_pte_dtor((page_ptdesc(pte)));			\
+	pagetable_dtor((page_ptdesc(pte)));			\
 	tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));	\
 } while (0)

--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@ -113,7 +113,10 @@ CONFIG_ZBUD=y
 CONFIG_ZSMALLOC=m
 # CONFIG_COMPAT_BRK is not set
 CONFIG_MEMORY_HOTPLUG=y
-CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
+# CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE is not set
+CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO=y
+# CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL is not set
+# CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE is not set
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@ -57,7 +57,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)

 #define __pte_free_tlb(tlb, pte, address)			\
 do {								\
-	pagetable_pte_dtor(page_ptdesc(pte));			\
+	pagetable_dtor(page_ptdesc(pte));			\
 	tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));	\
 } while (0)

--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@ -431,7 +431,7 @@ static void __init resource_init(void)

 	num_standard_resources = memblock.memory.cnt;
 	res_size = num_standard_resources * sizeof(*standard_resources);
-	standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES);
+	standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES);

 	for_each_mem_region(region) {
 		res = &standard_resources[i++];
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@ -174,9 +174,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr)
 	pmd_t *pmd;

 	if (p4d_none(p4dp_get(p4d))) {
-		pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-		if (!pud)
-			panic("%s: Failed to allocate memory\n", __func__);
+		pud = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
 		p4d_populate(&init_mm, p4d, pud);
 #ifndef __PAGETABLE_PUD_FOLDED
 		pud_init(pud);
@ -185,9 +183,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr)

 	pud = pud_offset(p4d, addr);
 	if (pud_none(pudp_get(pud))) {
-		pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-		if (!pmd)
-			panic("%s: Failed to allocate memory\n", __func__);
+		pmd = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
 		pud_populate(&init_mm, pud, pmd);
 #ifndef __PAGETABLE_PMD_FOLDED
 		pmd_init(pmd);
@ -198,10 +194,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr)
 	if (!pmd_present(pmdp_get(pmd))) {
 		pte_t *pte;

-		pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-		if (!pte)
-			panic("%s: Failed to allocate memory\n", __func__);
-
+		pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
 		pmd_populate_kernel(&init_mm, pmd, pte);
 		kernel_pte_init(pte);
 	}
--- a/arch/loongarch/mm/pgtable.c
+++ b/arch/loongarch/mm/pgtable.c
@ -23,11 +23,10 @@ EXPORT_SYMBOL(tlb_virt_to_page);

 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pgd_t *init, *ret = NULL;
-	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
+	pgd_t *init, *ret;

-	if (ptdesc) {
-		ret = (pgd_t *)ptdesc_address(ptdesc);
+	ret = __pgd_alloc(mm, 0);
+	if (ret) {
 		init = pgd_offset(&init_mm, 0UL);
 		pgd_init(ret);
 		memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@ -629,7 +629,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@ -586,7 +586,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@ -606,7 +606,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@ -578,7 +578,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@ -588,7 +588,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@ -605,7 +605,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@ -692,7 +692,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@ -578,7 +578,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@ -579,7 +579,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@ -595,7 +595,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@ -575,7 +575,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@ -576,7 +576,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/m68k/include/asm/mcf_pgalloc.h
+++ b/arch/m68k/include/asm/mcf_pgalloc.h
@ -37,7 +37,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable,
 {
 	struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);

-	pagetable_pte_dtor(ptdesc);
+	pagetable_dtor(ptdesc);
 	pagetable_free(ptdesc);
 }

@ -61,7 +61,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)
 {
 	struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);

-	pagetable_pte_dtor(ptdesc);
+	pagetable_dtor(ptdesc);
 	pagetable_free(ptdesc);
 }

@ -73,7 +73,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)

 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	pagetable_free(virt_to_ptdesc(pgd));
+	pagetable_dtor_free(virt_to_ptdesc(pgd));
 }

 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
@ -84,6 +84,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)

 	if (!ptdesc)
 		return NULL;
+	pagetable_pgd_ctor(ptdesc);
 	new_pgd = ptdesc_address(ptdesc);

 	memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t));
--- a/arch/m68k/include/asm/motorola_pgalloc.h
+++ b/arch/m68k/include/asm/motorola_pgalloc.h
@ -9,9 +9,9 @@ extern void mmu_page_ctor(void *page);
 extern void mmu_page_dtor(void *page);

 enum m68k_table_types {
-	TABLE_PGD = 0,
-	TABLE_PMD = 0, /* same size as PGD */
-	TABLE_PTE = 1,
+	TABLE_PGD,
+	TABLE_PMD,
+	TABLE_PTE,
 };

 extern void init_pointer_table(void *table, int type);
--- a/arch/m68k/include/asm/sun3_pgalloc.h
+++ b/arch/m68k/include/asm/sun3_pgalloc.h
@ -19,7 +19,7 @@ extern const char bad_pmd_string[];

 #define __pte_free_tlb(tlb, pte, addr)				\
 do {								\
-	pagetable_pte_dtor(page_ptdesc(pte));			\
+	pagetable_dtor(page_ptdesc(pte));			\
 	tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));	\
 } while (0)

@ -43,7 +43,7 @@ static inline pgd_t * pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *new_pgd;

-	new_pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+	new_pgd = __pgd_alloc(mm, 0);
 	memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE);
 	memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT));
 	return new_pgd;
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@ -68,10 +68,7 @@ void __init paging_init(void)

 	high_memory = (void *) end_mem;

-	empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!empty_zero_page)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, PAGE_SIZE, PAGE_SIZE);
+	empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
 	max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT;
 	free_area_init(max_zone_pfn);
 }
--- a/arch/m68k/mm/mcfmmu.c
+++ b/arch/m68k/mm/mcfmmu.c
@ -42,20 +42,14 @@ void __init paging_init(void)
 	unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
 	int i;

-	empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!empty_zero_page)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, PAGE_SIZE, PAGE_SIZE);
+	empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);

 	pg_dir = swapper_pg_dir;
 	memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir));

 	size = num_pages * sizeof(pte_t);
 	size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1);
-	next_pgtable = (unsigned long) memblock_alloc(size, PAGE_SIZE);
-	if (!next_pgtable)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, size, PAGE_SIZE);
+	next_pgtable = (unsigned long) memblock_alloc_or_panic(size, PAGE_SIZE);

 	pg_dir += PAGE_OFFSET >> PGDIR_SHIFT;

--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@ -97,17 +97,19 @@ void mmu_page_dtor(void *page)

 typedef struct list_head ptable_desc;

-static struct list_head ptable_list[2] = {
+static struct list_head ptable_list[3] = {
 	LIST_HEAD_INIT(ptable_list[0]),
 	LIST_HEAD_INIT(ptable_list[1]),
+	LIST_HEAD_INIT(ptable_list[2]),
 };

 #define PD_PTABLE(page) ((ptable_desc *)&(virt_to_page((void *)(page))->lru))
 #define PD_PAGE(ptable) (list_entry(ptable, struct page, lru))
 #define PD_MARKBITS(dp) (*(unsigned int *)&PD_PAGE(dp)->index)

-static const int ptable_shift[2] = {
-	7+2, /* PGD, PMD */
+static const int ptable_shift[3] = {
+	7+2, /* PGD */
+	7+2, /* PMD */
 	6+2, /* PTE */
 };

@ -156,12 +158,20 @@ void *get_pointer_table(int type)
 		if (!(page = (void *)get_zeroed_page(GFP_KERNEL)))
 			return NULL;

-		if (type == TABLE_PTE) {
+		switch (type) {
+		case TABLE_PTE:
 			/*
 			 * m68k doesn't have SPLIT_PTE_PTLOCKS for not having
 			 * SMP.
 			 */
 			pagetable_pte_ctor(virt_to_ptdesc(page));
+			break;
+		case TABLE_PMD:
+			pagetable_pmd_ctor(virt_to_ptdesc(page));
+			break;
+		case TABLE_PGD:
+			pagetable_pgd_ctor(virt_to_ptdesc(page));
+			break;
 		}

 		mmu_page_ctor(page);
@ -200,8 +210,7 @@ int free_pointer_table(void *table, int type)
 		/* all tables in page are free, free page */
 		list_del(dp);
 		mmu_page_dtor((void *)page);
-		if (type == TABLE_PTE)
-			pagetable_pte_dtor(virt_to_ptdesc((void *)page));
+		pagetable_dtor(virt_to_ptdesc((void *)page));
 		free_page (page);
 		return 1;
 	} else if (ptable_list[type].next != dp) {
@ -491,10 +500,7 @@ void __init paging_init(void)
 	 * initialize the bad page table and bad page to point
 	 * to a couple of allocated pages
 	 */
-	empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!empty_zero_page)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, PAGE_SIZE, PAGE_SIZE);
+	empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);

 	/*
 	 * Set up SFC/DFC registers
--- a/arch/m68k/mm/sun3mmu.c
+++ b/arch/m68k/mm/sun3mmu.c
@ -44,10 +44,7 @@ void __init paging_init(void)
 	unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
 	unsigned long size;

-	empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!empty_zero_page)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, PAGE_SIZE, PAGE_SIZE);
+	empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);

 	address = PAGE_OFFSET;
 	pg_dir = swapper_pg_dir;
@ -57,10 +54,7 @@ void __init paging_init(void)
 	size = num_pages * sizeof(pte_t);
 	size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1);

-	next_pgtable = (unsigned long)memblock_alloc(size, PAGE_SIZE);
-	if (!next_pgtable)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, size, PAGE_SIZE);
+	next_pgtable = (unsigned long)memblock_alloc_or_panic(size, PAGE_SIZE);
 	bootmem_end = (next_pgtable + size + PAGE_SIZE) & PAGE_MASK;

 	/* Map whole memory from PAGE_OFFSET (0x0E000000) */
--- a/arch/m68k/sun3/sun3dvma.c
+++ b/arch/m68k/sun3/sun3dvma.c
@ -252,12 +252,8 @@ void __init dvma_init(void)

 	list_add(&(hole->list), &hole_list);

-	iommu_use = memblock_alloc(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long),
+	iommu_use = memblock_alloc_or_panic(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long),
 				   SMP_CACHE_BYTES);
-	if (!iommu_use)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      IOMMU_TOTAL_ENTRIES * sizeof(unsigned long));
-
 	dvma_unmap_iommu(DVMA_START, DVMA_SIZE);

 	sun3_dvma_init();
--- a/arch/microblaze/include/asm/pgalloc.h
+++ b/arch/microblaze/include/asm/pgalloc.h
@ -21,12 +21,7 @@

 extern void __bad_pte(pmd_t *pmd);

-static inline pgd_t *get_pgd(void)
-{
-	return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
-}
-
-#define pgd_alloc(mm)		get_pgd()
+#define pgd_alloc(mm)		__pgd_alloc(mm, 0)

 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);

--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@ -15,7 +15,6 @@

 #define __HAVE_ARCH_PMD_ALLOC_ONE
 #define __HAVE_ARCH_PUD_ALLOC_ONE
-#define __HAVE_ARCH_PGD_FREE
 #include <asm-generic/pgalloc.h>

 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
@ -49,14 +48,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 extern void pgd_init(void *addr);
 extern pgd_t *pgd_alloc(struct mm_struct *mm);

-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	pagetable_free(virt_to_ptdesc(pgd));
-}
-
 #define __pte_free_tlb(tlb, pte, address)			\
 do {								\
-	pagetable_pte_dtor(page_ptdesc(pte));			\
+	pagetable_dtor(page_ptdesc(pte));			\
 	tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));	\
 } while (0)

--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@ -704,10 +704,7 @@ static void __init resource_init(void)
 	for_each_mem_range(i, &start, &end) {
 		struct resource *res;

-		res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
-		if (!res)
-			panic("%s: Failed to allocate %zu bytes\n", __func__,
-			      sizeof(struct resource));
+		res = memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES);

 		res->start = start;
 		/*
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@ -11,6 +11,7 @@
 #include <linux/ioport.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/mman.h>
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@ -97,11 +98,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		return -EINTR;

 	if (IS_ENABLED(CONFIG_MIPS_FP_SUPPORT)) {
+		unsigned long unused;
+
 		/* Map delay slot emulation page */
-		base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
-				VM_READ | VM_EXEC |
-				VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
-				0, NULL);
+		base = do_mmap(NULL, STACK_TOP, PAGE_SIZE, PROT_READ | PROT_EXEC,
+			       MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, 0, 0, &unused,
+			       NULL);
 		if (IS_ERR_VALUE(base)) {
 			ret = base;
 			goto out;
--- a/arch/mips/mm/pgtable.c
+++ b/arch/mips/mm/pgtable.c
@ -10,12 +10,10 @@

 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pgd_t *init, *ret = NULL;
-	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM,
-			PGD_TABLE_ORDER);
+	pgd_t *init, *ret;

-	if (ptdesc) {
-		ret = ptdesc_address(ptdesc);
+	ret = __pgd_alloc(mm, PGD_TABLE_ORDER);
+	if (ret) {
 		init = pgd_offset(&init_mm, 0UL);
 		pgd_init(ret);
 		memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@ -30,7 +30,7 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm);

 #define __pte_free_tlb(tlb, pte, addr)					\
 	do {								\
-		pagetable_pte_dtor(page_ptdesc(pte));			\
+		pagetable_dtor(page_ptdesc(pte));			\
 		tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));	\
 	} while (0)

--- a/arch/nios2/mm/pgtable.c
+++ b/arch/nios2/mm/pgtable.c
@ -11,6 +11,7 @@
 #include <linux/sched.h>

 #include <asm/cpuinfo.h>
+#include <asm/pgalloc.h>

 /* pteaddr:
 *   ptbase | vpn* | zero
@ -54,7 +55,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *ret, *init;

-	ret = (pgd_t *) __get_free_page(GFP_KERNEL);
+	ret = __pgd_alloc(mm, 0);
 	if (ret) {
 		init = pgd_offset(&init_mm, 0UL);
 		pgd_init(ret);
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h
@ -41,15 +41,13 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 */
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pgd_t *ret = (pgd_t *)__get_free_page(GFP_KERNEL);
+	pgd_t *ret = __pgd_alloc(mm, 0);

-	if (ret) {
-		memset(ret, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
+	if (ret)
 		memcpy(ret + USER_PTRS_PER_PGD,
 		       swapper_pg_dir + USER_PTRS_PER_PGD,
 		       (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));

-	}
 	return ret;
 }

@ -68,7 +66,7 @@ extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);

 #define __pte_free_tlb(tlb, pte, addr)				\
 do {								\
-	pagetable_pte_dtor(page_ptdesc(pte));			\
+	pagetable_dtor(page_ptdesc(pte));			\
 	tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));	\
 } while (0)

--- a/arch/openrisc/mm/ioremap.c
+++ b/arch/openrisc/mm/ioremap.c
@ -38,10 +38,7 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm)
 	if (likely(mem_init_done)) {
 		pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
 	} else {
-		pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-		if (!pte)
-			panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-			      __func__, PAGE_SIZE, PAGE_SIZE);
+		pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
 	}

 	return pte;
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@ -11,27 +11,12 @@
 #include <asm/cache.h>

 #define __HAVE_ARCH_PMD_ALLOC_ONE
-#define __HAVE_ARCH_PMD_FREE
-#define __HAVE_ARCH_PGD_FREE
 #include <asm-generic/pgalloc.h>

 /* Allocate the top level pgd (page directory) */
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pgd_t *pgd;
-
-	pgd = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER);
-	if (unlikely(pgd == NULL))
-		return NULL;
-
-	memset(pgd, 0, PAGE_SIZE << PGD_TABLE_ORDER);
-
-	return pgd;
-}
-
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	free_pages((unsigned long)pgd, PGD_TABLE_ORDER);
+	return __pgd_alloc(mm, PGD_TABLE_ORDER);
 }

 #if CONFIG_PGTABLE_LEVELS == 3
@ -46,17 +31,19 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)

 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	pmd_t *pmd;
+	struct ptdesc *ptdesc;
+	gfp_t gfp = GFP_PGTABLE_USER;

-	pmd = (pmd_t *)__get_free_pages(GFP_PGTABLE_KERNEL, PMD_TABLE_ORDER);
-	if (likely(pmd))
-		memset ((void *)pmd, 0, PAGE_SIZE << PMD_TABLE_ORDER);
-	return pmd;
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-	free_pages((unsigned long)pmd, PMD_TABLE_ORDER);
+	if (mm == &init_mm)
+		gfp = GFP_PGTABLE_KERNEL;
+	ptdesc = pagetable_alloc(gfp, PMD_TABLE_ORDER);
+	if (!ptdesc)
+		return NULL;
+	if (!pagetable_pmd_ctor(ptdesc)) {
+		pagetable_free(ptdesc);
+		return NULL;
+	}
+	return ptdesc_address(ptdesc);
 }
 #endif

--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@ -377,10 +377,8 @@ static void __ref map_pages(unsigned long start_vaddr,

 #if CONFIG_PGTABLE_LEVELS == 3
 		if (pud_none(*pud)) {
-			pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER,
+			pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER,
 					     PAGE_SIZE << PMD_TABLE_ORDER);
-			if (!pmd)
-				panic("pmd allocation failed.\n");
 			pud_populate(NULL, pud, pmd);
 		}
 #endif
@ -388,9 +386,7 @@ static void __ref map_pages(unsigned long start_vaddr,
 		pmd = pmd_offset(pud, vaddr);
 		for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++, pmd++) {
 			if (pmd_none(*pmd)) {
-				pg_table = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-				if (!pg_table)
-					panic("page table allocation failed\n");
+				pg_table = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
 				pmd_populate_kernel(NULL, pmd, pg_table);
 			}

@ -648,9 +644,7 @@ static void __init pagetable_init(void)
 	}
 #endif

-	empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!empty_zero_page)
-		panic("zero page allocation failed.\n");
+	empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);

 }

@ -687,19 +681,15 @@ static void __init fixmap_init(void)

 #if CONFIG_PGTABLE_LEVELS == 3
 	if (pud_none(*pud)) {
-		pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER,
+		pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER,
 				     PAGE_SIZE << PMD_TABLE_ORDER);
-		if (!pmd)
-			panic("fixmap: pmd allocation failed.\n");
 		pud_populate(NULL, pud, pmd);
 	}
 #endif

 	pmd = pmd_offset(pud, addr);
 	do {
-		pte_t *pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-		if (!pte)
-			panic("fixmap: pte allocation failed.\n");
+		pte_t *pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);

 		pmd_populate_kernel(&init_mm, pmd, pte);

--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@ -451,7 +451,6 @@ CONFIG_TEST_PRINTF=m
 CONFIG_TEST_SCANF=m
 CONFIG_TEST_BITMAP=m
 CONFIG_TEST_UUID=m
-CONFIG_TEST_XARRAY=m
 CONFIG_TEST_MAPLE_TREE=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_IDA=m
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@ -37,6 +37,7 @@ extern void tlb_flush(struct mmu_gather *tlb);
 */
 #define tlb_needs_table_invalidate()	radix_enabled()

+#define __HAVE_ARCH_TLB_REMOVE_TABLE
 /* Get the generic bits... */
 #include <asm-generic/tlb.h>

--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@ -1087,12 +1087,10 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char
 	/* Count and allocate space for cpu features */
 	of_scan_flat_dt_subnodes(node, count_cpufeatures_subnodes,
 						&nr_dt_cpu_features);
-	dt_cpu_features = memblock_alloc(sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, PAGE_SIZE);
-	if (!dt_cpu_features)
-		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
-		      __func__,
-		      sizeof(struct dt_cpu_feature) * nr_dt_cpu_features,
-		      PAGE_SIZE);
+	dt_cpu_features =
+		memblock_alloc_or_panic(
+			sizeof(struct dt_cpu_feature) * nr_dt_cpu_features,
+			PAGE_SIZE);

 	cpufeatures_setup_start(isa);

--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@ -213,11 +213,8 @@ pci_create_OF_bus_map(void)
 	struct property* of_prop;
 	struct device_node *dn;

-	of_prop = memblock_alloc(sizeof(struct property) + 256,
+	of_prop = memblock_alloc_or_panic(sizeof(struct property) + 256,
 				 SMP_CACHE_BYTES);
-	if (!of_prop)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      sizeof(struct property) + 256);
 	dn = of_find_node_by_path("/");
 	if (dn) {
 		memset(of_prop, -1, sizeof(struct property) + 256);
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@ -458,11 +458,8 @@ void __init smp_setup_cpu_maps(void)

 	DBG("smp_setup_cpu_maps()\n");

-	cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32),
+	cpu_to_phys_id = memblock_alloc_or_panic(nr_cpu_ids * sizeof(u32),
 					__alignof__(u32));
-	if (!cpu_to_phys_id)
-		panic("%s: Failed to allocate %zu bytes align=0x%zx\n",
-		      __func__, nr_cpu_ids * sizeof(u32), __alignof__(u32));

 	for_each_node_by_type(dn, "cpu") {
 		const __be32 *intserv;
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@ -140,13 +140,7 @@ arch_initcall(ppc_init);

 static void *__init alloc_stack(void)
 {
-	void *ptr = memblock_alloc(THREAD_SIZE, THREAD_ALIGN);
-
-	if (!ptr)
-		panic("cannot allocate %d bytes for stack at %pS\n",
-		      THREAD_SIZE, (void *)_RET_IP_);
-
-	return ptr;
+	return memblock_alloc_or_panic(THREAD_SIZE, THREAD_ALIGN);
 }

 void __init irqstack_early_init(void)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@ -4957,7 +4957,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 			 * states are synchronized from L0 to L1. L1 needs to inform L0 about
 			 * MER=1 only when there are pending external interrupts.
 			 * In the above if check, MER bit is set if there are pending
-			 * external interrupts. Hence, explicity mask off MER bit
+			 * external interrupts. Hence, explicitly mask off MER bit
 			 * here as otherwise it may generate spurious interrupts in L2 KVM
 			 * causing an endless loop, which results in L2 guest getting hung.
 			 */
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@ -377,10 +377,7 @@ void __init MMU_init_hw(void)
 	 * Find some memory for the hash table.
 	 */
 	if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
-	Hash = memblock_alloc(Hash_size, Hash_size);
-	if (!Hash)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, Hash_size, Hash_size);
+	Hash = memblock_alloc_or_panic(Hash_size, Hash_size);
 	_SDR1 = __pa(Hash) | SDR1_LOW_BITS;

 	pr_info("Total memory = %lldMB; using %ldkB for hash table\n",
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@ -253,7 +253,7 @@ static void pmd_frag_destroy(void *pmd_frag)
 	count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
 	/* We allow PTE_FRAG_NR fragments from a PTE page */
 	if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
-		pagetable_pmd_dtor(ptdesc);
+		pagetable_dtor(ptdesc);
 		pagetable_free(ptdesc);
 	}
 }
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@ -330,11 +330,7 @@ void __init mmu_partition_table_init(void)
 	unsigned long ptcr;

 	/* Initialize the Partition Table with no entries */
-	partition_tb = memblock_alloc(patb_size, patb_size);
-	if (!partition_tb)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, patb_size, patb_size);
-
+	partition_tb = memblock_alloc_or_panic(patb_size, patb_size);
 	ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
 	set_ptcr_when_no_uv(ptcr);
 	powernv_set_nmmu_ptcr(ptcr);
@ -477,7 +473,7 @@ void pmd_fragment_free(unsigned long *pmd)

 	BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
 	if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
-		pagetable_pmd_dtor(ptdesc);
+		pagetable_dtor(ptdesc);
 		pagetable_free(ptdesc);
 	}
 }
--- a/arch/powerpc/mm/kasan/init_book3e_64.c
+++ b/arch/powerpc/mm/kasan/init_book3e_64.c
@ -40,19 +40,19 @@ static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgpr
 	pgdp = pgd_offset_k(ea);
 	p4dp = p4d_offset(pgdp, ea);
 	if (kasan_pud_table(*p4dp)) {
-		pudp = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
+		pudp = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
 		memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE);
 		p4d_populate(&init_mm, p4dp, pudp);
 	}
 	pudp = pud_offset(p4dp, ea);
 	if (kasan_pmd_table(*pudp)) {
-		pmdp = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
+		pmdp = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
 		memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE);
 		pud_populate(&init_mm, pudp, pmdp);
 	}
 	pmdp = pmd_offset(pudp, ea);
 	if (kasan_pte_table(*pmdp)) {
-		ptep = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
+		ptep = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
 		memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE);
 		pmd_populate_kernel(&init_mm, pmdp, ptep);
 	}
@ -74,7 +74,7 @@ static void __init kasan_init_phys_region(void *start, void *end)
 	k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE);
 	k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE);

-	va = memblock_alloc(k_end - k_start, PAGE_SIZE);
+	va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE);
 	for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE)
 		kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL);
 }
--- a/arch/powerpc/mm/kasan/init_book3s_64.c
+++ b/arch/powerpc/mm/kasan/init_book3s_64.c
@ -32,7 +32,7 @@ static void __init kasan_init_phys_region(void *start, void *end)
 	k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE);
 	k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE);

-	va = memblock_alloc(k_end - k_start, PAGE_SIZE);
+	va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE);
 	for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE)
 		map_kernel_page(k_cur, __pa(va), PAGE_KERNEL);
 }
--- a/arch/powerpc/mm/nohash/mmu_context.c
+++ b/arch/powerpc/mm/nohash/mmu_context.c
@ -385,21 +385,11 @@ void __init mmu_context_init(void)
 	/*
 	 * Allocate the maps used by context management
 	 */
-	context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
-	if (!context_map)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      CTX_MAP_SIZE);
-	context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1),
+	context_map = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES);
+	context_mm = memblock_alloc_or_panic(sizeof(void *) * (LAST_CONTEXT + 1),
 				    SMP_CACHE_BYTES);
-	if (!context_mm)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      sizeof(void *) * (LAST_CONTEXT + 1));
 	if (IS_ENABLED(CONFIG_SMP)) {
-		stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
-		if (!stale_map[boot_cpuid])
-			panic("%s: Failed to allocate %zu bytes\n", __func__,
-			      CTX_MAP_SIZE);
-
+		stale_map[boot_cpuid] = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES);
 		cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
 					  "powerpc/mmu/ctx:prepare",
 					  mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead);
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag)
 	count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
 	/* We allow PTE_FRAG_NR fragments from a PTE page */
 	if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
-		pagetable_pte_dtor(ptdesc);
+		pagetable_dtor(ptdesc);
 		pagetable_free(ptdesc);
 	}
 }
@ -111,7 +111,7 @@ static void pte_free_now(struct rcu_head *head)
 	struct ptdesc *ptdesc;

 	ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
-	pagetable_pte_dtor(ptdesc);
+	pagetable_dtor(ptdesc);
 	pagetable_free(ptdesc);
 }

--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@ -50,13 +50,8 @@ notrace void __init early_ioremap_init(void)

 void __init *early_alloc_pgtable(unsigned long size)
 {
-	void *ptr = memblock_alloc(size, size);
+	return memblock_alloc_or_panic(size, size);

-	if (!ptr)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, size, size);
-
-	return ptr;
 }

 pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va)
--- a/arch/powerpc/platforms/powermac/nvram.c
+++ b/arch/powerpc/platforms/powermac/nvram.c
@ -514,10 +514,7 @@ static int __init core99_nvram_setup(struct device_node *dp, unsigned long addr)
 		printk(KERN_ERR "nvram: no address\n");
 		return -EINVAL;
 	}
-	nvram_image = memblock_alloc(NVRAM_SIZE, SMP_CACHE_BYTES);
-	if (!nvram_image)
-		panic("%s: Failed to allocate %u bytes\n", __func__,
-		      NVRAM_SIZE);
+	nvram_image = memblock_alloc_or_panic(NVRAM_SIZE, SMP_CACHE_BYTES);
 	nvram_data = ioremap(addr, NVRAM_SIZE*2);
 	nvram_naddrs = 1; /* Make sure we get the correct case */

--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@ -88,26 +88,6 @@ static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
 	}
 }

-static void memtrace_clear_range(unsigned long start_pfn,
-				 unsigned long nr_pages)
-{
-	unsigned long pfn;
-
-	/* As HIGHMEM does not apply, use clear_page() directly. */
-	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-		if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
-			cond_resched();
-		clear_page(__va(PFN_PHYS(pfn)));
-	}
-	/*
-	 * Before we go ahead and use this range as cache inhibited range
-	 * flush the cache.
-	 */
-	flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn),
-				   (unsigned long)pfn_to_kaddr(start_pfn + nr_pages),
-				   FLUSH_CHUNK_SIZE);
-}
-
 static u64 memtrace_alloc_node(u32 nid, u64 size)
 {
 	const unsigned long nr_pages = PHYS_PFN(size);
@ -119,17 +99,18 @@ static u64 memtrace_alloc_node(u32 nid, u64 size)
 	 * by alloc_contig_pages().
 	 */
 	page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE |
-				  __GFP_NOWARN, nid, NULL);
+				  __GFP_NOWARN | __GFP_ZERO, nid, NULL);
 	if (!page)
 		return 0;
 	start_pfn = page_to_pfn(page);

 	/*
-	 * Clear the range while we still have a linear mapping.
-	 *
-	 * TODO: use __GFP_ZERO with alloc_contig_pages() once supported.
+	 * Before we go ahead and use this range as cache inhibited range
+	 * flush the cache.
 	 */
-	memtrace_clear_range(start_pfn, nr_pages);
+	flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn),
+				   (unsigned long)pfn_to_kaddr(start_pfn + nr_pages),
+				   FLUSH_CHUNK_SIZE);

 	/*
 	 * Set pages PageOffline(), to indicate that nobody (e.g., hibernation,
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@ -180,10 +180,7 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
 	/*
 	 * Allocate a buffer to hold the MC recoverable ranges.
 	 */
-	mc_recoverable_range = memblock_alloc(size, __alignof__(u64));
-	if (!mc_recoverable_range)
-		panic("%s: Failed to allocate %u bytes align=0x%lx\n",
-		      __func__, size, __alignof__(u64));
+	mc_recoverable_range = memblock_alloc_or_panic(size, __alignof__(u64));

 	for (i = 0; i < mc_recoverable_range_len; i++) {
 		mc_recoverable_range[i].start_addr =
--- a/arch/powerpc/platforms/ps3/setup.c
+++ b/arch/powerpc/platforms/ps3/setup.c
@ -115,10 +115,7 @@ static void __init prealloc(struct ps3_prealloc *p)
 	if (!p->size)
 		return;

-	p->address = memblock_alloc(p->size, p->align);
-	if (!p->address)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, p->size, p->align);
+	p->address = memblock_alloc_or_panic(p->size, p->align);

 	printk(KERN_INFO "%s: %lu bytes at %p\n", p->name, p->size,
 	       p->address);
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@ -544,7 +544,7 @@ static int drc_pmem_query_health(struct papr_scm_priv *p)

 	/* Jiffies offset for which the health data is assumed to be same */
 	cache_timeout = p->lasthealth_jiffies +
-		msecs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL * 1000);
+		secs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL);

 	/* Fetch new health info is its older than MIN_HEALTH_QUERY_INTERVAL */
 	if (time_after(jiffies, cache_timeout))
--- a/arch/powerpc/sysdev/msi_bitmap.c
+++ b/arch/powerpc/sysdev/msi_bitmap.c
@ -124,10 +124,7 @@ int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count,
 	if (bmp->bitmap_from_slab)
 		bmp->bitmap = kzalloc(size, GFP_KERNEL);
 	else {
-		bmp->bitmap = memblock_alloc(size, SMP_CACHE_BYTES);
-		if (!bmp->bitmap)
-			panic("%s: Failed to allocate %u bytes\n", __func__,
-			      size);
+		bmp->bitmap = memblock_alloc_or_panic(size, SMP_CACHE_BYTES);
 		/* the bitmap won't be freed from memblock allocator */
 		kmemleak_not_leak(bmp->bitmap);
 	}
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@ -12,16 +12,25 @@
 #include <asm/tlb.h>

 #ifdef CONFIG_MMU
-#define __HAVE_ARCH_PUD_ALLOC_ONE
 #define __HAVE_ARCH_PUD_FREE
 #include <asm-generic/pgalloc.h>

+/*
+ * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to
+ * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use
+ * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this
+ * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the
+ * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
+ * for more details.
+ */
 static inline void riscv_tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
 {
-	if (riscv_use_sbi_for_rfence())
+	if (riscv_use_sbi_for_rfence()) {
 		tlb_remove_ptdesc(tlb, pt);
-	else
+	} else {
+		pagetable_dtor(pt);
 		tlb_remove_page_ptdesc(tlb, pt);
+	}
 }

 static inline void pmd_populate_kernel(struct mm_struct *mm,
@ -88,15 +97,6 @@ static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd,
 	}
 }

-#define pud_alloc_one pud_alloc_one
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-	if (pgtable_l4_enabled)
-		return __pud_alloc_one(mm, addr);
-
-	return NULL;
-}
-
 #define pud_free pud_free
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
@ -107,39 +107,8 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 				  unsigned long addr)
 {
-	if (pgtable_l4_enabled) {
-		struct ptdesc *ptdesc = virt_to_ptdesc(pud);
-
-		pagetable_pud_dtor(ptdesc);
-		riscv_tlb_remove_ptdesc(tlb, ptdesc);
-	}
-}
-
-#define p4d_alloc_one p4d_alloc_one
-static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-	if (pgtable_l5_enabled) {
-		gfp_t gfp = GFP_PGTABLE_USER;
-
-		if (mm == &init_mm)
-			gfp = GFP_PGTABLE_KERNEL;
-		return (p4d_t *)get_zeroed_page(gfp);
-	}
-
-	return NULL;
-}
-
-static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d)
-{
-	BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
-	free_page((unsigned long)p4d);
-}
-
-#define p4d_free p4d_free
-static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
-{
-	if (pgtable_l5_enabled)
-		__p4d_free(mm, p4d);
+	if (pgtable_l4_enabled)
+		riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
 }

 static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
@ -161,9 +130,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd;

-	pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
+	pgd = __pgd_alloc(mm, 0);
 	if (likely(pgd != NULL)) {
-		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
 		/* Copy kernel mappings */
 		sync_kernel_mappings(pgd);
 	}
@ -175,10 +143,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
 				  unsigned long addr)
 {
-	struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
-
-	pagetable_pmd_dtor(ptdesc);
-	riscv_tlb_remove_ptdesc(tlb, ptdesc);
+	riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
 }

 #endif /* __PAGETABLE_PMD_FOLDED */
@ -186,10 +151,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 				  unsigned long addr)
 {
-	struct ptdesc *ptdesc = page_ptdesc(pte);
-
-	pagetable_pte_dtor(ptdesc);
-	riscv_tlb_remove_ptdesc(tlb, ptdesc);
+	riscv_tlb_remove_ptdesc(tlb, page_ptdesc(pte));
 }
 #endif /* CONFIG_MMU */

--- a/arch/riscv/include/asm/tlb.h
+++ b/arch/riscv/include/asm/tlb.h
@ -10,24 +10,6 @@ struct mmu_gather;

 static void tlb_flush(struct mmu_gather *tlb);

-#ifdef CONFIG_MMU
-#include <linux/swap.h>
-
-/*
- * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to
- * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use
- * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this
- * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the
- * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
- * for more details.
- */
-static inline void __tlb_remove_table(void *table)
-{
-	free_page_and_swap_cache(table);
-}
-
-#endif /* CONFIG_MMU */
-
 #define tlb_flush tlb_flush
 #include <asm-generic/tlb.h>

--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@ -147,9 +147,7 @@ static void __init init_resources(void)
 	res_idx = num_resources - 1;

 	mem_res_sz = num_resources * sizeof(*mem_res);
-	mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES);
-	if (!mem_res)
-		panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz);
+	mem_res = memblock_alloc_or_panic(mem_res_sz, SMP_CACHE_BYTES);

 	/*
 	 * Start by adding the reserved regions, if they overlap
--- a/Show More
+++ b/Show More