Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

This commit is contained in:
Stephen Rothwell 2025-01-13 09:34:41 +11:00
commit ada24f0158
536 changed files with 11136 additions and 10750 deletions

View File

@ -410,6 +410,7 @@ Liam Mark <quic_lmark@quicinc.com> <lmark@codeaurora.org>
Linas Vepstas <linas@austin.ibm.com>
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
Linus Lüssing <linus.luessing@c0d3.blue> <ll@simonwunderlich.de>
<linux-hardening@vger.kernel.org> <kernel-hardening@lists.openwall.com>
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>

View File

@ -4339,7 +4339,7 @@ D: Freescale Highspeed USB device driver
D: Freescale QE SoC support and Ethernet driver
S: B-1206 Jingmao Guojigongyu
S: 16 Baliqiao Nanjie, Beijing 101100
S: People's Repulic of China
S: People's Republic of China
N: Vlad Yasevich
E: vyasevich@gmail.com

View File

@ -355,10 +355,15 @@ Description: If 'target' is written to the 'type' file, writing to or
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/matching
Date: Dec 2022
Contact: SeongJae Park <sj@kernel.org>
Description: Writing 'Y' or 'N' to this file sets whether to filter out
pages that do or do not match to the 'type' and 'memcg_path',
respectively. Filter out means the action of the scheme will
not be applied to.
Description: Writing 'Y' or 'N' to this file sets whether the filter is for
the memory of the 'type', or all except the 'type'.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/allow
Date: Jan 2025
Contact: SeongJae Park <sj@kernel.org>
Description: Writing 'Y' or 'N' to this file sets whether to allow or reject
applying the scheme's action to the memory that satisfies the
'type' and the 'matching' of the directory.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_tried
Date: Mar 2022
@ -384,6 +389,12 @@ Contact: SeongJae Park <sj@kernel.org>
Description: Reading this file returns the total size of regions that the
action of the scheme has successfully applied in bytes.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/sz_ops_filter_passed
Date: Dec 2024
Contact: SeongJae Park <sj@kernel.org>
Description: Reading this file returns the total size of memory that passed
DAMON operations layer-handled filters of the scheme in bytes.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/qt_exceeds
Date: Mar 2022
Contact: SeongJae Park <sj@kernel.org>
@ -424,3 +435,10 @@ Contact: SeongJae Park <sj@kernel.org>
Description: Reading this file returns the 'age' of a memory region that
corresponding DAMON-based Operation Scheme's action has tried
to be applied.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/sz_filter_passed
Date: Dec 2024
Contact: SeongJae Park <sj@kernel.org>
Description: Reading this file returns the size of the memory in the region
that passed DAMON operations layer-handled filters of the
scheme in bytes.

View File

@ -100,29 +100,29 @@ Get delays, since system boot, for pid 10::
# ./getdelays -d -p 10
(output similar to next case)
Get sum of delays, since system boot, for all pids with tgid 5::
Get sum and peak of delays, since system boot, for all pids with tgid 242::
# ./getdelays -d -t 5
bash-4.4# ./getdelays -d -t 242
print delayacct stats ON
TGID 5
TGID 242
CPU count real total virtual total delay total delay average
8 7000000 6872122 3382277 0.423ms
IO count delay total delay average
0 0 0.000ms
SWAP count delay total delay average
0 0 0.000ms
RECLAIM count delay total delay average
0 0 0.000ms
THRASHING count delay total delay average
0 0 0.000ms
COMPACT count delay total delay average
0 0 0.000ms
WPCOPY count delay total delay average
0 0 0.000ms
IRQ count delay total delay average
0 0 0.000ms
CPU count real total virtual total delay total delay average delay max delay min
39 156000000 156576579 2111069 0.054ms 0.212296ms 0.031307ms
IO count delay total delay average delay max delay min
0 0 0.000ms 0.000000ms 0.000000ms
SWAP count delay total delay average delay max delay min
0 0 0.000ms 0.000000ms 0.000000ms
RECLAIM count delay total delay average delay max delay min
0 0 0.000ms 0.000000ms 0.000000ms
THRASHING count delay total delay average delay max delay min
0 0 0.000ms 0.000000ms 0.000000ms
COMPACT count delay total delay average delay max delay min
0 0 0.000ms 0.000000ms 0.000000ms
WPCOPY count delay total delay average delay max delay min
156 11215873 0.072ms 0.207403ms 0.033913ms
IRQ count delay total delay average delay max delay min
0 0 0.000ms 0.000000ms 0.000000ms
Get IO accounting for pid 1, it works only with -p::

View File

@ -3351,8 +3351,8 @@
[KNL] Set the initial state for the memory hotplug
onlining policy. If not specified, the default value is
set according to the
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
option.
CONFIG_MHP_DEFAULT_ONLINE_TYPE kernel config
options.
See Documentation/admin-guide/mm/memory-hotplug.rst.
memmap=exactmap [KNL,X86,EARLY] Enable setting of an exact
@ -6992,6 +6992,13 @@
See Documentation/admin-guide/mm/transhuge.rst
for more details.
transparent_hugepage_tmpfs= [KNL]
Format: [always|within_size|advise|never]
Can be used to control the default hugepage allocation policy
for the tmpfs mount.
See Documentation/admin-guide/mm/transhuge.rst
for more details.
trusted.source= [KEYS]
Format: <string>
This parameter identifies the trust source as a backend

View File

@ -42,32 +42,45 @@ the execution. ::
$ git clone https://github.com/sjp38/masim; cd masim; make
$ sudo damo start "./masim ./configs/stairs.cfg --quiet"
$ sudo ./damo show
0 addr [85.541 TiB , 85.541 TiB ) (57.707 MiB ) access 0 % age 10.400 s
1 addr [85.541 TiB , 85.542 TiB ) (413.285 MiB) access 0 % age 11.400 s
2 addr [127.649 TiB , 127.649 TiB) (57.500 MiB ) access 0 % age 1.600 s
3 addr [127.649 TiB , 127.649 TiB) (32.500 MiB ) access 0 % age 500 ms
4 addr [127.649 TiB , 127.649 TiB) (9.535 MiB ) access 100 % age 300 ms
5 addr [127.649 TiB , 127.649 TiB) (8.000 KiB ) access 60 % age 0 ns
6 addr [127.649 TiB , 127.649 TiB) (6.926 MiB ) access 0 % age 1 s
7 addr [127.998 TiB , 127.998 TiB) (120.000 KiB) access 0 % age 11.100 s
8 addr [127.998 TiB , 127.998 TiB) (8.000 KiB ) access 40 % age 100 ms
9 addr [127.998 TiB , 127.998 TiB) (4.000 KiB ) access 0 % age 11 s
total size: 577.590 MiB
$ sudo ./damo stop
$ sudo damo report access
heatmap: 641111111000000000000000000000000000000000000000000000[...]33333333333333335557984444[...]7
# min/max temperatures: -1,840,000,000, 370,010,000, column size: 3.925 MiB
0 addr 86.182 TiB size 8.000 KiB access 0 % age 14.900 s
1 addr 86.182 TiB size 8.000 KiB access 60 % age 0 ns
2 addr 86.182 TiB size 3.422 MiB access 0 % age 4.100 s
3 addr 86.182 TiB size 2.004 MiB access 95 % age 2.200 s
4 addr 86.182 TiB size 29.688 MiB access 0 % age 14.100 s
5 addr 86.182 TiB size 29.516 MiB access 0 % age 16.700 s
6 addr 86.182 TiB size 29.633 MiB access 0 % age 17.900 s
7 addr 86.182 TiB size 117.652 MiB access 0 % age 18.400 s
8 addr 126.990 TiB size 62.332 MiB access 0 % age 9.500 s
9 addr 126.990 TiB size 13.980 MiB access 0 % age 5.200 s
10 addr 126.990 TiB size 9.539 MiB access 100 % age 3.700 s
11 addr 126.990 TiB size 16.098 MiB access 0 % age 6.400 s
12 addr 127.987 TiB size 132.000 KiB access 0 % age 2.900 s
total size: 314.008 MiB
$ sudo damo stop
The first command of the above example downloads and builds an artificial
memory access generator program called ``masim``. The second command asks DAMO
to execute the artificial generator process start via the given command and
make DAMON monitors the generator process. The third command retrieves the
current snapshot of the monitored access pattern of the process from DAMON and
shows the pattern in a human readable format.
to start the program via the given command and make DAMON monitors the newly
started process. The third command retrieves the current snapshot of the
monitored access pattern of the process from DAMON and shows the pattern in a
human readable format.
Each line of the output shows which virtual address range (``addr [XX, XX)``)
of the process is how frequently (``access XX %``) accessed for how long time
(``age XX``). For example, the fifth region of ~9 MiB size is being most
frequently accessed for last 300 milliseconds. Finally, the fourth command
stops DAMON.
The first line of the output shows the relative access temperature (hotness) of
the regions in a single row hetmap format. Each column on the heatmap
represents regions of same size on the monitored virtual address space. The
position of the colun on the row and the number on the column represents the
relative location and access temperature of the region. ``[...]`` means
unmapped huge regions on the virtual address spaces. The second line shows
additional information for better understanding the heatmap.
Each line of the output from the third line shows which virtual address range
(``addr XX size XX``) of the process is how frequently (``access XX %``)
accessed for how long time (``age XX``). For example, the evelenth region of
~9.5 MiB size is being most frequently accessed for last 3.7 seconds. Finally,
the fourth command stops DAMON.
Note that DAMON can monitor not only virtual address spaces but multiple types
of address spaces including the physical address space.
@ -95,7 +108,7 @@ Visualizing Recorded Patterns
You can visualize the pattern in a heatmap, showing which memory region
(x-axis) got accessed when (y-axis) and how frequently (number).::
$ sudo damo report heats --heatmap stdout
$ sudo damo report heatmap
22222222222222222222222222222222222222211111111111111111111111111111111111111100
44444444444444444444444444444444444444434444444444444444444444444444444444443200
44444444444444444444444444444444444444433444444444444444444444444444444444444200
@ -160,6 +173,6 @@ Data Access Pattern Aware Memory Management
Below command makes every memory region of size >=4K that has not accessed for
>=60 seconds in your workload to be swapped out. ::
$ sudo damo schemes --damos_access_rate 0 0 --damos_sz_region 4K max \
--damos_age 60s max --damos_action pageout \
<pid of your workload>
$ sudo damo start --damos_access_rate 0 0 --damos_sz_region 4K max \
--damos_age 60s max --damos_action pageout \
<pid of your workload>

View File

@ -26,12 +26,6 @@ DAMON provides below interfaces for different users.
writing kernel space DAMON application programs for you. You can even extend
DAMON for various address spaces. For detail, please refer to the interface
:doc:`document </mm/damon/api>`.
- *debugfs interface. (DEPRECATED!)*
:ref:`This <debugfs_interface>` is almost identical to :ref:`sysfs interface
<sysfs_interface>`. This is deprecated, so users should move to the
:ref:`sysfs interface <sysfs_interface>`. If you depend on this and cannot
move, please report your usecase to damon@lists.linux.dev and
linux-mm@kvack.org.
.. _sysfs_interface:
@ -89,10 +83,10 @@ comma (",").
│ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value
│ │ │ │ │ │ │ :ref:`watermarks <sysfs_watermarks>`/metric,interval_us,high,mid,low
│ │ │ │ │ │ │ :ref:`filters <sysfs_filters>`/nr_filters
│ │ │ │ │ │ │ │ 0/type,matching,memcg_id
│ │ │ │ │ │ │ :ref:`stats <sysfs_schemes_stats>`/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
│ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx
│ │ │ │ │ │ │ :ref:`stats <sysfs_schemes_stats>`/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds
│ │ │ │ │ │ │ :ref:`tried_regions <sysfs_schemes_tried_regions>`/total_bytes
│ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
│ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age,sz_filter_passed
│ │ │ │ │ │ │ │ ...
│ │ │ │ │ │ ...
│ │ │ │ ...
@ -412,59 +406,62 @@ number (``N``) to the file creates the number of child directories named ``0``
to ``N-1``. Each directory represents each filter. The filters are evaluated
in the numeric order.
Each filter directory contains six files, namely ``type``, ``matcing``,
``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``. To ``type``
file, you can write one of five special keywords: ``anon`` for anonymous pages,
``memcg`` for specific memory cgroup, ``young`` for young pages, ``addr`` for
specific address range (an open-ended interval), or ``target`` for specific
DAMON monitoring target filtering. In case of the memory cgroup filtering, you
can specify the memory cgroup of the interest by writing the path of the memory
cgroup from the cgroups mount point to ``memcg_path`` file. In case of the
address range filtering, you can specify the start and end address of the range
to ``addr_start`` and ``addr_end`` files, respectively. For the DAMON
monitoring target filtering, you can specify the index of the target between
the list of the DAMON context's monitoring targets list to ``target_idx`` file.
You can write ``Y`` or ``N`` to ``matching`` file to filter out pages that does
or does not match to the type, respectively. Then, the scheme's action will
not be applied to the pages that specified to be filtered out.
Each filter directory contains seven files, namely ``type``, ``matching``,
``allow``, ``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``.
To ``type`` file, you can write one of five special keywords: ``anon`` for
anonymous pages, ``memcg`` for specific memory cgroup, ``young`` for young
pages, ``addr`` for specific address range (an open-ended interval), or
``target`` for specific DAMON monitoring target filtering. Meaning of the
types are same to the description on the :ref:`design doc
<damon_design_damos_filters>`.
In case of the memory cgroup filtering, you can specify the memory cgroup of
the interest by writing the path of the memory cgroup from the cgroups mount
point to ``memcg_path`` file. In case of the address range filtering, you can
specify the start and end address of the range to ``addr_start`` and
``addr_end`` files, respectively. For the DAMON monitoring target filtering,
you can specify the index of the target between the list of the DAMON context's
monitoring targets list to ``target_idx`` file.
You can write ``Y`` or ``N`` to ``matching`` file to specify whether the filter
is for memory that matches the ``type``. You can write ``Y`` or ``N`` to
``allow`` file to specify if applying the action to the memory that satisfies
the ``type`` and ``matching`` should be allowed or not.
For example, below restricts a DAMOS action to be applied to only non-anonymous
pages of all memory cgroups except ``/having_care_already``.::
# echo 2 > nr_filters
# # filter out anonymous pages
# # disallow anonymous pages
echo anon > 0/type
echo Y > 0/matching
echo N > 0/allow
# # further filter out all cgroups except one at '/having_care_already'
echo memcg > 1/type
echo /having_care_already > 1/memcg_path
echo Y > 1/matching
echo N > 1/allow
Note that ``anon`` and ``memcg`` filters are currently supported only when
``paddr`` :ref:`implementation <sysfs_context>` is being used.
Also, memory regions that are filtered out by ``addr`` or ``target`` filters
are not counted as the scheme has tried to those, while regions that filtered
out by other type filters are counted as the scheme has tried to. The
difference is applied to :ref:`stats <damos_stats>` and
:ref:`tried regions <sysfs_schemes_tried_regions>`.
Refer to the :ref:`DAMOS filters design documentation
<damon_design_damos_filters>` for more details including how multiple filters
of different ``allow`` works, when each of the filters are supported, and
differences on stats.
.. _sysfs_schemes_stats:
schemes/<N>/stats/
------------------
DAMON counts the total number and bytes of regions that each scheme is tried to
be applied, the two numbers for the regions that each scheme is successfully
applied, and the total number of the quota limit exceeds. This statistics can
be used for online analysis or tuning of the schemes.
DAMON counts statistics for each scheme. This statistics can be used for
online analysis or tuning of the schemes. Refer to :ref:`design doc
<damon_design_damos_stat>` for more details about the stats.
The statistics can be retrieved by reading the files under ``stats`` directory
(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, and
``qt_exceeds``), respectively. The files are not updated in real time, so you
should ask DAMON sysfs interface to update the content of the files for the
stats by writing a special keyword, ``update_schemes_stats`` to the relevant
``kdamonds/<N>/state`` file.
(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``,
``sz_ops_filter_passed``, and ``qt_exceeds``), respectively. The files are not
updated in real time, so you should ask DAMON sysfs interface to update the
content of the files for the stats by writing a special keyword,
``update_schemes_stats`` to the relevant ``kdamonds/<N>/state`` file.
.. _sysfs_schemes_tried_regions:
@ -501,10 +498,10 @@ set the ``access pattern`` as their interested pattern that they want to query.
tried_regions/<N>/
------------------
In each region directory, you will find four files (``start``, ``end``,
``nr_accesses``, and ``age``). Reading the files will show the start and end
addresses, ``nr_accesses``, and ``age`` of the region that corresponding
DAMON-based operation scheme ``action`` has tried to be applied.
In each region directory, you will find five files (``start``, ``end``,
``nr_accesses``, ``age``, and ``sz_filter_passed``). Reading the files will
show the properties of the region that corresponding DAMON-based operation
scheme ``action`` has tried to be applied.
Example
~~~~~~~
@ -600,306 +597,3 @@ fields are as usual. It shows the index of the DAMON context (``ctx_idx=X``)
of the scheme in the list of the contexts of the context's kdamond, the index
of the scheme (``scheme_idx=X``) in the list of the schemes of the context, in
addition to the output of ``damon_aggregated`` tracepoint.
.. _debugfs_interface:
debugfs Interface (DEPRECATED!)
===============================
.. note::
THIS IS DEPRECATED!
DAMON debugfs interface is deprecated, so users should move to the
:ref:`sysfs interface <sysfs_interface>`. If you depend on this and cannot
move, please report your usecase to damon@lists.linux.dev and
linux-mm@kvack.org.
DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``,
``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``,
``mk_contexts`` and ``rm_contexts`` under its debugfs directory,
``<debugfs>/damon/``.
``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation
notice. Reading it returns the deprecation notice, as below::
# cat DEPRECATED
DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
Attributes
----------
Users can get and set the ``sampling interval``, ``aggregation interval``,
``update interval``, and min/max number of monitoring target regions by
reading from and writing to the ``attrs`` file. To know about the monitoring
attributes in detail, please refer to the :doc:`/mm/damon/design`. For
example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and
1000, and then check it again::
# cd <debugfs>/damon
# echo 5000 100000 1000000 10 1000 > attrs
# cat attrs
5000 100000 1000000 10 1000
Target IDs
----------
Some types of address spaces supports multiple monitoring target. For example,
the virtual memory address spaces monitoring can have multiple processes as the
monitoring targets. Users can set the targets by writing relevant id values of
the targets to, and get the ids of the current targets by reading from the
``target_ids`` file. In case of the virtual address spaces monitoring, the
values should be pids of the monitoring target processes. For example, below
commands set processes having pids 42 and 4242 as the monitoring targets and
check it again::
# cd <debugfs>/damon
# echo 42 4242 > target_ids
# cat target_ids
42 4242
Users can also monitor the physical memory address space of the system by
writing a special keyword, "``paddr\n``" to the file. Because physical address
space monitoring doesn't support multiple targets, reading the file will show a
fake value, ``42``, as below::
# cd <debugfs>/damon
# echo paddr > target_ids
# cat target_ids
42
Note that setting the target ids doesn't start the monitoring.
Initial Monitoring Target Regions
---------------------------------
In case of the virtual address space monitoring, DAMON automatically sets and
updates the monitoring target regions so that entire memory mappings of target
processes can be covered. However, users can want to limit the monitoring
region to specific address ranges, such as the heap, the stack, or specific
file-mapped area. Or, some users can know the initial access pattern of their
workloads and therefore want to set optimal initial regions for the 'adaptive
regions adjustment'.
In contrast, DAMON do not automatically sets and updates the monitoring target
regions in case of physical memory monitoring. Therefore, users should set the
monitoring target regions by themselves.
In such cases, users can explicitly set the initial monitoring target regions
as they want, by writing proper values to the ``init_regions`` file. The input
should be a sequence of three integers separated by white spaces that represent
one region in below form.::
<target idx> <start address> <end address>
The ``target idx`` should be the index of the target in ``target_ids`` file,
starting from ``0``, and the regions should be passed in address order. For
example, below commands will set a couple of address ranges, ``1-100`` and
``100-200`` as the initial monitoring target region of pid 42, which is the
first one (index ``0``) in ``target_ids``, and another couple of address
ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
(index ``1``) in ``target_ids``.::
# cd <debugfs>/damon
# cat target_ids
42 4242
# echo "0 1 100 \
0 100 200 \
1 20 40 \
1 50 100" > init_regions
Note that this sets the initial monitoring target regions only. In case of
virtual memory monitoring, DAMON will automatically updates the boundary of the
regions after one ``update interval``. Therefore, users should set the
``update interval`` large enough in this case, if they don't want the
update.
Schemes
-------
Users can get and set the DAMON-based operation :ref:`schemes
<damon_design_damos>` by reading from and writing to ``schemes`` debugfs file.
Reading the file also shows the statistics of each scheme. To the file, each
of the schemes should be represented in each line in below form::
<target access pattern> <action> <quota> <watermarks>
You can disable schemes by simply writing an empty string to the file.
Target Access Pattern
~~~~~~~~~~~~~~~~~~~~~
The target access :ref:`pattern <damon_design_damos_access_pattern>` of the
scheme. The ``<target access pattern>`` is constructed with three ranges in
below form::
min-size max-size min-acc max-acc min-age max-age
Specifically, bytes for the size of regions (``min-size`` and ``max-size``),
number of monitored accesses per aggregate interval for access frequency
(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of
regions (``min-age`` and ``max-age``) are specified. Note that the ranges are
closed interval.
Action
~~~~~~
The ``<action>`` is a predefined integer for memory management :ref:`actions
<damon_design_damos_action>`. The mapping between the ``<action>`` values and
the memory management actions is as below. For the detailed meaning of the
action and DAMON operations set supporting each action, please refer to the
list on :ref:`design doc <damon_design_damos_action>`.
- 0: ``willneed``
- 1: ``cold``
- 2: ``pageout``
- 3: ``hugepage``
- 4: ``nohugepage``
- 5: ``stat``
Quota
~~~~~
Users can set the :ref:`quotas <damon_design_damos_quotas>` of the given scheme
via the ``<quota>`` in below form::
<ms> <sz> <reset interval> <priority weights>
This makes DAMON to try to use only up to ``<ms>`` milliseconds for applying
the action to memory regions of the ``target access pattern`` within the
``<reset interval>`` milliseconds, and to apply the action to only up to
``<sz>`` bytes of memory regions within the ``<reset interval>``. Setting both
``<ms>`` and ``<sz>`` zero disables the quota limits.
For the :ref:`prioritization <damon_design_damos_quotas_prioritization>`, users
can set the weights for the three properties in ``<priority weights>`` in below
form::
<size weight> <access frequency weight> <age weight>
Watermarks
~~~~~~~~~~
Users can specify :ref:`watermarks <damon_design_damos_watermarks>` of the
given scheme via ``<watermarks>`` in below form::
<metric> <check interval> <high mark> <middle mark> <low mark>
``<metric>`` is a predefined integer for the metric to be checked. The
supported numbers and their meanings are as below.
- 0: Ignore the watermarks
- 1: System's free memory rate (per thousand)
The value of the metric is checked every ``<check interval>`` microseconds.
If the value is higher than ``<high mark>`` or lower than ``<low mark>``, the
scheme is deactivated. If the value is lower than ``<mid mark>``, the scheme
is activated.
.. _damos_stats:
Statistics
~~~~~~~~~~
It also counts the total number and bytes of regions that each scheme is tried
to be applied, the two numbers for the regions that each scheme is successfully
applied, and the total number of the quota limit exceeds. This statistics can
be used for online analysis or tuning of the schemes.
The statistics can be shown by reading the ``schemes`` file. Reading the file
will show each scheme you entered in each line, and the five numbers for the
statistics will be added at the end of each line.
Example
~~~~~~~
Below commands applies a scheme saying "If a memory region of size in [4KiB,
8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
interval in [10, 20], page out the region. For the paging out, use only up to
10ms per second, and also don't page out more than 1GiB per second. Under the
limitation, page out memory regions having longer age first. Also, check the
free memory rate of the system every 5 seconds, start the monitoring and paging
out when the free memory rate becomes lower than 50%, but stop it if the free
memory rate becomes larger than 60%, or lower than 30%".::
# cd <debugfs>/damon
# scheme="4096 8192 0 5 10 20 2" # target access pattern and action
# scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
# scheme+=" 0 0 100" # prioritization weights
# scheme+=" 1 5000000 600 500 300" # watermarks
# echo "$scheme" > schemes
Turning On/Off
--------------
Setting the files as described above doesn't incur effect unless you explicitly
start the monitoring. You can start, stop, and check the current status of the
monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file.
Writing ``on`` to the file starts the monitoring of the targets with the
attributes. Writing ``off`` to the file stops those. DAMON also stops if
every target process is terminated. Below example commands turn on, off, and
check the status of DAMON::
# cd <debugfs>/damon
# echo on > monitor_on_DEPRECATED
# echo off > monitor_on_DEPRECATED
# cat monitor_on_DEPRECATED
off
Please note that you cannot write to the above-mentioned debugfs files while
the monitoring is turned on. If you write to the files while DAMON is running,
an error code such as ``-EBUSY`` will be returned.
Monitoring Thread PID
---------------------
DAMON does requested monitoring with a kernel thread called ``kdamond``. You
can get the pid of the thread by reading the ``kdamond_pid`` file. When the
monitoring is turned off, reading the file returns ``none``. ::
# cd <debugfs>/damon
# cat monitor_on_DEPRECATED
off
# cat kdamond_pid
none
# echo on > monitor_on_DEPRECATED
# cat kdamond_pid
18594
Using Multiple Monitoring Threads
---------------------------------
One ``kdamond`` thread is created for each monitoring context. You can create
and remove monitoring contexts for multiple ``kdamond`` required use case using
the ``mk_contexts`` and ``rm_contexts`` files.
Writing the name of the new context to the ``mk_contexts`` file creates a
directory of the name on the DAMON debugfs directory. The directory will have
DAMON debugfs files for the context. ::
# cd <debugfs>/damon
# ls foo
# ls: cannot access 'foo': No such file or directory
# echo foo > mk_contexts
# ls foo
# attrs init_regions kdamond_pid schemes target_ids
If the context is not needed anymore, you can remove it and the corresponding
directory by putting the name of the context to the ``rm_contexts`` file. ::
# echo foo > rm_contexts
# ls foo
# ls: cannot access 'foo': No such file or directory
Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files
are in the root directory only.

View File

@ -280,8 +280,8 @@ The following files are currently defined:
blocks; configure auto-onlining.
The default value depends on the
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel configuration
option.
CONFIG_MHP_DEFAULT_ONLINE_TYPE kernel configuration
options.
See the ``state`` property of memory blocks for details.
``block_size_bytes`` read-only: the size in bytes of a memory block.

View File

@ -332,6 +332,12 @@ allocation policy for the internal shmem mount by using the kernel parameter
seven valid policies for shmem (``always``, ``within_size``, ``advise``,
``never``, ``deny``, and ``force``).
Similarly to ``transparent_hugepage_shmem``, you can control the default
hugepage allocation policy for the tmpfs mount by using the kernel parameter
``transparent_hugepage_tmpfs=<policy>``, where ``<policy>`` is one of the
four valid policies for tmpfs (``always``, ``within_size``, ``advise``,
``never``). The tmpfs mount default policy is ``never``.
In the same manner as ``thp_anon`` controls each supported anonymous THP
size, ``thp_shmem`` controls each supported shmem THP size. ``thp_shmem``
has the same format as ``thp_anon``, but also supports the policy
@ -352,8 +358,21 @@ default to ``never``.
Hugepages in tmpfs/shmem
========================
You can control hugepage allocation policy in tmpfs with mount option
``huge=``. It can have following values:
Traditionally, tmpfs only supported a single huge page size ("PMD"). Today,
it also supports smaller sizes just like anonymous memory, often referred
to as "multi-size THP" (mTHP). Huge pages of any size are commonly
represented in the kernel as "large folios".
While there is fine control over the huge page sizes to use for the internal
shmem mount (see below), ordinary tmpfs mounts will make use of all available
huge page sizes without any control over the exact sizes, behaving more like
other file systems.
tmpfs mounts
------------
The THP allocation policy for tmpfs mounts can be adjusted using the mount
option: ``huge=``. It can have following values:
always
Attempt to allocate huge pages every time we need a new page;
@ -363,24 +382,24 @@ never
within_size
Only allocate huge page if it will be fully within i_size.
Also respect fadvise()/madvise() hints;
Also respect madvise() hints;
advise
Only allocate huge pages if requested with fadvise()/madvise();
Only allocate huge pages if requested with madvise();
The default policy is ``never``.
Remember, that the kernel may use huge pages of all available sizes, and
that no fine control as for the internal tmpfs mount is available.
The default policy in the past was ``never``, but it can now be adjusted
using the kernel parameter ``transparent_hugepage_tmpfs=<policy>``.
``mount -o remount,huge= /mountpoint`` works fine after mount: remounting
``huge=never`` will not attempt to break up huge pages at all, just stop more
from being allocated.
There's also sysfs knob to control hugepage allocation policy for internal
shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
In addition to policies listed above, shmem_enabled allows two further
values:
In addition to policies listed above, the sysfs knob
/sys/kernel/mm/transparent_hugepage/shmem_enabled will affect the
allocation policy of tmpfs mounts, when set to the following values:
deny
For use in emergencies, to force the huge option off from
@ -388,13 +407,24 @@ deny
force
Force the huge option on for all - very useful for testing;
Shmem can also use "multi-size THP" (mTHP) by adding a new sysfs knob to
control mTHP allocation:
'/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/shmem_enabled',
and its value for each mTHP is essentially consistent with the global
setting. An 'inherit' option is added to ensure compatibility with these
global settings. Conversely, the options 'force' and 'deny' are dropped,
which are rather testing artifacts from the old ages.
shmem / internal tmpfs
----------------------
The mount internal tmpfs mount is used for SysV SHM, memfds, shared anonymous
mmaps (of /dev/zero or MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
To control the THP allocation policy for this internal tmpfs mount, the
sysfs knob /sys/kernel/mm/transparent_hugepage/shmem_enabled and the knobs
per THP size in
'/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/shmem_enabled'
can be used.
The global knob has the same semantics as the ``huge=`` mount options
for tmpfs mounts, except that the different huge page sizes can be controlled
individually, and will only use the setting of the global knob when the
per-size knob is set to 'inherit'.
The options 'force' and 'deny' are dropped for the individual sizes, which
are rather testing artifacts from the old ages.
always
Attempt to allocate <size> huge pages every time we need a new page;
@ -408,10 +438,10 @@ never
within_size
Only allocate <size> huge page if it will be fully within i_size.
Also respect fadvise()/madvise() hints;
Also respect madvise() hints;
advise
Only allocate <size> huge pages if requested with fadvise()/madvise();
Only allocate <size> huge pages if requested with madvise();
Need of application restart
===========================
@ -561,6 +591,16 @@ swpin
is incremented every time a huge page is swapped in from a non-zswap
swap device in one piece.
swpin_fallback
is incremented if swapin fails to allocate or charge a huge page
and instead falls back to using huge pages with lower orders or
small pages.
swpin_fallback_charge
is incremented if swapin fails to charge a huge page and instead
falls back to using huge pages with lower orders or small pages
even though the allocation was successful.
swpout
is incremented every time a huge page is swapped out to a non-zswap
swap device in one piece without splitting.

View File

@ -4,6 +4,8 @@
Min Heap API
============
:Author: Kuan-Wei Chiu <visitorckw@gmail.com>
Introduction
============

View File

@ -42,8 +42,8 @@ call xa_tag_pointer() to create an entry with a tag, xa_untag_pointer()
to turn a tagged entry back into an untagged pointer and xa_pointer_tag()
to retrieve the tag of an entry. Tagged pointers use the same bits that
are used to distinguish value entries from normal pointers, so you must
decide whether they want to store value entries or tagged pointers in
any particular XArray.
decide whether you want to store value entries or tagged pointers in any
particular XArray.
The XArray does not support storing IS_ERR() pointers as some
conflict with value entries or internal entries.
@ -52,8 +52,9 @@ An unusual feature of the XArray is the ability to create entries which
occupy a range of indices. Once stored to, looking up any index in
the range will return the same entry as looking up any other index in
the range. Storing to any index will store to all of them. Multi-index
entries can be explicitly split into smaller entries, or storing ``NULL``
into any entry will cause the XArray to forget about the range.
entries can be explicitly split into smaller entries. Unsetting (using
xa_erase() or xa_store() with ``NULL``) any entry will cause the XArray
to forget about the range.
Normal API
==========
@ -63,13 +64,14 @@ for statically allocated XArrays or xa_init() for dynamically
allocated ones. A freshly-initialised XArray contains a ``NULL``
pointer at every index.
You can then set entries using xa_store() and get entries
using xa_load(). xa_store will overwrite any entry with the
new entry and return the previous entry stored at that index. You can
use xa_erase() instead of calling xa_store() with a
``NULL`` entry. There is no difference between an entry that has never
been stored to, one that has been erased and one that has most recently
had ``NULL`` stored to it.
You can then set entries using xa_store() and get entries using
xa_load(). xa_store() will overwrite any entry with the new entry and
return the previous entry stored at that index. You can unset entries
using xa_erase() or by setting the entry to ``NULL`` using xa_store().
There is no difference between an entry that has never been stored to
and one that has been erased with xa_erase(); an entry that has most
recently had ``NULL`` stored to it is also equivalent except if the
XArray was initialized with ``XA_FLAGS_ALLOC``.
You can conditionally replace an entry at an index by using
xa_cmpxchg(). Like cmpxchg(), it will only succeed if

View File

@ -48,6 +48,7 @@ fixes/update part 1.1 Stefani Seibold <stefani@seibold.net> June 9 2009
3.11 /proc/<pid>/patch_state - Livepatch patch operation state
3.12 /proc/<pid>/arch_status - Task architecture specific information
3.13 /proc/<pid>/fd - List of symlinks to open files
3.14 /proc/<pid/ksm_stat - Information about the process's ksm status.
4 Configuring procfs
4.1 Mount options
@ -484,14 +485,15 @@ Memory Area, or VMA) there is a series of lines such as the following::
THPeligible: 0
VmFlags: rd ex mr mw me dw
The first of these lines shows the same information as is displayed for the
mapping in /proc/PID/maps. Following lines show the size of the mapping
(size); the size of each page allocated when backing a VMA (KernelPageSize),
which is usually the same as the size in the page table entries; the page size
used by the MMU when backing a VMA (in most cases, the same as KernelPageSize);
the amount of the mapping that is currently resident in RAM (RSS); the
process' proportional share of this mapping (PSS); and the number of clean and
dirty shared and private pages in the mapping.
The first of these lines shows the same information as is displayed for
the mapping in /proc/PID/maps. Following lines show the size of the
mapping (size); the size of each page allocated when backing a VMA
(KernelPageSize), which is usually the same as the size in the page table
entries; the page size used by the MMU when backing a VMA (in most cases,
the same as KernelPageSize); the amount of the mapping that is currently
resident in RAM (RSS); the process's proportional share of this mapping
(PSS); and the number of clean and dirty shared and private pages in the
mapping.
The "proportional set size" (PSS) of a process is the count of pages it has
in memory, where each page is divided by the number of processes sharing it.
@ -2232,6 +2234,74 @@ The number of open files for the process is stored in 'size' member
of stat() output for /proc/<pid>/fd for fast access.
-------------------------------------------------------
3.14 /proc/<pid/ksm_stat - Information about the process's ksm status
---------------------------------------------------------------------
When CONFIG_KSM is enabled, each process has this file which displays
the information of ksm merging status.
Example
~~~~~~~
::
/ # cat /proc/self/ksm_stat
ksm_rmap_items 0
ksm_zero_pages 0
ksm_merging_pages 0
ksm_process_profit 0
ksm_merge_any: no
ksm_mergeable: no
Description
~~~~~~~~~~~
ksm_rmap_items
^^^^^^^^^^^^^^
The number of ksm_rmap_item structure in use. The structure of
ksm_rmap_item is to store the reverse mapping information for virtual
addresses. KSM will generate a ksm_rmap_item for each ksm-scanned page
of the process.
ksm_zero_pages
^^^^^^^^^^^^^^
When /sys/kernel/mm/ksm/use_zero_pages is enabled, it represent how many
empty pages are merged with kernel zero pages by KSM.
ksm_merging_pages
^^^^^^^^^^^^^^^^^
It represents how many pages of this process are involved in KSM merging
(not including ksm_zero_pages). It is the same with what
/proc/<pid>/ksm_merging_pages shows.
ksm_process_profit
^^^^^^^^^^^^^^^^^^
The profit that KSM brings (Saved bytes). KSM can save memory by merging
identical pages, but also can consume additional memory, because it needs
to generate a number of rmap_items to save each scanned page's brief rmap
information. Some of these pages may be merged, but some may not be abled
to be merged after being checked several times, which are unprofitable
memory consumed.
ksm_merge_any
^^^^^^^^^^^^^
It specifies whether the process's mm is added by prctl() into the
candidate list of KSM or not, and KSM scanning is fully enabled at process
level.
ksm_mergeable
^^^^^^^^^^^^^
It specifies whether any VMAs of the process's mm are currently applicable
to KSM.
More information about KSM can be found at
Documentation/admin-guide/mm/ksm.rst.
Chapter 4: Configuring procfs
=============================
@ -2261,7 +2331,7 @@ arguments are now protected against local eavesdroppers.
hidepid=invisible or hidepid=2 means hidepid=1 plus all /proc/<pid>/ will be
fully invisible to other users. It doesn't mean that it hides a fact whether a
process with a specific pid value exists (it can be learned by other means, e.g.
by "kill -0 $PID"), but it hides process' uid and gid, which may be learned by
by "kill -0 $PID"), but it hides process's uid and gid, which may be learned by
stat()'ing /proc/<pid>/ otherwise. It greatly complicates an intruder's task of
gathering information about running processes, whether some daemon runs with
elevated privileges, whether other user runs some sensitive program, whether

View File

@ -6,7 +6,7 @@ Squashfs 4.0 Filesystem
Squashfs is a compressed read-only filesystem for Linux.
It uses zlib, lz4, lzo, or xz compression to compress files, inodes and
It uses zlib, lz4, lzo, xz or zstd compression to compress files, inodes and
directories. Inodes in the system are very small and all blocks are packed to
minimise data overhead. Block sizes greater than 4K are supported up to a
maximum of 1Mbytes (default block size 128K).
@ -16,8 +16,8 @@ use (i.e. in cases where a .tar.gz file may be used), and in constrained
block device/memory systems (e.g. embedded systems) where low overhead is
needed.
Mailing list: squashfs-devel@lists.sourceforge.net
Web site: www.squashfs.org
Mailing list (kernel code): linux-fsdevel@vger.kernel.org
Web site: github.com/plougher/squashfs-tools
1. Filesystem Features
----------------------
@ -58,11 +58,9 @@ inodes have different sizes).
As squashfs is a read-only filesystem, the mksquashfs program must be used to
create populated squashfs filesystems. This and other squashfs utilities
can be obtained from http://www.squashfs.org. Usage instructions can be
obtained from this site also.
The squashfs-tools development tree is now located on kernel.org
git://git.kernel.org/pub/scm/fs/squashfs/squashfs-tools.git
are very likely packaged by your linux distribution (called squashfs-tools).
The source code can be obtained from github.com/plougher/squashfs-tools.
Usage instructions can also be obtained from this site.
2.1 Mount options
-----------------

View File

@ -203,6 +203,8 @@ This scheme, however, cannot preserve the quality of the output if the
assumption is not guaranteed.
.. _damon_design_adaptive_regions_adjustment:
Adaptive Regions Adjustment
~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -264,6 +266,61 @@ tracepoints. For more details, please refer to the documentations for
respectively.
.. _damon_design_monitoring_params_tuning_guide:
Monitoring Parameters Tuning Guide
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In short, set ``aggregation interval`` to capture meaningful amount of accesses
for the purpose. The amount of accesses can be measured using ``nr_accesses``
and ``age`` of regions in the aggregated monitoring results snapshot. The
default value of the interval, ``100ms``, turns out to be too short in many
cases. Set ``sampling interval`` proportional to ``aggregation interval``. By
default, ``1/20`` is recommended as the ratio.
``Aggregation interval`` should be set as the time interval that the workload
can make an amount of accesses for the monitoring purpose, within the interval.
If the interval is too short, only small number of accesses are captured. As a
result, the monitoring results look everything is samely accessed only rarely.
For many purposes, that would be useless. If it is too long, however, the time
to converge regions with the :ref:`regions adjustment mechanism
<damon_design_adaptive_regions_adjustment>` can be too long, depending on the
time scale of the given purpose. This could happen if the workload is actually
making only rare accesses but the user thinks the amount of accesses for the
monitoring purpose too high. For such cases, the target amount of access to
capture per ``aggregation interval`` should carefully reconsidered. Also, note
that the captured amount of accesses is represented with not only
``nr_accesses``, but also ``age``. For example, even if every region on the
monitoring results show zero ``nr_accesses``, regions could still be
distinguished using ``age`` values as the recency information.
Hence the optimum value of ``aggregation interval`` depends on the access
intensiveness of the workload. The user should tune the interval based on the
amount of access that captured on each aggregated snapshot of the monitoring
results.
Note that the default value of the interval is 100 milliseconds, which is too
short in many cases, especially on large systems.
``Sampling interval`` defines the resolution of each aggregation. If it is set
too large, monitoring results will look like every region was samely rarely
accessed, or samely frequently accessed. That is, regions become
undistinguishable based on access pattern, and therefore the results will be
useless in many use cases. If ``sampling interval`` is too small, it will not
degrade the resolution, but will increase the monitoring overhead. If it is
appropriate enough to provide a resolution of the monitoring results that
sufficient for the given purpose, it shouldn't be unnecessarily further
lowered. It is recommended to be set proportional to ``aggregation interval``.
By default, the ratio is set as ``1/20``, and it is still recommended.
Refer to below documents for an example tuning based on the above guide.
.. toctree::
:maxdepth: 1
monitoring_intervals_tuning_example
.. _damon_design_damos:
Operation Schemes
@ -504,9 +561,34 @@ have a list of latency-critical processes.
To let users optimize DAMOS schemes with such special knowledge, DAMOS provides
a feature called DAMOS filters. The feature allows users to set an arbitrary
number of filters for each scheme. Each filter specifies the type of target
memory, and whether it should exclude the memory of the type (filter-out), or
all except the memory of the type (filter-in).
number of filters for each scheme. Each filter specifies
- a type of memory (``type``),
- whether it is for the memory of the type or all except the type
(``matching``), and
- whether it is to allow (include) or reject (exclude) applying
the scheme's action to the memory (``allow``).
When multiple filters are installed, each filter is evaluated in the installed
order. If a part of memory is matched to one of the filter, next filters are
ignored. If the memory passes through the filters evaluation stage because it
is not matched to any of the filters, applying the scheme's action to it is
allowed, same to the behavior when no filter exists.
For example, let's assume 1) a filter for allowing anonymous pages and 2)
another filter for rejecting young pages are installed in the order. If a page
of a region that eligible to apply the scheme's action is an anonymous page,
the scheme's action will be applied to the page regardless of whether it is
young or not, since it matches with the first allow-filter. If the page is
not anonymous but young, the scheme's action will not be applied, since the
second reject-filter blocks it. If the page is neither anonymous nor young,
the page will pass through the filters evaluation stage since there is no
matching filter, and the action will be applied to the page.
Note that the action can equally be applied to memory that either explicitly
filter-allowed or filters evaluation stage passed. It means that installing
allow-filters at the end of the list makes no practical change but only
filters-checking overhead.
For efficient handling of filters, some types of filters are handled by the
core layer, while others are handled by operations set. In the latter case,
@ -516,7 +598,7 @@ filter are not counted as the scheme has tried to the region. In contrast, if
a memory regions is filtered by an operations set layer-handled filter, it is
counted as the scheme has tried. This difference affects the statistics.
Below types of filters are currently supported.
Below ``type`` of filters are currently supported.
- anonymous page
- Applied to pages that containing data that not stored in files.
@ -539,6 +621,60 @@ To know how user-space can set the watermarks via :ref:`DAMON sysfs interface
<sysfs_interface>`, refer to :ref:`filters <sysfs_filters>` part of the
documentation.
.. _damon_design_damos_stat:
Statistics
~~~~~~~~~~
The statistics of DAMOS behaviors that designed to help monitoring, tuning and
debugging of DAMOS.
DAMOS accounts below statistics for each scheme, from the beginning of the
scheme's execution.
- ``nr_tried``: Total number of regions that the scheme is tried to be applied.
- ``sz_trtied``: Total size of regions that the scheme is tried to be applied.
- ``sz_ops_filter_passed``: Total bytes that passed operations set
layer-handled DAMOS filters.
- ``nr_applied``: Total number of regions that the scheme is applied.
- ``sz_applied``: Total size of regions that the scheme is applied.
- ``qt_exceeds``: Total number of times the quota of the scheme has exceeded.
"A scheme is tried to be applied to a region" means DAMOS core logic determined
the region is eligible to apply the scheme's :ref:`action
<damon_design_damos_action>`. The :ref:`access pattern
<damon_design_damos_access_pattern>`, :ref:`quotas
<damon_design_damos_quotas>`, :ref:`watermarks
<damon_design_damos_watermarks>`, and :ref:`filters
<damon_design_damos_filters>` that handled on core logic could affect this.
The core logic will only ask the underlying :ref:`operation set
<damon_operations_set>` to do apply the action to the region, so whether the
action is really applied or not is unclear. That's why it is called "tried".
"A scheme is applied to a region" means the :ref:`operation set
<damon_operations_set>` has applied the action to at least a part of the
region. The :ref:`filters <damon_design_damos_filters>` that handled by the
operation set, and the types of the :ref:`action <damon_design_damos_action>`
and the pages of the region can affect this. For example, if a filter is set
to exclude anonymous pages and the region has only anonymous pages, or if the
action is ``pageout`` while all pages of the region are unreclaimable, applying
the action to the region will fail.
To know how user-space can read the stats via :ref:`DAMON sysfs interface
<sysfs_interface>`, refer to :ref:s`stats <sysfs_stats>` part of the
documentation.
Regions Walking
~~~~~~~~~~~~~~~
DAMOS feature allowing users access each region that a DAMOS action has just
applied. Using this feature, DAMON :ref:`API <damon_design_api>` allows users
access full properties of the regions including the access monitoring results
and amount of the region's internal memory that passed the DAMOS filters.
:ref:`DAMON sysfs interface <sysfs_interface>` also allows users read the data
via special :ref:`files <sysfs_schemes_tried_regions>`.
.. _damon_design_api:
Application Programming Interface
---------------------------------
@ -573,15 +709,11 @@ General Purpose User Interface Modules
DAMON modules that provide user space ABIs for general purpose DAMON usage in
runtime.
DAMON user interface modules, namely 'DAMON sysfs interface' and 'DAMON debugfs
interface' are DAMON API user kernel modules that provide ABIs to the
user-space. Please note that DAMON debugfs interface is currently deprecated.
Like many other ABIs, the modules create files on sysfs and debugfs, allow
users to specify their requests to and get the answers from DAMON by writing to
and reading from the files. As a response to such I/O, DAMON user interface
modules control DAMON and retrieve the results as user requested via the DAMON
API, and return the results to the user-space.
Like many other ABIs, the modules create files on pseudo file systems like
'sysfs', allow users to specify their requests to and get the answers from
DAMON by writing to and reading from the files. As a response to such I/O,
DAMON user interface modules control DAMON and retrieve the results as user
requested via the DAMON API, and return the results to the user-space.
The ABIs are designed to be used for user space applications development,
rather than human beings' fingers. Human users are recommended to use such
@ -590,8 +722,9 @@ Github (https://github.com/damonitor/damo), Pypi
(https://pypistats.org/packages/damo), and Fedora
(https://packages.fedoraproject.org/pkgs/python-damo/damo/).
Please refer to the ABI :doc:`document </admin-guide/mm/damon/usage>` for
details of the interfaces.
Currently, one module for this type, namely 'DAMON sysfs interface' is
available. Please refer to the ABI :ref:`doc <sysfs_interface>` for details of
the interfaces.
Special-Purpose Access-aware Kernel Modules
@ -599,8 +732,8 @@ Special-Purpose Access-aware Kernel Modules
DAMON modules that provide user space ABI for specific purpose DAMON usage.
DAMON sysfs/debugfs user interfaces are for full control of all DAMON features
in runtime. For each special-purpose system-wide data access-aware system
DAMON user interface modules are for full control of all DAMON features in
runtime. For each special-purpose system-wide data access-aware system
operations such as proactive reclamation or LRU lists balancing, the interfaces
could be simplified by removing unnecessary knobs for the specific purpose, and
extended for boot-time and even compile time control. Default values of DAMON

View File

@ -0,0 +1,247 @@
.. SPDX-License-Identifier: GPL-2.0
=================================================
DAMON Moniting Interval Parameters Tuning Example
=================================================
DAMON's monitoring parameters need tuning based on given workload and the
monitoring purpose. There is a :ref:`tuning guide
<damon_design_monitoring_params_tuning_guide>` for that. This document
provides an example tuning based on the guide.
Setup
=====
For below example, DAMON of Linux kernel v6.11 and `damo
<https://github.com/damonitor/damo>`_ (DAMON user-space tool) v2.5.9 was used to
monitor and visualize access patterns on the physical address space of a system
running a real-world server workload.
5ms/100ms intervals: Too Short Interval
=======================================
Let's start by capturing the access pattern snapshot on the physical address
space of the system using DAMON, with the default interval parameters (5
milliseconds and 100 milliseconds for the sampling and the aggregation
intervals, respectively). Wait ten minutes between the start of DAMON and
the capturing of the snapshot, to show a meaningful time-wise access patterns.
::
# damo start
# sleep 600
# damo record --snapshot 0 1
# damo stop
Then, list the DAMON-found regions of different access patterns, sorted by the
"access temperature". "Access temperature" is a metric representing the
access-hotness of a region. It is calculated as a weighted sum of the access
frequency and the age of the region. If the access frequency is 0 %, the
temperature is multipled by minus one. That is, if a region is not accessed,
it gets minus temperature and it gets lower as not accessed for longer time.
The sorting is in temperature-ascendint order, so the region at the top of the
list is the coldest, and the one at the bottom is the hottest one. ::
# damo report access --sort_regions_by temperature
0 addr 16.052 GiB size 5.985 GiB access 0 % age 5.900 s # coldest
1 addr 22.037 GiB size 6.029 GiB access 0 % age 5.300 s
2 addr 28.065 GiB size 6.045 GiB access 0 % age 5.200 s
3 addr 10.069 GiB size 5.983 GiB access 0 % age 4.500 s
4 addr 4.000 GiB size 6.069 GiB access 0 % age 4.400 s
5 addr 62.008 GiB size 3.992 GiB access 0 % age 3.700 s
6 addr 56.795 GiB size 5.213 GiB access 0 % age 3.300 s
7 addr 39.393 GiB size 6.096 GiB access 0 % age 2.800 s
8 addr 50.782 GiB size 6.012 GiB access 0 % age 2.800 s
9 addr 34.111 GiB size 5.282 GiB access 0 % age 2.300 s
10 addr 45.489 GiB size 5.293 GiB access 0 % age 1.800 s # hottest
total size: 62.000 GiB
The list shows not seemingly hot regions, and only minimum access pattern
diversity. Every region has zero access frequency. The number of region is
10, which is the default ``min_nr_regions value``. Size of each region is also
nearly idential. We can suspect this is because “adaptive regions adjustment”
mechanism was not well working. As the guide suggested, we can get relative
hotness of regions using ``age`` as the recency information. That would be
better than nothing, but given the fact that the longest age is only about 6
seconds while we waited about ten minuts, it is unclear how useful this will
be.
The temperature ranges to total size of regions of each range histogram
visualization of the results also shows no interesting distribution pattern. ::
# damo report access --style temperature-sz-hist
<temperature> <total size>
[-,590,000,000, -,549,000,000) 5.985 GiB |********** |
[-,549,000,000, -,508,000,000) 12.074 GiB |********************|
[-,508,000,000, -,467,000,000) 0 B | |
[-,467,000,000, -,426,000,000) 12.052 GiB |********************|
[-,426,000,000, -,385,000,000) 0 B | |
[-,385,000,000, -,344,000,000) 3.992 GiB |******* |
[-,344,000,000, -,303,000,000) 5.213 GiB |********* |
[-,303,000,000, -,262,000,000) 12.109 GiB |********************|
[-,262,000,000, -,221,000,000) 5.282 GiB |********* |
[-,221,000,000, -,180,000,000) 0 B | |
[-,180,000,000, -,139,000,000) 5.293 GiB |********* |
total size: 62.000 GiB
In short, the parameters provide poor quality monitoring results for hot
regions detection. According to the :ref:`guide
<damon_design_monitoring_params_tuning_guide>`, this is due to the too short
aggregation interval.
100ms/2s intervals: Starts Showing Small Hot Regions
====================================================
Following the guide, increase the interval 20 times (100 milliseocnds and 2
seconds for sampling and aggregation intervals, respectively). ::
# damo start -s 100ms -a 2s
# sleep 600
# damo record --snapshot 0 1
# damo stop
# damo report access --sort_regions_by temperature
0 addr 10.180 GiB size 6.117 GiB access 0 % age 7 m 8 s # coldest
1 addr 49.275 GiB size 6.195 GiB access 0 % age 6 m 14 s
2 addr 62.421 GiB size 3.579 GiB access 0 % age 6 m 4 s
3 addr 40.154 GiB size 6.127 GiB access 0 % age 5 m 40 s
4 addr 16.296 GiB size 6.182 GiB access 0 % age 5 m 32 s
5 addr 34.254 GiB size 5.899 GiB access 0 % age 5 m 24 s
6 addr 46.281 GiB size 2.995 GiB access 0 % age 5 m 20 s
7 addr 28.420 GiB size 5.835 GiB access 0 % age 5 m 6 s
8 addr 4.000 GiB size 6.180 GiB access 0 % age 4 m 16 s
9 addr 22.478 GiB size 5.942 GiB access 0 % age 3 m 58 s
10 addr 55.470 GiB size 915.645 MiB access 0 % age 3 m 6 s
11 addr 56.364 GiB size 6.056 GiB access 0 % age 2 m 8 s
12 addr 56.364 GiB size 4.000 KiB access 95 % age 16 s
13 addr 49.275 GiB size 4.000 KiB access 100 % age 8 m 24 s # hottest
total size: 62.000 GiB
# damo report access --style temperature-sz-hist
<temperature> <total size>
[-42,800,000,000, -33,479,999,000) 22.018 GiB |***************** |
[-33,479,999,000, -24,159,998,000) 27.090 GiB |********************|
[-24,159,998,000, -14,839,997,000) 6.836 GiB |****** |
[-14,839,997,000, -5,519,996,000) 6.056 GiB |***** |
[-5,519,996,000, 3,800,005,000) 4.000 KiB |* |
[3,800,005,000, 13,120,006,000) 0 B | |
[13,120,006,000, 22,440,007,000) 0 B | |
[22,440,007,000, 31,760,008,000) 0 B | |
[31,760,008,000, 41,080,009,000) 0 B | |
[41,080,009,000, 50,400,010,000) 0 B | |
[50,400,010,000, 59,720,011,000) 4.000 KiB |* |
total size: 62.000 GiB
DAMON found two distinct 4 KiB regions that pretty hot. The regions are also
well aged. The hottest 4 KiB region was keeping the access frequency for about
8 minutes, and the coldest region was keeping no access for about 7 minutes.
The distribution on the histogram also looks like having a pattern.
Especially, the finding of the 4 KiB regions among the 62 GiB total memory
shows DAMONs adaptive regions adjustment is working as designed.
Still the number of regions is close to the ``min_nr_regions``, and sizes of
cold regions are similar, though. Apparently it is improved, but it still has
rooms to improve.
400ms/8s intervals: Pretty Improved Results
===========================================
Increase the intervals four times (400 milliseconds and 8 seconds
for sampling and aggregation intervals, respectively). ::
# damo start -s 400ms -a 8s
# sleep 600
# damo record --snapshot 0 1
# damo stop
# damo report access --sort_regions_by temperature
0 addr 64.492 GiB size 1.508 GiB access 0 % age 6 m 48 s # coldest
1 addr 21.749 GiB size 5.674 GiB access 0 % age 6 m 8 s
2 addr 27.422 GiB size 5.801 GiB access 0 % age 6 m
3 addr 49.431 GiB size 8.675 GiB access 0 % age 5 m 28 s
4 addr 33.223 GiB size 5.645 GiB access 0 % age 5 m 12 s
5 addr 58.321 GiB size 6.170 GiB access 0 % age 5 m 4 s
[...]
25 addr 6.615 GiB size 297.531 MiB access 15 % age 0 ns
26 addr 9.513 GiB size 12.000 KiB access 20 % age 0 ns
27 addr 9.511 GiB size 108.000 KiB access 25 % age 0 ns
28 addr 9.513 GiB size 20.000 KiB access 25 % age 0 ns
29 addr 9.511 GiB size 12.000 KiB access 30 % age 0 ns
30 addr 9.520 GiB size 4.000 KiB access 40 % age 0 ns
[...]
41 addr 9.520 GiB size 4.000 KiB access 80 % age 56 s
42 addr 9.511 GiB size 12.000 KiB access 100 % age 6 m 16 s
43 addr 58.321 GiB size 4.000 KiB access 100 % age 6 m 24 s
44 addr 9.512 GiB size 4.000 KiB access 100 % age 6 m 48 s
45 addr 58.106 GiB size 4.000 KiB access 100 % age 6 m 48 s # hottest
total size: 62.000 GiB
# damo report access --style temperature-sz-hist
<temperature> <total size>
[-40,800,000,000, -32,639,999,000) 21.657 GiB |********************|
[-32,639,999,000, -24,479,998,000) 17.938 GiB |***************** |
[-24,479,998,000, -16,319,997,000) 16.885 GiB |**************** |
[-16,319,997,000, -8,159,996,000) 586.879 MiB |* |
[-8,159,996,000, 5,000) 4.946 GiB |***** |
[5,000, 8,160,006,000) 260.000 KiB |* |
[8,160,006,000, 16,320,007,000) 0 B | |
[16,320,007,000, 24,480,008,000) 0 B | |
[24,480,008,000, 32,640,009,000) 0 B | |
[32,640,009,000, 40,800,010,000) 16.000 KiB |* |
[40,800,010,000, 48,960,011,000) 8.000 KiB |* |
total size: 62.000 GiB
The number of regions having different access patterns has significantly
increased. Size of each region is also more varied. Total size of non-zero
access frequency regions is also significantly increased. Maybe this is already
good enough to make some meaningful memory management efficieny changes.
800ms/16s intervals: Another bias
=================================
Further double the intervals (800 milliseconds and 16 seconds for sampling
and aggregation intervals, respectively). The results is more improved for the
hot regions detection, but starts looking degrading cold regions detection. ::
# damo start -s 800ms -a 16s
# sleep 600
# damo record --snapshot 0 1
# damo stop
# damo report access --sort_regions_by temperature
0 addr 64.781 GiB size 1.219 GiB access 0 % age 4 m 48 s
1 addr 24.505 GiB size 2.475 GiB access 0 % age 4 m 16 s
2 addr 26.980 GiB size 504.273 MiB access 0 % age 4 m
3 addr 29.443 GiB size 2.462 GiB access 0 % age 4 m
4 addr 37.264 GiB size 5.645 GiB access 0 % age 4 m
5 addr 31.905 GiB size 5.359 GiB access 0 % age 3 m 44 s
[...]
20 addr 8.711 GiB size 40.000 KiB access 5 % age 2 m 40 s
21 addr 27.473 GiB size 1.970 GiB access 5 % age 4 m
22 addr 48.185 GiB size 4.625 GiB access 5 % age 4 m
23 addr 47.304 GiB size 902.117 MiB access 10 % age 4 m
24 addr 8.711 GiB size 4.000 KiB access 100 % age 4 m
25 addr 20.793 GiB size 3.713 GiB access 5 % age 4 m 16 s
26 addr 8.773 GiB size 4.000 KiB access 100 % age 4 m 16 s
total size: 62.000 GiB
# damo report access --style temperature-sz-hist
<temperature> <total size>
[-28,800,000,000, -23,359,999,000) 12.294 GiB |***************** |
[-23,359,999,000, -17,919,998,000) 9.753 GiB |************* |
[-17,919,998,000, -12,479,997,000) 15.131 GiB |********************|
[-12,479,997,000, -7,039,996,000) 0 B | |
[-7,039,996,000, -1,599,995,000) 7.506 GiB |********** |
[-1,599,995,000, 3,840,006,000) 6.127 GiB |********* |
[3,840,006,000, 9,280,007,000) 0 B | |
[9,280,007,000, 14,720,008,000) 136.000 KiB |* |
[14,720,008,000, 20,160,009,000) 40.000 KiB |* |
[20,160,009,000, 25,600,010,000) 11.188 GiB |*************** |
[25,600,010,000, 31,040,011,000) 4.000 KiB |* |
total size: 62.000 GiB
It found more non-zero access frequency regions. The number of regions is still
much higher than the ``min_nr_regions``, but it is reduced from that of the
previous setup. And apparently the distribution seems bit biased to hot
regions.
Conclusion
==========
With the above experimental tuning results, we can conclude the theory and the
guide makes sense to at least this workload, and could be applied to similar
cases.

View File

@ -531,6 +531,10 @@ are extra requirements for accessing them:
new page table has been installed in the same location and filled with
entries. Writers normally need to take the PTE lock and revalidate that the
PMD entry still refers to the same PTE-level page table.
If the writer does not care whether it is the same PTE-level page table, it
can take the PMD lock and revalidate that the contents of pmd entry still meet
the requirements. In particular, this also happens in :c:func:`!retract_page_tables`
when handling :c:macro:`!MADV_COLLAPSE`.
To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or
:c:func:`!pte_offset_map` can be used depending on stability requirements.
@ -712,9 +716,14 @@ calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU
critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`,
before releasing the RCU lock via :c:func:`!rcu_read_unlock`.
VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for
their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it
via :c:func:`!vma_end_read`.
In cases when the user already holds mmap read lock, :c:func:`!vma_start_read_locked`
and :c:func:`!vma_start_read_locked_nested` can be used. These functions do not
fail due to lock contention but the caller should still check their return values
in case they fail for other reasons.
VMA read locks increment :c:member:`!vma.vm_refcnt` reference counter for their
duration and the caller of :c:func:`!lock_vma_under_rcu` must drop it via
:c:func:`!vma_end_read`.
VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a
VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always
@ -722,9 +731,9 @@ acquired. An mmap write lock **must** be held for the duration of the VMA write
lock, releasing or downgrading the mmap write lock also releases the VMA write
lock so there is no :c:func:`!vma_end_write` function.
Note that a semaphore write lock is not held across a VMA lock. Rather, a
sequence number is used for serialisation, and the write semaphore is only
acquired at the point of write lock to update this.
Note that when write-locking a VMA lock, the :c:member:`!vma.vm_refcnt` is temporarily
modified so that readers can detect the presense of a writer. The reference counter is
restored once the vma sequence number used for serialisation is updated.
This ensures the semantics we require - VMA write locks provide exclusive write
access to the VMA.
@ -734,7 +743,7 @@ Implementation details
The VMA lock mechanism is designed to be a lightweight means of avoiding the use
of the heavily contended mmap lock. It is implemented using a combination of a
read/write semaphore and sequence numbers belonging to the containing
reference counter and sequence numbers belonging to the containing
:c:struct:`!struct mm_struct` and the VMA.
Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic
@ -775,28 +784,31 @@ release of any VMA locks on its release makes sense, as you would never want to
keep VMAs locked across entirely separate write operations. It also maintains
correct lock ordering.
Each time a VMA read lock is acquired, we acquire a read lock on the
:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that
the sequence count of the VMA does not match that of the mm.
Each time a VMA read lock is acquired, we increment :c:member:`!vma.vm_refcnt`
reference counter and check that the sequence count of the VMA does not match
that of the mm.
If it does, the read lock fails. If it does not, we hold the lock, excluding
writers, but permitting other readers, who will also obtain this lock under RCU.
If it does, the read lock fails and :c:member:`!vma.vm_refcnt` is dropped.
If it does not, we keep the reference counter raised, excluding writers, but
permitting other readers, who can also obtain this lock under RCU.
Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu`
are also RCU safe, so the whole read lock operation is guaranteed to function
correctly.
On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock`
read/write semaphore, before setting the VMA's sequence number under this lock,
also simultaneously holding the mmap write lock.
On the write side, we set a bit in :c:member:`!vma.vm_refcnt` which can't be
modified by readers and wait for all readers to drop their reference count.
Once there are no readers, VMA's sequence number is set to match that of the
mm. During this entire operation mmap write lock is held.
This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep
until these are finished and mutual exclusion is achieved.
After setting the VMA's sequence number, the lock is released, avoiding
complexity with a long-term held write lock.
After setting the VMA's sequence number, the bit in :c:member:`!vma.vm_refcnt`
indicating a writer is cleared. From this point on, VMA's sequence number will
indicate VMA's write-locked state until mmap write lock is dropped or downgraded.
This clever combination of a read/write semaphore and sequence count allows for
This clever combination of a reference counter and sequence count allows for
fast RCU-based per-VMA lock acquisition (especially on page fault, though
utilised elsewhere) with minimal complexity around lock ordering.

View File

@ -62,7 +62,7 @@ Support of split page table lock by an architecture
===================================================
There's no need in special enabling of PTE split page table lock: everything
required is done by pagetable_pte_ctor() and pagetable_pte_dtor(), which
required is done by pagetable_pte_ctor() and pagetable_dtor(), which
must be called on PTE table allocation / freeing.
Make sure the architecture doesn't use slab allocator for page table
@ -73,7 +73,7 @@ PMD split lock only makes sense if you have more than two page table
levels.
PMD split lock enabling requires pagetable_pmd_ctor() call on PMD table
allocation and pagetable_pmd_dtor() on freeing.
allocation and pagetable_dtor() on freeing.
Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing

View File

@ -26,12 +26,7 @@ DAMON 为不同的用户提供了下面这些接口。
使用它用户可以通过读取和写入特殊的sysfs文件来使用DAMON的主要功能。因此你可以编写和使
用你个性化的DAMON sysfs包装程序代替你读/写sysfs文件。 `DAMON用户空间工具
<https://github.com/damonitor/damo>`_ 就是这种程序的一个例子 它同时支持虚拟和物理地址
空间的监测。注意,这个界面只提供简单的监测结果 :ref:`统计 <damos_stats>`。对于详细的监测
结果DAMON提供了一个:ref:`跟踪点 <tracepoint>`
- *debugfs interface.*
:ref:`这 <debugfs_interface>` 几乎与:ref:`sysfs interface <sysfs_interface>`
口相同。这将在下一个LTS内核发布后被移除所以用户应该转移到
:ref:`sysfs interface <sysfs_interface>`
空间的监测。
- *内核空间编程接口。*
:doc:`这 </mm/damon/api>` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内
核空间的DAMON应用程序最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。
@ -335,247 +330,6 @@ tried_regions/<N>/
请注意,我们强烈建议使用用户空间的工具,如 `damo <https://github.com/damonitor/damo>`_
而不是像上面那样手动读写文件。以上只是一个例子。
debugfs接口
===========
.. note::
DAMON debugfs接口将在下一个LTS内核发布后被移除所以用户应该转移到
:ref:`sysfs接口<sysfs_interface>`
DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts``
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
属性
----
用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔````聚集间隔````更新间隔``
以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/mm/damon/design` 。例如,
下面的命令将这些值设置为5ms、100ms、1000ms、10和1000然后再次检查::
# cd <debugfs>/damon
# echo 5000 100000 1000000 10 1000 > attrs
# cat attrs
5000 100000 1000000 10 1000
目标ID
------
一些类型的地址空间支持多个监测目标。例如,虚拟内存地址空间的监测可以有多个进程作为监测目标。用户
可以通过写入目标的相关id值来设置目标并通过读取 ``target_ids`` 文件来获得当前目标的id。在监
测虚拟地址空间的情况下这些值应该是监测目标进程的pid。例如下面的命令将pid为42和4242的进程设
为监测目标,并再次检查::
# cd <debugfs>/damon
# echo 42 4242 > target_ids
# cat target_ids
42 4242
用户还可以通过在文件中写入一个特殊的关键字 "paddr\n" 来监测系统的物理内存地址空间。因为物理地
址空间监测不支持多个目标,读取文件会显示一个假值,即 ``42`` ,如下图所示::
# cd <debugfs>/damon
# echo paddr > target_ids
# cat target_ids
42
请注意设置目标ID并不启动监测。
初始监测目标区域
----------------
在虚拟地址空间监测的情况下DAMON自动设置和更新监测的目标区域这样就可以覆盖目标进程的整个
内存映射。然而,用户可能希望将监测区域限制在特定的地址范围内,如堆、栈或特定的文件映射区域。
或者,一些用户可以知道他们工作负载的初始访问模式,因此希望为“自适应区域调整”设置最佳初始区域。
相比之下DAMON在物理内存监测的情况下不会自动设置和更新监测目标区域。因此用户应该自己设置
监测目标区域。
在这种情况下,用户可以通过在 ``init_regions`` 文件中写入适当的值,明确地设置他们想要的初
始监测目标区域。输入应该是一个由三个整数组成的队列,用空格隔开,代表一个区域的形式如下::
<target idx> <start address> <end address>
目标idx应该是 ``target_ids`` 文件中目标的索引,从 ``0`` 开始,区域应该按照地址顺序传递。
例如,下面的命令将设置几个地址范围, ``1-100````100-200`` 作为pid 42的初始监测目标
区域,这是 ``target_ids`` 中的第一个(索引 ``0`` ),另外几个地址范围, ``20-40``
``50-100`` 作为pid 4242的地址这是 ``target_ids`` 中的第二个(索引 ``1`` ::
# cd <debugfs>/damon
# cat target_ids
42 4242
# echo "0 1 100 \
0 100 200 \
1 20 40 \
1 50 100" > init_regions
请注意这只是设置了初始的监测目标区域。在虚拟内存监测的情况下DAMON会在一个 ``更新间隔``
后自动更新区域的边界。因此,在这种情况下,如果用户不希望更新的话,应该把 ``更新间隔``
置得足够大。
方案
----
对于通常的基于DAMON的数据访问感知的内存管理优化用户只是希望系统对特定访问模式的内存区域应用内
存管理操作。DAMON从用户那里接收这种形式化的操作方案并将这些方案应用到目标进程中。
用户可以通过读取和写入 ``scheme`` debugfs文件来获得和设置这些方案。读取该文件还可以显示每个
方案的统计数据。在文件中,每一个方案都应该在每一行中以下列形式表示出来::
<target access pattern> <action> <quota> <watermarks>
你可以通过简单地在文件中写入一个空字符串来禁用方案。
目标访问模式
~~~~~~~~~~~~
``<目标访问模式>`` 是由三个范围构成的,形式如下::
min-size max-size min-acc max-acc min-age max-age
具体来说,区域大小的字节数( `min-size``max-size` ),访问频率的每聚合区间的监测访问次
数( `min-acc``max-acc` ),区域年龄的聚合区间数( `min-age``max-age` )都被指定。
请注意,这些范围是封闭区间。
动作
~~~~
``<action>`` 是一个预定义的内存管理动作的整数DAMON将应用于具有目标访问模式的区域。支持
的数字和它们的含义如下::
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
- 1: Call ``madvise()`` for the region with ``MADV_COLD``
- 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
- 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
- 5: Do nothing but count the statistics
配额
~~~~
每个 ``动作`` 的最佳 ``目标访问模式`` 取决于工作负载,所以不容易找到。更糟糕的是,将某个
动作的方案设置得过于激进会导致严重的开销。为了避免这种开销,用户可以通过下面表格中的 ``<quota>``
来限制方案的时间和大小配额::
<ms> <sz> <reset interval> <priority weights>
这使得DAMON在 ``<reset interval>`` 毫秒内,尽量只用 ``<ms>`` 毫秒的时间对 ``目标访
问模式`` 的内存区域应用动作,并在 ``<reset interval>`` 内只对最多<sz>字节的内存区域应
用动作。将 ``<ms>````<sz>`` 都设置为零,可以禁用配额限制。
当预计超过配额限制时DAMON会根据 ``目标访问模式`` 的大小、访问频率和年龄,对发现的内存
区域进行优先排序。为了实现个性化的优先级,用户可以在 ``<优先级权重>`` 中设置这三个属性的
权重,具体形式如下::
<size weight> <access frequency weight> <age weight>
水位
~~~~
有些方案需要根据系统特定指标的当前值来运行,如自由内存比率。对于这种情况,用户可以为该条
件指定水位。::
<metric> <check interval> <high mark> <middle mark> <low mark>
``<metric>`` 是一个预定义的整数,用于要检查的度量。支持的数字和它们的含义如下。
- 0: 忽视水位
- 1: 系统空闲内存率 (千分比)
每隔 ``<检查间隔>`` 微秒检查一次公制的值。
如果该值高于 ``<高标>`` 或低于 ``<低标>`` ,该方案被停用。如果该值低于 ``<中标>``
该方案将被激活。
统计数据
~~~~~~~~
它还统计每个方案被尝试应用的区域的总数量和字节数,每个方案被成功应用的区域的两个数量,以
及超过配额限制的总数量。这些统计数据可用于在线分析或调整方案。
统计数据可以通过读取方案文件来显示。读取该文件将显示你在每一行中输入的每个 ``方案``
统计的五个数字将被加在每一行的末尾。
例子
~~~~
下面的命令应用了一个方案:”如果一个大小为[4KiB, 8KiB]的内存区域在[10, 20]的聚合时间
间隔内显示出每一个聚合时间间隔[0, 5]的访问量,请分页出该区域。对于分页,每秒最多只能使
用10ms而且每秒分页不能超过1GiB。在这一限制下首先分页出具有较长年龄的内存区域。另外
每5秒钟检查一次系统的可用内存率当可用内存率低于50%时开始监测和分页,但如果可用内存率
大于60%或低于30%,则停止监测“::
# cd <debugfs>/damon
# scheme="4096 8192 0 5 10 20 2" # target access pattern and action
# scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
# scheme+=" 0 0 100" # prioritization weights
# scheme+=" 1 5000000 600 500 300" # watermarks
# echo "$scheme" > schemes
开关
----
除非你明确地启动监测,否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on_DEPRECATED``
文件来启动、停止和检查监测的当前状态。写入 ``on`` 该文件可以启动对有属性的目标的监测。写入
``off`` 该文件则停止这些目标。如果每个目标进程被终止DAMON也会停止。下面的示例命令开启、关
闭和检查DAMON的状态::
# cd <debugfs>/damon
# echo on > monitor_on_DEPRECATED
# echo off > monitor_on_DEPRECATED
# cat monitor_on_DEPRECATED
off
请注意当监测开启时你不能写到上述的debugfs文件。如果你在DAMON运行时写到这些文件将会返
回一个错误代码,如 ``-EBUSY``
监测线程PID
-----------
DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以通过读取 ``kdamond_pid`` 文件获
得该线程的 ``pid`` 。当监测被 ``关闭`` 时,读取该文件不会返回任何信息::
# cd <debugfs>/damon
# cat monitor_on_DEPRECATED
off
# cat kdamond_pid
none
# echo on > monitor_on_DEPRECATED
# cat kdamond_pid
18594
使用多个监测线程
----------------
每个监测上下文都会创建一个 ``kdamond`` 线程。你可以使用 ``mk_contexts````rm_contexts``
文件为多个 ``kdamond`` 需要的用例创建和删除监测上下文。
将新上下文的名称写入 ``mk_contexts`` 文件,在 ``DAMON debugfs`` 目录上创建一个该名称的目录。
该目录将有该上下文的 ``DAMON debugfs`` 文件::
# cd <debugfs>/damon
# ls foo
# ls: cannot access 'foo': No such file or directory
# echo foo > mk_contexts
# ls foo
# attrs init_regions kdamond_pid schemes target_ids
如果不再需要上下文,你可以通过把上下文的名字放到 ``rm_contexts`` 文件中来删除它和相应的目录::
# echo foo > rm_contexts
# ls foo
# ls: cannot access 'foo': No such file or directory
注意, ``mk_contexts````rm_contexts````monitor_on_DEPRECATED`` 文件只在根目录下。
监测结果的监测点
================

View File

@ -26,12 +26,7 @@ DAMON 爲不同的用戶提供了下面這些接口。
使用它用戶可以通過讀取和寫入特殊的sysfs文件來使用DAMON的主要功能。因此你可以編寫和使
用你個性化的DAMON sysfs包裝程序代替你讀/寫sysfs文件。 `DAMON用戶空間工具
<https://github.com/damonitor/damo>`_ 就是這種程序的一個例子 它同時支持虛擬和物理地址
空間的監測。注意,這個界面只提供簡單的監測結果 :ref:`統計 <damos_stats>`。對於詳細的監測
結果DAMON提供了一個:ref:`跟蹤點 <tracepoint>`
- *debugfs interface.*
:ref:`這 <debugfs_interface>` 幾乎與:ref:`sysfs interface <sysfs_interface>`
口相同。這將在下一個LTS內核發佈後被移除所以用戶應該轉移到
:ref:`sysfs interface <sysfs_interface>`
空間的監測。
- *內核空間編程接口。*
:doc:`這 </mm/damon/api>` 這是爲內核空間程序員準備的。使用它,用戶可以通過爲你編寫內
核空間的DAMON應用程序最靈活有效地利用DAMON的每一個功能。你甚至可以爲各種地址空間擴展DAMON。
@ -335,247 +330,6 @@ tried_regions/<N>/
請注意,我們強烈建議使用用戶空間的工具,如 `damo <https://github.com/damonitor/damo>`_
而不是像上面那樣手動讀寫文件。以上只是一個例子。
debugfs接口
===========
.. note::
DAMON debugfs接口將在下一個LTS內核發佈後被移除所以用戶應該轉移到
:ref:`sysfs接口<sysfs_interface>`
DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``,
``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts``
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
屬性
----
用戶可以通過讀取和寫入 ``attrs`` 文件獲得和設置 ``採樣間隔````聚集間隔````更新間隔``
以及監測目標區域的最小/最大數量。要詳細瞭解監測屬性,請參考 `:doc:/mm/damon/design` 。例如,
下面的命令將這些值設置爲5ms、100ms、1000ms、10和1000然後再次檢查::
# cd <debugfs>/damon
# echo 5000 100000 1000000 10 1000 > attrs
# cat attrs
5000 100000 1000000 10 1000
目標ID
------
一些類型的地址空間支持多個監測目標。例如,虛擬內存地址空間的監測可以有多個進程作爲監測目標。用戶
可以通過寫入目標的相關id值來設置目標並通過讀取 ``target_ids`` 文件來獲得當前目標的id。在監
測虛擬地址空間的情況下這些值應該是監測目標進程的pid。例如下面的命令將pid爲42和4242的進程設
爲監測目標,並再次檢查::
# cd <debugfs>/damon
# echo 42 4242 > target_ids
# cat target_ids
42 4242
用戶還可以通過在文件中寫入一個特殊的關鍵字 "paddr\n" 來監測系統的物理內存地址空間。因爲物理地
址空間監測不支持多個目標,讀取文件會顯示一個假值,即 ``42`` ,如下圖所示::
# cd <debugfs>/damon
# echo paddr > target_ids
# cat target_ids
42
請注意設置目標ID並不啓動監測。
初始監測目標區域
----------------
在虛擬地址空間監測的情況下DAMON自動設置和更新監測的目標區域這樣就可以覆蓋目標進程的整個
內存映射。然而,用戶可能希望將監測區域限制在特定的地址範圍內,如堆、棧或特定的文件映射區域。
或者,一些用戶可以知道他們工作負載的初始訪問模式,因此希望爲“自適應區域調整”設置最佳初始區域。
相比之下DAMON在物理內存監測的情況下不會自動設置和更新監測目標區域。因此用戶應該自己設置
監測目標區域。
在這種情況下,用戶可以通過在 ``init_regions`` 文件中寫入適當的值,明確地設置他們想要的初
始監測目標區域。輸入應該是一個由三個整數組成的隊列,用空格隔開,代表一個區域的形式如下::
<target idx> <start address> <end address>
目標idx應該是 ``target_ids`` 文件中目標的索引,從 ``0`` 開始,區域應該按照地址順序傳遞。
例如,下面的命令將設置幾個地址範圍, ``1-100````100-200`` 作爲pid 42的初始監測目標
區域,這是 ``target_ids`` 中的第一個(索引 ``0`` ),另外幾個地址範圍, ``20-40``
``50-100`` 作爲pid 4242的地址這是 ``target_ids`` 中的第二個(索引 ``1`` ::
# cd <debugfs>/damon
# cat target_ids
42 4242
# echo "0 1 100 \
0 100 200 \
1 20 40 \
1 50 100" > init_regions
請注意這只是設置了初始的監測目標區域。在虛擬內存監測的情況下DAMON會在一個 ``更新間隔``
後自動更新區域的邊界。因此,在這種情況下,如果用戶不希望更新的話,應該把 ``更新間隔``
置得足夠大。
方案
----
對於通常的基於DAMON的數據訪問感知的內存管理優化用戶只是希望系統對特定訪問模式的內存區域應用內
存管理操作。DAMON從用戶那裏接收這種形式化的操作方案並將這些方案應用到目標進程中。
用戶可以通過讀取和寫入 ``scheme`` debugfs文件來獲得和設置這些方案。讀取該文件還可以顯示每個
方案的統計數據。在文件中,每一個方案都應該在每一行中以下列形式表示出來::
<target access pattern> <action> <quota> <watermarks>
你可以通過簡單地在文件中寫入一個空字符串來禁用方案。
目標訪問模式
~~~~~~~~~~~~
``<目標訪問模式>`` 是由三個範圍構成的,形式如下::
min-size max-size min-acc max-acc min-age max-age
具體來說,區域大小的字節數( `min-size``max-size` ),訪問頻率的每聚合區間的監測訪問次
數( `min-acc``max-acc` ),區域年齡的聚合區間數( `min-age``max-age` )都被指定。
請注意,這些範圍是封閉區間。
動作
~~~~
``<action>`` 是一個預定義的內存管理動作的整數DAMON將應用於具有目標訪問模式的區域。支持
的數字和它們的含義如下::
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
- 1: Call ``madvise()`` for the region with ``MADV_COLD``
- 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
- 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
- 5: Do nothing but count the statistics
配額
~~~~
每個 ``動作`` 的最佳 ``目標訪問模式`` 取決於工作負載,所以不容易找到。更糟糕的是,將某個
動作的方案設置得過於激進會導致嚴重的開銷。爲了避免這種開銷,用戶可以通過下面表格中的 ``<quota>``
來限制方案的時間和大小配額::
<ms> <sz> <reset interval> <priority weights>
這使得DAMON在 ``<reset interval>`` 毫秒內,儘量只用 ``<ms>`` 毫秒的時間對 ``目標訪
問模式`` 的內存區域應用動作,並在 ``<reset interval>`` 內只對最多<sz>字節的內存區域應
用動作。將 ``<ms>````<sz>`` 都設置爲零,可以禁用配額限制。
當預計超過配額限制時DAMON會根據 ``目標訪問模式`` 的大小、訪問頻率和年齡,對發現的內存
區域進行優先排序。爲了實現個性化的優先級,用戶可以在 ``<優先級權重>`` 中設置這三個屬性的
權重,具體形式如下::
<size weight> <access frequency weight> <age weight>
水位
~~~~
有些方案需要根據系統特定指標的當前值來運行,如自由內存比率。對於這種情況,用戶可以爲該條
件指定水位。::
<metric> <check interval> <high mark> <middle mark> <low mark>
``<metric>`` 是一個預定義的整數,用於要檢查的度量。支持的數字和它們的含義如下。
- 0: 忽視水位
- 1: 系統空閒內存率 (千分比)
每隔 ``<檢查間隔>`` 微秒檢查一次公制的值。
如果該值高於 ``<高標>`` 或低於 ``<低標>`` ,該方案被停用。如果該值低於 ``<中標>``
該方案將被激活。
統計數據
~~~~~~~~
它還統計每個方案被嘗試應用的區域的總數量和字節數,每個方案被成功應用的區域的兩個數量,以
及超過配額限制的總數量。這些統計數據可用於在線分析或調整方案。
統計數據可以通過讀取方案文件來顯示。讀取該文件將顯示你在每一行中輸入的每個 ``方案``
統計的五個數字將被加在每一行的末尾。
例子
~~~~
下面的命令應用了一個方案:”如果一個大小爲[4KiB, 8KiB]的內存區域在[10, 20]的聚合時間
間隔內顯示出每一個聚合時間間隔[0, 5]的訪問量,請分頁出該區域。對於分頁,每秒最多隻能使
用10ms而且每秒分頁不能超過1GiB。在這一限制下首先分頁出具有較長年齡的內存區域。另外
每5秒鐘檢查一次系統的可用內存率當可用內存率低於50%時開始監測和分頁,但如果可用內存率
大於60%或低於30%,則停止監測“::
# cd <debugfs>/damon
# scheme="4096 8192 0 5 10 20 2" # target access pattern and action
# scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
# scheme+=" 0 0 100" # prioritization weights
# scheme+=" 1 5000000 600 500 300" # watermarks
# echo "$scheme" > schemes
開關
----
除非你明確地啓動監測,否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on_DEPRECATED``
文件來啓動、停止和檢查監測的當前狀態。寫入 ``on`` 該文件可以啓動對有屬性的目標的監測。寫入
``off`` 該文件則停止這些目標。如果每個目標進程被終止DAMON也會停止。下面的示例命令開啓、關
閉和檢查DAMON的狀態::
# cd <debugfs>/damon
# echo on > monitor_on_DEPRECATED
# echo off > monitor_on_DEPRECATED
# cat monitor_on_DEPRECATED
off
請注意當監測開啓時你不能寫到上述的debugfs文件。如果你在DAMON運行時寫到這些文件將會返
回一個錯誤代碼,如 ``-EBUSY``
監測線程PID
-----------
DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以通過讀取 ``kdamond_pid`` 文件獲
得該線程的 ``pid`` 。當監測被 ``關閉`` 時,讀取該文件不會返回任何信息::
# cd <debugfs>/damon
# cat monitor_on_DEPRECATED
off
# cat kdamond_pid
none
# echo on > monitor_on_DEPRECATED
# cat kdamond_pid
18594
使用多個監測線程
----------------
每個監測上下文都會創建一個 ``kdamond`` 線程。你可以使用 ``mk_contexts````rm_contexts``
文件爲多個 ``kdamond`` 需要的用例創建和刪除監測上下文。
將新上下文的名稱寫入 ``mk_contexts`` 文件,在 ``DAMON debugfs`` 目錄上創建一個該名稱的目錄。
該目錄將有該上下文的 ``DAMON debugfs`` 文件::
# cd <debugfs>/damon
# ls foo
# ls: cannot access 'foo': No such file or directory
# echo foo > mk_contexts
# ls foo
# attrs init_regions kdamond_pid schemes target_ids
如果不再需要上下文,你可以通過把上下文的名字放到 ``rm_contexts`` 文件中來刪除它和相應的目錄::
# echo foo > rm_contexts
# ls foo
# ls: cannot access 'foo': No such file or directory
注意, ``mk_contexts````rm_contexts````monitor_on_DEPRECATED`` 文件只在根目錄下。
監測結果的監測點
================

View File

@ -2827,7 +2827,7 @@ ARM/NXP S32G ARCHITECTURE
R: Chester Lin <chester62515@gmail.com>
R: Matthias Brugger <mbrugger@suse.com>
R: Ghennadi Procopciuc <ghennadi.procopciuc@oss.nxp.com>
L: NXP S32 Linux Team <s32@nxp.com>
R: NXP S32 Linux Team <s32@nxp.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
F: arch/arm64/boot/dts/freescale/s32g*.dts*
@ -6327,6 +6327,7 @@ F: Documentation/mm/damon/
F: include/linux/damon.h
F: include/trace/events/damon.h
F: mm/damon/
F: samples/damon/
F: tools/testing/selftests/damon/
DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
@ -15068,7 +15069,15 @@ L: linux-mm@kvack.org
S: Maintained
W: http://www.linux-mm.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
F: mm/mlock.c
F: mm/mmap.c
F: mm/mprotect.c
F: mm/mremap.c
F: mm/mseal.c
F: mm/vma.c
F: mm/vma.h
F: mm/vma_internal.h
F: tools/testing/vma/
MEMORY TECHNOLOGY DEVICES (MTD)
M: Miquel Raynal <miquel.raynal@bootlin.com>
@ -16600,8 +16609,8 @@ F: arch/nios2/
NITRO ENCLAVES (NE)
M: Alexandru Ciobotaru <alcioa@amazon.com>
R: The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
L: linux-kernel@vger.kernel.org
L: The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
S: Supported
W: https://aws.amazon.com/ec2/nitro/nitro-enclaves/
F: Documentation/virt/ne_overview.rst
@ -16612,8 +16621,8 @@ F: samples/nitro_enclaves/
NITRO SECURE MODULE (NSM)
M: Alexander Graf <graf@amazon.com>
R: The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
L: linux-kernel@vger.kernel.org
L: The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
S: Supported
W: https://aws.amazon.com/ec2/nitro/nitro-enclaves/
F: drivers/misc/nsm.c
@ -18425,8 +18434,8 @@ M: Fabio Estevam <festevam@gmail.com>
M: Shawn Guo <shawnguo@kernel.org>
M: Jacky Bai <ping.bai@nxp.com>
R: Pengutronix Kernel Team <kernel@pengutronix.de>
R: NXP S32 Linux Team <s32@nxp.com>
L: linux-gpio@vger.kernel.org
L: NXP S32 Linux Team <s32@nxp.com>
S: Maintained
F: Documentation/devicetree/bindings/pinctrl/fsl,*
F: Documentation/devicetree/bindings/pinctrl/nxp,s32*
@ -19561,7 +19570,7 @@ F: drivers/ras/amd/fmpm.c
RASPBERRY PI PISP BACK END
M: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
L: Raspberry Pi Kernel Maintenance <kernel-list@raspberrypi.com>
R: Raspberry Pi Kernel Maintenance <kernel-list@raspberrypi.com>
L: linux-media@vger.kernel.org
S: Maintained
F: Documentation/devicetree/bindings/media/raspberrypi,pispbe.yaml
@ -25018,21 +25027,6 @@ F: include/uapi/linux/vsockmon.h
F: net/vmw_vsock/
F: tools/testing/vsock/
VMA
M: Andrew Morton <akpm@linux-foundation.org>
M: Liam R. Howlett <Liam.Howlett@oracle.com>
M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
R: Vlastimil Babka <vbabka@suse.cz>
R: Jann Horn <jannh@google.com>
L: linux-mm@kvack.org
S: Maintained
W: https://www.linux-mm.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
F: mm/vma.c
F: mm/vma.h
F: mm/vma_internal.h
F: tools/testing/vma/
VMALLOC
M: Andrew Morton <akpm@linux-foundation.org>
R: Uladzislau Rezki <urezki@gmail.com>

View File

@ -331,10 +331,7 @@ cia_prepare_tbia_workaround(int window)
long i;
/* Use minimal 1K map. */
ppte = memblock_alloc(CIA_BROKEN_TBIA_SIZE, 32768);
if (!ppte)
panic("%s: Failed to allocate %u bytes align=0x%x\n",
__func__, CIA_BROKEN_TBIA_SIZE, 32768);
ppte = memblock_alloc_or_panic(CIA_BROKEN_TBIA_SIZE, 32768);
pte = (virt_to_phys(ppte) >> (PAGE_SHIFT - 1)) | 1;
for (i = 0; i < CIA_BROKEN_TBIA_SIZE / sizeof(unsigned long); ++i)

View File

@ -81,10 +81,7 @@ mk_resource_name(int pe, int port, char *str)
char *name;
sprintf(tmp, "PCI %s PE %d PORT %d", str, pe, port);
name = memblock_alloc(strlen(tmp) + 1, SMP_CACHE_BYTES);
if (!name)
panic("%s: Failed to allocate %zu bytes\n", __func__,
strlen(tmp) + 1);
name = memblock_alloc_or_panic(strlen(tmp) + 1, SMP_CACHE_BYTES);
strcpy(name, tmp);
return name;
@ -119,10 +116,7 @@ alloc_io7(unsigned int pe)
return NULL;
}
io7 = memblock_alloc(sizeof(*io7), SMP_CACHE_BYTES);
if (!io7)
panic("%s: Failed to allocate %zu bytes\n", __func__,
sizeof(*io7));
io7 = memblock_alloc_or_panic(sizeof(*io7), SMP_CACHE_BYTES);
io7->pe = pe;
raw_spin_lock_init(&io7->irq_lock);

View File

@ -391,10 +391,7 @@ alloc_pci_controller(void)
{
struct pci_controller *hose;
hose = memblock_alloc(sizeof(*hose), SMP_CACHE_BYTES);
if (!hose)
panic("%s: Failed to allocate %zu bytes\n", __func__,
sizeof(*hose));
hose = memblock_alloc_or_panic(sizeof(*hose), SMP_CACHE_BYTES);
*hose_tail = hose;
hose_tail = &hose->next;
@ -405,13 +402,7 @@ alloc_pci_controller(void)
struct resource * __init
alloc_resource(void)
{
void *ptr = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
if (!ptr)
panic("%s: Failed to allocate %zu bytes\n", __func__,
sizeof(struct resource));
return ptr;
return memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES);
}

View File

@ -71,14 +71,8 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base,
if (align < mem_size)
align = mem_size;
arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES);
if (!arena)
panic("%s: Failed to allocate %zu bytes\n", __func__,
sizeof(*arena));
arena->ptes = memblock_alloc(mem_size, align);
if (!arena->ptes)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, mem_size, align);
arena = memblock_alloc_or_panic(sizeof(*arena), SMP_CACHE_BYTES);
arena->ptes = memblock_alloc_or_panic(mem_size, align);
spin_lock_init(&arena->lock);
arena->hose = hose;

View File

@ -10,7 +10,6 @@
#include <linux/preempt.h>
#include <asm/fpu.h>
#include <asm/thread_info.h>
#include <asm/fpu.h>
#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67)
#define STT(reg,val) asm volatile ("ftoit $f"#reg",%0" : "=r"(val));

View File

@ -42,7 +42,7 @@ pgd_alloc(struct mm_struct *mm)
{
pgd_t *ret, *init;
ret = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
ret = __pgd_alloc(mm, 0);
init = pgd_offset(&init_mm, 0UL);
if (ret) {
#ifdef CONFIG_ALPHA_LARGE_VMALLOC

View File

@ -53,19 +53,14 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *ret = (pgd_t *) __get_free_page(GFP_KERNEL);
pgd_t *ret = __pgd_alloc(mm, 0);
if (ret) {
int num, num2;
num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE;
memzero(ret, num * sizeof(pgd_t));
num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE;
num2 = VMALLOC_SIZE / PGDIR_SIZE;
memcpy(ret + num, swapper_pg_dir + num, num2 * sizeof(pgd_t));
memzero(ret + num + num2,
(PTRS_PER_PGD - num - num2) * sizeof(pgd_t));
}
return ret;
}

View File

@ -200,7 +200,6 @@ int misaligned_fixup(unsigned long address, struct pt_regs *regs,
struct callee_regs *cregs)
{
struct disasm_state state;
char buf[TASK_COMM_LEN];
/* handle user mode only and only if enabled by sysadmin */
if (!user_mode(regs) || !unaligned_enabled)
@ -212,11 +211,11 @@ int misaligned_fixup(unsigned long address, struct pt_regs *regs,
" performance significantly\n. To enable further"
" logging of such instances, please \n"
" echo 0 > /proc/sys/kernel/ignore-unaligned-usertrap\n",
get_task_comm(buf, current), task_pid_nr(current));
current->comm, task_pid_nr(current));
} else {
/* Add rate limiting if it gets down to it */
pr_warn("%s(%d): unaligned access to/from 0x%lx by PC: 0x%lx\n",
get_task_comm(buf, current), task_pid_nr(current),
current->comm, task_pid_nr(current),
address, regs->ret);
}

View File

@ -26,14 +26,7 @@
#else /* !CONFIG_MMU */
#include <linux/swap.h>
#include <asm/tlbflush.h>
static inline void __tlb_remove_table(void *_table)
{
free_page_and_swap_cache((struct page *)_table);
}
#include <asm-generic/tlb.h>
static inline void
@ -41,8 +34,6 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
{
struct ptdesc *ptdesc = page_ptdesc(pte);
pagetable_pte_dtor(ptdesc);
#ifndef CONFIG_ARM_LPAE
/*
* With the classic ARM MMU, a pte page has two corresponding pmd
@ -61,7 +52,6 @@ __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr)
#ifdef CONFIG_ARM_LPAE
struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);
pagetable_pmd_dtor(ptdesc);
tlb_remove_ptdesc(tlb, ptdesc);
#endif
}

View File

@ -880,10 +880,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
*/
boot_alias_start = phys_to_idmap(start);
if (arm_has_idmap_alias() && boot_alias_start != IDMAP_INVALID_ADDR) {
res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES);
if (!res)
panic("%s: Failed to allocate %zu bytes\n",
__func__, sizeof(*res));
res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES);
res->name = "System RAM (boot alias)";
res->start = boot_alias_start;
res->end = phys_to_idmap(res_end);
@ -891,10 +888,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
request_resource(&iomem_resource, res);
}
res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES);
if (!res)
panic("%s: Failed to allocate %zu bytes\n", __func__,
sizeof(*res));
res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES);
res->name = "System RAM";
res->start = start;
res->end = res_end;

View File

@ -31,10 +31,10 @@
/*
* Constants
*/
#define SHARPSL_CHARGE_ON_TIME_INTERVAL (msecs_to_jiffies(1*60*1000)) /* 1 min */
#define SHARPSL_CHARGE_FINISH_TIME (msecs_to_jiffies(10*60*1000)) /* 10 min */
#define SHARPSL_BATCHK_TIME (msecs_to_jiffies(15*1000)) /* 15 sec */
#define SHARPSL_BATCHK_TIME_SUSPEND (60*10) /* 10 min */
#define SHARPSL_CHARGE_ON_TIME_INTERVAL (secs_to_jiffies(60))
#define SHARPSL_CHARGE_FINISH_TIME (secs_to_jiffies(10*60))
#define SHARPSL_BATCHK_TIME (secs_to_jiffies(15))
#define SHARPSL_BATCHK_TIME_SUSPEND (60*10) /* 10 min */
#define SHARPSL_WAIT_CO_TIME 15 /* 15 sec */
#define SHARPSL_WAIT_DISCHARGE_ON 100 /* 100 msec */

View File

@ -726,13 +726,8 @@ EXPORT_SYMBOL(phys_mem_access_prot);
static void __init *early_alloc(unsigned long sz)
{
void *ptr = memblock_alloc(sz, sz);
return memblock_alloc_or_panic(sz, sz);
if (!ptr)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, sz, sz);
return ptr;
}
static void *__init late_alloc(unsigned long sz)
@ -1027,10 +1022,7 @@ void __init iotable_init(struct map_desc *io_desc, int nr)
if (!nr)
return;
svm = memblock_alloc(sizeof(*svm) * nr, __alignof__(*svm));
if (!svm)
panic("%s: Failed to allocate %zu bytes align=0x%zx\n",
__func__, sizeof(*svm) * nr, __alignof__(*svm));
svm = memblock_alloc_or_panic(sizeof(*svm) * nr, __alignof__(*svm));
for (md = io_desc; nr; md++, nr--) {
create_mapping(md);
@ -1052,10 +1044,7 @@ void __init vm_reserve_area_early(unsigned long addr, unsigned long size,
struct vm_struct *vm;
struct static_vm *svm;
svm = memblock_alloc(sizeof(*svm), __alignof__(*svm));
if (!svm)
panic("%s: Failed to allocate %zu bytes align=0x%zx\n",
__func__, sizeof(*svm), __alignof__(*svm));
svm = memblock_alloc_or_panic(sizeof(*svm), __alignof__(*svm));
vm = &svm->vm;
vm->addr = (void *)addr;

View File

@ -162,10 +162,7 @@ void __init paging_init(const struct machine_desc *mdesc)
mpu_setup();
/* allocate the zero page. */
zero_page = (void *)memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!zero_page)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, PAGE_SIZE, PAGE_SIZE);
zero_page = (void *)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
bootmem_init();

View File

@ -17,11 +17,11 @@
#include "mm.h"
#ifdef CONFIG_ARM_LPAE
#define __pgd_alloc() kmalloc_array(PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL)
#define __pgd_free(pgd) kfree(pgd)
#define _pgd_alloc(mm) kmalloc_array(PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL | __GFP_ZERO)
#define _pgd_free(mm, pgd) kfree(pgd)
#else
#define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL, 2)
#define __pgd_free(pgd) free_pages((unsigned long)pgd, 2)
#define _pgd_alloc(mm) __pgd_alloc(mm, 2)
#define _pgd_free(mm, pgd) __pgd_free(mm, pgd)
#endif
/*
@ -35,12 +35,10 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
pmd_t *new_pmd, *init_pmd;
pte_t *new_pte, *init_pte;
new_pgd = __pgd_alloc();
new_pgd = _pgd_alloc(mm);
if (!new_pgd)
goto no_pgd;
memset(new_pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
/*
* Copy over the kernel and IO PGD entries
*/
@ -134,7 +132,7 @@ no_pmd:
no_pud:
p4d_free(mm, new_p4d);
no_p4d:
__pgd_free(new_pgd);
_pgd_free(mm, new_pgd);
no_pgd:
return NULL;
}
@ -207,5 +205,5 @@ no_pgd:
p4d_free(mm, p4d);
}
#endif
__pgd_free(pgd_base);
_pgd_free(mm, pgd_base);
}

View File

@ -85,24 +85,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
__pgd_populate(pgdp, __pa(p4dp), pgdval);
}
static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
{
gfp_t gfp = GFP_PGTABLE_USER;
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
return (p4d_t *)get_zeroed_page(gfp);
}
static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
if (!pgtable_l5_enabled())
return;
BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
free_page((unsigned long)p4d);
}
#define __p4d_free_tlb(tlb, p4d, addr) p4d_free((tlb)->mm, p4d)
#else
static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
{

View File

@ -9,12 +9,7 @@
#define __ASM_TLB_H
#include <linux/pagemap.h>
#include <linux/swap.h>
static inline void __tlb_remove_table(void *_table)
{
free_page_and_swap_cache((struct page *)_table);
}
#define tlb_flush tlb_flush
static void tlb_flush(struct mmu_gather *tlb);
@ -82,7 +77,6 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
{
struct ptdesc *ptdesc = page_ptdesc(pte);
pagetable_pte_dtor(ptdesc);
tlb_remove_ptdesc(tlb, ptdesc);
}
@ -92,7 +86,6 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
{
struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);
pagetable_pmd_dtor(ptdesc);
tlb_remove_ptdesc(tlb, ptdesc);
}
#endif
@ -106,7 +99,19 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
if (!pgtable_l4_enabled())
return;
pagetable_pud_dtor(ptdesc);
tlb_remove_ptdesc(tlb, ptdesc);
}
#endif
#if CONFIG_PGTABLE_LEVELS > 4
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4dp,
unsigned long addr)
{
struct ptdesc *ptdesc = virt_to_ptdesc(p4dp);
if (!pgtable_l5_enabled())
return;
tlb_remove_ptdesc(tlb, ptdesc);
}
#endif

View File

@ -223,9 +223,7 @@ static void __init request_standard_resources(void)
num_standard_resources = memblock.memory.cnt;
res_size = num_standard_resources * sizeof(*standard_resources);
standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES);
if (!standard_resources)
panic("%s: Failed to allocate %zu bytes\n", __func__, res_size);
standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES);
for_each_mem_region(region) {
res = &standard_resources[i++];

View File

@ -33,7 +33,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
gfp_t gfp = GFP_PGTABLE_USER;
if (pgdir_is_page_size())
return (pgd_t *)__get_free_page(gfp);
return __pgd_alloc(mm, 0);
else
return kmem_cache_alloc(pgd_cache, gfp);
}
@ -41,7 +41,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
if (pgdir_is_page_size())
free_page((unsigned long)pgd);
__pgd_free(mm, pgd);
else
kmem_cache_free(pgd_cache, pgd);
}

View File

@ -44,7 +44,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
pgd_t *ret;
pgd_t *init;
ret = (pgd_t *) __get_free_page(GFP_KERNEL);
ret = __pgd_alloc(mm, 0);
if (ret) {
init = pgd_offset(&init_mm, 0UL);
pgd_init((unsigned long *)ret);
@ -63,7 +63,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
#define __pte_free_tlb(tlb, pte, address) \
do { \
pagetable_pte_dtor(page_ptdesc(pte)); \
pagetable_dtor(page_ptdesc(pte)); \
tlb_remove_page_ptdesc(tlb, page_ptdesc(pte)); \
} while (0)

View File

@ -22,7 +22,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
pgd = __pgd_alloc(mm, 0);
/*
* There may be better ways to do this, but to ensure
@ -89,7 +89,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
#define __pte_free_tlb(tlb, pte, addr) \
do { \
pagetable_pte_dtor((page_ptdesc(pte))); \
pagetable_dtor((page_ptdesc(pte))); \
tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
} while (0)

View File

@ -113,7 +113,10 @@ CONFIG_ZBUD=y
CONFIG_ZSMALLOC=m
# CONFIG_COMPAT_BRK is not set
CONFIG_MEMORY_HOTPLUG=y
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
# CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE is not set
CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO=y
# CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL is not set
# CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE is not set
CONFIG_MEMORY_HOTREMOVE=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y

View File

@ -57,7 +57,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
#define __pte_free_tlb(tlb, pte, address) \
do { \
pagetable_pte_dtor(page_ptdesc(pte)); \
pagetable_dtor(page_ptdesc(pte)); \
tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
} while (0)

View File

@ -431,7 +431,7 @@ static void __init resource_init(void)
num_standard_resources = memblock.memory.cnt;
res_size = num_standard_resources * sizeof(*standard_resources);
standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES);
standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES);
for_each_mem_region(region) {
res = &standard_resources[i++];

View File

@ -174,9 +174,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr)
pmd_t *pmd;
if (p4d_none(p4dp_get(p4d))) {
pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!pud)
panic("%s: Failed to allocate memory\n", __func__);
pud = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
p4d_populate(&init_mm, p4d, pud);
#ifndef __PAGETABLE_PUD_FOLDED
pud_init(pud);
@ -185,9 +183,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr)
pud = pud_offset(p4d, addr);
if (pud_none(pudp_get(pud))) {
pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!pmd)
panic("%s: Failed to allocate memory\n", __func__);
pmd = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
pud_populate(&init_mm, pud, pmd);
#ifndef __PAGETABLE_PMD_FOLDED
pmd_init(pmd);
@ -198,10 +194,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr)
if (!pmd_present(pmdp_get(pmd))) {
pte_t *pte;
pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!pte)
panic("%s: Failed to allocate memory\n", __func__);
pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
pmd_populate_kernel(&init_mm, pmd, pte);
kernel_pte_init(pte);
}

View File

@ -23,11 +23,10 @@ EXPORT_SYMBOL(tlb_virt_to_page);
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *init, *ret = NULL;
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
pgd_t *init, *ret;
if (ptdesc) {
ret = (pgd_t *)ptdesc_address(ptdesc);
ret = __pgd_alloc(mm, 0);
if (ret) {
init = pgd_offset(&init_mm, 0UL);
pgd_init(ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,

View File

@ -629,7 +629,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -586,7 +586,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -606,7 +606,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -578,7 +578,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -588,7 +588,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -605,7 +605,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -692,7 +692,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -578,7 +578,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -579,7 +579,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -595,7 +595,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -575,7 +575,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -576,7 +576,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -37,7 +37,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable,
{
struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
pagetable_pte_dtor(ptdesc);
pagetable_dtor(ptdesc);
pagetable_free(ptdesc);
}
@ -61,7 +61,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)
{
struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
pagetable_pte_dtor(ptdesc);
pagetable_dtor(ptdesc);
pagetable_free(ptdesc);
}
@ -73,7 +73,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
pagetable_free(virt_to_ptdesc(pgd));
pagetable_dtor_free(virt_to_ptdesc(pgd));
}
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
@ -84,6 +84,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
if (!ptdesc)
return NULL;
pagetable_pgd_ctor(ptdesc);
new_pgd = ptdesc_address(ptdesc);
memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t));

View File

@ -9,9 +9,9 @@ extern void mmu_page_ctor(void *page);
extern void mmu_page_dtor(void *page);
enum m68k_table_types {
TABLE_PGD = 0,
TABLE_PMD = 0, /* same size as PGD */
TABLE_PTE = 1,
TABLE_PGD,
TABLE_PMD,
TABLE_PTE,
};
extern void init_pointer_table(void *table, int type);

View File

@ -19,7 +19,7 @@ extern const char bad_pmd_string[];
#define __pte_free_tlb(tlb, pte, addr) \
do { \
pagetable_pte_dtor(page_ptdesc(pte)); \
pagetable_dtor(page_ptdesc(pte)); \
tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
} while (0)
@ -43,7 +43,7 @@ static inline pgd_t * pgd_alloc(struct mm_struct *mm)
{
pgd_t *new_pgd;
new_pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
new_pgd = __pgd_alloc(mm, 0);
memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE);
memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT));
return new_pgd;

View File

@ -68,10 +68,7 @@ void __init paging_init(void)
high_memory = (void *) end_mem;
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!empty_zero_page)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, PAGE_SIZE, PAGE_SIZE);
empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT;
free_area_init(max_zone_pfn);
}

View File

@ -42,20 +42,14 @@ void __init paging_init(void)
unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
int i;
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!empty_zero_page)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, PAGE_SIZE, PAGE_SIZE);
empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
pg_dir = swapper_pg_dir;
memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir));
size = num_pages * sizeof(pte_t);
size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1);
next_pgtable = (unsigned long) memblock_alloc(size, PAGE_SIZE);
if (!next_pgtable)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, size, PAGE_SIZE);
next_pgtable = (unsigned long) memblock_alloc_or_panic(size, PAGE_SIZE);
pg_dir += PAGE_OFFSET >> PGDIR_SHIFT;

View File

@ -97,17 +97,19 @@ void mmu_page_dtor(void *page)
typedef struct list_head ptable_desc;
static struct list_head ptable_list[2] = {
static struct list_head ptable_list[3] = {
LIST_HEAD_INIT(ptable_list[0]),
LIST_HEAD_INIT(ptable_list[1]),
LIST_HEAD_INIT(ptable_list[2]),
};
#define PD_PTABLE(page) ((ptable_desc *)&(virt_to_page((void *)(page))->lru))
#define PD_PAGE(ptable) (list_entry(ptable, struct page, lru))
#define PD_MARKBITS(dp) (*(unsigned int *)&PD_PAGE(dp)->index)
static const int ptable_shift[2] = {
7+2, /* PGD, PMD */
static const int ptable_shift[3] = {
7+2, /* PGD */
7+2, /* PMD */
6+2, /* PTE */
};
@ -156,12 +158,20 @@ void *get_pointer_table(int type)
if (!(page = (void *)get_zeroed_page(GFP_KERNEL)))
return NULL;
if (type == TABLE_PTE) {
switch (type) {
case TABLE_PTE:
/*
* m68k doesn't have SPLIT_PTE_PTLOCKS for not having
* SMP.
*/
pagetable_pte_ctor(virt_to_ptdesc(page));
break;
case TABLE_PMD:
pagetable_pmd_ctor(virt_to_ptdesc(page));
break;
case TABLE_PGD:
pagetable_pgd_ctor(virt_to_ptdesc(page));
break;
}
mmu_page_ctor(page);
@ -200,8 +210,7 @@ int free_pointer_table(void *table, int type)
/* all tables in page are free, free page */
list_del(dp);
mmu_page_dtor((void *)page);
if (type == TABLE_PTE)
pagetable_pte_dtor(virt_to_ptdesc((void *)page));
pagetable_dtor(virt_to_ptdesc((void *)page));
free_page (page);
return 1;
} else if (ptable_list[type].next != dp) {
@ -491,10 +500,7 @@ void __init paging_init(void)
* initialize the bad page table and bad page to point
* to a couple of allocated pages
*/
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!empty_zero_page)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, PAGE_SIZE, PAGE_SIZE);
empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
/*
* Set up SFC/DFC registers

View File

@ -44,10 +44,7 @@ void __init paging_init(void)
unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
unsigned long size;
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!empty_zero_page)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, PAGE_SIZE, PAGE_SIZE);
empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
address = PAGE_OFFSET;
pg_dir = swapper_pg_dir;
@ -57,10 +54,7 @@ void __init paging_init(void)
size = num_pages * sizeof(pte_t);
size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1);
next_pgtable = (unsigned long)memblock_alloc(size, PAGE_SIZE);
if (!next_pgtable)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, size, PAGE_SIZE);
next_pgtable = (unsigned long)memblock_alloc_or_panic(size, PAGE_SIZE);
bootmem_end = (next_pgtable + size + PAGE_SIZE) & PAGE_MASK;
/* Map whole memory from PAGE_OFFSET (0x0E000000) */

View File

@ -252,12 +252,8 @@ void __init dvma_init(void)
list_add(&(hole->list), &hole_list);
iommu_use = memblock_alloc(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long),
iommu_use = memblock_alloc_or_panic(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long),
SMP_CACHE_BYTES);
if (!iommu_use)
panic("%s: Failed to allocate %zu bytes\n", __func__,
IOMMU_TOTAL_ENTRIES * sizeof(unsigned long));
dvma_unmap_iommu(DVMA_START, DVMA_SIZE);
sun3_dvma_init();

View File

@ -21,12 +21,7 @@
extern void __bad_pte(pmd_t *pmd);
static inline pgd_t *get_pgd(void)
{
return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
}
#define pgd_alloc(mm) get_pgd()
#define pgd_alloc(mm) __pgd_alloc(mm, 0)
extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);

View File

@ -15,7 +15,6 @@
#define __HAVE_ARCH_PMD_ALLOC_ONE
#define __HAVE_ARCH_PUD_ALLOC_ONE
#define __HAVE_ARCH_PGD_FREE
#include <asm-generic/pgalloc.h>
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
@ -49,14 +48,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
extern void pgd_init(void *addr);
extern pgd_t *pgd_alloc(struct mm_struct *mm);
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
pagetable_free(virt_to_ptdesc(pgd));
}
#define __pte_free_tlb(tlb, pte, address) \
do { \
pagetable_pte_dtor(page_ptdesc(pte)); \
pagetable_dtor(page_ptdesc(pte)); \
tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
} while (0)

View File

@ -704,10 +704,7 @@ static void __init resource_init(void)
for_each_mem_range(i, &start, &end) {
struct resource *res;
res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
if (!res)
panic("%s: Failed to allocate %zu bytes\n", __func__,
sizeof(struct resource));
res = memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES);
res->start = start;
/*

View File

@ -11,6 +11,7 @@
#include <linux/ioport.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/slab.h>
@ -97,11 +98,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
return -EINTR;
if (IS_ENABLED(CONFIG_MIPS_FP_SUPPORT)) {
unsigned long unused;
/* Map delay slot emulation page */
base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
VM_READ | VM_EXEC |
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
0, NULL);
base = do_mmap(NULL, STACK_TOP, PAGE_SIZE, PROT_READ | PROT_EXEC,
MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, 0, 0, &unused,
NULL);
if (IS_ERR_VALUE(base)) {
ret = base;
goto out;

View File

@ -10,12 +10,10 @@
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *init, *ret = NULL;
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM,
PGD_TABLE_ORDER);
pgd_t *init, *ret;
if (ptdesc) {
ret = ptdesc_address(ptdesc);
ret = __pgd_alloc(mm, PGD_TABLE_ORDER);
if (ret) {
init = pgd_offset(&init_mm, 0UL);
pgd_init(ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,

View File

@ -30,7 +30,7 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm);
#define __pte_free_tlb(tlb, pte, addr) \
do { \
pagetable_pte_dtor(page_ptdesc(pte)); \
pagetable_dtor(page_ptdesc(pte)); \
tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
} while (0)

View File

@ -11,6 +11,7 @@
#include <linux/sched.h>
#include <asm/cpuinfo.h>
#include <asm/pgalloc.h>
/* pteaddr:
* ptbase | vpn* | zero
@ -54,7 +55,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *ret, *init;
ret = (pgd_t *) __get_free_page(GFP_KERNEL);
ret = __pgd_alloc(mm, 0);
if (ret) {
init = pgd_offset(&init_mm, 0UL);
pgd_init(ret);

View File

@ -41,15 +41,13 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
*/
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *ret = (pgd_t *)__get_free_page(GFP_KERNEL);
pgd_t *ret = __pgd_alloc(mm, 0);
if (ret) {
memset(ret, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
if (ret)
memcpy(ret + USER_PTRS_PER_PGD,
swapper_pg_dir + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
}
return ret;
}
@ -68,7 +66,7 @@ extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
#define __pte_free_tlb(tlb, pte, addr) \
do { \
pagetable_pte_dtor(page_ptdesc(pte)); \
pagetable_dtor(page_ptdesc(pte)); \
tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
} while (0)

View File

@ -38,10 +38,7 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm)
if (likely(mem_init_done)) {
pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
} else {
pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!pte)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, PAGE_SIZE, PAGE_SIZE);
pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
}
return pte;

View File

@ -11,27 +11,12 @@
#include <asm/cache.h>
#define __HAVE_ARCH_PMD_ALLOC_ONE
#define __HAVE_ARCH_PMD_FREE
#define __HAVE_ARCH_PGD_FREE
#include <asm-generic/pgalloc.h>
/* Allocate the top level pgd (page directory) */
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pgd = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER);
if (unlikely(pgd == NULL))
return NULL;
memset(pgd, 0, PAGE_SIZE << PGD_TABLE_ORDER);
return pgd;
}
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
free_pages((unsigned long)pgd, PGD_TABLE_ORDER);
return __pgd_alloc(mm, PGD_TABLE_ORDER);
}
#if CONFIG_PGTABLE_LEVELS == 3
@ -46,17 +31,19 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
{
pmd_t *pmd;
struct ptdesc *ptdesc;
gfp_t gfp = GFP_PGTABLE_USER;
pmd = (pmd_t *)__get_free_pages(GFP_PGTABLE_KERNEL, PMD_TABLE_ORDER);
if (likely(pmd))
memset ((void *)pmd, 0, PAGE_SIZE << PMD_TABLE_ORDER);
return pmd;
}
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
free_pages((unsigned long)pmd, PMD_TABLE_ORDER);
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
ptdesc = pagetable_alloc(gfp, PMD_TABLE_ORDER);
if (!ptdesc)
return NULL;
if (!pagetable_pmd_ctor(ptdesc)) {
pagetable_free(ptdesc);
return NULL;
}
return ptdesc_address(ptdesc);
}
#endif

View File

@ -377,10 +377,8 @@ static void __ref map_pages(unsigned long start_vaddr,
#if CONFIG_PGTABLE_LEVELS == 3
if (pud_none(*pud)) {
pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER,
pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER,
PAGE_SIZE << PMD_TABLE_ORDER);
if (!pmd)
panic("pmd allocation failed.\n");
pud_populate(NULL, pud, pmd);
}
#endif
@ -388,9 +386,7 @@ static void __ref map_pages(unsigned long start_vaddr,
pmd = pmd_offset(pud, vaddr);
for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++, pmd++) {
if (pmd_none(*pmd)) {
pg_table = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!pg_table)
panic("page table allocation failed\n");
pg_table = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
pmd_populate_kernel(NULL, pmd, pg_table);
}
@ -648,9 +644,7 @@ static void __init pagetable_init(void)
}
#endif
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!empty_zero_page)
panic("zero page allocation failed.\n");
empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
}
@ -687,19 +681,15 @@ static void __init fixmap_init(void)
#if CONFIG_PGTABLE_LEVELS == 3
if (pud_none(*pud)) {
pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER,
pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER,
PAGE_SIZE << PMD_TABLE_ORDER);
if (!pmd)
panic("fixmap: pmd allocation failed.\n");
pud_populate(NULL, pud, pmd);
}
#endif
pmd = pmd_offset(pud, addr);
do {
pte_t *pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!pte)
panic("fixmap: pte allocation failed.\n");
pte_t *pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
pmd_populate_kernel(&init_mm, pmd, pte);

View File

@ -451,7 +451,6 @@ CONFIG_TEST_PRINTF=m
CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m

View File

@ -37,6 +37,7 @@ extern void tlb_flush(struct mmu_gather *tlb);
*/
#define tlb_needs_table_invalidate() radix_enabled()
#define __HAVE_ARCH_TLB_REMOVE_TABLE
/* Get the generic bits... */
#include <asm-generic/tlb.h>

View File

@ -1087,12 +1087,10 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char
/* Count and allocate space for cpu features */
of_scan_flat_dt_subnodes(node, count_cpufeatures_subnodes,
&nr_dt_cpu_features);
dt_cpu_features = memblock_alloc(sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, PAGE_SIZE);
if (!dt_cpu_features)
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
__func__,
sizeof(struct dt_cpu_feature) * nr_dt_cpu_features,
PAGE_SIZE);
dt_cpu_features =
memblock_alloc_or_panic(
sizeof(struct dt_cpu_feature) * nr_dt_cpu_features,
PAGE_SIZE);
cpufeatures_setup_start(isa);

View File

@ -213,11 +213,8 @@ pci_create_OF_bus_map(void)
struct property* of_prop;
struct device_node *dn;
of_prop = memblock_alloc(sizeof(struct property) + 256,
of_prop = memblock_alloc_or_panic(sizeof(struct property) + 256,
SMP_CACHE_BYTES);
if (!of_prop)
panic("%s: Failed to allocate %zu bytes\n", __func__,
sizeof(struct property) + 256);
dn = of_find_node_by_path("/");
if (dn) {
memset(of_prop, -1, sizeof(struct property) + 256);

View File

@ -458,11 +458,8 @@ void __init smp_setup_cpu_maps(void)
DBG("smp_setup_cpu_maps()\n");
cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32),
cpu_to_phys_id = memblock_alloc_or_panic(nr_cpu_ids * sizeof(u32),
__alignof__(u32));
if (!cpu_to_phys_id)
panic("%s: Failed to allocate %zu bytes align=0x%zx\n",
__func__, nr_cpu_ids * sizeof(u32), __alignof__(u32));
for_each_node_by_type(dn, "cpu") {
const __be32 *intserv;

View File

@ -140,13 +140,7 @@ arch_initcall(ppc_init);
static void *__init alloc_stack(void)
{
void *ptr = memblock_alloc(THREAD_SIZE, THREAD_ALIGN);
if (!ptr)
panic("cannot allocate %d bytes for stack at %pS\n",
THREAD_SIZE, (void *)_RET_IP_);
return ptr;
return memblock_alloc_or_panic(THREAD_SIZE, THREAD_ALIGN);
}
void __init irqstack_early_init(void)

View File

@ -4957,7 +4957,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
* states are synchronized from L0 to L1. L1 needs to inform L0 about
* MER=1 only when there are pending external interrupts.
* In the above if check, MER bit is set if there are pending
* external interrupts. Hence, explicity mask off MER bit
* external interrupts. Hence, explicitly mask off MER bit
* here as otherwise it may generate spurious interrupts in L2 KVM
* causing an endless loop, which results in L2 guest getting hung.
*/

View File

@ -377,10 +377,7 @@ void __init MMU_init_hw(void)
* Find some memory for the hash table.
*/
if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
Hash = memblock_alloc(Hash_size, Hash_size);
if (!Hash)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, Hash_size, Hash_size);
Hash = memblock_alloc_or_panic(Hash_size, Hash_size);
_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
pr_info("Total memory = %lldMB; using %ldkB for hash table\n",

View File

@ -253,7 +253,7 @@ static void pmd_frag_destroy(void *pmd_frag)
count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
pagetable_pmd_dtor(ptdesc);
pagetable_dtor(ptdesc);
pagetable_free(ptdesc);
}
}

View File

@ -330,11 +330,7 @@ void __init mmu_partition_table_init(void)
unsigned long ptcr;
/* Initialize the Partition Table with no entries */
partition_tb = memblock_alloc(patb_size, patb_size);
if (!partition_tb)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, patb_size, patb_size);
partition_tb = memblock_alloc_or_panic(patb_size, patb_size);
ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
set_ptcr_when_no_uv(ptcr);
powernv_set_nmmu_ptcr(ptcr);
@ -477,7 +473,7 @@ void pmd_fragment_free(unsigned long *pmd)
BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
pagetable_pmd_dtor(ptdesc);
pagetable_dtor(ptdesc);
pagetable_free(ptdesc);
}
}

View File

@ -40,19 +40,19 @@ static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgpr
pgdp = pgd_offset_k(ea);
p4dp = p4d_offset(pgdp, ea);
if (kasan_pud_table(*p4dp)) {
pudp = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
pudp = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE);
p4d_populate(&init_mm, p4dp, pudp);
}
pudp = pud_offset(p4dp, ea);
if (kasan_pmd_table(*pudp)) {
pmdp = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
pmdp = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE);
pud_populate(&init_mm, pudp, pmdp);
}
pmdp = pmd_offset(pudp, ea);
if (kasan_pte_table(*pmdp)) {
ptep = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
ptep = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE);
pmd_populate_kernel(&init_mm, pmdp, ptep);
}
@ -74,7 +74,7 @@ static void __init kasan_init_phys_region(void *start, void *end)
k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE);
k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE);
va = memblock_alloc(k_end - k_start, PAGE_SIZE);
va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE);
for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE)
kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL);
}

View File

@ -32,7 +32,7 @@ static void __init kasan_init_phys_region(void *start, void *end)
k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE);
k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE);
va = memblock_alloc(k_end - k_start, PAGE_SIZE);
va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE);
for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE)
map_kernel_page(k_cur, __pa(va), PAGE_KERNEL);
}

View File

@ -385,21 +385,11 @@ void __init mmu_context_init(void)
/*
* Allocate the maps used by context management
*/
context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
if (!context_map)
panic("%s: Failed to allocate %zu bytes\n", __func__,
CTX_MAP_SIZE);
context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1),
context_map = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES);
context_mm = memblock_alloc_or_panic(sizeof(void *) * (LAST_CONTEXT + 1),
SMP_CACHE_BYTES);
if (!context_mm)
panic("%s: Failed to allocate %zu bytes\n", __func__,
sizeof(void *) * (LAST_CONTEXT + 1));
if (IS_ENABLED(CONFIG_SMP)) {
stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
if (!stale_map[boot_cpuid])
panic("%s: Failed to allocate %zu bytes\n", __func__,
CTX_MAP_SIZE);
stale_map[boot_cpuid] = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES);
cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
"powerpc/mmu/ctx:prepare",
mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead);

View File

@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag)
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
pagetable_pte_dtor(ptdesc);
pagetable_dtor(ptdesc);
pagetable_free(ptdesc);
}
}
@ -111,7 +111,7 @@ static void pte_free_now(struct rcu_head *head)
struct ptdesc *ptdesc;
ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
pagetable_pte_dtor(ptdesc);
pagetable_dtor(ptdesc);
pagetable_free(ptdesc);
}

View File

@ -50,13 +50,8 @@ notrace void __init early_ioremap_init(void)
void __init *early_alloc_pgtable(unsigned long size)
{
void *ptr = memblock_alloc(size, size);
return memblock_alloc_or_panic(size, size);
if (!ptr)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, size, size);
return ptr;
}
pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va)

View File

@ -514,10 +514,7 @@ static int __init core99_nvram_setup(struct device_node *dp, unsigned long addr)
printk(KERN_ERR "nvram: no address\n");
return -EINVAL;
}
nvram_image = memblock_alloc(NVRAM_SIZE, SMP_CACHE_BYTES);
if (!nvram_image)
panic("%s: Failed to allocate %u bytes\n", __func__,
NVRAM_SIZE);
nvram_image = memblock_alloc_or_panic(NVRAM_SIZE, SMP_CACHE_BYTES);
nvram_data = ioremap(addr, NVRAM_SIZE*2);
nvram_naddrs = 1; /* Make sure we get the correct case */

View File

@ -88,26 +88,6 @@ static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
}
}
static void memtrace_clear_range(unsigned long start_pfn,
unsigned long nr_pages)
{
unsigned long pfn;
/* As HIGHMEM does not apply, use clear_page() directly. */
for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
cond_resched();
clear_page(__va(PFN_PHYS(pfn)));
}
/*
* Before we go ahead and use this range as cache inhibited range
* flush the cache.
*/
flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn),
(unsigned long)pfn_to_kaddr(start_pfn + nr_pages),
FLUSH_CHUNK_SIZE);
}
static u64 memtrace_alloc_node(u32 nid, u64 size)
{
const unsigned long nr_pages = PHYS_PFN(size);
@ -119,17 +99,18 @@ static u64 memtrace_alloc_node(u32 nid, u64 size)
* by alloc_contig_pages().
*/
page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE |
__GFP_NOWARN, nid, NULL);
__GFP_NOWARN | __GFP_ZERO, nid, NULL);
if (!page)
return 0;
start_pfn = page_to_pfn(page);
/*
* Clear the range while we still have a linear mapping.
*
* TODO: use __GFP_ZERO with alloc_contig_pages() once supported.
* Before we go ahead and use this range as cache inhibited range
* flush the cache.
*/
memtrace_clear_range(start_pfn, nr_pages);
flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn),
(unsigned long)pfn_to_kaddr(start_pfn + nr_pages),
FLUSH_CHUNK_SIZE);
/*
* Set pages PageOffline(), to indicate that nobody (e.g., hibernation,

View File

@ -180,10 +180,7 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
/*
* Allocate a buffer to hold the MC recoverable ranges.
*/
mc_recoverable_range = memblock_alloc(size, __alignof__(u64));
if (!mc_recoverable_range)
panic("%s: Failed to allocate %u bytes align=0x%lx\n",
__func__, size, __alignof__(u64));
mc_recoverable_range = memblock_alloc_or_panic(size, __alignof__(u64));
for (i = 0; i < mc_recoverable_range_len; i++) {
mc_recoverable_range[i].start_addr =

View File

@ -115,10 +115,7 @@ static void __init prealloc(struct ps3_prealloc *p)
if (!p->size)
return;
p->address = memblock_alloc(p->size, p->align);
if (!p->address)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, p->size, p->align);
p->address = memblock_alloc_or_panic(p->size, p->align);
printk(KERN_INFO "%s: %lu bytes at %p\n", p->name, p->size,
p->address);

View File

@ -544,7 +544,7 @@ static int drc_pmem_query_health(struct papr_scm_priv *p)
/* Jiffies offset for which the health data is assumed to be same */
cache_timeout = p->lasthealth_jiffies +
msecs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL * 1000);
secs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL);
/* Fetch new health info is its older than MIN_HEALTH_QUERY_INTERVAL */
if (time_after(jiffies, cache_timeout))

View File

@ -124,10 +124,7 @@ int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count,
if (bmp->bitmap_from_slab)
bmp->bitmap = kzalloc(size, GFP_KERNEL);
else {
bmp->bitmap = memblock_alloc(size, SMP_CACHE_BYTES);
if (!bmp->bitmap)
panic("%s: Failed to allocate %u bytes\n", __func__,
size);
bmp->bitmap = memblock_alloc_or_panic(size, SMP_CACHE_BYTES);
/* the bitmap won't be freed from memblock allocator */
kmemleak_not_leak(bmp->bitmap);
}

View File

@ -12,16 +12,25 @@
#include <asm/tlb.h>
#ifdef CONFIG_MMU
#define __HAVE_ARCH_PUD_ALLOC_ONE
#define __HAVE_ARCH_PUD_FREE
#include <asm-generic/pgalloc.h>
/*
* While riscv platforms with riscv_ipi_for_rfence as true require an IPI to
* perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use
* SBI to perform TLB shootdown. To keep software pagetable walkers safe in this
* case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the
* comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
* for more details.
*/
static inline void riscv_tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
{
if (riscv_use_sbi_for_rfence())
if (riscv_use_sbi_for_rfence()) {
tlb_remove_ptdesc(tlb, pt);
else
} else {
pagetable_dtor(pt);
tlb_remove_page_ptdesc(tlb, pt);
}
}
static inline void pmd_populate_kernel(struct mm_struct *mm,
@ -88,15 +97,6 @@ static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd,
}
}
#define pud_alloc_one pud_alloc_one
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
if (pgtable_l4_enabled)
return __pud_alloc_one(mm, addr);
return NULL;
}
#define pud_free pud_free
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
@ -107,39 +107,8 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr)
{
if (pgtable_l4_enabled) {
struct ptdesc *ptdesc = virt_to_ptdesc(pud);
pagetable_pud_dtor(ptdesc);
riscv_tlb_remove_ptdesc(tlb, ptdesc);
}
}
#define p4d_alloc_one p4d_alloc_one
static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
{
if (pgtable_l5_enabled) {
gfp_t gfp = GFP_PGTABLE_USER;
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
return (p4d_t *)get_zeroed_page(gfp);
}
return NULL;
}
static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
free_page((unsigned long)p4d);
}
#define p4d_free p4d_free
static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
if (pgtable_l5_enabled)
__p4d_free(mm, p4d);
if (pgtable_l4_enabled)
riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
}
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
@ -161,9 +130,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
pgd = __pgd_alloc(mm, 0);
if (likely(pgd != NULL)) {
memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
/* Copy kernel mappings */
sync_kernel_mappings(pgd);
}
@ -175,10 +143,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
unsigned long addr)
{
struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
pagetable_pmd_dtor(ptdesc);
riscv_tlb_remove_ptdesc(tlb, ptdesc);
riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
}
#endif /* __PAGETABLE_PMD_FOLDED */
@ -186,10 +151,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
unsigned long addr)
{
struct ptdesc *ptdesc = page_ptdesc(pte);
pagetable_pte_dtor(ptdesc);
riscv_tlb_remove_ptdesc(tlb, ptdesc);
riscv_tlb_remove_ptdesc(tlb, page_ptdesc(pte));
}
#endif /* CONFIG_MMU */

View File

@ -10,24 +10,6 @@ struct mmu_gather;
static void tlb_flush(struct mmu_gather *tlb);
#ifdef CONFIG_MMU
#include <linux/swap.h>
/*
* While riscv platforms with riscv_ipi_for_rfence as true require an IPI to
* perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use
* SBI to perform TLB shootdown. To keep software pagetable walkers safe in this
* case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the
* comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
* for more details.
*/
static inline void __tlb_remove_table(void *table)
{
free_page_and_swap_cache(table);
}
#endif /* CONFIG_MMU */
#define tlb_flush tlb_flush
#include <asm-generic/tlb.h>

View File

@ -147,9 +147,7 @@ static void __init init_resources(void)
res_idx = num_resources - 1;
mem_res_sz = num_resources * sizeof(*mem_res);
mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES);
if (!mem_res)
panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz);
mem_res = memblock_alloc_or_panic(mem_res_sz, SMP_CACHE_BYTES);
/*
* Start by adding the reserved regions, if they overlap

Some files were not shown because too many files have changed in this diff Show More