mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-15 02:05:33 +00:00
Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
This commit is contained in:
commit
ada24f0158
1
.mailmap
1
.mailmap
@ -410,6 +410,7 @@ Liam Mark <quic_lmark@quicinc.com> <lmark@codeaurora.org>
|
||||
Linas Vepstas <linas@austin.ibm.com>
|
||||
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
|
||||
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
||||
Linus Lüssing <linus.luessing@c0d3.blue> <ll@simonwunderlich.de>
|
||||
<linux-hardening@vger.kernel.org> <kernel-hardening@lists.openwall.com>
|
||||
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
|
||||
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
|
||||
|
2
CREDITS
2
CREDITS
@ -4339,7 +4339,7 @@ D: Freescale Highspeed USB device driver
|
||||
D: Freescale QE SoC support and Ethernet driver
|
||||
S: B-1206 Jingmao Guojigongyu
|
||||
S: 16 Baliqiao Nanjie, Beijing 101100
|
||||
S: People's Repulic of China
|
||||
S: People's Republic of China
|
||||
|
||||
N: Vlad Yasevich
|
||||
E: vyasevich@gmail.com
|
||||
|
@ -355,10 +355,15 @@ Description: If 'target' is written to the 'type' file, writing to or
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/matching
|
||||
Date: Dec 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing 'Y' or 'N' to this file sets whether to filter out
|
||||
pages that do or do not match to the 'type' and 'memcg_path',
|
||||
respectively. Filter out means the action of the scheme will
|
||||
not be applied to.
|
||||
Description: Writing 'Y' or 'N' to this file sets whether the filter is for
|
||||
the memory of the 'type', or all except the 'type'.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/allow
|
||||
Date: Jan 2025
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing 'Y' or 'N' to this file sets whether to allow or reject
|
||||
applying the scheme's action to the memory that satisfies the
|
||||
'type' and the 'matching' of the directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_tried
|
||||
Date: Mar 2022
|
||||
@ -384,6 +389,12 @@ Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the total size of regions that the
|
||||
action of the scheme has successfully applied in bytes.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/sz_ops_filter_passed
|
||||
Date: Dec 2024
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the total size of memory that passed
|
||||
DAMON operations layer-handled filters of the scheme in bytes.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/qt_exceeds
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
@ -424,3 +435,10 @@ Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the 'age' of a memory region that
|
||||
corresponding DAMON-based Operation Scheme's action has tried
|
||||
to be applied.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/sz_filter_passed
|
||||
Date: Dec 2024
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the size of the memory in the region
|
||||
that passed DAMON operations layer-handled filters of the
|
||||
scheme in bytes.
|
||||
|
@ -100,29 +100,29 @@ Get delays, since system boot, for pid 10::
|
||||
# ./getdelays -d -p 10
|
||||
(output similar to next case)
|
||||
|
||||
Get sum of delays, since system boot, for all pids with tgid 5::
|
||||
Get sum and peak of delays, since system boot, for all pids with tgid 242::
|
||||
|
||||
# ./getdelays -d -t 5
|
||||
bash-4.4# ./getdelays -d -t 242
|
||||
print delayacct stats ON
|
||||
TGID 5
|
||||
TGID 242
|
||||
|
||||
|
||||
CPU count real total virtual total delay total delay average
|
||||
8 7000000 6872122 3382277 0.423ms
|
||||
IO count delay total delay average
|
||||
0 0 0.000ms
|
||||
SWAP count delay total delay average
|
||||
0 0 0.000ms
|
||||
RECLAIM count delay total delay average
|
||||
0 0 0.000ms
|
||||
THRASHING count delay total delay average
|
||||
0 0 0.000ms
|
||||
COMPACT count delay total delay average
|
||||
0 0 0.000ms
|
||||
WPCOPY count delay total delay average
|
||||
0 0 0.000ms
|
||||
IRQ count delay total delay average
|
||||
0 0 0.000ms
|
||||
CPU count real total virtual total delay total delay average delay max delay min
|
||||
39 156000000 156576579 2111069 0.054ms 0.212296ms 0.031307ms
|
||||
IO count delay total delay average delay max delay min
|
||||
0 0 0.000ms 0.000000ms 0.000000ms
|
||||
SWAP count delay total delay average delay max delay min
|
||||
0 0 0.000ms 0.000000ms 0.000000ms
|
||||
RECLAIM count delay total delay average delay max delay min
|
||||
0 0 0.000ms 0.000000ms 0.000000ms
|
||||
THRASHING count delay total delay average delay max delay min
|
||||
0 0 0.000ms 0.000000ms 0.000000ms
|
||||
COMPACT count delay total delay average delay max delay min
|
||||
0 0 0.000ms 0.000000ms 0.000000ms
|
||||
WPCOPY count delay total delay average delay max delay min
|
||||
156 11215873 0.072ms 0.207403ms 0.033913ms
|
||||
IRQ count delay total delay average delay max delay min
|
||||
0 0 0.000ms 0.000000ms 0.000000ms
|
||||
|
||||
Get IO accounting for pid 1, it works only with -p::
|
||||
|
||||
|
@ -3351,8 +3351,8 @@
|
||||
[KNL] Set the initial state for the memory hotplug
|
||||
onlining policy. If not specified, the default value is
|
||||
set according to the
|
||||
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
|
||||
option.
|
||||
CONFIG_MHP_DEFAULT_ONLINE_TYPE kernel config
|
||||
options.
|
||||
See Documentation/admin-guide/mm/memory-hotplug.rst.
|
||||
|
||||
memmap=exactmap [KNL,X86,EARLY] Enable setting of an exact
|
||||
@ -6992,6 +6992,13 @@
|
||||
See Documentation/admin-guide/mm/transhuge.rst
|
||||
for more details.
|
||||
|
||||
transparent_hugepage_tmpfs= [KNL]
|
||||
Format: [always|within_size|advise|never]
|
||||
Can be used to control the default hugepage allocation policy
|
||||
for the tmpfs mount.
|
||||
See Documentation/admin-guide/mm/transhuge.rst
|
||||
for more details.
|
||||
|
||||
trusted.source= [KEYS]
|
||||
Format: <string>
|
||||
This parameter identifies the trust source as a backend
|
||||
|
@ -42,32 +42,45 @@ the execution. ::
|
||||
|
||||
$ git clone https://github.com/sjp38/masim; cd masim; make
|
||||
$ sudo damo start "./masim ./configs/stairs.cfg --quiet"
|
||||
$ sudo ./damo show
|
||||
0 addr [85.541 TiB , 85.541 TiB ) (57.707 MiB ) access 0 % age 10.400 s
|
||||
1 addr [85.541 TiB , 85.542 TiB ) (413.285 MiB) access 0 % age 11.400 s
|
||||
2 addr [127.649 TiB , 127.649 TiB) (57.500 MiB ) access 0 % age 1.600 s
|
||||
3 addr [127.649 TiB , 127.649 TiB) (32.500 MiB ) access 0 % age 500 ms
|
||||
4 addr [127.649 TiB , 127.649 TiB) (9.535 MiB ) access 100 % age 300 ms
|
||||
5 addr [127.649 TiB , 127.649 TiB) (8.000 KiB ) access 60 % age 0 ns
|
||||
6 addr [127.649 TiB , 127.649 TiB) (6.926 MiB ) access 0 % age 1 s
|
||||
7 addr [127.998 TiB , 127.998 TiB) (120.000 KiB) access 0 % age 11.100 s
|
||||
8 addr [127.998 TiB , 127.998 TiB) (8.000 KiB ) access 40 % age 100 ms
|
||||
9 addr [127.998 TiB , 127.998 TiB) (4.000 KiB ) access 0 % age 11 s
|
||||
total size: 577.590 MiB
|
||||
$ sudo ./damo stop
|
||||
$ sudo damo report access
|
||||
heatmap: 641111111000000000000000000000000000000000000000000000[...]33333333333333335557984444[...]7
|
||||
# min/max temperatures: -1,840,000,000, 370,010,000, column size: 3.925 MiB
|
||||
0 addr 86.182 TiB size 8.000 KiB access 0 % age 14.900 s
|
||||
1 addr 86.182 TiB size 8.000 KiB access 60 % age 0 ns
|
||||
2 addr 86.182 TiB size 3.422 MiB access 0 % age 4.100 s
|
||||
3 addr 86.182 TiB size 2.004 MiB access 95 % age 2.200 s
|
||||
4 addr 86.182 TiB size 29.688 MiB access 0 % age 14.100 s
|
||||
5 addr 86.182 TiB size 29.516 MiB access 0 % age 16.700 s
|
||||
6 addr 86.182 TiB size 29.633 MiB access 0 % age 17.900 s
|
||||
7 addr 86.182 TiB size 117.652 MiB access 0 % age 18.400 s
|
||||
8 addr 126.990 TiB size 62.332 MiB access 0 % age 9.500 s
|
||||
9 addr 126.990 TiB size 13.980 MiB access 0 % age 5.200 s
|
||||
10 addr 126.990 TiB size 9.539 MiB access 100 % age 3.700 s
|
||||
11 addr 126.990 TiB size 16.098 MiB access 0 % age 6.400 s
|
||||
12 addr 127.987 TiB size 132.000 KiB access 0 % age 2.900 s
|
||||
total size: 314.008 MiB
|
||||
$ sudo damo stop
|
||||
|
||||
The first command of the above example downloads and builds an artificial
|
||||
memory access generator program called ``masim``. The second command asks DAMO
|
||||
to execute the artificial generator process start via the given command and
|
||||
make DAMON monitors the generator process. The third command retrieves the
|
||||
current snapshot of the monitored access pattern of the process from DAMON and
|
||||
shows the pattern in a human readable format.
|
||||
to start the program via the given command and make DAMON monitors the newly
|
||||
started process. The third command retrieves the current snapshot of the
|
||||
monitored access pattern of the process from DAMON and shows the pattern in a
|
||||
human readable format.
|
||||
|
||||
Each line of the output shows which virtual address range (``addr [XX, XX)``)
|
||||
of the process is how frequently (``access XX %``) accessed for how long time
|
||||
(``age XX``). For example, the fifth region of ~9 MiB size is being most
|
||||
frequently accessed for last 300 milliseconds. Finally, the fourth command
|
||||
stops DAMON.
|
||||
The first line of the output shows the relative access temperature (hotness) of
|
||||
the regions in a single row hetmap format. Each column on the heatmap
|
||||
represents regions of same size on the monitored virtual address space. The
|
||||
position of the colun on the row and the number on the column represents the
|
||||
relative location and access temperature of the region. ``[...]`` means
|
||||
unmapped huge regions on the virtual address spaces. The second line shows
|
||||
additional information for better understanding the heatmap.
|
||||
|
||||
Each line of the output from the third line shows which virtual address range
|
||||
(``addr XX size XX``) of the process is how frequently (``access XX %``)
|
||||
accessed for how long time (``age XX``). For example, the evelenth region of
|
||||
~9.5 MiB size is being most frequently accessed for last 3.7 seconds. Finally,
|
||||
the fourth command stops DAMON.
|
||||
|
||||
Note that DAMON can monitor not only virtual address spaces but multiple types
|
||||
of address spaces including the physical address space.
|
||||
@ -95,7 +108,7 @@ Visualizing Recorded Patterns
|
||||
You can visualize the pattern in a heatmap, showing which memory region
|
||||
(x-axis) got accessed when (y-axis) and how frequently (number).::
|
||||
|
||||
$ sudo damo report heats --heatmap stdout
|
||||
$ sudo damo report heatmap
|
||||
22222222222222222222222222222222222222211111111111111111111111111111111111111100
|
||||
44444444444444444444444444444444444444434444444444444444444444444444444444443200
|
||||
44444444444444444444444444444444444444433444444444444444444444444444444444444200
|
||||
@ -160,6 +173,6 @@ Data Access Pattern Aware Memory Management
|
||||
Below command makes every memory region of size >=4K that has not accessed for
|
||||
>=60 seconds in your workload to be swapped out. ::
|
||||
|
||||
$ sudo damo schemes --damos_access_rate 0 0 --damos_sz_region 4K max \
|
||||
$ sudo damo start --damos_access_rate 0 0 --damos_sz_region 4K max \
|
||||
--damos_age 60s max --damos_action pageout \
|
||||
<pid of your workload>
|
||||
|
@ -26,12 +26,6 @@ DAMON provides below interfaces for different users.
|
||||
writing kernel space DAMON application programs for you. You can even extend
|
||||
DAMON for various address spaces. For detail, please refer to the interface
|
||||
:doc:`document </mm/damon/api>`.
|
||||
- *debugfs interface. (DEPRECATED!)*
|
||||
:ref:`This <debugfs_interface>` is almost identical to :ref:`sysfs interface
|
||||
<sysfs_interface>`. This is deprecated, so users should move to the
|
||||
:ref:`sysfs interface <sysfs_interface>`. If you depend on this and cannot
|
||||
move, please report your usecase to damon@lists.linux.dev and
|
||||
linux-mm@kvack.org.
|
||||
|
||||
.. _sysfs_interface:
|
||||
|
||||
@ -89,10 +83,10 @@ comma (",").
|
||||
│ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value
|
||||
│ │ │ │ │ │ │ :ref:`watermarks <sysfs_watermarks>`/metric,interval_us,high,mid,low
|
||||
│ │ │ │ │ │ │ :ref:`filters <sysfs_filters>`/nr_filters
|
||||
│ │ │ │ │ │ │ │ 0/type,matching,memcg_id
|
||||
│ │ │ │ │ │ │ :ref:`stats <sysfs_schemes_stats>`/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
|
||||
│ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx
|
||||
│ │ │ │ │ │ │ :ref:`stats <sysfs_schemes_stats>`/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds
|
||||
│ │ │ │ │ │ │ :ref:`tried_regions <sysfs_schemes_tried_regions>`/total_bytes
|
||||
│ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
|
||||
│ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age,sz_filter_passed
|
||||
│ │ │ │ │ │ │ │ ...
|
||||
│ │ │ │ │ │ ...
|
||||
│ │ │ │ ...
|
||||
@ -412,59 +406,62 @@ number (``N``) to the file creates the number of child directories named ``0``
|
||||
to ``N-1``. Each directory represents each filter. The filters are evaluated
|
||||
in the numeric order.
|
||||
|
||||
Each filter directory contains six files, namely ``type``, ``matcing``,
|
||||
``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``. To ``type``
|
||||
file, you can write one of five special keywords: ``anon`` for anonymous pages,
|
||||
``memcg`` for specific memory cgroup, ``young`` for young pages, ``addr`` for
|
||||
specific address range (an open-ended interval), or ``target`` for specific
|
||||
DAMON monitoring target filtering. In case of the memory cgroup filtering, you
|
||||
can specify the memory cgroup of the interest by writing the path of the memory
|
||||
cgroup from the cgroups mount point to ``memcg_path`` file. In case of the
|
||||
address range filtering, you can specify the start and end address of the range
|
||||
to ``addr_start`` and ``addr_end`` files, respectively. For the DAMON
|
||||
monitoring target filtering, you can specify the index of the target between
|
||||
the list of the DAMON context's monitoring targets list to ``target_idx`` file.
|
||||
You can write ``Y`` or ``N`` to ``matching`` file to filter out pages that does
|
||||
or does not match to the type, respectively. Then, the scheme's action will
|
||||
not be applied to the pages that specified to be filtered out.
|
||||
Each filter directory contains seven files, namely ``type``, ``matching``,
|
||||
``allow``, ``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``.
|
||||
To ``type`` file, you can write one of five special keywords: ``anon`` for
|
||||
anonymous pages, ``memcg`` for specific memory cgroup, ``young`` for young
|
||||
pages, ``addr`` for specific address range (an open-ended interval), or
|
||||
``target`` for specific DAMON monitoring target filtering. Meaning of the
|
||||
types are same to the description on the :ref:`design doc
|
||||
<damon_design_damos_filters>`.
|
||||
|
||||
In case of the memory cgroup filtering, you can specify the memory cgroup of
|
||||
the interest by writing the path of the memory cgroup from the cgroups mount
|
||||
point to ``memcg_path`` file. In case of the address range filtering, you can
|
||||
specify the start and end address of the range to ``addr_start`` and
|
||||
``addr_end`` files, respectively. For the DAMON monitoring target filtering,
|
||||
you can specify the index of the target between the list of the DAMON context's
|
||||
monitoring targets list to ``target_idx`` file.
|
||||
|
||||
You can write ``Y`` or ``N`` to ``matching`` file to specify whether the filter
|
||||
is for memory that matches the ``type``. You can write ``Y`` or ``N`` to
|
||||
``allow`` file to specify if applying the action to the memory that satisfies
|
||||
the ``type`` and ``matching`` should be allowed or not.
|
||||
|
||||
For example, below restricts a DAMOS action to be applied to only non-anonymous
|
||||
pages of all memory cgroups except ``/having_care_already``.::
|
||||
|
||||
# echo 2 > nr_filters
|
||||
# # filter out anonymous pages
|
||||
# # disallow anonymous pages
|
||||
echo anon > 0/type
|
||||
echo Y > 0/matching
|
||||
echo N > 0/allow
|
||||
# # further filter out all cgroups except one at '/having_care_already'
|
||||
echo memcg > 1/type
|
||||
echo /having_care_already > 1/memcg_path
|
||||
echo Y > 1/matching
|
||||
echo N > 1/allow
|
||||
|
||||
Note that ``anon`` and ``memcg`` filters are currently supported only when
|
||||
``paddr`` :ref:`implementation <sysfs_context>` is being used.
|
||||
|
||||
Also, memory regions that are filtered out by ``addr`` or ``target`` filters
|
||||
are not counted as the scheme has tried to those, while regions that filtered
|
||||
out by other type filters are counted as the scheme has tried to. The
|
||||
difference is applied to :ref:`stats <damos_stats>` and
|
||||
:ref:`tried regions <sysfs_schemes_tried_regions>`.
|
||||
Refer to the :ref:`DAMOS filters design documentation
|
||||
<damon_design_damos_filters>` for more details including how multiple filters
|
||||
of different ``allow`` works, when each of the filters are supported, and
|
||||
differences on stats.
|
||||
|
||||
.. _sysfs_schemes_stats:
|
||||
|
||||
schemes/<N>/stats/
|
||||
------------------
|
||||
|
||||
DAMON counts the total number and bytes of regions that each scheme is tried to
|
||||
be applied, the two numbers for the regions that each scheme is successfully
|
||||
applied, and the total number of the quota limit exceeds. This statistics can
|
||||
be used for online analysis or tuning of the schemes.
|
||||
DAMON counts statistics for each scheme. This statistics can be used for
|
||||
online analysis or tuning of the schemes. Refer to :ref:`design doc
|
||||
<damon_design_damos_stat>` for more details about the stats.
|
||||
|
||||
The statistics can be retrieved by reading the files under ``stats`` directory
|
||||
(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, and
|
||||
``qt_exceeds``), respectively. The files are not updated in real time, so you
|
||||
should ask DAMON sysfs interface to update the content of the files for the
|
||||
stats by writing a special keyword, ``update_schemes_stats`` to the relevant
|
||||
``kdamonds/<N>/state`` file.
|
||||
(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``,
|
||||
``sz_ops_filter_passed``, and ``qt_exceeds``), respectively. The files are not
|
||||
updated in real time, so you should ask DAMON sysfs interface to update the
|
||||
content of the files for the stats by writing a special keyword,
|
||||
``update_schemes_stats`` to the relevant ``kdamonds/<N>/state`` file.
|
||||
|
||||
.. _sysfs_schemes_tried_regions:
|
||||
|
||||
@ -501,10 +498,10 @@ set the ``access pattern`` as their interested pattern that they want to query.
|
||||
tried_regions/<N>/
|
||||
------------------
|
||||
|
||||
In each region directory, you will find four files (``start``, ``end``,
|
||||
``nr_accesses``, and ``age``). Reading the files will show the start and end
|
||||
addresses, ``nr_accesses``, and ``age`` of the region that corresponding
|
||||
DAMON-based operation scheme ``action`` has tried to be applied.
|
||||
In each region directory, you will find five files (``start``, ``end``,
|
||||
``nr_accesses``, ``age``, and ``sz_filter_passed``). Reading the files will
|
||||
show the properties of the region that corresponding DAMON-based operation
|
||||
scheme ``action`` has tried to be applied.
|
||||
|
||||
Example
|
||||
~~~~~~~
|
||||
@ -600,306 +597,3 @@ fields are as usual. It shows the index of the DAMON context (``ctx_idx=X``)
|
||||
of the scheme in the list of the contexts of the context's kdamond, the index
|
||||
of the scheme (``scheme_idx=X``) in the list of the schemes of the context, in
|
||||
addition to the output of ``damon_aggregated`` tracepoint.
|
||||
|
||||
|
||||
.. _debugfs_interface:
|
||||
|
||||
debugfs Interface (DEPRECATED!)
|
||||
===============================
|
||||
|
||||
.. note::
|
||||
|
||||
THIS IS DEPRECATED!
|
||||
|
||||
DAMON debugfs interface is deprecated, so users should move to the
|
||||
:ref:`sysfs interface <sysfs_interface>`. If you depend on this and cannot
|
||||
move, please report your usecase to damon@lists.linux.dev and
|
||||
linux-mm@kvack.org.
|
||||
|
||||
DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``,
|
||||
``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``,
|
||||
``mk_contexts`` and ``rm_contexts`` under its debugfs directory,
|
||||
``<debugfs>/damon/``.
|
||||
|
||||
|
||||
``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation
|
||||
notice. Reading it returns the deprecation notice, as below::
|
||||
|
||||
# cat DEPRECATED
|
||||
DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
|
||||
|
||||
|
||||
Attributes
|
||||
----------
|
||||
|
||||
Users can get and set the ``sampling interval``, ``aggregation interval``,
|
||||
``update interval``, and min/max number of monitoring target regions by
|
||||
reading from and writing to the ``attrs`` file. To know about the monitoring
|
||||
attributes in detail, please refer to the :doc:`/mm/damon/design`. For
|
||||
example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and
|
||||
1000, and then check it again::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo 5000 100000 1000000 10 1000 > attrs
|
||||
# cat attrs
|
||||
5000 100000 1000000 10 1000
|
||||
|
||||
|
||||
Target IDs
|
||||
----------
|
||||
|
||||
Some types of address spaces supports multiple monitoring target. For example,
|
||||
the virtual memory address spaces monitoring can have multiple processes as the
|
||||
monitoring targets. Users can set the targets by writing relevant id values of
|
||||
the targets to, and get the ids of the current targets by reading from the
|
||||
``target_ids`` file. In case of the virtual address spaces monitoring, the
|
||||
values should be pids of the monitoring target processes. For example, below
|
||||
commands set processes having pids 42 and 4242 as the monitoring targets and
|
||||
check it again::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo 42 4242 > target_ids
|
||||
# cat target_ids
|
||||
42 4242
|
||||
|
||||
Users can also monitor the physical memory address space of the system by
|
||||
writing a special keyword, "``paddr\n``" to the file. Because physical address
|
||||
space monitoring doesn't support multiple targets, reading the file will show a
|
||||
fake value, ``42``, as below::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo paddr > target_ids
|
||||
# cat target_ids
|
||||
42
|
||||
|
||||
Note that setting the target ids doesn't start the monitoring.
|
||||
|
||||
|
||||
Initial Monitoring Target Regions
|
||||
---------------------------------
|
||||
|
||||
In case of the virtual address space monitoring, DAMON automatically sets and
|
||||
updates the monitoring target regions so that entire memory mappings of target
|
||||
processes can be covered. However, users can want to limit the monitoring
|
||||
region to specific address ranges, such as the heap, the stack, or specific
|
||||
file-mapped area. Or, some users can know the initial access pattern of their
|
||||
workloads and therefore want to set optimal initial regions for the 'adaptive
|
||||
regions adjustment'.
|
||||
|
||||
In contrast, DAMON do not automatically sets and updates the monitoring target
|
||||
regions in case of physical memory monitoring. Therefore, users should set the
|
||||
monitoring target regions by themselves.
|
||||
|
||||
In such cases, users can explicitly set the initial monitoring target regions
|
||||
as they want, by writing proper values to the ``init_regions`` file. The input
|
||||
should be a sequence of three integers separated by white spaces that represent
|
||||
one region in below form.::
|
||||
|
||||
<target idx> <start address> <end address>
|
||||
|
||||
The ``target idx`` should be the index of the target in ``target_ids`` file,
|
||||
starting from ``0``, and the regions should be passed in address order. For
|
||||
example, below commands will set a couple of address ranges, ``1-100`` and
|
||||
``100-200`` as the initial monitoring target region of pid 42, which is the
|
||||
first one (index ``0``) in ``target_ids``, and another couple of address
|
||||
ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
|
||||
(index ``1``) in ``target_ids``.::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# cat target_ids
|
||||
42 4242
|
||||
# echo "0 1 100 \
|
||||
0 100 200 \
|
||||
1 20 40 \
|
||||
1 50 100" > init_regions
|
||||
|
||||
Note that this sets the initial monitoring target regions only. In case of
|
||||
virtual memory monitoring, DAMON will automatically updates the boundary of the
|
||||
regions after one ``update interval``. Therefore, users should set the
|
||||
``update interval`` large enough in this case, if they don't want the
|
||||
update.
|
||||
|
||||
|
||||
Schemes
|
||||
-------
|
||||
|
||||
Users can get and set the DAMON-based operation :ref:`schemes
|
||||
<damon_design_damos>` by reading from and writing to ``schemes`` debugfs file.
|
||||
Reading the file also shows the statistics of each scheme. To the file, each
|
||||
of the schemes should be represented in each line in below form::
|
||||
|
||||
<target access pattern> <action> <quota> <watermarks>
|
||||
|
||||
You can disable schemes by simply writing an empty string to the file.
|
||||
|
||||
Target Access Pattern
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The target access :ref:`pattern <damon_design_damos_access_pattern>` of the
|
||||
scheme. The ``<target access pattern>`` is constructed with three ranges in
|
||||
below form::
|
||||
|
||||
min-size max-size min-acc max-acc min-age max-age
|
||||
|
||||
Specifically, bytes for the size of regions (``min-size`` and ``max-size``),
|
||||
number of monitored accesses per aggregate interval for access frequency
|
||||
(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of
|
||||
regions (``min-age`` and ``max-age``) are specified. Note that the ranges are
|
||||
closed interval.
|
||||
|
||||
Action
|
||||
~~~~~~
|
||||
|
||||
The ``<action>`` is a predefined integer for memory management :ref:`actions
|
||||
<damon_design_damos_action>`. The mapping between the ``<action>`` values and
|
||||
the memory management actions is as below. For the detailed meaning of the
|
||||
action and DAMON operations set supporting each action, please refer to the
|
||||
list on :ref:`design doc <damon_design_damos_action>`.
|
||||
|
||||
- 0: ``willneed``
|
||||
- 1: ``cold``
|
||||
- 2: ``pageout``
|
||||
- 3: ``hugepage``
|
||||
- 4: ``nohugepage``
|
||||
- 5: ``stat``
|
||||
|
||||
Quota
|
||||
~~~~~
|
||||
|
||||
Users can set the :ref:`quotas <damon_design_damos_quotas>` of the given scheme
|
||||
via the ``<quota>`` in below form::
|
||||
|
||||
<ms> <sz> <reset interval> <priority weights>
|
||||
|
||||
This makes DAMON to try to use only up to ``<ms>`` milliseconds for applying
|
||||
the action to memory regions of the ``target access pattern`` within the
|
||||
``<reset interval>`` milliseconds, and to apply the action to only up to
|
||||
``<sz>`` bytes of memory regions within the ``<reset interval>``. Setting both
|
||||
``<ms>`` and ``<sz>`` zero disables the quota limits.
|
||||
|
||||
For the :ref:`prioritization <damon_design_damos_quotas_prioritization>`, users
|
||||
can set the weights for the three properties in ``<priority weights>`` in below
|
||||
form::
|
||||
|
||||
<size weight> <access frequency weight> <age weight>
|
||||
|
||||
Watermarks
|
||||
~~~~~~~~~~
|
||||
|
||||
Users can specify :ref:`watermarks <damon_design_damos_watermarks>` of the
|
||||
given scheme via ``<watermarks>`` in below form::
|
||||
|
||||
<metric> <check interval> <high mark> <middle mark> <low mark>
|
||||
|
||||
``<metric>`` is a predefined integer for the metric to be checked. The
|
||||
supported numbers and their meanings are as below.
|
||||
|
||||
- 0: Ignore the watermarks
|
||||
- 1: System's free memory rate (per thousand)
|
||||
|
||||
The value of the metric is checked every ``<check interval>`` microseconds.
|
||||
|
||||
If the value is higher than ``<high mark>`` or lower than ``<low mark>``, the
|
||||
scheme is deactivated. If the value is lower than ``<mid mark>``, the scheme
|
||||
is activated.
|
||||
|
||||
.. _damos_stats:
|
||||
|
||||
Statistics
|
||||
~~~~~~~~~~
|
||||
|
||||
It also counts the total number and bytes of regions that each scheme is tried
|
||||
to be applied, the two numbers for the regions that each scheme is successfully
|
||||
applied, and the total number of the quota limit exceeds. This statistics can
|
||||
be used for online analysis or tuning of the schemes.
|
||||
|
||||
The statistics can be shown by reading the ``schemes`` file. Reading the file
|
||||
will show each scheme you entered in each line, and the five numbers for the
|
||||
statistics will be added at the end of each line.
|
||||
|
||||
Example
|
||||
~~~~~~~
|
||||
|
||||
Below commands applies a scheme saying "If a memory region of size in [4KiB,
|
||||
8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
|
||||
interval in [10, 20], page out the region. For the paging out, use only up to
|
||||
10ms per second, and also don't page out more than 1GiB per second. Under the
|
||||
limitation, page out memory regions having longer age first. Also, check the
|
||||
free memory rate of the system every 5 seconds, start the monitoring and paging
|
||||
out when the free memory rate becomes lower than 50%, but stop it if the free
|
||||
memory rate becomes larger than 60%, or lower than 30%".::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# scheme="4096 8192 0 5 10 20 2" # target access pattern and action
|
||||
# scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
|
||||
# scheme+=" 0 0 100" # prioritization weights
|
||||
# scheme+=" 1 5000000 600 500 300" # watermarks
|
||||
# echo "$scheme" > schemes
|
||||
|
||||
|
||||
Turning On/Off
|
||||
--------------
|
||||
|
||||
Setting the files as described above doesn't incur effect unless you explicitly
|
||||
start the monitoring. You can start, stop, and check the current status of the
|
||||
monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file.
|
||||
Writing ``on`` to the file starts the monitoring of the targets with the
|
||||
attributes. Writing ``off`` to the file stops those. DAMON also stops if
|
||||
every target process is terminated. Below example commands turn on, off, and
|
||||
check the status of DAMON::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo on > monitor_on_DEPRECATED
|
||||
# echo off > monitor_on_DEPRECATED
|
||||
# cat monitor_on_DEPRECATED
|
||||
off
|
||||
|
||||
Please note that you cannot write to the above-mentioned debugfs files while
|
||||
the monitoring is turned on. If you write to the files while DAMON is running,
|
||||
an error code such as ``-EBUSY`` will be returned.
|
||||
|
||||
|
||||
Monitoring Thread PID
|
||||
---------------------
|
||||
|
||||
DAMON does requested monitoring with a kernel thread called ``kdamond``. You
|
||||
can get the pid of the thread by reading the ``kdamond_pid`` file. When the
|
||||
monitoring is turned off, reading the file returns ``none``. ::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# cat monitor_on_DEPRECATED
|
||||
off
|
||||
# cat kdamond_pid
|
||||
none
|
||||
# echo on > monitor_on_DEPRECATED
|
||||
# cat kdamond_pid
|
||||
18594
|
||||
|
||||
|
||||
Using Multiple Monitoring Threads
|
||||
---------------------------------
|
||||
|
||||
One ``kdamond`` thread is created for each monitoring context. You can create
|
||||
and remove monitoring contexts for multiple ``kdamond`` required use case using
|
||||
the ``mk_contexts`` and ``rm_contexts`` files.
|
||||
|
||||
Writing the name of the new context to the ``mk_contexts`` file creates a
|
||||
directory of the name on the DAMON debugfs directory. The directory will have
|
||||
DAMON debugfs files for the context. ::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# ls foo
|
||||
# ls: cannot access 'foo': No such file or directory
|
||||
# echo foo > mk_contexts
|
||||
# ls foo
|
||||
# attrs init_regions kdamond_pid schemes target_ids
|
||||
|
||||
If the context is not needed anymore, you can remove it and the corresponding
|
||||
directory by putting the name of the context to the ``rm_contexts`` file. ::
|
||||
|
||||
# echo foo > rm_contexts
|
||||
# ls foo
|
||||
# ls: cannot access 'foo': No such file or directory
|
||||
|
||||
Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files
|
||||
are in the root directory only.
|
||||
|
@ -280,8 +280,8 @@ The following files are currently defined:
|
||||
blocks; configure auto-onlining.
|
||||
|
||||
The default value depends on the
|
||||
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel configuration
|
||||
option.
|
||||
CONFIG_MHP_DEFAULT_ONLINE_TYPE kernel configuration
|
||||
options.
|
||||
|
||||
See the ``state`` property of memory blocks for details.
|
||||
``block_size_bytes`` read-only: the size in bytes of a memory block.
|
||||
|
@ -332,6 +332,12 @@ allocation policy for the internal shmem mount by using the kernel parameter
|
||||
seven valid policies for shmem (``always``, ``within_size``, ``advise``,
|
||||
``never``, ``deny``, and ``force``).
|
||||
|
||||
Similarly to ``transparent_hugepage_shmem``, you can control the default
|
||||
hugepage allocation policy for the tmpfs mount by using the kernel parameter
|
||||
``transparent_hugepage_tmpfs=<policy>``, where ``<policy>`` is one of the
|
||||
four valid policies for tmpfs (``always``, ``within_size``, ``advise``,
|
||||
``never``). The tmpfs mount default policy is ``never``.
|
||||
|
||||
In the same manner as ``thp_anon`` controls each supported anonymous THP
|
||||
size, ``thp_shmem`` controls each supported shmem THP size. ``thp_shmem``
|
||||
has the same format as ``thp_anon``, but also supports the policy
|
||||
@ -352,8 +358,21 @@ default to ``never``.
|
||||
Hugepages in tmpfs/shmem
|
||||
========================
|
||||
|
||||
You can control hugepage allocation policy in tmpfs with mount option
|
||||
``huge=``. It can have following values:
|
||||
Traditionally, tmpfs only supported a single huge page size ("PMD"). Today,
|
||||
it also supports smaller sizes just like anonymous memory, often referred
|
||||
to as "multi-size THP" (mTHP). Huge pages of any size are commonly
|
||||
represented in the kernel as "large folios".
|
||||
|
||||
While there is fine control over the huge page sizes to use for the internal
|
||||
shmem mount (see below), ordinary tmpfs mounts will make use of all available
|
||||
huge page sizes without any control over the exact sizes, behaving more like
|
||||
other file systems.
|
||||
|
||||
tmpfs mounts
|
||||
------------
|
||||
|
||||
The THP allocation policy for tmpfs mounts can be adjusted using the mount
|
||||
option: ``huge=``. It can have following values:
|
||||
|
||||
always
|
||||
Attempt to allocate huge pages every time we need a new page;
|
||||
@ -363,24 +382,24 @@ never
|
||||
|
||||
within_size
|
||||
Only allocate huge page if it will be fully within i_size.
|
||||
Also respect fadvise()/madvise() hints;
|
||||
Also respect madvise() hints;
|
||||
|
||||
advise
|
||||
Only allocate huge pages if requested with fadvise()/madvise();
|
||||
Only allocate huge pages if requested with madvise();
|
||||
|
||||
The default policy is ``never``.
|
||||
Remember, that the kernel may use huge pages of all available sizes, and
|
||||
that no fine control as for the internal tmpfs mount is available.
|
||||
|
||||
The default policy in the past was ``never``, but it can now be adjusted
|
||||
using the kernel parameter ``transparent_hugepage_tmpfs=<policy>``.
|
||||
|
||||
``mount -o remount,huge= /mountpoint`` works fine after mount: remounting
|
||||
``huge=never`` will not attempt to break up huge pages at all, just stop more
|
||||
from being allocated.
|
||||
|
||||
There's also sysfs knob to control hugepage allocation policy for internal
|
||||
shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
|
||||
is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
|
||||
MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
|
||||
|
||||
In addition to policies listed above, shmem_enabled allows two further
|
||||
values:
|
||||
In addition to policies listed above, the sysfs knob
|
||||
/sys/kernel/mm/transparent_hugepage/shmem_enabled will affect the
|
||||
allocation policy of tmpfs mounts, when set to the following values:
|
||||
|
||||
deny
|
||||
For use in emergencies, to force the huge option off from
|
||||
@ -388,13 +407,24 @@ deny
|
||||
force
|
||||
Force the huge option on for all - very useful for testing;
|
||||
|
||||
Shmem can also use "multi-size THP" (mTHP) by adding a new sysfs knob to
|
||||
control mTHP allocation:
|
||||
'/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/shmem_enabled',
|
||||
and its value for each mTHP is essentially consistent with the global
|
||||
setting. An 'inherit' option is added to ensure compatibility with these
|
||||
global settings. Conversely, the options 'force' and 'deny' are dropped,
|
||||
which are rather testing artifacts from the old ages.
|
||||
shmem / internal tmpfs
|
||||
----------------------
|
||||
The mount internal tmpfs mount is used for SysV SHM, memfds, shared anonymous
|
||||
mmaps (of /dev/zero or MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
|
||||
|
||||
To control the THP allocation policy for this internal tmpfs mount, the
|
||||
sysfs knob /sys/kernel/mm/transparent_hugepage/shmem_enabled and the knobs
|
||||
per THP size in
|
||||
'/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/shmem_enabled'
|
||||
can be used.
|
||||
|
||||
The global knob has the same semantics as the ``huge=`` mount options
|
||||
for tmpfs mounts, except that the different huge page sizes can be controlled
|
||||
individually, and will only use the setting of the global knob when the
|
||||
per-size knob is set to 'inherit'.
|
||||
|
||||
The options 'force' and 'deny' are dropped for the individual sizes, which
|
||||
are rather testing artifacts from the old ages.
|
||||
|
||||
always
|
||||
Attempt to allocate <size> huge pages every time we need a new page;
|
||||
@ -408,10 +438,10 @@ never
|
||||
|
||||
within_size
|
||||
Only allocate <size> huge page if it will be fully within i_size.
|
||||
Also respect fadvise()/madvise() hints;
|
||||
Also respect madvise() hints;
|
||||
|
||||
advise
|
||||
Only allocate <size> huge pages if requested with fadvise()/madvise();
|
||||
Only allocate <size> huge pages if requested with madvise();
|
||||
|
||||
Need of application restart
|
||||
===========================
|
||||
@ -561,6 +591,16 @@ swpin
|
||||
is incremented every time a huge page is swapped in from a non-zswap
|
||||
swap device in one piece.
|
||||
|
||||
swpin_fallback
|
||||
is incremented if swapin fails to allocate or charge a huge page
|
||||
and instead falls back to using huge pages with lower orders or
|
||||
small pages.
|
||||
|
||||
swpin_fallback_charge
|
||||
is incremented if swapin fails to charge a huge page and instead
|
||||
falls back to using huge pages with lower orders or small pages
|
||||
even though the allocation was successful.
|
||||
|
||||
swpout
|
||||
is incremented every time a huge page is swapped out to a non-zswap
|
||||
swap device in one piece without splitting.
|
||||
|
@ -4,6 +4,8 @@
|
||||
Min Heap API
|
||||
============
|
||||
|
||||
:Author: Kuan-Wei Chiu <visitorckw@gmail.com>
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
|
@ -42,8 +42,8 @@ call xa_tag_pointer() to create an entry with a tag, xa_untag_pointer()
|
||||
to turn a tagged entry back into an untagged pointer and xa_pointer_tag()
|
||||
to retrieve the tag of an entry. Tagged pointers use the same bits that
|
||||
are used to distinguish value entries from normal pointers, so you must
|
||||
decide whether they want to store value entries or tagged pointers in
|
||||
any particular XArray.
|
||||
decide whether you want to store value entries or tagged pointers in any
|
||||
particular XArray.
|
||||
|
||||
The XArray does not support storing IS_ERR() pointers as some
|
||||
conflict with value entries or internal entries.
|
||||
@ -52,8 +52,9 @@ An unusual feature of the XArray is the ability to create entries which
|
||||
occupy a range of indices. Once stored to, looking up any index in
|
||||
the range will return the same entry as looking up any other index in
|
||||
the range. Storing to any index will store to all of them. Multi-index
|
||||
entries can be explicitly split into smaller entries, or storing ``NULL``
|
||||
into any entry will cause the XArray to forget about the range.
|
||||
entries can be explicitly split into smaller entries. Unsetting (using
|
||||
xa_erase() or xa_store() with ``NULL``) any entry will cause the XArray
|
||||
to forget about the range.
|
||||
|
||||
Normal API
|
||||
==========
|
||||
@ -63,13 +64,14 @@ for statically allocated XArrays or xa_init() for dynamically
|
||||
allocated ones. A freshly-initialised XArray contains a ``NULL``
|
||||
pointer at every index.
|
||||
|
||||
You can then set entries using xa_store() and get entries
|
||||
using xa_load(). xa_store will overwrite any entry with the
|
||||
new entry and return the previous entry stored at that index. You can
|
||||
use xa_erase() instead of calling xa_store() with a
|
||||
``NULL`` entry. There is no difference between an entry that has never
|
||||
been stored to, one that has been erased and one that has most recently
|
||||
had ``NULL`` stored to it.
|
||||
You can then set entries using xa_store() and get entries using
|
||||
xa_load(). xa_store() will overwrite any entry with the new entry and
|
||||
return the previous entry stored at that index. You can unset entries
|
||||
using xa_erase() or by setting the entry to ``NULL`` using xa_store().
|
||||
There is no difference between an entry that has never been stored to
|
||||
and one that has been erased with xa_erase(); an entry that has most
|
||||
recently had ``NULL`` stored to it is also equivalent except if the
|
||||
XArray was initialized with ``XA_FLAGS_ALLOC``.
|
||||
|
||||
You can conditionally replace an entry at an index by using
|
||||
xa_cmpxchg(). Like cmpxchg(), it will only succeed if
|
||||
|
@ -48,6 +48,7 @@ fixes/update part 1.1 Stefani Seibold <stefani@seibold.net> June 9 2009
|
||||
3.11 /proc/<pid>/patch_state - Livepatch patch operation state
|
||||
3.12 /proc/<pid>/arch_status - Task architecture specific information
|
||||
3.13 /proc/<pid>/fd - List of symlinks to open files
|
||||
3.14 /proc/<pid/ksm_stat - Information about the process's ksm status.
|
||||
|
||||
4 Configuring procfs
|
||||
4.1 Mount options
|
||||
@ -484,14 +485,15 @@ Memory Area, or VMA) there is a series of lines such as the following::
|
||||
THPeligible: 0
|
||||
VmFlags: rd ex mr mw me dw
|
||||
|
||||
The first of these lines shows the same information as is displayed for the
|
||||
mapping in /proc/PID/maps. Following lines show the size of the mapping
|
||||
(size); the size of each page allocated when backing a VMA (KernelPageSize),
|
||||
which is usually the same as the size in the page table entries; the page size
|
||||
used by the MMU when backing a VMA (in most cases, the same as KernelPageSize);
|
||||
the amount of the mapping that is currently resident in RAM (RSS); the
|
||||
process' proportional share of this mapping (PSS); and the number of clean and
|
||||
dirty shared and private pages in the mapping.
|
||||
The first of these lines shows the same information as is displayed for
|
||||
the mapping in /proc/PID/maps. Following lines show the size of the
|
||||
mapping (size); the size of each page allocated when backing a VMA
|
||||
(KernelPageSize), which is usually the same as the size in the page table
|
||||
entries; the page size used by the MMU when backing a VMA (in most cases,
|
||||
the same as KernelPageSize); the amount of the mapping that is currently
|
||||
resident in RAM (RSS); the process's proportional share of this mapping
|
||||
(PSS); and the number of clean and dirty shared and private pages in the
|
||||
mapping.
|
||||
|
||||
The "proportional set size" (PSS) of a process is the count of pages it has
|
||||
in memory, where each page is divided by the number of processes sharing it.
|
||||
@ -2232,6 +2234,74 @@ The number of open files for the process is stored in 'size' member
|
||||
of stat() output for /proc/<pid>/fd for fast access.
|
||||
-------------------------------------------------------
|
||||
|
||||
3.14 /proc/<pid/ksm_stat - Information about the process's ksm status
|
||||
---------------------------------------------------------------------
|
||||
When CONFIG_KSM is enabled, each process has this file which displays
|
||||
the information of ksm merging status.
|
||||
|
||||
Example
|
||||
~~~~~~~
|
||||
|
||||
::
|
||||
|
||||
/ # cat /proc/self/ksm_stat
|
||||
ksm_rmap_items 0
|
||||
ksm_zero_pages 0
|
||||
ksm_merging_pages 0
|
||||
ksm_process_profit 0
|
||||
ksm_merge_any: no
|
||||
ksm_mergeable: no
|
||||
|
||||
Description
|
||||
~~~~~~~~~~~
|
||||
|
||||
ksm_rmap_items
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
The number of ksm_rmap_item structure in use. The structure of
|
||||
ksm_rmap_item is to store the reverse mapping information for virtual
|
||||
addresses. KSM will generate a ksm_rmap_item for each ksm-scanned page
|
||||
of the process.
|
||||
|
||||
ksm_zero_pages
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
When /sys/kernel/mm/ksm/use_zero_pages is enabled, it represent how many
|
||||
empty pages are merged with kernel zero pages by KSM.
|
||||
|
||||
ksm_merging_pages
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
It represents how many pages of this process are involved in KSM merging
|
||||
(not including ksm_zero_pages). It is the same with what
|
||||
/proc/<pid>/ksm_merging_pages shows.
|
||||
|
||||
ksm_process_profit
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The profit that KSM brings (Saved bytes). KSM can save memory by merging
|
||||
identical pages, but also can consume additional memory, because it needs
|
||||
to generate a number of rmap_items to save each scanned page's brief rmap
|
||||
information. Some of these pages may be merged, but some may not be abled
|
||||
to be merged after being checked several times, which are unprofitable
|
||||
memory consumed.
|
||||
|
||||
ksm_merge_any
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
It specifies whether the process's mm is added by prctl() into the
|
||||
candidate list of KSM or not, and KSM scanning is fully enabled at process
|
||||
level.
|
||||
|
||||
ksm_mergeable
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
It specifies whether any VMAs of the process's mm are currently applicable
|
||||
to KSM.
|
||||
|
||||
More information about KSM can be found at
|
||||
Documentation/admin-guide/mm/ksm.rst.
|
||||
|
||||
|
||||
Chapter 4: Configuring procfs
|
||||
=============================
|
||||
@ -2261,7 +2331,7 @@ arguments are now protected against local eavesdroppers.
|
||||
hidepid=invisible or hidepid=2 means hidepid=1 plus all /proc/<pid>/ will be
|
||||
fully invisible to other users. It doesn't mean that it hides a fact whether a
|
||||
process with a specific pid value exists (it can be learned by other means, e.g.
|
||||
by "kill -0 $PID"), but it hides process' uid and gid, which may be learned by
|
||||
by "kill -0 $PID"), but it hides process's uid and gid, which may be learned by
|
||||
stat()'ing /proc/<pid>/ otherwise. It greatly complicates an intruder's task of
|
||||
gathering information about running processes, whether some daemon runs with
|
||||
elevated privileges, whether other user runs some sensitive program, whether
|
||||
|
@ -6,7 +6,7 @@ Squashfs 4.0 Filesystem
|
||||
|
||||
Squashfs is a compressed read-only filesystem for Linux.
|
||||
|
||||
It uses zlib, lz4, lzo, or xz compression to compress files, inodes and
|
||||
It uses zlib, lz4, lzo, xz or zstd compression to compress files, inodes and
|
||||
directories. Inodes in the system are very small and all blocks are packed to
|
||||
minimise data overhead. Block sizes greater than 4K are supported up to a
|
||||
maximum of 1Mbytes (default block size 128K).
|
||||
@ -16,8 +16,8 @@ use (i.e. in cases where a .tar.gz file may be used), and in constrained
|
||||
block device/memory systems (e.g. embedded systems) where low overhead is
|
||||
needed.
|
||||
|
||||
Mailing list: squashfs-devel@lists.sourceforge.net
|
||||
Web site: www.squashfs.org
|
||||
Mailing list (kernel code): linux-fsdevel@vger.kernel.org
|
||||
Web site: github.com/plougher/squashfs-tools
|
||||
|
||||
1. Filesystem Features
|
||||
----------------------
|
||||
@ -58,11 +58,9 @@ inodes have different sizes).
|
||||
|
||||
As squashfs is a read-only filesystem, the mksquashfs program must be used to
|
||||
create populated squashfs filesystems. This and other squashfs utilities
|
||||
can be obtained from http://www.squashfs.org. Usage instructions can be
|
||||
obtained from this site also.
|
||||
|
||||
The squashfs-tools development tree is now located on kernel.org
|
||||
git://git.kernel.org/pub/scm/fs/squashfs/squashfs-tools.git
|
||||
are very likely packaged by your linux distribution (called squashfs-tools).
|
||||
The source code can be obtained from github.com/plougher/squashfs-tools.
|
||||
Usage instructions can also be obtained from this site.
|
||||
|
||||
2.1 Mount options
|
||||
-----------------
|
||||
|
@ -203,6 +203,8 @@ This scheme, however, cannot preserve the quality of the output if the
|
||||
assumption is not guaranteed.
|
||||
|
||||
|
||||
.. _damon_design_adaptive_regions_adjustment:
|
||||
|
||||
Adaptive Regions Adjustment
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -264,6 +266,61 @@ tracepoints. For more details, please refer to the documentations for
|
||||
respectively.
|
||||
|
||||
|
||||
.. _damon_design_monitoring_params_tuning_guide:
|
||||
|
||||
Monitoring Parameters Tuning Guide
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
In short, set ``aggregation interval`` to capture meaningful amount of accesses
|
||||
for the purpose. The amount of accesses can be measured using ``nr_accesses``
|
||||
and ``age`` of regions in the aggregated monitoring results snapshot. The
|
||||
default value of the interval, ``100ms``, turns out to be too short in many
|
||||
cases. Set ``sampling interval`` proportional to ``aggregation interval``. By
|
||||
default, ``1/20`` is recommended as the ratio.
|
||||
|
||||
``Aggregation interval`` should be set as the time interval that the workload
|
||||
can make an amount of accesses for the monitoring purpose, within the interval.
|
||||
If the interval is too short, only small number of accesses are captured. As a
|
||||
result, the monitoring results look everything is samely accessed only rarely.
|
||||
For many purposes, that would be useless. If it is too long, however, the time
|
||||
to converge regions with the :ref:`regions adjustment mechanism
|
||||
<damon_design_adaptive_regions_adjustment>` can be too long, depending on the
|
||||
time scale of the given purpose. This could happen if the workload is actually
|
||||
making only rare accesses but the user thinks the amount of accesses for the
|
||||
monitoring purpose too high. For such cases, the target amount of access to
|
||||
capture per ``aggregation interval`` should carefully reconsidered. Also, note
|
||||
that the captured amount of accesses is represented with not only
|
||||
``nr_accesses``, but also ``age``. For example, even if every region on the
|
||||
monitoring results show zero ``nr_accesses``, regions could still be
|
||||
distinguished using ``age`` values as the recency information.
|
||||
|
||||
Hence the optimum value of ``aggregation interval`` depends on the access
|
||||
intensiveness of the workload. The user should tune the interval based on the
|
||||
amount of access that captured on each aggregated snapshot of the monitoring
|
||||
results.
|
||||
|
||||
Note that the default value of the interval is 100 milliseconds, which is too
|
||||
short in many cases, especially on large systems.
|
||||
|
||||
``Sampling interval`` defines the resolution of each aggregation. If it is set
|
||||
too large, monitoring results will look like every region was samely rarely
|
||||
accessed, or samely frequently accessed. That is, regions become
|
||||
undistinguishable based on access pattern, and therefore the results will be
|
||||
useless in many use cases. If ``sampling interval`` is too small, it will not
|
||||
degrade the resolution, but will increase the monitoring overhead. If it is
|
||||
appropriate enough to provide a resolution of the monitoring results that
|
||||
sufficient for the given purpose, it shouldn't be unnecessarily further
|
||||
lowered. It is recommended to be set proportional to ``aggregation interval``.
|
||||
By default, the ratio is set as ``1/20``, and it is still recommended.
|
||||
|
||||
Refer to below documents for an example tuning based on the above guide.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
monitoring_intervals_tuning_example
|
||||
|
||||
|
||||
.. _damon_design_damos:
|
||||
|
||||
Operation Schemes
|
||||
@ -504,9 +561,34 @@ have a list of latency-critical processes.
|
||||
|
||||
To let users optimize DAMOS schemes with such special knowledge, DAMOS provides
|
||||
a feature called DAMOS filters. The feature allows users to set an arbitrary
|
||||
number of filters for each scheme. Each filter specifies the type of target
|
||||
memory, and whether it should exclude the memory of the type (filter-out), or
|
||||
all except the memory of the type (filter-in).
|
||||
number of filters for each scheme. Each filter specifies
|
||||
|
||||
- a type of memory (``type``),
|
||||
- whether it is for the memory of the type or all except the type
|
||||
(``matching``), and
|
||||
- whether it is to allow (include) or reject (exclude) applying
|
||||
the scheme's action to the memory (``allow``).
|
||||
|
||||
When multiple filters are installed, each filter is evaluated in the installed
|
||||
order. If a part of memory is matched to one of the filter, next filters are
|
||||
ignored. If the memory passes through the filters evaluation stage because it
|
||||
is not matched to any of the filters, applying the scheme's action to it is
|
||||
allowed, same to the behavior when no filter exists.
|
||||
|
||||
For example, let's assume 1) a filter for allowing anonymous pages and 2)
|
||||
another filter for rejecting young pages are installed in the order. If a page
|
||||
of a region that eligible to apply the scheme's action is an anonymous page,
|
||||
the scheme's action will be applied to the page regardless of whether it is
|
||||
young or not, since it matches with the first allow-filter. If the page is
|
||||
not anonymous but young, the scheme's action will not be applied, since the
|
||||
second reject-filter blocks it. If the page is neither anonymous nor young,
|
||||
the page will pass through the filters evaluation stage since there is no
|
||||
matching filter, and the action will be applied to the page.
|
||||
|
||||
Note that the action can equally be applied to memory that either explicitly
|
||||
filter-allowed or filters evaluation stage passed. It means that installing
|
||||
allow-filters at the end of the list makes no practical change but only
|
||||
filters-checking overhead.
|
||||
|
||||
For efficient handling of filters, some types of filters are handled by the
|
||||
core layer, while others are handled by operations set. In the latter case,
|
||||
@ -516,7 +598,7 @@ filter are not counted as the scheme has tried to the region. In contrast, if
|
||||
a memory regions is filtered by an operations set layer-handled filter, it is
|
||||
counted as the scheme has tried. This difference affects the statistics.
|
||||
|
||||
Below types of filters are currently supported.
|
||||
Below ``type`` of filters are currently supported.
|
||||
|
||||
- anonymous page
|
||||
- Applied to pages that containing data that not stored in files.
|
||||
@ -539,6 +621,60 @@ To know how user-space can set the watermarks via :ref:`DAMON sysfs interface
|
||||
<sysfs_interface>`, refer to :ref:`filters <sysfs_filters>` part of the
|
||||
documentation.
|
||||
|
||||
.. _damon_design_damos_stat:
|
||||
|
||||
Statistics
|
||||
~~~~~~~~~~
|
||||
|
||||
The statistics of DAMOS behaviors that designed to help monitoring, tuning and
|
||||
debugging of DAMOS.
|
||||
|
||||
DAMOS accounts below statistics for each scheme, from the beginning of the
|
||||
scheme's execution.
|
||||
|
||||
- ``nr_tried``: Total number of regions that the scheme is tried to be applied.
|
||||
- ``sz_trtied``: Total size of regions that the scheme is tried to be applied.
|
||||
- ``sz_ops_filter_passed``: Total bytes that passed operations set
|
||||
layer-handled DAMOS filters.
|
||||
- ``nr_applied``: Total number of regions that the scheme is applied.
|
||||
- ``sz_applied``: Total size of regions that the scheme is applied.
|
||||
- ``qt_exceeds``: Total number of times the quota of the scheme has exceeded.
|
||||
|
||||
"A scheme is tried to be applied to a region" means DAMOS core logic determined
|
||||
the region is eligible to apply the scheme's :ref:`action
|
||||
<damon_design_damos_action>`. The :ref:`access pattern
|
||||
<damon_design_damos_access_pattern>`, :ref:`quotas
|
||||
<damon_design_damos_quotas>`, :ref:`watermarks
|
||||
<damon_design_damos_watermarks>`, and :ref:`filters
|
||||
<damon_design_damos_filters>` that handled on core logic could affect this.
|
||||
The core logic will only ask the underlying :ref:`operation set
|
||||
<damon_operations_set>` to do apply the action to the region, so whether the
|
||||
action is really applied or not is unclear. That's why it is called "tried".
|
||||
|
||||
"A scheme is applied to a region" means the :ref:`operation set
|
||||
<damon_operations_set>` has applied the action to at least a part of the
|
||||
region. The :ref:`filters <damon_design_damos_filters>` that handled by the
|
||||
operation set, and the types of the :ref:`action <damon_design_damos_action>`
|
||||
and the pages of the region can affect this. For example, if a filter is set
|
||||
to exclude anonymous pages and the region has only anonymous pages, or if the
|
||||
action is ``pageout`` while all pages of the region are unreclaimable, applying
|
||||
the action to the region will fail.
|
||||
|
||||
To know how user-space can read the stats via :ref:`DAMON sysfs interface
|
||||
<sysfs_interface>`, refer to :ref:s`stats <sysfs_stats>` part of the
|
||||
documentation.
|
||||
|
||||
Regions Walking
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
DAMOS feature allowing users access each region that a DAMOS action has just
|
||||
applied. Using this feature, DAMON :ref:`API <damon_design_api>` allows users
|
||||
access full properties of the regions including the access monitoring results
|
||||
and amount of the region's internal memory that passed the DAMOS filters.
|
||||
:ref:`DAMON sysfs interface <sysfs_interface>` also allows users read the data
|
||||
via special :ref:`files <sysfs_schemes_tried_regions>`.
|
||||
|
||||
.. _damon_design_api:
|
||||
|
||||
Application Programming Interface
|
||||
---------------------------------
|
||||
@ -573,15 +709,11 @@ General Purpose User Interface Modules
|
||||
DAMON modules that provide user space ABIs for general purpose DAMON usage in
|
||||
runtime.
|
||||
|
||||
DAMON user interface modules, namely 'DAMON sysfs interface' and 'DAMON debugfs
|
||||
interface' are DAMON API user kernel modules that provide ABIs to the
|
||||
user-space. Please note that DAMON debugfs interface is currently deprecated.
|
||||
|
||||
Like many other ABIs, the modules create files on sysfs and debugfs, allow
|
||||
users to specify their requests to and get the answers from DAMON by writing to
|
||||
and reading from the files. As a response to such I/O, DAMON user interface
|
||||
modules control DAMON and retrieve the results as user requested via the DAMON
|
||||
API, and return the results to the user-space.
|
||||
Like many other ABIs, the modules create files on pseudo file systems like
|
||||
'sysfs', allow users to specify their requests to and get the answers from
|
||||
DAMON by writing to and reading from the files. As a response to such I/O,
|
||||
DAMON user interface modules control DAMON and retrieve the results as user
|
||||
requested via the DAMON API, and return the results to the user-space.
|
||||
|
||||
The ABIs are designed to be used for user space applications development,
|
||||
rather than human beings' fingers. Human users are recommended to use such
|
||||
@ -590,8 +722,9 @@ Github (https://github.com/damonitor/damo), Pypi
|
||||
(https://pypistats.org/packages/damo), and Fedora
|
||||
(https://packages.fedoraproject.org/pkgs/python-damo/damo/).
|
||||
|
||||
Please refer to the ABI :doc:`document </admin-guide/mm/damon/usage>` for
|
||||
details of the interfaces.
|
||||
Currently, one module for this type, namely 'DAMON sysfs interface' is
|
||||
available. Please refer to the ABI :ref:`doc <sysfs_interface>` for details of
|
||||
the interfaces.
|
||||
|
||||
|
||||
Special-Purpose Access-aware Kernel Modules
|
||||
@ -599,8 +732,8 @@ Special-Purpose Access-aware Kernel Modules
|
||||
|
||||
DAMON modules that provide user space ABI for specific purpose DAMON usage.
|
||||
|
||||
DAMON sysfs/debugfs user interfaces are for full control of all DAMON features
|
||||
in runtime. For each special-purpose system-wide data access-aware system
|
||||
DAMON user interface modules are for full control of all DAMON features in
|
||||
runtime. For each special-purpose system-wide data access-aware system
|
||||
operations such as proactive reclamation or LRU lists balancing, the interfaces
|
||||
could be simplified by removing unnecessary knobs for the specific purpose, and
|
||||
extended for boot-time and even compile time control. Default values of DAMON
|
||||
|
247
Documentation/mm/damon/monitoring_intervals_tuning_example.rst
Normal file
247
Documentation/mm/damon/monitoring_intervals_tuning_example.rst
Normal file
@ -0,0 +1,247 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=================================================
|
||||
DAMON Moniting Interval Parameters Tuning Example
|
||||
=================================================
|
||||
|
||||
DAMON's monitoring parameters need tuning based on given workload and the
|
||||
monitoring purpose. There is a :ref:`tuning guide
|
||||
<damon_design_monitoring_params_tuning_guide>` for that. This document
|
||||
provides an example tuning based on the guide.
|
||||
|
||||
Setup
|
||||
=====
|
||||
|
||||
For below example, DAMON of Linux kernel v6.11 and `damo
|
||||
<https://github.com/damonitor/damo>`_ (DAMON user-space tool) v2.5.9 was used to
|
||||
monitor and visualize access patterns on the physical address space of a system
|
||||
running a real-world server workload.
|
||||
|
||||
5ms/100ms intervals: Too Short Interval
|
||||
=======================================
|
||||
|
||||
Let's start by capturing the access pattern snapshot on the physical address
|
||||
space of the system using DAMON, with the default interval parameters (5
|
||||
milliseconds and 100 milliseconds for the sampling and the aggregation
|
||||
intervals, respectively). Wait ten minutes between the start of DAMON and
|
||||
the capturing of the snapshot, to show a meaningful time-wise access patterns.
|
||||
::
|
||||
|
||||
# damo start
|
||||
# sleep 600
|
||||
# damo record --snapshot 0 1
|
||||
# damo stop
|
||||
|
||||
Then, list the DAMON-found regions of different access patterns, sorted by the
|
||||
"access temperature". "Access temperature" is a metric representing the
|
||||
access-hotness of a region. It is calculated as a weighted sum of the access
|
||||
frequency and the age of the region. If the access frequency is 0 %, the
|
||||
temperature is multipled by minus one. That is, if a region is not accessed,
|
||||
it gets minus temperature and it gets lower as not accessed for longer time.
|
||||
The sorting is in temperature-ascendint order, so the region at the top of the
|
||||
list is the coldest, and the one at the bottom is the hottest one. ::
|
||||
|
||||
# damo report access --sort_regions_by temperature
|
||||
0 addr 16.052 GiB size 5.985 GiB access 0 % age 5.900 s # coldest
|
||||
1 addr 22.037 GiB size 6.029 GiB access 0 % age 5.300 s
|
||||
2 addr 28.065 GiB size 6.045 GiB access 0 % age 5.200 s
|
||||
3 addr 10.069 GiB size 5.983 GiB access 0 % age 4.500 s
|
||||
4 addr 4.000 GiB size 6.069 GiB access 0 % age 4.400 s
|
||||
5 addr 62.008 GiB size 3.992 GiB access 0 % age 3.700 s
|
||||
6 addr 56.795 GiB size 5.213 GiB access 0 % age 3.300 s
|
||||
7 addr 39.393 GiB size 6.096 GiB access 0 % age 2.800 s
|
||||
8 addr 50.782 GiB size 6.012 GiB access 0 % age 2.800 s
|
||||
9 addr 34.111 GiB size 5.282 GiB access 0 % age 2.300 s
|
||||
10 addr 45.489 GiB size 5.293 GiB access 0 % age 1.800 s # hottest
|
||||
total size: 62.000 GiB
|
||||
|
||||
The list shows not seemingly hot regions, and only minimum access pattern
|
||||
diversity. Every region has zero access frequency. The number of region is
|
||||
10, which is the default ``min_nr_regions value``. Size of each region is also
|
||||
nearly idential. We can suspect this is because “adaptive regions adjustment”
|
||||
mechanism was not well working. As the guide suggested, we can get relative
|
||||
hotness of regions using ``age`` as the recency information. That would be
|
||||
better than nothing, but given the fact that the longest age is only about 6
|
||||
seconds while we waited about ten minuts, it is unclear how useful this will
|
||||
be.
|
||||
|
||||
The temperature ranges to total size of regions of each range histogram
|
||||
visualization of the results also shows no interesting distribution pattern. ::
|
||||
|
||||
# damo report access --style temperature-sz-hist
|
||||
<temperature> <total size>
|
||||
[-,590,000,000, -,549,000,000) 5.985 GiB |********** |
|
||||
[-,549,000,000, -,508,000,000) 12.074 GiB |********************|
|
||||
[-,508,000,000, -,467,000,000) 0 B | |
|
||||
[-,467,000,000, -,426,000,000) 12.052 GiB |********************|
|
||||
[-,426,000,000, -,385,000,000) 0 B | |
|
||||
[-,385,000,000, -,344,000,000) 3.992 GiB |******* |
|
||||
[-,344,000,000, -,303,000,000) 5.213 GiB |********* |
|
||||
[-,303,000,000, -,262,000,000) 12.109 GiB |********************|
|
||||
[-,262,000,000, -,221,000,000) 5.282 GiB |********* |
|
||||
[-,221,000,000, -,180,000,000) 0 B | |
|
||||
[-,180,000,000, -,139,000,000) 5.293 GiB |********* |
|
||||
total size: 62.000 GiB
|
||||
|
||||
In short, the parameters provide poor quality monitoring results for hot
|
||||
regions detection. According to the :ref:`guide
|
||||
<damon_design_monitoring_params_tuning_guide>`, this is due to the too short
|
||||
aggregation interval.
|
||||
|
||||
100ms/2s intervals: Starts Showing Small Hot Regions
|
||||
====================================================
|
||||
|
||||
Following the guide, increase the interval 20 times (100 milliseocnds and 2
|
||||
seconds for sampling and aggregation intervals, respectively). ::
|
||||
|
||||
# damo start -s 100ms -a 2s
|
||||
# sleep 600
|
||||
# damo record --snapshot 0 1
|
||||
# damo stop
|
||||
# damo report access --sort_regions_by temperature
|
||||
0 addr 10.180 GiB size 6.117 GiB access 0 % age 7 m 8 s # coldest
|
||||
1 addr 49.275 GiB size 6.195 GiB access 0 % age 6 m 14 s
|
||||
2 addr 62.421 GiB size 3.579 GiB access 0 % age 6 m 4 s
|
||||
3 addr 40.154 GiB size 6.127 GiB access 0 % age 5 m 40 s
|
||||
4 addr 16.296 GiB size 6.182 GiB access 0 % age 5 m 32 s
|
||||
5 addr 34.254 GiB size 5.899 GiB access 0 % age 5 m 24 s
|
||||
6 addr 46.281 GiB size 2.995 GiB access 0 % age 5 m 20 s
|
||||
7 addr 28.420 GiB size 5.835 GiB access 0 % age 5 m 6 s
|
||||
8 addr 4.000 GiB size 6.180 GiB access 0 % age 4 m 16 s
|
||||
9 addr 22.478 GiB size 5.942 GiB access 0 % age 3 m 58 s
|
||||
10 addr 55.470 GiB size 915.645 MiB access 0 % age 3 m 6 s
|
||||
11 addr 56.364 GiB size 6.056 GiB access 0 % age 2 m 8 s
|
||||
12 addr 56.364 GiB size 4.000 KiB access 95 % age 16 s
|
||||
13 addr 49.275 GiB size 4.000 KiB access 100 % age 8 m 24 s # hottest
|
||||
total size: 62.000 GiB
|
||||
# damo report access --style temperature-sz-hist
|
||||
<temperature> <total size>
|
||||
[-42,800,000,000, -33,479,999,000) 22.018 GiB |***************** |
|
||||
[-33,479,999,000, -24,159,998,000) 27.090 GiB |********************|
|
||||
[-24,159,998,000, -14,839,997,000) 6.836 GiB |****** |
|
||||
[-14,839,997,000, -5,519,996,000) 6.056 GiB |***** |
|
||||
[-5,519,996,000, 3,800,005,000) 4.000 KiB |* |
|
||||
[3,800,005,000, 13,120,006,000) 0 B | |
|
||||
[13,120,006,000, 22,440,007,000) 0 B | |
|
||||
[22,440,007,000, 31,760,008,000) 0 B | |
|
||||
[31,760,008,000, 41,080,009,000) 0 B | |
|
||||
[41,080,009,000, 50,400,010,000) 0 B | |
|
||||
[50,400,010,000, 59,720,011,000) 4.000 KiB |* |
|
||||
total size: 62.000 GiB
|
||||
|
||||
DAMON found two distinct 4 KiB regions that pretty hot. The regions are also
|
||||
well aged. The hottest 4 KiB region was keeping the access frequency for about
|
||||
8 minutes, and the coldest region was keeping no access for about 7 minutes.
|
||||
The distribution on the histogram also looks like having a pattern.
|
||||
|
||||
Especially, the finding of the 4 KiB regions among the 62 GiB total memory
|
||||
shows DAMON’s adaptive regions adjustment is working as designed.
|
||||
|
||||
Still the number of regions is close to the ``min_nr_regions``, and sizes of
|
||||
cold regions are similar, though. Apparently it is improved, but it still has
|
||||
rooms to improve.
|
||||
|
||||
400ms/8s intervals: Pretty Improved Results
|
||||
===========================================
|
||||
|
||||
Increase the intervals four times (400 milliseconds and 8 seconds
|
||||
for sampling and aggregation intervals, respectively). ::
|
||||
|
||||
# damo start -s 400ms -a 8s
|
||||
# sleep 600
|
||||
# damo record --snapshot 0 1
|
||||
# damo stop
|
||||
# damo report access --sort_regions_by temperature
|
||||
0 addr 64.492 GiB size 1.508 GiB access 0 % age 6 m 48 s # coldest
|
||||
1 addr 21.749 GiB size 5.674 GiB access 0 % age 6 m 8 s
|
||||
2 addr 27.422 GiB size 5.801 GiB access 0 % age 6 m
|
||||
3 addr 49.431 GiB size 8.675 GiB access 0 % age 5 m 28 s
|
||||
4 addr 33.223 GiB size 5.645 GiB access 0 % age 5 m 12 s
|
||||
5 addr 58.321 GiB size 6.170 GiB access 0 % age 5 m 4 s
|
||||
[...]
|
||||
25 addr 6.615 GiB size 297.531 MiB access 15 % age 0 ns
|
||||
26 addr 9.513 GiB size 12.000 KiB access 20 % age 0 ns
|
||||
27 addr 9.511 GiB size 108.000 KiB access 25 % age 0 ns
|
||||
28 addr 9.513 GiB size 20.000 KiB access 25 % age 0 ns
|
||||
29 addr 9.511 GiB size 12.000 KiB access 30 % age 0 ns
|
||||
30 addr 9.520 GiB size 4.000 KiB access 40 % age 0 ns
|
||||
[...]
|
||||
41 addr 9.520 GiB size 4.000 KiB access 80 % age 56 s
|
||||
42 addr 9.511 GiB size 12.000 KiB access 100 % age 6 m 16 s
|
||||
43 addr 58.321 GiB size 4.000 KiB access 100 % age 6 m 24 s
|
||||
44 addr 9.512 GiB size 4.000 KiB access 100 % age 6 m 48 s
|
||||
45 addr 58.106 GiB size 4.000 KiB access 100 % age 6 m 48 s # hottest
|
||||
total size: 62.000 GiB
|
||||
# damo report access --style temperature-sz-hist
|
||||
<temperature> <total size>
|
||||
[-40,800,000,000, -32,639,999,000) 21.657 GiB |********************|
|
||||
[-32,639,999,000, -24,479,998,000) 17.938 GiB |***************** |
|
||||
[-24,479,998,000, -16,319,997,000) 16.885 GiB |**************** |
|
||||
[-16,319,997,000, -8,159,996,000) 586.879 MiB |* |
|
||||
[-8,159,996,000, 5,000) 4.946 GiB |***** |
|
||||
[5,000, 8,160,006,000) 260.000 KiB |* |
|
||||
[8,160,006,000, 16,320,007,000) 0 B | |
|
||||
[16,320,007,000, 24,480,008,000) 0 B | |
|
||||
[24,480,008,000, 32,640,009,000) 0 B | |
|
||||
[32,640,009,000, 40,800,010,000) 16.000 KiB |* |
|
||||
[40,800,010,000, 48,960,011,000) 8.000 KiB |* |
|
||||
total size: 62.000 GiB
|
||||
|
||||
The number of regions having different access patterns has significantly
|
||||
increased. Size of each region is also more varied. Total size of non-zero
|
||||
access frequency regions is also significantly increased. Maybe this is already
|
||||
good enough to make some meaningful memory management efficieny changes.
|
||||
|
||||
800ms/16s intervals: Another bias
|
||||
=================================
|
||||
|
||||
Further double the intervals (800 milliseconds and 16 seconds for sampling
|
||||
and aggregation intervals, respectively). The results is more improved for the
|
||||
hot regions detection, but starts looking degrading cold regions detection. ::
|
||||
|
||||
# damo start -s 800ms -a 16s
|
||||
# sleep 600
|
||||
# damo record --snapshot 0 1
|
||||
# damo stop
|
||||
# damo report access --sort_regions_by temperature
|
||||
0 addr 64.781 GiB size 1.219 GiB access 0 % age 4 m 48 s
|
||||
1 addr 24.505 GiB size 2.475 GiB access 0 % age 4 m 16 s
|
||||
2 addr 26.980 GiB size 504.273 MiB access 0 % age 4 m
|
||||
3 addr 29.443 GiB size 2.462 GiB access 0 % age 4 m
|
||||
4 addr 37.264 GiB size 5.645 GiB access 0 % age 4 m
|
||||
5 addr 31.905 GiB size 5.359 GiB access 0 % age 3 m 44 s
|
||||
[...]
|
||||
20 addr 8.711 GiB size 40.000 KiB access 5 % age 2 m 40 s
|
||||
21 addr 27.473 GiB size 1.970 GiB access 5 % age 4 m
|
||||
22 addr 48.185 GiB size 4.625 GiB access 5 % age 4 m
|
||||
23 addr 47.304 GiB size 902.117 MiB access 10 % age 4 m
|
||||
24 addr 8.711 GiB size 4.000 KiB access 100 % age 4 m
|
||||
25 addr 20.793 GiB size 3.713 GiB access 5 % age 4 m 16 s
|
||||
26 addr 8.773 GiB size 4.000 KiB access 100 % age 4 m 16 s
|
||||
total size: 62.000 GiB
|
||||
# damo report access --style temperature-sz-hist
|
||||
<temperature> <total size>
|
||||
[-28,800,000,000, -23,359,999,000) 12.294 GiB |***************** |
|
||||
[-23,359,999,000, -17,919,998,000) 9.753 GiB |************* |
|
||||
[-17,919,998,000, -12,479,997,000) 15.131 GiB |********************|
|
||||
[-12,479,997,000, -7,039,996,000) 0 B | |
|
||||
[-7,039,996,000, -1,599,995,000) 7.506 GiB |********** |
|
||||
[-1,599,995,000, 3,840,006,000) 6.127 GiB |********* |
|
||||
[3,840,006,000, 9,280,007,000) 0 B | |
|
||||
[9,280,007,000, 14,720,008,000) 136.000 KiB |* |
|
||||
[14,720,008,000, 20,160,009,000) 40.000 KiB |* |
|
||||
[20,160,009,000, 25,600,010,000) 11.188 GiB |*************** |
|
||||
[25,600,010,000, 31,040,011,000) 4.000 KiB |* |
|
||||
total size: 62.000 GiB
|
||||
|
||||
It found more non-zero access frequency regions. The number of regions is still
|
||||
much higher than the ``min_nr_regions``, but it is reduced from that of the
|
||||
previous setup. And apparently the distribution seems bit biased to hot
|
||||
regions.
|
||||
|
||||
Conclusion
|
||||
==========
|
||||
|
||||
With the above experimental tuning results, we can conclude the theory and the
|
||||
guide makes sense to at least this workload, and could be applied to similar
|
||||
cases.
|
@ -531,6 +531,10 @@ are extra requirements for accessing them:
|
||||
new page table has been installed in the same location and filled with
|
||||
entries. Writers normally need to take the PTE lock and revalidate that the
|
||||
PMD entry still refers to the same PTE-level page table.
|
||||
If the writer does not care whether it is the same PTE-level page table, it
|
||||
can take the PMD lock and revalidate that the contents of pmd entry still meet
|
||||
the requirements. In particular, this also happens in :c:func:`!retract_page_tables`
|
||||
when handling :c:macro:`!MADV_COLLAPSE`.
|
||||
|
||||
To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or
|
||||
:c:func:`!pte_offset_map` can be used depending on stability requirements.
|
||||
@ -712,9 +716,14 @@ calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU
|
||||
critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`,
|
||||
before releasing the RCU lock via :c:func:`!rcu_read_unlock`.
|
||||
|
||||
VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for
|
||||
their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it
|
||||
via :c:func:`!vma_end_read`.
|
||||
In cases when the user already holds mmap read lock, :c:func:`!vma_start_read_locked`
|
||||
and :c:func:`!vma_start_read_locked_nested` can be used. These functions do not
|
||||
fail due to lock contention but the caller should still check their return values
|
||||
in case they fail for other reasons.
|
||||
|
||||
VMA read locks increment :c:member:`!vma.vm_refcnt` reference counter for their
|
||||
duration and the caller of :c:func:`!lock_vma_under_rcu` must drop it via
|
||||
:c:func:`!vma_end_read`.
|
||||
|
||||
VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a
|
||||
VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always
|
||||
@ -722,9 +731,9 @@ acquired. An mmap write lock **must** be held for the duration of the VMA write
|
||||
lock, releasing or downgrading the mmap write lock also releases the VMA write
|
||||
lock so there is no :c:func:`!vma_end_write` function.
|
||||
|
||||
Note that a semaphore write lock is not held across a VMA lock. Rather, a
|
||||
sequence number is used for serialisation, and the write semaphore is only
|
||||
acquired at the point of write lock to update this.
|
||||
Note that when write-locking a VMA lock, the :c:member:`!vma.vm_refcnt` is temporarily
|
||||
modified so that readers can detect the presense of a writer. The reference counter is
|
||||
restored once the vma sequence number used for serialisation is updated.
|
||||
|
||||
This ensures the semantics we require - VMA write locks provide exclusive write
|
||||
access to the VMA.
|
||||
@ -734,7 +743,7 @@ Implementation details
|
||||
|
||||
The VMA lock mechanism is designed to be a lightweight means of avoiding the use
|
||||
of the heavily contended mmap lock. It is implemented using a combination of a
|
||||
read/write semaphore and sequence numbers belonging to the containing
|
||||
reference counter and sequence numbers belonging to the containing
|
||||
:c:struct:`!struct mm_struct` and the VMA.
|
||||
|
||||
Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic
|
||||
@ -775,28 +784,31 @@ release of any VMA locks on its release makes sense, as you would never want to
|
||||
keep VMAs locked across entirely separate write operations. It also maintains
|
||||
correct lock ordering.
|
||||
|
||||
Each time a VMA read lock is acquired, we acquire a read lock on the
|
||||
:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that
|
||||
the sequence count of the VMA does not match that of the mm.
|
||||
Each time a VMA read lock is acquired, we increment :c:member:`!vma.vm_refcnt`
|
||||
reference counter and check that the sequence count of the VMA does not match
|
||||
that of the mm.
|
||||
|
||||
If it does, the read lock fails. If it does not, we hold the lock, excluding
|
||||
writers, but permitting other readers, who will also obtain this lock under RCU.
|
||||
If it does, the read lock fails and :c:member:`!vma.vm_refcnt` is dropped.
|
||||
If it does not, we keep the reference counter raised, excluding writers, but
|
||||
permitting other readers, who can also obtain this lock under RCU.
|
||||
|
||||
Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu`
|
||||
are also RCU safe, so the whole read lock operation is guaranteed to function
|
||||
correctly.
|
||||
|
||||
On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock`
|
||||
read/write semaphore, before setting the VMA's sequence number under this lock,
|
||||
also simultaneously holding the mmap write lock.
|
||||
On the write side, we set a bit in :c:member:`!vma.vm_refcnt` which can't be
|
||||
modified by readers and wait for all readers to drop their reference count.
|
||||
Once there are no readers, VMA's sequence number is set to match that of the
|
||||
mm. During this entire operation mmap write lock is held.
|
||||
|
||||
This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep
|
||||
until these are finished and mutual exclusion is achieved.
|
||||
|
||||
After setting the VMA's sequence number, the lock is released, avoiding
|
||||
complexity with a long-term held write lock.
|
||||
After setting the VMA's sequence number, the bit in :c:member:`!vma.vm_refcnt`
|
||||
indicating a writer is cleared. From this point on, VMA's sequence number will
|
||||
indicate VMA's write-locked state until mmap write lock is dropped or downgraded.
|
||||
|
||||
This clever combination of a read/write semaphore and sequence count allows for
|
||||
This clever combination of a reference counter and sequence count allows for
|
||||
fast RCU-based per-VMA lock acquisition (especially on page fault, though
|
||||
utilised elsewhere) with minimal complexity around lock ordering.
|
||||
|
||||
|
@ -62,7 +62,7 @@ Support of split page table lock by an architecture
|
||||
===================================================
|
||||
|
||||
There's no need in special enabling of PTE split page table lock: everything
|
||||
required is done by pagetable_pte_ctor() and pagetable_pte_dtor(), which
|
||||
required is done by pagetable_pte_ctor() and pagetable_dtor(), which
|
||||
must be called on PTE table allocation / freeing.
|
||||
|
||||
Make sure the architecture doesn't use slab allocator for page table
|
||||
@ -73,7 +73,7 @@ PMD split lock only makes sense if you have more than two page table
|
||||
levels.
|
||||
|
||||
PMD split lock enabling requires pagetable_pmd_ctor() call on PMD table
|
||||
allocation and pagetable_pmd_dtor() on freeing.
|
||||
allocation and pagetable_dtor() on freeing.
|
||||
|
||||
Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
|
||||
pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
|
||||
|
@ -26,12 +26,7 @@ DAMON 为不同的用户提供了下面这些接口。
|
||||
使用它,用户可以通过读取和写入特殊的sysfs文件来使用DAMON的主要功能。因此,你可以编写和使
|
||||
用你个性化的DAMON sysfs包装程序,代替你读/写sysfs文件。 `DAMON用户空间工具
|
||||
<https://github.com/damonitor/damo>`_ 就是这种程序的一个例子 它同时支持虚拟和物理地址
|
||||
空间的监测。注意,这个界面只提供简单的监测结果 :ref:`统计 <damos_stats>`。对于详细的监测
|
||||
结果,DAMON提供了一个:ref:`跟踪点 <tracepoint>`。
|
||||
- *debugfs interface.*
|
||||
:ref:`这 <debugfs_interface>` 几乎与:ref:`sysfs interface <sysfs_interface>` 接
|
||||
口相同。这将在下一个LTS内核发布后被移除,所以用户应该转移到
|
||||
:ref:`sysfs interface <sysfs_interface>`。
|
||||
空间的监测。
|
||||
- *内核空间编程接口。*
|
||||
:doc:`这 </mm/damon/api>` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内
|
||||
核空间的DAMON应用程序,最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。
|
||||
@ -335,247 +330,6 @@ tried_regions/<N>/
|
||||
请注意,我们强烈建议使用用户空间的工具,如 `damo <https://github.com/damonitor/damo>`_ ,
|
||||
而不是像上面那样手动读写文件。以上只是一个例子。
|
||||
|
||||
debugfs接口
|
||||
===========
|
||||
|
||||
.. note::
|
||||
|
||||
DAMON debugfs接口将在下一个LTS内核发布后被移除,所以用户应该转移到
|
||||
:ref:`sysfs接口<sysfs_interface>`。
|
||||
|
||||
DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
|
||||
``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和
|
||||
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
|
||||
|
||||
|
||||
属性
|
||||
----
|
||||
|
||||
用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 、 ``聚集间隔`` 、 ``更新间隔``
|
||||
以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/mm/damon/design` 。例如,
|
||||
下面的命令将这些值设置为5ms、100ms、1000ms、10和1000,然后再次检查::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo 5000 100000 1000000 10 1000 > attrs
|
||||
# cat attrs
|
||||
5000 100000 1000000 10 1000
|
||||
|
||||
|
||||
目标ID
|
||||
------
|
||||
|
||||
一些类型的地址空间支持多个监测目标。例如,虚拟内存地址空间的监测可以有多个进程作为监测目标。用户
|
||||
可以通过写入目标的相关id值来设置目标,并通过读取 ``target_ids`` 文件来获得当前目标的id。在监
|
||||
测虚拟地址空间的情况下,这些值应该是监测目标进程的pid。例如,下面的命令将pid为42和4242的进程设
|
||||
为监测目标,并再次检查::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo 42 4242 > target_ids
|
||||
# cat target_ids
|
||||
42 4242
|
||||
|
||||
用户还可以通过在文件中写入一个特殊的关键字 "paddr\n" 来监测系统的物理内存地址空间。因为物理地
|
||||
址空间监测不支持多个目标,读取文件会显示一个假值,即 ``42`` ,如下图所示::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo paddr > target_ids
|
||||
# cat target_ids
|
||||
42
|
||||
|
||||
请注意,设置目标ID并不启动监测。
|
||||
|
||||
|
||||
初始监测目标区域
|
||||
----------------
|
||||
|
||||
在虚拟地址空间监测的情况下,DAMON自动设置和更新监测的目标区域,这样就可以覆盖目标进程的整个
|
||||
内存映射。然而,用户可能希望将监测区域限制在特定的地址范围内,如堆、栈或特定的文件映射区域。
|
||||
或者,一些用户可以知道他们工作负载的初始访问模式,因此希望为“自适应区域调整”设置最佳初始区域。
|
||||
|
||||
相比之下,DAMON在物理内存监测的情况下不会自动设置和更新监测目标区域。因此,用户应该自己设置
|
||||
监测目标区域。
|
||||
|
||||
在这种情况下,用户可以通过在 ``init_regions`` 文件中写入适当的值,明确地设置他们想要的初
|
||||
始监测目标区域。输入应该是一个由三个整数组成的队列,用空格隔开,代表一个区域的形式如下::
|
||||
|
||||
<target idx> <start address> <end address>
|
||||
|
||||
目标idx应该是 ``target_ids`` 文件中目标的索引,从 ``0`` 开始,区域应该按照地址顺序传递。
|
||||
例如,下面的命令将设置几个地址范围, ``1-100`` 和 ``100-200`` 作为pid 42的初始监测目标
|
||||
区域,这是 ``target_ids`` 中的第一个(索引 ``0`` ),另外几个地址范围, ``20-40`` 和
|
||||
``50-100`` 作为pid 4242的地址,这是 ``target_ids`` 中的第二个(索引 ``1`` )::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# cat target_ids
|
||||
42 4242
|
||||
# echo "0 1 100 \
|
||||
0 100 200 \
|
||||
1 20 40 \
|
||||
1 50 100" > init_regions
|
||||
|
||||
请注意,这只是设置了初始的监测目标区域。在虚拟内存监测的情况下,DAMON会在一个 ``更新间隔``
|
||||
后自动更新区域的边界。因此,在这种情况下,如果用户不希望更新的话,应该把 ``更新间隔`` 设
|
||||
置得足够大。
|
||||
|
||||
|
||||
方案
|
||||
----
|
||||
|
||||
对于通常的基于DAMON的数据访问感知的内存管理优化,用户只是希望系统对特定访问模式的内存区域应用内
|
||||
存管理操作。DAMON从用户那里接收这种形式化的操作方案,并将这些方案应用到目标进程中。
|
||||
|
||||
用户可以通过读取和写入 ``scheme`` debugfs文件来获得和设置这些方案。读取该文件还可以显示每个
|
||||
方案的统计数据。在文件中,每一个方案都应该在每一行中以下列形式表示出来::
|
||||
|
||||
<target access pattern> <action> <quota> <watermarks>
|
||||
|
||||
你可以通过简单地在文件中写入一个空字符串来禁用方案。
|
||||
|
||||
目标访问模式
|
||||
~~~~~~~~~~~~
|
||||
|
||||
``<目标访问模式>`` 是由三个范围构成的,形式如下::
|
||||
|
||||
min-size max-size min-acc max-acc min-age max-age
|
||||
|
||||
具体来说,区域大小的字节数( `min-size` 和 `max-size` ),访问频率的每聚合区间的监测访问次
|
||||
数( `min-acc` 和 `max-acc` ),区域年龄的聚合区间数( `min-age` 和 `max-age` )都被指定。
|
||||
请注意,这些范围是封闭区间。
|
||||
|
||||
动作
|
||||
~~~~
|
||||
|
||||
``<action>`` 是一个预定义的内存管理动作的整数,DAMON将应用于具有目标访问模式的区域。支持
|
||||
的数字和它们的含义如下::
|
||||
|
||||
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
|
||||
- 1: Call ``madvise()`` for the region with ``MADV_COLD``
|
||||
- 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
|
||||
- 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
|
||||
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
|
||||
- 5: Do nothing but count the statistics
|
||||
|
||||
配额
|
||||
~~~~
|
||||
|
||||
每个 ``动作`` 的最佳 ``目标访问模式`` 取决于工作负载,所以不容易找到。更糟糕的是,将某个
|
||||
动作的方案设置得过于激进会导致严重的开销。为了避免这种开销,用户可以通过下面表格中的 ``<quota>``
|
||||
来限制方案的时间和大小配额::
|
||||
|
||||
<ms> <sz> <reset interval> <priority weights>
|
||||
|
||||
这使得DAMON在 ``<reset interval>`` 毫秒内,尽量只用 ``<ms>`` 毫秒的时间对 ``目标访
|
||||
问模式`` 的内存区域应用动作,并在 ``<reset interval>`` 内只对最多<sz>字节的内存区域应
|
||||
用动作。将 ``<ms>`` 和 ``<sz>`` 都设置为零,可以禁用配额限制。
|
||||
|
||||
当预计超过配额限制时,DAMON会根据 ``目标访问模式`` 的大小、访问频率和年龄,对发现的内存
|
||||
区域进行优先排序。为了实现个性化的优先级,用户可以在 ``<优先级权重>`` 中设置这三个属性的
|
||||
权重,具体形式如下::
|
||||
|
||||
<size weight> <access frequency weight> <age weight>
|
||||
|
||||
水位
|
||||
~~~~
|
||||
|
||||
有些方案需要根据系统特定指标的当前值来运行,如自由内存比率。对于这种情况,用户可以为该条
|
||||
件指定水位。::
|
||||
|
||||
<metric> <check interval> <high mark> <middle mark> <low mark>
|
||||
|
||||
``<metric>`` 是一个预定义的整数,用于要检查的度量。支持的数字和它们的含义如下。
|
||||
|
||||
- 0: 忽视水位
|
||||
- 1: 系统空闲内存率 (千分比)
|
||||
|
||||
每隔 ``<检查间隔>`` 微秒检查一次公制的值。
|
||||
|
||||
如果该值高于 ``<高标>`` 或低于 ``<低标>`` ,该方案被停用。如果该值低于 ``<中标>`` ,
|
||||
该方案将被激活。
|
||||
|
||||
统计数据
|
||||
~~~~~~~~
|
||||
|
||||
它还统计每个方案被尝试应用的区域的总数量和字节数,每个方案被成功应用的区域的两个数量,以
|
||||
及超过配额限制的总数量。这些统计数据可用于在线分析或调整方案。
|
||||
|
||||
统计数据可以通过读取方案文件来显示。读取该文件将显示你在每一行中输入的每个 ``方案`` ,
|
||||
统计的五个数字将被加在每一行的末尾。
|
||||
|
||||
例子
|
||||
~~~~
|
||||
|
||||
下面的命令应用了一个方案:”如果一个大小为[4KiB, 8KiB]的内存区域在[10, 20]的聚合时间
|
||||
间隔内显示出每一个聚合时间间隔[0, 5]的访问量,请分页出该区域。对于分页,每秒最多只能使
|
||||
用10ms,而且每秒分页不能超过1GiB。在这一限制下,首先分页出具有较长年龄的内存区域。另外,
|
||||
每5秒钟检查一次系统的可用内存率,当可用内存率低于50%时开始监测和分页,但如果可用内存率
|
||||
大于60%,或低于30%,则停止监测“::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# scheme="4096 8192 0 5 10 20 2" # target access pattern and action
|
||||
# scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
|
||||
# scheme+=" 0 0 100" # prioritization weights
|
||||
# scheme+=" 1 5000000 600 500 300" # watermarks
|
||||
# echo "$scheme" > schemes
|
||||
|
||||
|
||||
开关
|
||||
----
|
||||
|
||||
除非你明确地启动监测,否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on_DEPRECATED``
|
||||
文件来启动、停止和检查监测的当前状态。写入 ``on`` 该文件可以启动对有属性的目标的监测。写入
|
||||
``off`` 该文件则停止这些目标。如果每个目标进程被终止,DAMON也会停止。下面的示例命令开启、关
|
||||
闭和检查DAMON的状态::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo on > monitor_on_DEPRECATED
|
||||
# echo off > monitor_on_DEPRECATED
|
||||
# cat monitor_on_DEPRECATED
|
||||
off
|
||||
|
||||
请注意,当监测开启时,你不能写到上述的debugfs文件。如果你在DAMON运行时写到这些文件,将会返
|
||||
回一个错误代码,如 ``-EBUSY`` 。
|
||||
|
||||
|
||||
监测线程PID
|
||||
-----------
|
||||
|
||||
DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以通过读取 ``kdamond_pid`` 文件获
|
||||
得该线程的 ``pid`` 。当监测被 ``关闭`` 时,读取该文件不会返回任何信息::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# cat monitor_on_DEPRECATED
|
||||
off
|
||||
# cat kdamond_pid
|
||||
none
|
||||
# echo on > monitor_on_DEPRECATED
|
||||
# cat kdamond_pid
|
||||
18594
|
||||
|
||||
|
||||
使用多个监测线程
|
||||
----------------
|
||||
|
||||
每个监测上下文都会创建一个 ``kdamond`` 线程。你可以使用 ``mk_contexts`` 和 ``rm_contexts``
|
||||
文件为多个 ``kdamond`` 需要的用例创建和删除监测上下文。
|
||||
|
||||
将新上下文的名称写入 ``mk_contexts`` 文件,在 ``DAMON debugfs`` 目录上创建一个该名称的目录。
|
||||
该目录将有该上下文的 ``DAMON debugfs`` 文件::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# ls foo
|
||||
# ls: cannot access 'foo': No such file or directory
|
||||
# echo foo > mk_contexts
|
||||
# ls foo
|
||||
# attrs init_regions kdamond_pid schemes target_ids
|
||||
|
||||
如果不再需要上下文,你可以通过把上下文的名字放到 ``rm_contexts`` 文件中来删除它和相应的目录::
|
||||
|
||||
# echo foo > rm_contexts
|
||||
# ls foo
|
||||
# ls: cannot access 'foo': No such file or directory
|
||||
|
||||
注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目录下。
|
||||
|
||||
|
||||
监测结果的监测点
|
||||
================
|
||||
|
@ -26,12 +26,7 @@ DAMON 爲不同的用戶提供了下面這些接口。
|
||||
使用它,用戶可以通過讀取和寫入特殊的sysfs文件來使用DAMON的主要功能。因此,你可以編寫和使
|
||||
用你個性化的DAMON sysfs包裝程序,代替你讀/寫sysfs文件。 `DAMON用戶空間工具
|
||||
<https://github.com/damonitor/damo>`_ 就是這種程序的一個例子 它同時支持虛擬和物理地址
|
||||
空間的監測。注意,這個界面只提供簡單的監測結果 :ref:`統計 <damos_stats>`。對於詳細的監測
|
||||
結果,DAMON提供了一個:ref:`跟蹤點 <tracepoint>`。
|
||||
- *debugfs interface.*
|
||||
:ref:`這 <debugfs_interface>` 幾乎與:ref:`sysfs interface <sysfs_interface>` 接
|
||||
口相同。這將在下一個LTS內核發佈後被移除,所以用戶應該轉移到
|
||||
:ref:`sysfs interface <sysfs_interface>`。
|
||||
空間的監測。
|
||||
- *內核空間編程接口。*
|
||||
:doc:`這 </mm/damon/api>` 這是爲內核空間程序員準備的。使用它,用戶可以通過爲你編寫內
|
||||
核空間的DAMON應用程序,最靈活有效地利用DAMON的每一個功能。你甚至可以爲各種地址空間擴展DAMON。
|
||||
@ -335,247 +330,6 @@ tried_regions/<N>/
|
||||
請注意,我們強烈建議使用用戶空間的工具,如 `damo <https://github.com/damonitor/damo>`_ ,
|
||||
而不是像上面那樣手動讀寫文件。以上只是一個例子。
|
||||
|
||||
debugfs接口
|
||||
===========
|
||||
|
||||
.. note::
|
||||
|
||||
DAMON debugfs接口將在下一個LTS內核發佈後被移除,所以用戶應該轉移到
|
||||
:ref:`sysfs接口<sysfs_interface>`。
|
||||
|
||||
DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``,
|
||||
``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和
|
||||
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
|
||||
|
||||
|
||||
屬性
|
||||
----
|
||||
|
||||
用戶可以通過讀取和寫入 ``attrs`` 文件獲得和設置 ``採樣間隔`` 、 ``聚集間隔`` 、 ``更新間隔``
|
||||
以及監測目標區域的最小/最大數量。要詳細瞭解監測屬性,請參考 `:doc:/mm/damon/design` 。例如,
|
||||
下面的命令將這些值設置爲5ms、100ms、1000ms、10和1000,然後再次檢查::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo 5000 100000 1000000 10 1000 > attrs
|
||||
# cat attrs
|
||||
5000 100000 1000000 10 1000
|
||||
|
||||
|
||||
目標ID
|
||||
------
|
||||
|
||||
一些類型的地址空間支持多個監測目標。例如,虛擬內存地址空間的監測可以有多個進程作爲監測目標。用戶
|
||||
可以通過寫入目標的相關id值來設置目標,並通過讀取 ``target_ids`` 文件來獲得當前目標的id。在監
|
||||
測虛擬地址空間的情況下,這些值應該是監測目標進程的pid。例如,下面的命令將pid爲42和4242的進程設
|
||||
爲監測目標,並再次檢查::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo 42 4242 > target_ids
|
||||
# cat target_ids
|
||||
42 4242
|
||||
|
||||
用戶還可以通過在文件中寫入一個特殊的關鍵字 "paddr\n" 來監測系統的物理內存地址空間。因爲物理地
|
||||
址空間監測不支持多個目標,讀取文件會顯示一個假值,即 ``42`` ,如下圖所示::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo paddr > target_ids
|
||||
# cat target_ids
|
||||
42
|
||||
|
||||
請注意,設置目標ID並不啓動監測。
|
||||
|
||||
|
||||
初始監測目標區域
|
||||
----------------
|
||||
|
||||
在虛擬地址空間監測的情況下,DAMON自動設置和更新監測的目標區域,這樣就可以覆蓋目標進程的整個
|
||||
內存映射。然而,用戶可能希望將監測區域限制在特定的地址範圍內,如堆、棧或特定的文件映射區域。
|
||||
或者,一些用戶可以知道他們工作負載的初始訪問模式,因此希望爲“自適應區域調整”設置最佳初始區域。
|
||||
|
||||
相比之下,DAMON在物理內存監測的情況下不會自動設置和更新監測目標區域。因此,用戶應該自己設置
|
||||
監測目標區域。
|
||||
|
||||
在這種情況下,用戶可以通過在 ``init_regions`` 文件中寫入適當的值,明確地設置他們想要的初
|
||||
始監測目標區域。輸入應該是一個由三個整數組成的隊列,用空格隔開,代表一個區域的形式如下::
|
||||
|
||||
<target idx> <start address> <end address>
|
||||
|
||||
目標idx應該是 ``target_ids`` 文件中目標的索引,從 ``0`` 開始,區域應該按照地址順序傳遞。
|
||||
例如,下面的命令將設置幾個地址範圍, ``1-100`` 和 ``100-200`` 作爲pid 42的初始監測目標
|
||||
區域,這是 ``target_ids`` 中的第一個(索引 ``0`` ),另外幾個地址範圍, ``20-40`` 和
|
||||
``50-100`` 作爲pid 4242的地址,這是 ``target_ids`` 中的第二個(索引 ``1`` )::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# cat target_ids
|
||||
42 4242
|
||||
# echo "0 1 100 \
|
||||
0 100 200 \
|
||||
1 20 40 \
|
||||
1 50 100" > init_regions
|
||||
|
||||
請注意,這只是設置了初始的監測目標區域。在虛擬內存監測的情況下,DAMON會在一個 ``更新間隔``
|
||||
後自動更新區域的邊界。因此,在這種情況下,如果用戶不希望更新的話,應該把 ``更新間隔`` 設
|
||||
置得足夠大。
|
||||
|
||||
|
||||
方案
|
||||
----
|
||||
|
||||
對於通常的基於DAMON的數據訪問感知的內存管理優化,用戶只是希望系統對特定訪問模式的內存區域應用內
|
||||
存管理操作。DAMON從用戶那裏接收這種形式化的操作方案,並將這些方案應用到目標進程中。
|
||||
|
||||
用戶可以通過讀取和寫入 ``scheme`` debugfs文件來獲得和設置這些方案。讀取該文件還可以顯示每個
|
||||
方案的統計數據。在文件中,每一個方案都應該在每一行中以下列形式表示出來::
|
||||
|
||||
<target access pattern> <action> <quota> <watermarks>
|
||||
|
||||
你可以通過簡單地在文件中寫入一個空字符串來禁用方案。
|
||||
|
||||
目標訪問模式
|
||||
~~~~~~~~~~~~
|
||||
|
||||
``<目標訪問模式>`` 是由三個範圍構成的,形式如下::
|
||||
|
||||
min-size max-size min-acc max-acc min-age max-age
|
||||
|
||||
具體來說,區域大小的字節數( `min-size` 和 `max-size` ),訪問頻率的每聚合區間的監測訪問次
|
||||
數( `min-acc` 和 `max-acc` ),區域年齡的聚合區間數( `min-age` 和 `max-age` )都被指定。
|
||||
請注意,這些範圍是封閉區間。
|
||||
|
||||
動作
|
||||
~~~~
|
||||
|
||||
``<action>`` 是一個預定義的內存管理動作的整數,DAMON將應用於具有目標訪問模式的區域。支持
|
||||
的數字和它們的含義如下::
|
||||
|
||||
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
|
||||
- 1: Call ``madvise()`` for the region with ``MADV_COLD``
|
||||
- 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
|
||||
- 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
|
||||
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
|
||||
- 5: Do nothing but count the statistics
|
||||
|
||||
配額
|
||||
~~~~
|
||||
|
||||
每個 ``動作`` 的最佳 ``目標訪問模式`` 取決於工作負載,所以不容易找到。更糟糕的是,將某個
|
||||
動作的方案設置得過於激進會導致嚴重的開銷。爲了避免這種開銷,用戶可以通過下面表格中的 ``<quota>``
|
||||
來限制方案的時間和大小配額::
|
||||
|
||||
<ms> <sz> <reset interval> <priority weights>
|
||||
|
||||
這使得DAMON在 ``<reset interval>`` 毫秒內,儘量只用 ``<ms>`` 毫秒的時間對 ``目標訪
|
||||
問模式`` 的內存區域應用動作,並在 ``<reset interval>`` 內只對最多<sz>字節的內存區域應
|
||||
用動作。將 ``<ms>`` 和 ``<sz>`` 都設置爲零,可以禁用配額限制。
|
||||
|
||||
當預計超過配額限制時,DAMON會根據 ``目標訪問模式`` 的大小、訪問頻率和年齡,對發現的內存
|
||||
區域進行優先排序。爲了實現個性化的優先級,用戶可以在 ``<優先級權重>`` 中設置這三個屬性的
|
||||
權重,具體形式如下::
|
||||
|
||||
<size weight> <access frequency weight> <age weight>
|
||||
|
||||
水位
|
||||
~~~~
|
||||
|
||||
有些方案需要根據系統特定指標的當前值來運行,如自由內存比率。對於這種情況,用戶可以爲該條
|
||||
件指定水位。::
|
||||
|
||||
<metric> <check interval> <high mark> <middle mark> <low mark>
|
||||
|
||||
``<metric>`` 是一個預定義的整數,用於要檢查的度量。支持的數字和它們的含義如下。
|
||||
|
||||
- 0: 忽視水位
|
||||
- 1: 系統空閒內存率 (千分比)
|
||||
|
||||
每隔 ``<檢查間隔>`` 微秒檢查一次公制的值。
|
||||
|
||||
如果該值高於 ``<高標>`` 或低於 ``<低標>`` ,該方案被停用。如果該值低於 ``<中標>`` ,
|
||||
該方案將被激活。
|
||||
|
||||
統計數據
|
||||
~~~~~~~~
|
||||
|
||||
它還統計每個方案被嘗試應用的區域的總數量和字節數,每個方案被成功應用的區域的兩個數量,以
|
||||
及超過配額限制的總數量。這些統計數據可用於在線分析或調整方案。
|
||||
|
||||
統計數據可以通過讀取方案文件來顯示。讀取該文件將顯示你在每一行中輸入的每個 ``方案`` ,
|
||||
統計的五個數字將被加在每一行的末尾。
|
||||
|
||||
例子
|
||||
~~~~
|
||||
|
||||
下面的命令應用了一個方案:”如果一個大小爲[4KiB, 8KiB]的內存區域在[10, 20]的聚合時間
|
||||
間隔內顯示出每一個聚合時間間隔[0, 5]的訪問量,請分頁出該區域。對於分頁,每秒最多隻能使
|
||||
用10ms,而且每秒分頁不能超過1GiB。在這一限制下,首先分頁出具有較長年齡的內存區域。另外,
|
||||
每5秒鐘檢查一次系統的可用內存率,當可用內存率低於50%時開始監測和分頁,但如果可用內存率
|
||||
大於60%,或低於30%,則停止監測“::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# scheme="4096 8192 0 5 10 20 2" # target access pattern and action
|
||||
# scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
|
||||
# scheme+=" 0 0 100" # prioritization weights
|
||||
# scheme+=" 1 5000000 600 500 300" # watermarks
|
||||
# echo "$scheme" > schemes
|
||||
|
||||
|
||||
開關
|
||||
----
|
||||
|
||||
除非你明確地啓動監測,否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on_DEPRECATED``
|
||||
文件來啓動、停止和檢查監測的當前狀態。寫入 ``on`` 該文件可以啓動對有屬性的目標的監測。寫入
|
||||
``off`` 該文件則停止這些目標。如果每個目標進程被終止,DAMON也會停止。下面的示例命令開啓、關
|
||||
閉和檢查DAMON的狀態::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo on > monitor_on_DEPRECATED
|
||||
# echo off > monitor_on_DEPRECATED
|
||||
# cat monitor_on_DEPRECATED
|
||||
off
|
||||
|
||||
請注意,當監測開啓時,你不能寫到上述的debugfs文件。如果你在DAMON運行時寫到這些文件,將會返
|
||||
回一個錯誤代碼,如 ``-EBUSY`` 。
|
||||
|
||||
|
||||
監測線程PID
|
||||
-----------
|
||||
|
||||
DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以通過讀取 ``kdamond_pid`` 文件獲
|
||||
得該線程的 ``pid`` 。當監測被 ``關閉`` 時,讀取該文件不會返回任何信息::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# cat monitor_on_DEPRECATED
|
||||
off
|
||||
# cat kdamond_pid
|
||||
none
|
||||
# echo on > monitor_on_DEPRECATED
|
||||
# cat kdamond_pid
|
||||
18594
|
||||
|
||||
|
||||
使用多個監測線程
|
||||
----------------
|
||||
|
||||
每個監測上下文都會創建一個 ``kdamond`` 線程。你可以使用 ``mk_contexts`` 和 ``rm_contexts``
|
||||
文件爲多個 ``kdamond`` 需要的用例創建和刪除監測上下文。
|
||||
|
||||
將新上下文的名稱寫入 ``mk_contexts`` 文件,在 ``DAMON debugfs`` 目錄上創建一個該名稱的目錄。
|
||||
該目錄將有該上下文的 ``DAMON debugfs`` 文件::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# ls foo
|
||||
# ls: cannot access 'foo': No such file or directory
|
||||
# echo foo > mk_contexts
|
||||
# ls foo
|
||||
# attrs init_regions kdamond_pid schemes target_ids
|
||||
|
||||
如果不再需要上下文,你可以通過把上下文的名字放到 ``rm_contexts`` 文件中來刪除它和相應的目錄::
|
||||
|
||||
# echo foo > rm_contexts
|
||||
# ls foo
|
||||
# ls: cannot access 'foo': No such file or directory
|
||||
|
||||
注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目錄下。
|
||||
|
||||
|
||||
監測結果的監測點
|
||||
================
|
||||
|
34
MAINTAINERS
34
MAINTAINERS
@ -2827,7 +2827,7 @@ ARM/NXP S32G ARCHITECTURE
|
||||
R: Chester Lin <chester62515@gmail.com>
|
||||
R: Matthias Brugger <mbrugger@suse.com>
|
||||
R: Ghennadi Procopciuc <ghennadi.procopciuc@oss.nxp.com>
|
||||
L: NXP S32 Linux Team <s32@nxp.com>
|
||||
R: NXP S32 Linux Team <s32@nxp.com>
|
||||
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
|
||||
S: Maintained
|
||||
F: arch/arm64/boot/dts/freescale/s32g*.dts*
|
||||
@ -6327,6 +6327,7 @@ F: Documentation/mm/damon/
|
||||
F: include/linux/damon.h
|
||||
F: include/trace/events/damon.h
|
||||
F: mm/damon/
|
||||
F: samples/damon/
|
||||
F: tools/testing/selftests/damon/
|
||||
|
||||
DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
|
||||
@ -15068,7 +15069,15 @@ L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
W: http://www.linux-mm.org
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
|
||||
F: mm/mlock.c
|
||||
F: mm/mmap.c
|
||||
F: mm/mprotect.c
|
||||
F: mm/mremap.c
|
||||
F: mm/mseal.c
|
||||
F: mm/vma.c
|
||||
F: mm/vma.h
|
||||
F: mm/vma_internal.h
|
||||
F: tools/testing/vma/
|
||||
|
||||
MEMORY TECHNOLOGY DEVICES (MTD)
|
||||
M: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
@ -16600,8 +16609,8 @@ F: arch/nios2/
|
||||
|
||||
NITRO ENCLAVES (NE)
|
||||
M: Alexandru Ciobotaru <alcioa@amazon.com>
|
||||
R: The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
L: The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
|
||||
S: Supported
|
||||
W: https://aws.amazon.com/ec2/nitro/nitro-enclaves/
|
||||
F: Documentation/virt/ne_overview.rst
|
||||
@ -16612,8 +16621,8 @@ F: samples/nitro_enclaves/
|
||||
|
||||
NITRO SECURE MODULE (NSM)
|
||||
M: Alexander Graf <graf@amazon.com>
|
||||
R: The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
L: The AWS Nitro Enclaves Team <aws-nitro-enclaves-devel@amazon.com>
|
||||
S: Supported
|
||||
W: https://aws.amazon.com/ec2/nitro/nitro-enclaves/
|
||||
F: drivers/misc/nsm.c
|
||||
@ -18425,8 +18434,8 @@ M: Fabio Estevam <festevam@gmail.com>
|
||||
M: Shawn Guo <shawnguo@kernel.org>
|
||||
M: Jacky Bai <ping.bai@nxp.com>
|
||||
R: Pengutronix Kernel Team <kernel@pengutronix.de>
|
||||
R: NXP S32 Linux Team <s32@nxp.com>
|
||||
L: linux-gpio@vger.kernel.org
|
||||
L: NXP S32 Linux Team <s32@nxp.com>
|
||||
S: Maintained
|
||||
F: Documentation/devicetree/bindings/pinctrl/fsl,*
|
||||
F: Documentation/devicetree/bindings/pinctrl/nxp,s32*
|
||||
@ -19561,7 +19570,7 @@ F: drivers/ras/amd/fmpm.c
|
||||
|
||||
RASPBERRY PI PISP BACK END
|
||||
M: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
|
||||
L: Raspberry Pi Kernel Maintenance <kernel-list@raspberrypi.com>
|
||||
R: Raspberry Pi Kernel Maintenance <kernel-list@raspberrypi.com>
|
||||
L: linux-media@vger.kernel.org
|
||||
S: Maintained
|
||||
F: Documentation/devicetree/bindings/media/raspberrypi,pispbe.yaml
|
||||
@ -25018,21 +25027,6 @@ F: include/uapi/linux/vsockmon.h
|
||||
F: net/vmw_vsock/
|
||||
F: tools/testing/vsock/
|
||||
|
||||
VMA
|
||||
M: Andrew Morton <akpm@linux-foundation.org>
|
||||
M: Liam R. Howlett <Liam.Howlett@oracle.com>
|
||||
M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
R: Vlastimil Babka <vbabka@suse.cz>
|
||||
R: Jann Horn <jannh@google.com>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
W: https://www.linux-mm.org
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
|
||||
F: mm/vma.c
|
||||
F: mm/vma.h
|
||||
F: mm/vma_internal.h
|
||||
F: tools/testing/vma/
|
||||
|
||||
VMALLOC
|
||||
M: Andrew Morton <akpm@linux-foundation.org>
|
||||
R: Uladzislau Rezki <urezki@gmail.com>
|
||||
|
@ -331,10 +331,7 @@ cia_prepare_tbia_workaround(int window)
|
||||
long i;
|
||||
|
||||
/* Use minimal 1K map. */
|
||||
ppte = memblock_alloc(CIA_BROKEN_TBIA_SIZE, 32768);
|
||||
if (!ppte)
|
||||
panic("%s: Failed to allocate %u bytes align=0x%x\n",
|
||||
__func__, CIA_BROKEN_TBIA_SIZE, 32768);
|
||||
ppte = memblock_alloc_or_panic(CIA_BROKEN_TBIA_SIZE, 32768);
|
||||
pte = (virt_to_phys(ppte) >> (PAGE_SHIFT - 1)) | 1;
|
||||
|
||||
for (i = 0; i < CIA_BROKEN_TBIA_SIZE / sizeof(unsigned long); ++i)
|
||||
|
@ -81,10 +81,7 @@ mk_resource_name(int pe, int port, char *str)
|
||||
char *name;
|
||||
|
||||
sprintf(tmp, "PCI %s PE %d PORT %d", str, pe, port);
|
||||
name = memblock_alloc(strlen(tmp) + 1, SMP_CACHE_BYTES);
|
||||
if (!name)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
strlen(tmp) + 1);
|
||||
name = memblock_alloc_or_panic(strlen(tmp) + 1, SMP_CACHE_BYTES);
|
||||
strcpy(name, tmp);
|
||||
|
||||
return name;
|
||||
@ -119,10 +116,7 @@ alloc_io7(unsigned int pe)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
io7 = memblock_alloc(sizeof(*io7), SMP_CACHE_BYTES);
|
||||
if (!io7)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
sizeof(*io7));
|
||||
io7 = memblock_alloc_or_panic(sizeof(*io7), SMP_CACHE_BYTES);
|
||||
io7->pe = pe;
|
||||
raw_spin_lock_init(&io7->irq_lock);
|
||||
|
||||
|
@ -391,10 +391,7 @@ alloc_pci_controller(void)
|
||||
{
|
||||
struct pci_controller *hose;
|
||||
|
||||
hose = memblock_alloc(sizeof(*hose), SMP_CACHE_BYTES);
|
||||
if (!hose)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
sizeof(*hose));
|
||||
hose = memblock_alloc_or_panic(sizeof(*hose), SMP_CACHE_BYTES);
|
||||
|
||||
*hose_tail = hose;
|
||||
hose_tail = &hose->next;
|
||||
@ -405,13 +402,7 @@ alloc_pci_controller(void)
|
||||
struct resource * __init
|
||||
alloc_resource(void)
|
||||
{
|
||||
void *ptr = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
|
||||
|
||||
if (!ptr)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
sizeof(struct resource));
|
||||
|
||||
return ptr;
|
||||
return memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES);
|
||||
}
|
||||
|
||||
|
||||
|
@ -71,14 +71,8 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base,
|
||||
if (align < mem_size)
|
||||
align = mem_size;
|
||||
|
||||
arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES);
|
||||
if (!arena)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
sizeof(*arena));
|
||||
arena->ptes = memblock_alloc(mem_size, align);
|
||||
if (!arena->ptes)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, mem_size, align);
|
||||
arena = memblock_alloc_or_panic(sizeof(*arena), SMP_CACHE_BYTES);
|
||||
arena->ptes = memblock_alloc_or_panic(mem_size, align);
|
||||
|
||||
spin_lock_init(&arena->lock);
|
||||
arena->hose = hose;
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include <linux/preempt.h>
|
||||
#include <asm/fpu.h>
|
||||
#include <asm/thread_info.h>
|
||||
#include <asm/fpu.h>
|
||||
|
||||
#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67)
|
||||
#define STT(reg,val) asm volatile ("ftoit $f"#reg",%0" : "=r"(val));
|
||||
|
@ -42,7 +42,7 @@ pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *ret, *init;
|
||||
|
||||
ret = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
|
||||
ret = __pgd_alloc(mm, 0);
|
||||
init = pgd_offset(&init_mm, 0UL);
|
||||
if (ret) {
|
||||
#ifdef CONFIG_ALPHA_LARGE_VMALLOC
|
||||
|
@ -53,19 +53,14 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_
|
||||
|
||||
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *ret = (pgd_t *) __get_free_page(GFP_KERNEL);
|
||||
pgd_t *ret = __pgd_alloc(mm, 0);
|
||||
|
||||
if (ret) {
|
||||
int num, num2;
|
||||
num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE;
|
||||
memzero(ret, num * sizeof(pgd_t));
|
||||
|
||||
num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE;
|
||||
num2 = VMALLOC_SIZE / PGDIR_SIZE;
|
||||
memcpy(ret + num, swapper_pg_dir + num, num2 * sizeof(pgd_t));
|
||||
|
||||
memzero(ret + num + num2,
|
||||
(PTRS_PER_PGD - num - num2) * sizeof(pgd_t));
|
||||
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -200,7 +200,6 @@ int misaligned_fixup(unsigned long address, struct pt_regs *regs,
|
||||
struct callee_regs *cregs)
|
||||
{
|
||||
struct disasm_state state;
|
||||
char buf[TASK_COMM_LEN];
|
||||
|
||||
/* handle user mode only and only if enabled by sysadmin */
|
||||
if (!user_mode(regs) || !unaligned_enabled)
|
||||
@ -212,11 +211,11 @@ int misaligned_fixup(unsigned long address, struct pt_regs *regs,
|
||||
" performance significantly\n. To enable further"
|
||||
" logging of such instances, please \n"
|
||||
" echo 0 > /proc/sys/kernel/ignore-unaligned-usertrap\n",
|
||||
get_task_comm(buf, current), task_pid_nr(current));
|
||||
current->comm, task_pid_nr(current));
|
||||
} else {
|
||||
/* Add rate limiting if it gets down to it */
|
||||
pr_warn("%s(%d): unaligned access to/from 0x%lx by PC: 0x%lx\n",
|
||||
get_task_comm(buf, current), task_pid_nr(current),
|
||||
current->comm, task_pid_nr(current),
|
||||
address, regs->ret);
|
||||
|
||||
}
|
||||
|
@ -26,14 +26,7 @@
|
||||
|
||||
#else /* !CONFIG_MMU */
|
||||
|
||||
#include <linux/swap.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
static inline void __tlb_remove_table(void *_table)
|
||||
{
|
||||
free_page_and_swap_cache((struct page *)_table);
|
||||
}
|
||||
|
||||
#include <asm-generic/tlb.h>
|
||||
|
||||
static inline void
|
||||
@ -41,8 +34,6 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
|
||||
{
|
||||
struct ptdesc *ptdesc = page_ptdesc(pte);
|
||||
|
||||
pagetable_pte_dtor(ptdesc);
|
||||
|
||||
#ifndef CONFIG_ARM_LPAE
|
||||
/*
|
||||
* With the classic ARM MMU, a pte page has two corresponding pmd
|
||||
@ -61,7 +52,6 @@ __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr)
|
||||
#ifdef CONFIG_ARM_LPAE
|
||||
struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);
|
||||
|
||||
pagetable_pmd_dtor(ptdesc);
|
||||
tlb_remove_ptdesc(tlb, ptdesc);
|
||||
#endif
|
||||
}
|
||||
|
@ -880,10 +880,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
|
||||
*/
|
||||
boot_alias_start = phys_to_idmap(start);
|
||||
if (arm_has_idmap_alias() && boot_alias_start != IDMAP_INVALID_ADDR) {
|
||||
res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES);
|
||||
if (!res)
|
||||
panic("%s: Failed to allocate %zu bytes\n",
|
||||
__func__, sizeof(*res));
|
||||
res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES);
|
||||
res->name = "System RAM (boot alias)";
|
||||
res->start = boot_alias_start;
|
||||
res->end = phys_to_idmap(res_end);
|
||||
@ -891,10 +888,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
|
||||
request_resource(&iomem_resource, res);
|
||||
}
|
||||
|
||||
res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES);
|
||||
if (!res)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
sizeof(*res));
|
||||
res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES);
|
||||
res->name = "System RAM";
|
||||
res->start = start;
|
||||
res->end = res_end;
|
||||
|
@ -31,9 +31,9 @@
|
||||
/*
|
||||
* Constants
|
||||
*/
|
||||
#define SHARPSL_CHARGE_ON_TIME_INTERVAL (msecs_to_jiffies(1*60*1000)) /* 1 min */
|
||||
#define SHARPSL_CHARGE_FINISH_TIME (msecs_to_jiffies(10*60*1000)) /* 10 min */
|
||||
#define SHARPSL_BATCHK_TIME (msecs_to_jiffies(15*1000)) /* 15 sec */
|
||||
#define SHARPSL_CHARGE_ON_TIME_INTERVAL (secs_to_jiffies(60))
|
||||
#define SHARPSL_CHARGE_FINISH_TIME (secs_to_jiffies(10*60))
|
||||
#define SHARPSL_BATCHK_TIME (secs_to_jiffies(15))
|
||||
#define SHARPSL_BATCHK_TIME_SUSPEND (60*10) /* 10 min */
|
||||
|
||||
#define SHARPSL_WAIT_CO_TIME 15 /* 15 sec */
|
||||
|
@ -726,13 +726,8 @@ EXPORT_SYMBOL(phys_mem_access_prot);
|
||||
|
||||
static void __init *early_alloc(unsigned long sz)
|
||||
{
|
||||
void *ptr = memblock_alloc(sz, sz);
|
||||
return memblock_alloc_or_panic(sz, sz);
|
||||
|
||||
if (!ptr)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, sz, sz);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static void *__init late_alloc(unsigned long sz)
|
||||
@ -1027,10 +1022,7 @@ void __init iotable_init(struct map_desc *io_desc, int nr)
|
||||
if (!nr)
|
||||
return;
|
||||
|
||||
svm = memblock_alloc(sizeof(*svm) * nr, __alignof__(*svm));
|
||||
if (!svm)
|
||||
panic("%s: Failed to allocate %zu bytes align=0x%zx\n",
|
||||
__func__, sizeof(*svm) * nr, __alignof__(*svm));
|
||||
svm = memblock_alloc_or_panic(sizeof(*svm) * nr, __alignof__(*svm));
|
||||
|
||||
for (md = io_desc; nr; md++, nr--) {
|
||||
create_mapping(md);
|
||||
@ -1052,10 +1044,7 @@ void __init vm_reserve_area_early(unsigned long addr, unsigned long size,
|
||||
struct vm_struct *vm;
|
||||
struct static_vm *svm;
|
||||
|
||||
svm = memblock_alloc(sizeof(*svm), __alignof__(*svm));
|
||||
if (!svm)
|
||||
panic("%s: Failed to allocate %zu bytes align=0x%zx\n",
|
||||
__func__, sizeof(*svm), __alignof__(*svm));
|
||||
svm = memblock_alloc_or_panic(sizeof(*svm), __alignof__(*svm));
|
||||
|
||||
vm = &svm->vm;
|
||||
vm->addr = (void *)addr;
|
||||
|
@ -162,10 +162,7 @@ void __init paging_init(const struct machine_desc *mdesc)
|
||||
mpu_setup();
|
||||
|
||||
/* allocate the zero page. */
|
||||
zero_page = (void *)memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!zero_page)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, PAGE_SIZE, PAGE_SIZE);
|
||||
zero_page = (void *)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
|
||||
bootmem_init();
|
||||
|
||||
|
@ -17,11 +17,11 @@
|
||||
#include "mm.h"
|
||||
|
||||
#ifdef CONFIG_ARM_LPAE
|
||||
#define __pgd_alloc() kmalloc_array(PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL)
|
||||
#define __pgd_free(pgd) kfree(pgd)
|
||||
#define _pgd_alloc(mm) kmalloc_array(PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL | __GFP_ZERO)
|
||||
#define _pgd_free(mm, pgd) kfree(pgd)
|
||||
#else
|
||||
#define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL, 2)
|
||||
#define __pgd_free(pgd) free_pages((unsigned long)pgd, 2)
|
||||
#define _pgd_alloc(mm) __pgd_alloc(mm, 2)
|
||||
#define _pgd_free(mm, pgd) __pgd_free(mm, pgd)
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -35,12 +35,10 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
pmd_t *new_pmd, *init_pmd;
|
||||
pte_t *new_pte, *init_pte;
|
||||
|
||||
new_pgd = __pgd_alloc();
|
||||
new_pgd = _pgd_alloc(mm);
|
||||
if (!new_pgd)
|
||||
goto no_pgd;
|
||||
|
||||
memset(new_pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
|
||||
|
||||
/*
|
||||
* Copy over the kernel and IO PGD entries
|
||||
*/
|
||||
@ -134,7 +132,7 @@ no_pmd:
|
||||
no_pud:
|
||||
p4d_free(mm, new_p4d);
|
||||
no_p4d:
|
||||
__pgd_free(new_pgd);
|
||||
_pgd_free(mm, new_pgd);
|
||||
no_pgd:
|
||||
return NULL;
|
||||
}
|
||||
@ -207,5 +205,5 @@ no_pgd:
|
||||
p4d_free(mm, p4d);
|
||||
}
|
||||
#endif
|
||||
__pgd_free(pgd_base);
|
||||
_pgd_free(mm, pgd_base);
|
||||
}
|
||||
|
@ -85,24 +85,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
|
||||
__pgd_populate(pgdp, __pa(p4dp), pgdval);
|
||||
}
|
||||
|
||||
static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
gfp_t gfp = GFP_PGTABLE_USER;
|
||||
|
||||
if (mm == &init_mm)
|
||||
gfp = GFP_PGTABLE_KERNEL;
|
||||
return (p4d_t *)get_zeroed_page(gfp);
|
||||
}
|
||||
|
||||
static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
|
||||
{
|
||||
if (!pgtable_l5_enabled())
|
||||
return;
|
||||
BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
|
||||
free_page((unsigned long)p4d);
|
||||
}
|
||||
|
||||
#define __p4d_free_tlb(tlb, p4d, addr) p4d_free((tlb)->mm, p4d)
|
||||
#else
|
||||
static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
|
||||
{
|
||||
|
@ -9,12 +9,7 @@
|
||||
#define __ASM_TLB_H
|
||||
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/swap.h>
|
||||
|
||||
static inline void __tlb_remove_table(void *_table)
|
||||
{
|
||||
free_page_and_swap_cache((struct page *)_table);
|
||||
}
|
||||
|
||||
#define tlb_flush tlb_flush
|
||||
static void tlb_flush(struct mmu_gather *tlb);
|
||||
@ -82,7 +77,6 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
|
||||
{
|
||||
struct ptdesc *ptdesc = page_ptdesc(pte);
|
||||
|
||||
pagetable_pte_dtor(ptdesc);
|
||||
tlb_remove_ptdesc(tlb, ptdesc);
|
||||
}
|
||||
|
||||
@ -92,7 +86,6 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
|
||||
{
|
||||
struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);
|
||||
|
||||
pagetable_pmd_dtor(ptdesc);
|
||||
tlb_remove_ptdesc(tlb, ptdesc);
|
||||
}
|
||||
#endif
|
||||
@ -106,7 +99,19 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
|
||||
if (!pgtable_l4_enabled())
|
||||
return;
|
||||
|
||||
pagetable_pud_dtor(ptdesc);
|
||||
tlb_remove_ptdesc(tlb, ptdesc);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS > 4
|
||||
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4dp,
|
||||
unsigned long addr)
|
||||
{
|
||||
struct ptdesc *ptdesc = virt_to_ptdesc(p4dp);
|
||||
|
||||
if (!pgtable_l5_enabled())
|
||||
return;
|
||||
|
||||
tlb_remove_ptdesc(tlb, ptdesc);
|
||||
}
|
||||
#endif
|
||||
|
@ -223,9 +223,7 @@ static void __init request_standard_resources(void)
|
||||
|
||||
num_standard_resources = memblock.memory.cnt;
|
||||
res_size = num_standard_resources * sizeof(*standard_resources);
|
||||
standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES);
|
||||
if (!standard_resources)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__, res_size);
|
||||
standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES);
|
||||
|
||||
for_each_mem_region(region) {
|
||||
res = &standard_resources[i++];
|
||||
|
@ -33,7 +33,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
gfp_t gfp = GFP_PGTABLE_USER;
|
||||
|
||||
if (pgdir_is_page_size())
|
||||
return (pgd_t *)__get_free_page(gfp);
|
||||
return __pgd_alloc(mm, 0);
|
||||
else
|
||||
return kmem_cache_alloc(pgd_cache, gfp);
|
||||
}
|
||||
@ -41,7 +41,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
||||
{
|
||||
if (pgdir_is_page_size())
|
||||
free_page((unsigned long)pgd);
|
||||
__pgd_free(mm, pgd);
|
||||
else
|
||||
kmem_cache_free(pgd_cache, pgd);
|
||||
}
|
||||
|
@ -44,7 +44,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
pgd_t *ret;
|
||||
pgd_t *init;
|
||||
|
||||
ret = (pgd_t *) __get_free_page(GFP_KERNEL);
|
||||
ret = __pgd_alloc(mm, 0);
|
||||
if (ret) {
|
||||
init = pgd_offset(&init_mm, 0UL);
|
||||
pgd_init((unsigned long *)ret);
|
||||
@ -63,7 +63,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
|
||||
#define __pte_free_tlb(tlb, pte, address) \
|
||||
do { \
|
||||
pagetable_pte_dtor(page_ptdesc(pte)); \
|
||||
pagetable_dtor(page_ptdesc(pte)); \
|
||||
tlb_remove_page_ptdesc(tlb, page_ptdesc(pte)); \
|
||||
} while (0)
|
||||
|
||||
|
@ -22,7 +22,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
|
||||
pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
|
||||
pgd = __pgd_alloc(mm, 0);
|
||||
|
||||
/*
|
||||
* There may be better ways to do this, but to ensure
|
||||
@ -89,7 +89,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
|
||||
|
||||
#define __pte_free_tlb(tlb, pte, addr) \
|
||||
do { \
|
||||
pagetable_pte_dtor((page_ptdesc(pte))); \
|
||||
pagetable_dtor((page_ptdesc(pte))); \
|
||||
tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
|
||||
} while (0)
|
||||
|
||||
|
@ -113,7 +113,10 @@ CONFIG_ZBUD=y
|
||||
CONFIG_ZSMALLOC=m
|
||||
# CONFIG_COMPAT_BRK is not set
|
||||
CONFIG_MEMORY_HOTPLUG=y
|
||||
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
|
||||
# CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE is not set
|
||||
CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO=y
|
||||
# CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL is not set
|
||||
# CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE is not set
|
||||
CONFIG_MEMORY_HOTREMOVE=y
|
||||
CONFIG_KSM=y
|
||||
CONFIG_TRANSPARENT_HUGEPAGE=y
|
||||
|
@ -57,7 +57,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
|
||||
|
||||
#define __pte_free_tlb(tlb, pte, address) \
|
||||
do { \
|
||||
pagetable_pte_dtor(page_ptdesc(pte)); \
|
||||
pagetable_dtor(page_ptdesc(pte)); \
|
||||
tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
|
||||
} while (0)
|
||||
|
||||
|
@ -431,7 +431,7 @@ static void __init resource_init(void)
|
||||
|
||||
num_standard_resources = memblock.memory.cnt;
|
||||
res_size = num_standard_resources * sizeof(*standard_resources);
|
||||
standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES);
|
||||
standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES);
|
||||
|
||||
for_each_mem_region(region) {
|
||||
res = &standard_resources[i++];
|
||||
|
@ -174,9 +174,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr)
|
||||
pmd_t *pmd;
|
||||
|
||||
if (p4d_none(p4dp_get(p4d))) {
|
||||
pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!pud)
|
||||
panic("%s: Failed to allocate memory\n", __func__);
|
||||
pud = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
p4d_populate(&init_mm, p4d, pud);
|
||||
#ifndef __PAGETABLE_PUD_FOLDED
|
||||
pud_init(pud);
|
||||
@ -185,9 +183,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr)
|
||||
|
||||
pud = pud_offset(p4d, addr);
|
||||
if (pud_none(pudp_get(pud))) {
|
||||
pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!pmd)
|
||||
panic("%s: Failed to allocate memory\n", __func__);
|
||||
pmd = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
pud_populate(&init_mm, pud, pmd);
|
||||
#ifndef __PAGETABLE_PMD_FOLDED
|
||||
pmd_init(pmd);
|
||||
@ -198,10 +194,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr)
|
||||
if (!pmd_present(pmdp_get(pmd))) {
|
||||
pte_t *pte;
|
||||
|
||||
pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!pte)
|
||||
panic("%s: Failed to allocate memory\n", __func__);
|
||||
|
||||
pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
pmd_populate_kernel(&init_mm, pmd, pte);
|
||||
kernel_pte_init(pte);
|
||||
}
|
||||
|
@ -23,11 +23,10 @@ EXPORT_SYMBOL(tlb_virt_to_page);
|
||||
|
||||
pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *init, *ret = NULL;
|
||||
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
|
||||
pgd_t *init, *ret;
|
||||
|
||||
if (ptdesc) {
|
||||
ret = (pgd_t *)ptdesc_address(ptdesc);
|
||||
ret = __pgd_alloc(mm, 0);
|
||||
if (ret) {
|
||||
init = pgd_offset(&init_mm, 0UL);
|
||||
pgd_init(ret);
|
||||
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
|
||||
|
@ -629,7 +629,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -586,7 +586,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -606,7 +606,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -578,7 +578,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -588,7 +588,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -605,7 +605,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -692,7 +692,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -578,7 +578,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -579,7 +579,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -595,7 +595,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -575,7 +575,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -576,7 +576,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -37,7 +37,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable,
|
||||
{
|
||||
struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
|
||||
|
||||
pagetable_pte_dtor(ptdesc);
|
||||
pagetable_dtor(ptdesc);
|
||||
pagetable_free(ptdesc);
|
||||
}
|
||||
|
||||
@ -61,7 +61,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)
|
||||
{
|
||||
struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
|
||||
|
||||
pagetable_pte_dtor(ptdesc);
|
||||
pagetable_dtor(ptdesc);
|
||||
pagetable_free(ptdesc);
|
||||
}
|
||||
|
||||
@ -73,7 +73,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)
|
||||
|
||||
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
||||
{
|
||||
pagetable_free(virt_to_ptdesc(pgd));
|
||||
pagetable_dtor_free(virt_to_ptdesc(pgd));
|
||||
}
|
||||
|
||||
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
@ -84,6 +84,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
|
||||
if (!ptdesc)
|
||||
return NULL;
|
||||
pagetable_pgd_ctor(ptdesc);
|
||||
new_pgd = ptdesc_address(ptdesc);
|
||||
|
||||
memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t));
|
||||
|
@ -9,9 +9,9 @@ extern void mmu_page_ctor(void *page);
|
||||
extern void mmu_page_dtor(void *page);
|
||||
|
||||
enum m68k_table_types {
|
||||
TABLE_PGD = 0,
|
||||
TABLE_PMD = 0, /* same size as PGD */
|
||||
TABLE_PTE = 1,
|
||||
TABLE_PGD,
|
||||
TABLE_PMD,
|
||||
TABLE_PTE,
|
||||
};
|
||||
|
||||
extern void init_pointer_table(void *table, int type);
|
||||
|
@ -19,7 +19,7 @@ extern const char bad_pmd_string[];
|
||||
|
||||
#define __pte_free_tlb(tlb, pte, addr) \
|
||||
do { \
|
||||
pagetable_pte_dtor(page_ptdesc(pte)); \
|
||||
pagetable_dtor(page_ptdesc(pte)); \
|
||||
tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
|
||||
} while (0)
|
||||
|
||||
@ -43,7 +43,7 @@ static inline pgd_t * pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *new_pgd;
|
||||
|
||||
new_pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
|
||||
new_pgd = __pgd_alloc(mm, 0);
|
||||
memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE);
|
||||
memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT));
|
||||
return new_pgd;
|
||||
|
@ -68,10 +68,7 @@ void __init paging_init(void)
|
||||
|
||||
high_memory = (void *) end_mem;
|
||||
|
||||
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!empty_zero_page)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, PAGE_SIZE, PAGE_SIZE);
|
||||
empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT;
|
||||
free_area_init(max_zone_pfn);
|
||||
}
|
||||
|
@ -42,20 +42,14 @@ void __init paging_init(void)
|
||||
unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
|
||||
int i;
|
||||
|
||||
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!empty_zero_page)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, PAGE_SIZE, PAGE_SIZE);
|
||||
empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
|
||||
pg_dir = swapper_pg_dir;
|
||||
memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir));
|
||||
|
||||
size = num_pages * sizeof(pte_t);
|
||||
size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1);
|
||||
next_pgtable = (unsigned long) memblock_alloc(size, PAGE_SIZE);
|
||||
if (!next_pgtable)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, size, PAGE_SIZE);
|
||||
next_pgtable = (unsigned long) memblock_alloc_or_panic(size, PAGE_SIZE);
|
||||
|
||||
pg_dir += PAGE_OFFSET >> PGDIR_SHIFT;
|
||||
|
||||
|
@ -97,17 +97,19 @@ void mmu_page_dtor(void *page)
|
||||
|
||||
typedef struct list_head ptable_desc;
|
||||
|
||||
static struct list_head ptable_list[2] = {
|
||||
static struct list_head ptable_list[3] = {
|
||||
LIST_HEAD_INIT(ptable_list[0]),
|
||||
LIST_HEAD_INIT(ptable_list[1]),
|
||||
LIST_HEAD_INIT(ptable_list[2]),
|
||||
};
|
||||
|
||||
#define PD_PTABLE(page) ((ptable_desc *)&(virt_to_page((void *)(page))->lru))
|
||||
#define PD_PAGE(ptable) (list_entry(ptable, struct page, lru))
|
||||
#define PD_MARKBITS(dp) (*(unsigned int *)&PD_PAGE(dp)->index)
|
||||
|
||||
static const int ptable_shift[2] = {
|
||||
7+2, /* PGD, PMD */
|
||||
static const int ptable_shift[3] = {
|
||||
7+2, /* PGD */
|
||||
7+2, /* PMD */
|
||||
6+2, /* PTE */
|
||||
};
|
||||
|
||||
@ -156,12 +158,20 @@ void *get_pointer_table(int type)
|
||||
if (!(page = (void *)get_zeroed_page(GFP_KERNEL)))
|
||||
return NULL;
|
||||
|
||||
if (type == TABLE_PTE) {
|
||||
switch (type) {
|
||||
case TABLE_PTE:
|
||||
/*
|
||||
* m68k doesn't have SPLIT_PTE_PTLOCKS for not having
|
||||
* SMP.
|
||||
*/
|
||||
pagetable_pte_ctor(virt_to_ptdesc(page));
|
||||
break;
|
||||
case TABLE_PMD:
|
||||
pagetable_pmd_ctor(virt_to_ptdesc(page));
|
||||
break;
|
||||
case TABLE_PGD:
|
||||
pagetable_pgd_ctor(virt_to_ptdesc(page));
|
||||
break;
|
||||
}
|
||||
|
||||
mmu_page_ctor(page);
|
||||
@ -200,8 +210,7 @@ int free_pointer_table(void *table, int type)
|
||||
/* all tables in page are free, free page */
|
||||
list_del(dp);
|
||||
mmu_page_dtor((void *)page);
|
||||
if (type == TABLE_PTE)
|
||||
pagetable_pte_dtor(virt_to_ptdesc((void *)page));
|
||||
pagetable_dtor(virt_to_ptdesc((void *)page));
|
||||
free_page (page);
|
||||
return 1;
|
||||
} else if (ptable_list[type].next != dp) {
|
||||
@ -491,10 +500,7 @@ void __init paging_init(void)
|
||||
* initialize the bad page table and bad page to point
|
||||
* to a couple of allocated pages
|
||||
*/
|
||||
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!empty_zero_page)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, PAGE_SIZE, PAGE_SIZE);
|
||||
empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
|
||||
/*
|
||||
* Set up SFC/DFC registers
|
||||
|
@ -44,10 +44,7 @@ void __init paging_init(void)
|
||||
unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
|
||||
unsigned long size;
|
||||
|
||||
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!empty_zero_page)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, PAGE_SIZE, PAGE_SIZE);
|
||||
empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
|
||||
address = PAGE_OFFSET;
|
||||
pg_dir = swapper_pg_dir;
|
||||
@ -57,10 +54,7 @@ void __init paging_init(void)
|
||||
size = num_pages * sizeof(pte_t);
|
||||
size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1);
|
||||
|
||||
next_pgtable = (unsigned long)memblock_alloc(size, PAGE_SIZE);
|
||||
if (!next_pgtable)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, size, PAGE_SIZE);
|
||||
next_pgtable = (unsigned long)memblock_alloc_or_panic(size, PAGE_SIZE);
|
||||
bootmem_end = (next_pgtable + size + PAGE_SIZE) & PAGE_MASK;
|
||||
|
||||
/* Map whole memory from PAGE_OFFSET (0x0E000000) */
|
||||
|
@ -252,12 +252,8 @@ void __init dvma_init(void)
|
||||
|
||||
list_add(&(hole->list), &hole_list);
|
||||
|
||||
iommu_use = memblock_alloc(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long),
|
||||
iommu_use = memblock_alloc_or_panic(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long),
|
||||
SMP_CACHE_BYTES);
|
||||
if (!iommu_use)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
IOMMU_TOTAL_ENTRIES * sizeof(unsigned long));
|
||||
|
||||
dvma_unmap_iommu(DVMA_START, DVMA_SIZE);
|
||||
|
||||
sun3_dvma_init();
|
||||
|
@ -21,12 +21,7 @@
|
||||
|
||||
extern void __bad_pte(pmd_t *pmd);
|
||||
|
||||
static inline pgd_t *get_pgd(void)
|
||||
{
|
||||
return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
|
||||
}
|
||||
|
||||
#define pgd_alloc(mm) get_pgd()
|
||||
#define pgd_alloc(mm) __pgd_alloc(mm, 0)
|
||||
|
||||
extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
|
||||
#define __HAVE_ARCH_PMD_ALLOC_ONE
|
||||
#define __HAVE_ARCH_PUD_ALLOC_ONE
|
||||
#define __HAVE_ARCH_PGD_FREE
|
||||
#include <asm-generic/pgalloc.h>
|
||||
|
||||
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
|
||||
@ -49,14 +48,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
|
||||
extern void pgd_init(void *addr);
|
||||
extern pgd_t *pgd_alloc(struct mm_struct *mm);
|
||||
|
||||
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
||||
{
|
||||
pagetable_free(virt_to_ptdesc(pgd));
|
||||
}
|
||||
|
||||
#define __pte_free_tlb(tlb, pte, address) \
|
||||
do { \
|
||||
pagetable_pte_dtor(page_ptdesc(pte)); \
|
||||
pagetable_dtor(page_ptdesc(pte)); \
|
||||
tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
|
||||
} while (0)
|
||||
|
||||
|
@ -704,10 +704,7 @@ static void __init resource_init(void)
|
||||
for_each_mem_range(i, &start, &end) {
|
||||
struct resource *res;
|
||||
|
||||
res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
|
||||
if (!res)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
sizeof(struct resource));
|
||||
res = memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES);
|
||||
|
||||
res->start = start;
|
||||
/*
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <linux/ioport.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
@ -97,11 +98,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
|
||||
return -EINTR;
|
||||
|
||||
if (IS_ENABLED(CONFIG_MIPS_FP_SUPPORT)) {
|
||||
unsigned long unused;
|
||||
|
||||
/* Map delay slot emulation page */
|
||||
base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
|
||||
VM_READ | VM_EXEC |
|
||||
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
|
||||
0, NULL);
|
||||
base = do_mmap(NULL, STACK_TOP, PAGE_SIZE, PROT_READ | PROT_EXEC,
|
||||
MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, 0, 0, &unused,
|
||||
NULL);
|
||||
if (IS_ERR_VALUE(base)) {
|
||||
ret = base;
|
||||
goto out;
|
||||
|
@ -10,12 +10,10 @@
|
||||
|
||||
pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *init, *ret = NULL;
|
||||
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM,
|
||||
PGD_TABLE_ORDER);
|
||||
pgd_t *init, *ret;
|
||||
|
||||
if (ptdesc) {
|
||||
ret = ptdesc_address(ptdesc);
|
||||
ret = __pgd_alloc(mm, PGD_TABLE_ORDER);
|
||||
if (ret) {
|
||||
init = pgd_offset(&init_mm, 0UL);
|
||||
pgd_init(ret);
|
||||
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
|
||||
|
@ -30,7 +30,7 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm);
|
||||
|
||||
#define __pte_free_tlb(tlb, pte, addr) \
|
||||
do { \
|
||||
pagetable_pte_dtor(page_ptdesc(pte)); \
|
||||
pagetable_dtor(page_ptdesc(pte)); \
|
||||
tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
|
||||
} while (0)
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <linux/sched.h>
|
||||
|
||||
#include <asm/cpuinfo.h>
|
||||
#include <asm/pgalloc.h>
|
||||
|
||||
/* pteaddr:
|
||||
* ptbase | vpn* | zero
|
||||
@ -54,7 +55,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *ret, *init;
|
||||
|
||||
ret = (pgd_t *) __get_free_page(GFP_KERNEL);
|
||||
ret = __pgd_alloc(mm, 0);
|
||||
if (ret) {
|
||||
init = pgd_offset(&init_mm, 0UL);
|
||||
pgd_init(ret);
|
||||
|
@ -41,15 +41,13 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
|
||||
*/
|
||||
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *ret = (pgd_t *)__get_free_page(GFP_KERNEL);
|
||||
pgd_t *ret = __pgd_alloc(mm, 0);
|
||||
|
||||
if (ret) {
|
||||
memset(ret, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
|
||||
if (ret)
|
||||
memcpy(ret + USER_PTRS_PER_PGD,
|
||||
swapper_pg_dir + USER_PTRS_PER_PGD,
|
||||
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
|
||||
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -68,7 +66,7 @@ extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
|
||||
|
||||
#define __pte_free_tlb(tlb, pte, addr) \
|
||||
do { \
|
||||
pagetable_pte_dtor(page_ptdesc(pte)); \
|
||||
pagetable_dtor(page_ptdesc(pte)); \
|
||||
tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
|
||||
} while (0)
|
||||
|
||||
|
@ -38,10 +38,7 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm)
|
||||
if (likely(mem_init_done)) {
|
||||
pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
|
||||
} else {
|
||||
pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!pte)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, PAGE_SIZE, PAGE_SIZE);
|
||||
pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
}
|
||||
|
||||
return pte;
|
||||
|
@ -11,27 +11,12 @@
|
||||
#include <asm/cache.h>
|
||||
|
||||
#define __HAVE_ARCH_PMD_ALLOC_ONE
|
||||
#define __HAVE_ARCH_PMD_FREE
|
||||
#define __HAVE_ARCH_PGD_FREE
|
||||
#include <asm-generic/pgalloc.h>
|
||||
|
||||
/* Allocate the top level pgd (page directory) */
|
||||
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
|
||||
pgd = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER);
|
||||
if (unlikely(pgd == NULL))
|
||||
return NULL;
|
||||
|
||||
memset(pgd, 0, PAGE_SIZE << PGD_TABLE_ORDER);
|
||||
|
||||
return pgd;
|
||||
}
|
||||
|
||||
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
||||
{
|
||||
free_pages((unsigned long)pgd, PGD_TABLE_ORDER);
|
||||
return __pgd_alloc(mm, PGD_TABLE_ORDER);
|
||||
}
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS == 3
|
||||
@ -46,17 +31,19 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
|
||||
|
||||
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
struct ptdesc *ptdesc;
|
||||
gfp_t gfp = GFP_PGTABLE_USER;
|
||||
|
||||
pmd = (pmd_t *)__get_free_pages(GFP_PGTABLE_KERNEL, PMD_TABLE_ORDER);
|
||||
if (likely(pmd))
|
||||
memset ((void *)pmd, 0, PAGE_SIZE << PMD_TABLE_ORDER);
|
||||
return pmd;
|
||||
if (mm == &init_mm)
|
||||
gfp = GFP_PGTABLE_KERNEL;
|
||||
ptdesc = pagetable_alloc(gfp, PMD_TABLE_ORDER);
|
||||
if (!ptdesc)
|
||||
return NULL;
|
||||
if (!pagetable_pmd_ctor(ptdesc)) {
|
||||
pagetable_free(ptdesc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
|
||||
{
|
||||
free_pages((unsigned long)pmd, PMD_TABLE_ORDER);
|
||||
return ptdesc_address(ptdesc);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -377,10 +377,8 @@ static void __ref map_pages(unsigned long start_vaddr,
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS == 3
|
||||
if (pud_none(*pud)) {
|
||||
pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER,
|
||||
pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER,
|
||||
PAGE_SIZE << PMD_TABLE_ORDER);
|
||||
if (!pmd)
|
||||
panic("pmd allocation failed.\n");
|
||||
pud_populate(NULL, pud, pmd);
|
||||
}
|
||||
#endif
|
||||
@ -388,9 +386,7 @@ static void __ref map_pages(unsigned long start_vaddr,
|
||||
pmd = pmd_offset(pud, vaddr);
|
||||
for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++, pmd++) {
|
||||
if (pmd_none(*pmd)) {
|
||||
pg_table = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!pg_table)
|
||||
panic("page table allocation failed\n");
|
||||
pg_table = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
pmd_populate_kernel(NULL, pmd, pg_table);
|
||||
}
|
||||
|
||||
@ -648,9 +644,7 @@ static void __init pagetable_init(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!empty_zero_page)
|
||||
panic("zero page allocation failed.\n");
|
||||
empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
|
||||
}
|
||||
|
||||
@ -687,19 +681,15 @@ static void __init fixmap_init(void)
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS == 3
|
||||
if (pud_none(*pud)) {
|
||||
pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER,
|
||||
pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER,
|
||||
PAGE_SIZE << PMD_TABLE_ORDER);
|
||||
if (!pmd)
|
||||
panic("fixmap: pmd allocation failed.\n");
|
||||
pud_populate(NULL, pud, pmd);
|
||||
}
|
||||
#endif
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
do {
|
||||
pte_t *pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!pte)
|
||||
panic("fixmap: pte allocation failed.\n");
|
||||
pte_t *pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
|
||||
|
||||
pmd_populate_kernel(&init_mm, pmd, pte);
|
||||
|
||||
|
@ -451,7 +451,6 @@ CONFIG_TEST_PRINTF=m
|
||||
CONFIG_TEST_SCANF=m
|
||||
CONFIG_TEST_BITMAP=m
|
||||
CONFIG_TEST_UUID=m
|
||||
CONFIG_TEST_XARRAY=m
|
||||
CONFIG_TEST_MAPLE_TREE=m
|
||||
CONFIG_TEST_RHASHTABLE=m
|
||||
CONFIG_TEST_IDA=m
|
||||
|
@ -37,6 +37,7 @@ extern void tlb_flush(struct mmu_gather *tlb);
|
||||
*/
|
||||
#define tlb_needs_table_invalidate() radix_enabled()
|
||||
|
||||
#define __HAVE_ARCH_TLB_REMOVE_TABLE
|
||||
/* Get the generic bits... */
|
||||
#include <asm-generic/tlb.h>
|
||||
|
||||
|
@ -1087,10 +1087,8 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char
|
||||
/* Count and allocate space for cpu features */
|
||||
of_scan_flat_dt_subnodes(node, count_cpufeatures_subnodes,
|
||||
&nr_dt_cpu_features);
|
||||
dt_cpu_features = memblock_alloc(sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, PAGE_SIZE);
|
||||
if (!dt_cpu_features)
|
||||
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
|
||||
__func__,
|
||||
dt_cpu_features =
|
||||
memblock_alloc_or_panic(
|
||||
sizeof(struct dt_cpu_feature) * nr_dt_cpu_features,
|
||||
PAGE_SIZE);
|
||||
|
||||
|
@ -213,11 +213,8 @@ pci_create_OF_bus_map(void)
|
||||
struct property* of_prop;
|
||||
struct device_node *dn;
|
||||
|
||||
of_prop = memblock_alloc(sizeof(struct property) + 256,
|
||||
of_prop = memblock_alloc_or_panic(sizeof(struct property) + 256,
|
||||
SMP_CACHE_BYTES);
|
||||
if (!of_prop)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
sizeof(struct property) + 256);
|
||||
dn = of_find_node_by_path("/");
|
||||
if (dn) {
|
||||
memset(of_prop, -1, sizeof(struct property) + 256);
|
||||
|
@ -458,11 +458,8 @@ void __init smp_setup_cpu_maps(void)
|
||||
|
||||
DBG("smp_setup_cpu_maps()\n");
|
||||
|
||||
cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32),
|
||||
cpu_to_phys_id = memblock_alloc_or_panic(nr_cpu_ids * sizeof(u32),
|
||||
__alignof__(u32));
|
||||
if (!cpu_to_phys_id)
|
||||
panic("%s: Failed to allocate %zu bytes align=0x%zx\n",
|
||||
__func__, nr_cpu_ids * sizeof(u32), __alignof__(u32));
|
||||
|
||||
for_each_node_by_type(dn, "cpu") {
|
||||
const __be32 *intserv;
|
||||
|
@ -140,13 +140,7 @@ arch_initcall(ppc_init);
|
||||
|
||||
static void *__init alloc_stack(void)
|
||||
{
|
||||
void *ptr = memblock_alloc(THREAD_SIZE, THREAD_ALIGN);
|
||||
|
||||
if (!ptr)
|
||||
panic("cannot allocate %d bytes for stack at %pS\n",
|
||||
THREAD_SIZE, (void *)_RET_IP_);
|
||||
|
||||
return ptr;
|
||||
return memblock_alloc_or_panic(THREAD_SIZE, THREAD_ALIGN);
|
||||
}
|
||||
|
||||
void __init irqstack_early_init(void)
|
||||
|
@ -4957,7 +4957,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
|
||||
* states are synchronized from L0 to L1. L1 needs to inform L0 about
|
||||
* MER=1 only when there are pending external interrupts.
|
||||
* In the above if check, MER bit is set if there are pending
|
||||
* external interrupts. Hence, explicity mask off MER bit
|
||||
* external interrupts. Hence, explicitly mask off MER bit
|
||||
* here as otherwise it may generate spurious interrupts in L2 KVM
|
||||
* causing an endless loop, which results in L2 guest getting hung.
|
||||
*/
|
||||
|
@ -377,10 +377,7 @@ void __init MMU_init_hw(void)
|
||||
* Find some memory for the hash table.
|
||||
*/
|
||||
if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
|
||||
Hash = memblock_alloc(Hash_size, Hash_size);
|
||||
if (!Hash)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, Hash_size, Hash_size);
|
||||
Hash = memblock_alloc_or_panic(Hash_size, Hash_size);
|
||||
_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
|
||||
|
||||
pr_info("Total memory = %lldMB; using %ldkB for hash table\n",
|
||||
|
@ -253,7 +253,7 @@ static void pmd_frag_destroy(void *pmd_frag)
|
||||
count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
|
||||
/* We allow PTE_FRAG_NR fragments from a PTE page */
|
||||
if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
|
||||
pagetable_pmd_dtor(ptdesc);
|
||||
pagetable_dtor(ptdesc);
|
||||
pagetable_free(ptdesc);
|
||||
}
|
||||
}
|
||||
|
@ -330,11 +330,7 @@ void __init mmu_partition_table_init(void)
|
||||
unsigned long ptcr;
|
||||
|
||||
/* Initialize the Partition Table with no entries */
|
||||
partition_tb = memblock_alloc(patb_size, patb_size);
|
||||
if (!partition_tb)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, patb_size, patb_size);
|
||||
|
||||
partition_tb = memblock_alloc_or_panic(patb_size, patb_size);
|
||||
ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
|
||||
set_ptcr_when_no_uv(ptcr);
|
||||
powernv_set_nmmu_ptcr(ptcr);
|
||||
@ -477,7 +473,7 @@ void pmd_fragment_free(unsigned long *pmd)
|
||||
|
||||
BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
|
||||
if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
|
||||
pagetable_pmd_dtor(ptdesc);
|
||||
pagetable_dtor(ptdesc);
|
||||
pagetable_free(ptdesc);
|
||||
}
|
||||
}
|
||||
|
@ -40,19 +40,19 @@ static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgpr
|
||||
pgdp = pgd_offset_k(ea);
|
||||
p4dp = p4d_offset(pgdp, ea);
|
||||
if (kasan_pud_table(*p4dp)) {
|
||||
pudp = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
|
||||
pudp = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
|
||||
memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE);
|
||||
p4d_populate(&init_mm, p4dp, pudp);
|
||||
}
|
||||
pudp = pud_offset(p4dp, ea);
|
||||
if (kasan_pmd_table(*pudp)) {
|
||||
pmdp = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
|
||||
pmdp = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
|
||||
memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE);
|
||||
pud_populate(&init_mm, pudp, pmdp);
|
||||
}
|
||||
pmdp = pmd_offset(pudp, ea);
|
||||
if (kasan_pte_table(*pmdp)) {
|
||||
ptep = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
|
||||
ptep = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
|
||||
memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE);
|
||||
pmd_populate_kernel(&init_mm, pmdp, ptep);
|
||||
}
|
||||
@ -74,7 +74,7 @@ static void __init kasan_init_phys_region(void *start, void *end)
|
||||
k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE);
|
||||
k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE);
|
||||
|
||||
va = memblock_alloc(k_end - k_start, PAGE_SIZE);
|
||||
va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE);
|
||||
for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE)
|
||||
kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL);
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ static void __init kasan_init_phys_region(void *start, void *end)
|
||||
k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE);
|
||||
k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE);
|
||||
|
||||
va = memblock_alloc(k_end - k_start, PAGE_SIZE);
|
||||
va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE);
|
||||
for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE)
|
||||
map_kernel_page(k_cur, __pa(va), PAGE_KERNEL);
|
||||
}
|
||||
|
@ -385,21 +385,11 @@ void __init mmu_context_init(void)
|
||||
/*
|
||||
* Allocate the maps used by context management
|
||||
*/
|
||||
context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
|
||||
if (!context_map)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
CTX_MAP_SIZE);
|
||||
context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1),
|
||||
context_map = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES);
|
||||
context_mm = memblock_alloc_or_panic(sizeof(void *) * (LAST_CONTEXT + 1),
|
||||
SMP_CACHE_BYTES);
|
||||
if (!context_mm)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
sizeof(void *) * (LAST_CONTEXT + 1));
|
||||
if (IS_ENABLED(CONFIG_SMP)) {
|
||||
stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
|
||||
if (!stale_map[boot_cpuid])
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__,
|
||||
CTX_MAP_SIZE);
|
||||
|
||||
stale_map[boot_cpuid] = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES);
|
||||
cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
|
||||
"powerpc/mmu/ctx:prepare",
|
||||
mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead);
|
||||
|
@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag)
|
||||
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
|
||||
/* We allow PTE_FRAG_NR fragments from a PTE page */
|
||||
if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
|
||||
pagetable_pte_dtor(ptdesc);
|
||||
pagetable_dtor(ptdesc);
|
||||
pagetable_free(ptdesc);
|
||||
}
|
||||
}
|
||||
@ -111,7 +111,7 @@ static void pte_free_now(struct rcu_head *head)
|
||||
struct ptdesc *ptdesc;
|
||||
|
||||
ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
|
||||
pagetable_pte_dtor(ptdesc);
|
||||
pagetable_dtor(ptdesc);
|
||||
pagetable_free(ptdesc);
|
||||
}
|
||||
|
||||
|
@ -50,13 +50,8 @@ notrace void __init early_ioremap_init(void)
|
||||
|
||||
void __init *early_alloc_pgtable(unsigned long size)
|
||||
{
|
||||
void *ptr = memblock_alloc(size, size);
|
||||
return memblock_alloc_or_panic(size, size);
|
||||
|
||||
if (!ptr)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, size, size);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va)
|
||||
|
@ -514,10 +514,7 @@ static int __init core99_nvram_setup(struct device_node *dp, unsigned long addr)
|
||||
printk(KERN_ERR "nvram: no address\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
nvram_image = memblock_alloc(NVRAM_SIZE, SMP_CACHE_BYTES);
|
||||
if (!nvram_image)
|
||||
panic("%s: Failed to allocate %u bytes\n", __func__,
|
||||
NVRAM_SIZE);
|
||||
nvram_image = memblock_alloc_or_panic(NVRAM_SIZE, SMP_CACHE_BYTES);
|
||||
nvram_data = ioremap(addr, NVRAM_SIZE*2);
|
||||
nvram_naddrs = 1; /* Make sure we get the correct case */
|
||||
|
||||
|
@ -88,26 +88,6 @@ static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
|
||||
}
|
||||
}
|
||||
|
||||
static void memtrace_clear_range(unsigned long start_pfn,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
unsigned long pfn;
|
||||
|
||||
/* As HIGHMEM does not apply, use clear_page() directly. */
|
||||
for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
|
||||
if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
|
||||
cond_resched();
|
||||
clear_page(__va(PFN_PHYS(pfn)));
|
||||
}
|
||||
/*
|
||||
* Before we go ahead and use this range as cache inhibited range
|
||||
* flush the cache.
|
||||
*/
|
||||
flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn),
|
||||
(unsigned long)pfn_to_kaddr(start_pfn + nr_pages),
|
||||
FLUSH_CHUNK_SIZE);
|
||||
}
|
||||
|
||||
static u64 memtrace_alloc_node(u32 nid, u64 size)
|
||||
{
|
||||
const unsigned long nr_pages = PHYS_PFN(size);
|
||||
@ -119,17 +99,18 @@ static u64 memtrace_alloc_node(u32 nid, u64 size)
|
||||
* by alloc_contig_pages().
|
||||
*/
|
||||
page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE |
|
||||
__GFP_NOWARN, nid, NULL);
|
||||
__GFP_NOWARN | __GFP_ZERO, nid, NULL);
|
||||
if (!page)
|
||||
return 0;
|
||||
start_pfn = page_to_pfn(page);
|
||||
|
||||
/*
|
||||
* Clear the range while we still have a linear mapping.
|
||||
*
|
||||
* TODO: use __GFP_ZERO with alloc_contig_pages() once supported.
|
||||
* Before we go ahead and use this range as cache inhibited range
|
||||
* flush the cache.
|
||||
*/
|
||||
memtrace_clear_range(start_pfn, nr_pages);
|
||||
flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn),
|
||||
(unsigned long)pfn_to_kaddr(start_pfn + nr_pages),
|
||||
FLUSH_CHUNK_SIZE);
|
||||
|
||||
/*
|
||||
* Set pages PageOffline(), to indicate that nobody (e.g., hibernation,
|
||||
|
@ -180,10 +180,7 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
|
||||
/*
|
||||
* Allocate a buffer to hold the MC recoverable ranges.
|
||||
*/
|
||||
mc_recoverable_range = memblock_alloc(size, __alignof__(u64));
|
||||
if (!mc_recoverable_range)
|
||||
panic("%s: Failed to allocate %u bytes align=0x%lx\n",
|
||||
__func__, size, __alignof__(u64));
|
||||
mc_recoverable_range = memblock_alloc_or_panic(size, __alignof__(u64));
|
||||
|
||||
for (i = 0; i < mc_recoverable_range_len; i++) {
|
||||
mc_recoverable_range[i].start_addr =
|
||||
|
@ -115,10 +115,7 @@ static void __init prealloc(struct ps3_prealloc *p)
|
||||
if (!p->size)
|
||||
return;
|
||||
|
||||
p->address = memblock_alloc(p->size, p->align);
|
||||
if (!p->address)
|
||||
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
||||
__func__, p->size, p->align);
|
||||
p->address = memblock_alloc_or_panic(p->size, p->align);
|
||||
|
||||
printk(KERN_INFO "%s: %lu bytes at %p\n", p->name, p->size,
|
||||
p->address);
|
||||
|
@ -544,7 +544,7 @@ static int drc_pmem_query_health(struct papr_scm_priv *p)
|
||||
|
||||
/* Jiffies offset for which the health data is assumed to be same */
|
||||
cache_timeout = p->lasthealth_jiffies +
|
||||
msecs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL * 1000);
|
||||
secs_to_jiffies(MIN_HEALTH_QUERY_INTERVAL);
|
||||
|
||||
/* Fetch new health info is its older than MIN_HEALTH_QUERY_INTERVAL */
|
||||
if (time_after(jiffies, cache_timeout))
|
||||
|
@ -124,10 +124,7 @@ int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count,
|
||||
if (bmp->bitmap_from_slab)
|
||||
bmp->bitmap = kzalloc(size, GFP_KERNEL);
|
||||
else {
|
||||
bmp->bitmap = memblock_alloc(size, SMP_CACHE_BYTES);
|
||||
if (!bmp->bitmap)
|
||||
panic("%s: Failed to allocate %u bytes\n", __func__,
|
||||
size);
|
||||
bmp->bitmap = memblock_alloc_or_panic(size, SMP_CACHE_BYTES);
|
||||
/* the bitmap won't be freed from memblock allocator */
|
||||
kmemleak_not_leak(bmp->bitmap);
|
||||
}
|
||||
|
@ -12,17 +12,26 @@
|
||||
#include <asm/tlb.h>
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
#define __HAVE_ARCH_PUD_ALLOC_ONE
|
||||
#define __HAVE_ARCH_PUD_FREE
|
||||
#include <asm-generic/pgalloc.h>
|
||||
|
||||
/*
|
||||
* While riscv platforms with riscv_ipi_for_rfence as true require an IPI to
|
||||
* perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use
|
||||
* SBI to perform TLB shootdown. To keep software pagetable walkers safe in this
|
||||
* case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the
|
||||
* comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
|
||||
* for more details.
|
||||
*/
|
||||
static inline void riscv_tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
|
||||
{
|
||||
if (riscv_use_sbi_for_rfence())
|
||||
if (riscv_use_sbi_for_rfence()) {
|
||||
tlb_remove_ptdesc(tlb, pt);
|
||||
else
|
||||
} else {
|
||||
pagetable_dtor(pt);
|
||||
tlb_remove_page_ptdesc(tlb, pt);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void pmd_populate_kernel(struct mm_struct *mm,
|
||||
pmd_t *pmd, pte_t *pte)
|
||||
@ -88,15 +97,6 @@ static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd,
|
||||
}
|
||||
}
|
||||
|
||||
#define pud_alloc_one pud_alloc_one
|
||||
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
if (pgtable_l4_enabled)
|
||||
return __pud_alloc_one(mm, addr);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define pud_free pud_free
|
||||
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
|
||||
{
|
||||
@ -107,39 +107,8 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
|
||||
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
|
||||
unsigned long addr)
|
||||
{
|
||||
if (pgtable_l4_enabled) {
|
||||
struct ptdesc *ptdesc = virt_to_ptdesc(pud);
|
||||
|
||||
pagetable_pud_dtor(ptdesc);
|
||||
riscv_tlb_remove_ptdesc(tlb, ptdesc);
|
||||
}
|
||||
}
|
||||
|
||||
#define p4d_alloc_one p4d_alloc_one
|
||||
static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
if (pgtable_l5_enabled) {
|
||||
gfp_t gfp = GFP_PGTABLE_USER;
|
||||
|
||||
if (mm == &init_mm)
|
||||
gfp = GFP_PGTABLE_KERNEL;
|
||||
return (p4d_t *)get_zeroed_page(gfp);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d)
|
||||
{
|
||||
BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
|
||||
free_page((unsigned long)p4d);
|
||||
}
|
||||
|
||||
#define p4d_free p4d_free
|
||||
static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
|
||||
{
|
||||
if (pgtable_l5_enabled)
|
||||
__p4d_free(mm, p4d);
|
||||
if (pgtable_l4_enabled)
|
||||
riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
|
||||
}
|
||||
|
||||
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
|
||||
@ -161,9 +130,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
|
||||
pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
|
||||
pgd = __pgd_alloc(mm, 0);
|
||||
if (likely(pgd != NULL)) {
|
||||
memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
|
||||
/* Copy kernel mappings */
|
||||
sync_kernel_mappings(pgd);
|
||||
}
|
||||
@ -175,10 +143,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
|
||||
unsigned long addr)
|
||||
{
|
||||
struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
|
||||
|
||||
pagetable_pmd_dtor(ptdesc);
|
||||
riscv_tlb_remove_ptdesc(tlb, ptdesc);
|
||||
riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
|
||||
}
|
||||
|
||||
#endif /* __PAGETABLE_PMD_FOLDED */
|
||||
@ -186,10 +151,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
|
||||
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
|
||||
unsigned long addr)
|
||||
{
|
||||
struct ptdesc *ptdesc = page_ptdesc(pte);
|
||||
|
||||
pagetable_pte_dtor(ptdesc);
|
||||
riscv_tlb_remove_ptdesc(tlb, ptdesc);
|
||||
riscv_tlb_remove_ptdesc(tlb, page_ptdesc(pte));
|
||||
}
|
||||
#endif /* CONFIG_MMU */
|
||||
|
||||
|
@ -10,24 +10,6 @@ struct mmu_gather;
|
||||
|
||||
static void tlb_flush(struct mmu_gather *tlb);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
#include <linux/swap.h>
|
||||
|
||||
/*
|
||||
* While riscv platforms with riscv_ipi_for_rfence as true require an IPI to
|
||||
* perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use
|
||||
* SBI to perform TLB shootdown. To keep software pagetable walkers safe in this
|
||||
* case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the
|
||||
* comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
|
||||
* for more details.
|
||||
*/
|
||||
static inline void __tlb_remove_table(void *table)
|
||||
{
|
||||
free_page_and_swap_cache(table);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_MMU */
|
||||
|
||||
#define tlb_flush tlb_flush
|
||||
#include <asm-generic/tlb.h>
|
||||
|
||||
|
@ -147,9 +147,7 @@ static void __init init_resources(void)
|
||||
res_idx = num_resources - 1;
|
||||
|
||||
mem_res_sz = num_resources * sizeof(*mem_res);
|
||||
mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES);
|
||||
if (!mem_res)
|
||||
panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz);
|
||||
mem_res = memblock_alloc_or_panic(mem_res_sz, SMP_CACHE_BYTES);
|
||||
|
||||
/*
|
||||
* Start by adding the reserved regions, if they overlap
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user