mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2024-12-28 00:32:00 +00:00
Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
# Conflicts: # include/linux/pagevec.h
This commit is contained in:
commit
5f9df76887
@ -22,6 +22,7 @@ Description:
|
||||
MMUPageSize: 4 kB
|
||||
Rss: 884 kB
|
||||
Pss: 385 kB
|
||||
Pss_Dirty: 68 kB
|
||||
Pss_Anon: 301 kB
|
||||
Pss_File: 80 kB
|
||||
Pss_Shmem: 4 kB
|
||||
|
@ -1667,6 +1667,19 @@
|
||||
|
||||
hlt [BUGS=ARM,SH]
|
||||
|
||||
hostname= [KNL] Set the hostname (aka UTS nodename).
|
||||
Format: <string>
|
||||
This allows setting the system's hostname during early
|
||||
startup. This sets the name returned by gethostname.
|
||||
Using this parameter to set the hostname makes it
|
||||
possible to ensure the hostname is correctly set before
|
||||
any userspace processes run, avoiding the possibility
|
||||
that a process may call gethostname before the hostname
|
||||
has been explicitly set, resulting in the calling
|
||||
process getting an incorrect result. The string must
|
||||
not exceed the maximum allowed hostname length (usually
|
||||
64 characters) and will be truncated otherwise.
|
||||
|
||||
hpet= [X86-32,HPET] option to control HPET usage
|
||||
Format: { enable (default) | disable | force |
|
||||
verbose }
|
||||
@ -1722,9 +1735,11 @@
|
||||
Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
|
||||
the default is on.
|
||||
|
||||
This is not compatible with memory_hotplug.memmap_on_memory.
|
||||
If both parameters are enabled, hugetlb_free_vmemmap takes
|
||||
precedence over memory_hotplug.memmap_on_memory.
|
||||
Note that the vmemmap pages may be allocated from the added
|
||||
memory block itself when memory_hotplug.memmap_on_memory is
|
||||
enabled, those vmemmap pages cannot be optimized even if this
|
||||
feature is enabled. Other vmemmap pages not allocated from
|
||||
the added memory block itself do not be affected.
|
||||
|
||||
hung_task_panic=
|
||||
[KNL] Should the hung task detector generate panics.
|
||||
@ -3083,10 +3098,12 @@
|
||||
[KNL,X86,ARM] Boolean flag to enable this feature.
|
||||
Format: {on | off (default)}
|
||||
When enabled, runtime hotplugged memory will
|
||||
allocate its internal metadata (struct pages)
|
||||
from the hotadded memory which will allow to
|
||||
hotadd a lot of memory without requiring
|
||||
additional memory to do so.
|
||||
allocate its internal metadata (struct pages,
|
||||
those vmemmap pages cannot be optimized even
|
||||
if hugetlb_free_vmemmap is enabled) from the
|
||||
hotadded memory which will allow to hotadd a
|
||||
lot of memory without requiring additional
|
||||
memory to do so.
|
||||
This feature is disabled by default because it
|
||||
has some implication on large (e.g. GB)
|
||||
allocations in some configurations (e.g. small
|
||||
@ -3096,10 +3113,6 @@
|
||||
Note that even when enabled, there are a few cases where
|
||||
the feature is not effective.
|
||||
|
||||
This is not compatible with hugetlb_free_vmemmap. If
|
||||
both parameters are enabled, hugetlb_free_vmemmap takes
|
||||
precedence over memory_hotplug.memmap_on_memory.
|
||||
|
||||
memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
|
||||
Format: <integer>
|
||||
default : 0 <disable>
|
||||
|
@ -14,3 +14,4 @@ optimize those.
|
||||
start
|
||||
usage
|
||||
reclaim
|
||||
lru_sort
|
||||
|
294
Documentation/admin-guide/mm/damon/lru_sort.rst
Normal file
294
Documentation/admin-guide/mm/damon/lru_sort.rst
Normal file
@ -0,0 +1,294 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=============================
|
||||
DAMON-based LRU-lists Sorting
|
||||
=============================
|
||||
|
||||
DAMON-based LRU-lists Sorting (DAMON_LRU_SORT) is a static kernel module that
|
||||
aimed to be used for proactive and lightweight data access pattern based
|
||||
(de)prioritization of pages on their LRU-lists for making LRU-lists a more
|
||||
trusworthy data access pattern source.
|
||||
|
||||
Where Proactive LRU-lists Sorting is Required?
|
||||
==============================================
|
||||
|
||||
As page-granularity access checking overhead could be significant on huge
|
||||
systems, LRU lists are normally not proactively sorted but partially and
|
||||
reactively sorted for special events including specific user requests, system
|
||||
calls and memory pressure. As a result, LRU lists are sometimes not so
|
||||
perfectly prepared to be used as a trustworthy access pattern source for some
|
||||
situations including reclamation target pages selection under sudden memory
|
||||
pressure.
|
||||
|
||||
Because DAMON can identify access patterns of best-effort accuracy while
|
||||
inducing only user-specified range of overhead, proactively running
|
||||
DAMON_LRU_SORT could be helpful for making LRU lists more trustworthy access
|
||||
pattern source with low and controlled overhead.
|
||||
|
||||
How It Works?
|
||||
=============
|
||||
|
||||
DAMON_LRU_SORT finds hot pages (pages of memory regions that showing access
|
||||
rates that higher than a user-specified threshold) and cold pages (pages of
|
||||
memory regions that showing no access for a time that longer than a
|
||||
user-specified threshold) using DAMON, and prioritizes hot pages while
|
||||
deprioritizing cold pages on their LRU-lists. To avoid it consuming too much
|
||||
CPU for the prioritizations, a CPU time usage limit can be configured. Under
|
||||
the limit, it prioritizes and deprioritizes more hot and cold pages first,
|
||||
respectively. System administrators can also configure under what situation
|
||||
this scheme should automatically activated and deactivated with three memory
|
||||
pressure watermarks.
|
||||
|
||||
Its default parameters for hotness/coldness thresholds and CPU quota limit are
|
||||
conservatively chosen. That is, the module under its default parameters could
|
||||
be widely used without harm for common situations while providing a level of
|
||||
benefits for systems having clear hot/cold access patterns under memory
|
||||
pressure while consuming only a limited small portion of CPU time.
|
||||
|
||||
Interface: Module Parameters
|
||||
============================
|
||||
|
||||
To use this feature, you should first ensure your system is running on a kernel
|
||||
that is built with ``CONFIG_DAMON_LRU_SORT=y``.
|
||||
|
||||
To let sysadmins enable or disable it and tune for the given system,
|
||||
DAMON_LRU_SORT utilizes module parameters. That is, you can put
|
||||
``damon_lru_sort.<parameter>=<value>`` on the kernel boot command line or write
|
||||
proper values to ``/sys/modules/damon_lru_sort/parameters/<parameter>`` files.
|
||||
|
||||
Below are the description of each parameter.
|
||||
|
||||
enabled
|
||||
-------
|
||||
|
||||
Enable or disable DAMON_LRU_SORT.
|
||||
|
||||
You can enable DAMON_LRU_SORT by setting the value of this parameter as ``Y``.
|
||||
Setting it as ``N`` disables DAMON_LRU_SORT. Note that DAMON_LRU_SORT could do
|
||||
no real monitoring and LRU-lists sorting due to the watermarks-based activation
|
||||
condition. Refer to below descriptions for the watermarks parameter for this.
|
||||
|
||||
commit_inputs
|
||||
-------------
|
||||
|
||||
Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``.
|
||||
|
||||
Input parameters that updated while DAMON_LRU_SORT is running are not applied
|
||||
by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT reads values
|
||||
of parametrs except ``enabled`` again. Once the re-reading is done, this
|
||||
parameter is set as ``N``. If invalid parameters are found while the
|
||||
re-reading, DAMON_LRU_SORT will be disabled.
|
||||
|
||||
hot_thres_access_freq
|
||||
---------------------
|
||||
|
||||
Access frequency threshold for hot memory regions identification in permil.
|
||||
|
||||
If a memory region is accessed in frequency of this or higher, DAMON_LRU_SORT
|
||||
identifies the region as hot, and mark it as accessed on the LRU list, so that
|
||||
it could not be reclaimed under memory pressure. 50% by default.
|
||||
|
||||
cold_min_age
|
||||
------------
|
||||
|
||||
Time threshold for cold memory regions identification in microseconds.
|
||||
|
||||
If a memory region is not accessed for this or longer time, DAMON_LRU_SORT
|
||||
identifies the region as cold, and mark it as unaccessed on the LRU list, so
|
||||
that it could be reclaimed first under memory pressure. 120 seconds by
|
||||
default.
|
||||
|
||||
quota_ms
|
||||
--------
|
||||
|
||||
Limit of time for trying the LRU lists sorting in milliseconds.
|
||||
|
||||
DAMON_LRU_SORT tries to use only up to this time within a time window
|
||||
(quota_reset_interval_ms) for trying LRU lists sorting. This can be used
|
||||
for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the
|
||||
limit is disabled.
|
||||
|
||||
10 ms by default.
|
||||
|
||||
quota_reset_interval_ms
|
||||
-----------------------
|
||||
|
||||
The time quota charge reset interval in milliseconds.
|
||||
|
||||
The charge reset interval for the quota of time (quota_ms). That is,
|
||||
DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms
|
||||
milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds.
|
||||
|
||||
1 second by default.
|
||||
|
||||
wmarks_interval
|
||||
---------------
|
||||
|
||||
The watermarks check time interval in microseconds.
|
||||
|
||||
Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is
|
||||
enabled but inactive due to its watermarks rule. 5 seconds by default.
|
||||
|
||||
wmarks_high
|
||||
-----------
|
||||
|
||||
Free memory rate (per thousand) for the high watermark.
|
||||
|
||||
If free memory of the system in bytes per thousand bytes is higher than this,
|
||||
DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the
|
||||
watermarks. 200 (20%) by default.
|
||||
|
||||
wmarks_mid
|
||||
----------
|
||||
|
||||
Free memory rate (per thousand) for the middle watermark.
|
||||
|
||||
If free memory of the system in bytes per thousand bytes is between this and
|
||||
the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring and
|
||||
the LRU-lists sorting. 150 (15%) by default.
|
||||
|
||||
wmarks_low
|
||||
----------
|
||||
|
||||
Free memory rate (per thousand) for the low watermark.
|
||||
|
||||
If free memory of the system in bytes per thousand bytes is lower than this,
|
||||
DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the
|
||||
watermarks. 50 (5%) by default.
|
||||
|
||||
sample_interval
|
||||
---------------
|
||||
|
||||
Sampling interval for the monitoring in microseconds.
|
||||
|
||||
The sampling interval of DAMON for the cold memory monitoring. Please refer to
|
||||
the DAMON documentation (:doc:`usage`) for more detail. 5ms by default.
|
||||
|
||||
aggr_interval
|
||||
-------------
|
||||
|
||||
Aggregation interval for the monitoring in microseconds.
|
||||
|
||||
The aggregation interval of DAMON for the cold memory monitoring. Please
|
||||
refer to the DAMON documentation (:doc:`usage`) for more detail. 100ms by
|
||||
default.
|
||||
|
||||
min_nr_regions
|
||||
--------------
|
||||
|
||||
Minimum number of monitoring regions.
|
||||
|
||||
The minimal number of monitoring regions of DAMON for the cold memory
|
||||
monitoring. This can be used to set lower-bound of the monitoring quality.
|
||||
But, setting this too high could result in increased monitoring overhead.
|
||||
Please refer to the DAMON documentation (:doc:`usage`) for more detail. 10 by
|
||||
default.
|
||||
|
||||
max_nr_regions
|
||||
--------------
|
||||
|
||||
Maximum number of monitoring regions.
|
||||
|
||||
The maximum number of monitoring regions of DAMON for the cold memory
|
||||
monitoring. This can be used to set upper-bound of the monitoring overhead.
|
||||
However, setting this too low could result in bad monitoring quality. Please
|
||||
refer to the DAMON documentation (:doc:`usage`) for more detail. 1000 by
|
||||
defaults.
|
||||
|
||||
monitor_region_start
|
||||
--------------------
|
||||
|
||||
Start of target memory region in physical address.
|
||||
|
||||
The start physical address of memory region that DAMON_LRU_SORT will do work
|
||||
against. By default, biggest System RAM is used as the region.
|
||||
|
||||
monitor_region_end
|
||||
------------------
|
||||
|
||||
End of target memory region in physical address.
|
||||
|
||||
The end physical address of memory region that DAMON_LRU_SORT will do work
|
||||
against. By default, biggest System RAM is used as the region.
|
||||
|
||||
kdamond_pid
|
||||
-----------
|
||||
|
||||
PID of the DAMON thread.
|
||||
|
||||
If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. Else,
|
||||
-1.
|
||||
|
||||
nr_lru_sort_tried_hot_regions
|
||||
-----------------------------
|
||||
|
||||
Number of hot memory regions that tried to be LRU-sorted.
|
||||
|
||||
bytes_lru_sort_tried_hot_regions
|
||||
--------------------------------
|
||||
|
||||
Total bytes of hot memory regions that tried to be LRU-sorted.
|
||||
|
||||
nr_lru_sorted_hot_regions
|
||||
-------------------------
|
||||
|
||||
Number of hot memory regions that successfully be LRU-sorted.
|
||||
|
||||
bytes_lru_sorted_hot_regions
|
||||
----------------------------
|
||||
|
||||
Total bytes of hot memory regions that successfully be LRU-sorted.
|
||||
|
||||
nr_hot_quota_exceeds
|
||||
--------------------
|
||||
|
||||
Number of times that the time quota limit for hot regions have exceeded.
|
||||
|
||||
nr_lru_sort_tried_cold_regions
|
||||
------------------------------
|
||||
|
||||
Number of cold memory regions that tried to be LRU-sorted.
|
||||
|
||||
bytes_lru_sort_tried_cold_regions
|
||||
---------------------------------
|
||||
|
||||
Total bytes of cold memory regions that tried to be LRU-sorted.
|
||||
|
||||
nr_lru_sorted_cold_regions
|
||||
--------------------------
|
||||
|
||||
Number of cold memory regions that successfully be LRU-sorted.
|
||||
|
||||
bytes_lru_sorted_cold_regions
|
||||
-----------------------------
|
||||
|
||||
Total bytes of cold memory regions that successfully be LRU-sorted.
|
||||
|
||||
nr_cold_quota_exceeds
|
||||
---------------------
|
||||
|
||||
Number of times that the time quota limit for cold regions have exceeded.
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
Below runtime example commands make DAMON_LRU_SORT to find memory regions
|
||||
having >=50% access frequency and LRU-prioritize while LRU-deprioritizing
|
||||
memory regions that not accessed for 120 seconds. The prioritization and
|
||||
deprioritization is limited to be done using only up to 1% CPU time to avoid
|
||||
DAMON_LRU_SORT consuming too much CPU time for the (de)prioritization. It also
|
||||
asks DAMON_LRU_SORT to do nothing if the system's free memory rate is more than
|
||||
50%, but start the real works if it becomes lower than 40%. If DAMON_RECLAIM
|
||||
doesn't make progress and therefore the free memory rate becomes lower than
|
||||
20%, it asks DAMON_LRU_SORT to do nothing again, so that we can fall back to
|
||||
the LRU-list based page granularity reclamation. ::
|
||||
|
||||
# cd /sys/modules/damon_lru_sort/parameters
|
||||
# echo 500 > hot_thres_access_freq
|
||||
# echo 120000000 > cold_min_age
|
||||
# echo 10 > quota_ms
|
||||
# echo 1000 > quota_reset_interval_ms
|
||||
# echo 500 > wmarks_high
|
||||
# echo 400 > wmarks_mid
|
||||
# echo 200 > wmarks_low
|
||||
# echo Y > enabled
|
@ -48,12 +48,6 @@ DAMON_RECLAIM utilizes module parameters. That is, you can put
|
||||
``damon_reclaim.<parameter>=<value>`` on the kernel boot command line or write
|
||||
proper values to ``/sys/modules/damon_reclaim/parameters/<parameter>`` files.
|
||||
|
||||
Note that the parameter values except ``enabled`` are applied only when
|
||||
DAMON_RECLAIM starts. Therefore, if you want to apply new parameter values in
|
||||
runtime and DAMON_RECLAIM is already enabled, you should disable and re-enable
|
||||
it via ``enabled`` parameter file. Writing of the new values to proper
|
||||
parameter values should be done before the re-enablement.
|
||||
|
||||
Below are the description of each parameter.
|
||||
|
||||
enabled
|
||||
|
@ -264,6 +264,8 @@ that can be written to and read from the file and their meaning are as below.
|
||||
- ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
|
||||
- ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
|
||||
- ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
|
||||
- ``lru_prio``: Prioritize the region on its LRU lists.
|
||||
- ``lru_deprio``: Deprioritize the region on its LRU lists.
|
||||
- ``stat``: Do nothing but count the statistics
|
||||
|
||||
schemes/<N>/access_pattern/
|
||||
|
@ -36,6 +36,7 @@ the Linux memory management.
|
||||
numa_memory_policy
|
||||
numaperf
|
||||
pagemap
|
||||
shrinker_debugfs
|
||||
soft-dirty
|
||||
swap_numa
|
||||
transhuge
|
||||
|
135
Documentation/admin-guide/mm/shrinker_debugfs.rst
Normal file
135
Documentation/admin-guide/mm/shrinker_debugfs.rst
Normal file
@ -0,0 +1,135 @@
|
||||
.. _shrinker_debugfs:
|
||||
|
||||
==========================
|
||||
Shrinker Debugfs Interface
|
||||
==========================
|
||||
|
||||
Shrinker debugfs interface provides a visibility into the kernel memory
|
||||
shrinkers subsystem and allows to get information about individual shrinkers
|
||||
and interact with them.
|
||||
|
||||
For each shrinker registered in the system a directory in **<debugfs>/shrinker/**
|
||||
is created. The directory's name is composed from the shrinker's name and an
|
||||
unique id: e.g. *kfree_rcu-0* or *sb-xfs:vda1-36*.
|
||||
|
||||
Each shrinker directory contains **count** and **scan** files, which allow to
|
||||
trigger *count_objects()* and *scan_objects()* callbacks for each memcg and
|
||||
numa node (if applicable).
|
||||
|
||||
Usage:
|
||||
------
|
||||
|
||||
1. *List registered shrinkers*
|
||||
|
||||
::
|
||||
|
||||
$ cd /sys/kernel/debug/shrinker/
|
||||
$ ls
|
||||
dquota-cache-16 sb-devpts-28 sb-proc-47 sb-tmpfs-42
|
||||
mm-shadow-18 sb-devtmpfs-5 sb-proc-48 sb-tmpfs-43
|
||||
mm-zspool:zram0-34 sb-hugetlbfs-17 sb-pstore-31 sb-tmpfs-44
|
||||
rcu-kfree-0 sb-hugetlbfs-33 sb-rootfs-2 sb-tmpfs-49
|
||||
sb-aio-20 sb-iomem-12 sb-securityfs-6 sb-tracefs-13
|
||||
sb-anon_inodefs-15 sb-mqueue-21 sb-selinuxfs-22 sb-xfs:vda1-36
|
||||
sb-bdev-3 sb-nsfs-4 sb-sockfs-8 sb-zsmalloc-19
|
||||
sb-bpf-32 sb-pipefs-14 sb-sysfs-26 thp-deferred_split-10
|
||||
sb-btrfs:vda2-24 sb-proc-25 sb-tmpfs-1 thp-zero-9
|
||||
sb-cgroup2-30 sb-proc-39 sb-tmpfs-27 xfs-buf:vda1-37
|
||||
sb-configfs-23 sb-proc-41 sb-tmpfs-29 xfs-inodegc:vda1-38
|
||||
sb-dax-11 sb-proc-45 sb-tmpfs-35
|
||||
sb-debugfs-7 sb-proc-46 sb-tmpfs-40
|
||||
|
||||
2. *Get information about a specific shrinker*
|
||||
|
||||
::
|
||||
|
||||
$ cd sb-btrfs\:vda2-24/
|
||||
$ ls
|
||||
count scan
|
||||
|
||||
3. *Count objects*
|
||||
|
||||
Each line in the output has the following format::
|
||||
|
||||
<cgroup inode id> <nr of objects on node 0> <nr of objects on node 1> ...
|
||||
<cgroup inode id> <nr of objects on node 0> <nr of objects on node 1> ...
|
||||
...
|
||||
|
||||
If there are no objects on all numa nodes, a line is omitted. If there
|
||||
are no objects at all, the output might be empty.
|
||||
|
||||
If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is printed
|
||||
as cgroup inode id. If the shrinker is not numa-aware, 0's are printed
|
||||
for all nodes except the first one.
|
||||
::
|
||||
|
||||
$ cat count
|
||||
1 224 2
|
||||
21 98 0
|
||||
55 818 10
|
||||
2367 2 0
|
||||
2401 30 0
|
||||
225 13 0
|
||||
599 35 0
|
||||
939 124 0
|
||||
1041 3 0
|
||||
1075 1 0
|
||||
1109 1 0
|
||||
1279 60 0
|
||||
1313 7 0
|
||||
1347 39 0
|
||||
1381 3 0
|
||||
1449 14 0
|
||||
1483 63 0
|
||||
1517 53 0
|
||||
1551 6 0
|
||||
1585 1 0
|
||||
1619 6 0
|
||||
1653 40 0
|
||||
1687 11 0
|
||||
1721 8 0
|
||||
1755 4 0
|
||||
1789 52 0
|
||||
1823 888 0
|
||||
1857 1 0
|
||||
1925 2 0
|
||||
1959 32 0
|
||||
2027 22 0
|
||||
2061 9 0
|
||||
2469 799 0
|
||||
2537 861 0
|
||||
2639 1 0
|
||||
2707 70 0
|
||||
2775 4 0
|
||||
2877 84 0
|
||||
293 1 0
|
||||
735 8 0
|
||||
|
||||
4. *Scan objects*
|
||||
|
||||
The expected input format::
|
||||
|
||||
<cgroup inode id> <numa id> <number of objects to scan>
|
||||
|
||||
For a non-memcg-aware shrinker or on a system with no memory
|
||||
cgrups **0** should be passed as cgroup id.
|
||||
::
|
||||
|
||||
$ cd /sys/kernel/debug/shrinker/
|
||||
$ cd sb-btrfs\:vda2-24/
|
||||
|
||||
$ cat count | head -n 5
|
||||
1 212 0
|
||||
21 97 0
|
||||
55 802 5
|
||||
2367 2 0
|
||||
225 13 0
|
||||
|
||||
$ echo "55 0 200" > scan
|
||||
|
||||
$ cat count | head -n 5
|
||||
1 212 0
|
||||
21 96 0
|
||||
55 752 5
|
||||
2367 2 0
|
||||
225 13 0
|
@ -17,7 +17,10 @@ of the ``PROT_NONE+SIGSEGV`` trick.
|
||||
Design
|
||||
======
|
||||
|
||||
Userfaults are delivered and resolved through the ``userfaultfd`` syscall.
|
||||
Userspace creates a new userfaultfd, initializes it, and registers one or more
|
||||
regions of virtual memory with it. Then, any page faults which occur within the
|
||||
region(s) result in a message being delivered to the userfaultfd, notifying
|
||||
userspace of the fault.
|
||||
|
||||
The ``userfaultfd`` (aside from registering and unregistering virtual
|
||||
memory ranges) provides two primary functionalities:
|
||||
@ -34,12 +37,11 @@ The real advantage of userfaults if compared to regular virtual memory
|
||||
management of mremap/mprotect is that the userfaults in all their
|
||||
operations never involve heavyweight structures like vmas (in fact the
|
||||
``userfaultfd`` runtime load never takes the mmap_lock for writing).
|
||||
|
||||
Vmas are not suitable for page- (or hugepage) granular fault tracking
|
||||
when dealing with virtual address spaces that could span
|
||||
Terabytes. Too many vmas would be needed for that.
|
||||
|
||||
The ``userfaultfd`` once opened by invoking the syscall, can also be
|
||||
The ``userfaultfd``, once created, can also be
|
||||
passed using unix domain sockets to a manager process, so the same
|
||||
manager process could handle the userfaults of a multitude of
|
||||
different processes without them being aware about what is going on
|
||||
@ -50,6 +52,38 @@ is a corner case that would currently return ``-EBUSY``).
|
||||
API
|
||||
===
|
||||
|
||||
Creating a userfaultfd
|
||||
----------------------
|
||||
|
||||
There are two ways to create a new userfaultfd, each of which provide ways to
|
||||
restrict access to this functionality (since historically userfaultfds which
|
||||
handle kernel page faults have been a useful tool for exploiting the kernel).
|
||||
|
||||
The first way, supported by older kernels, is the userfaultfd(2) syscall.
|
||||
Access to this is controlled in several ways:
|
||||
|
||||
- By default, the userfaultfd will be able to handle kernel page faults. This
|
||||
can be disabled by passing in UFFD_USER_MODE_ONLY.
|
||||
|
||||
- If vm.unprivileged_userfaultfd is 0, then the caller must *either* have
|
||||
CAP_SYS_PTRACE, or pass in UFFD_USER_MODE_ONLY.
|
||||
|
||||
- If vm.unprivileged_userfaultfd is 1, then no particular privilege is needed to
|
||||
use this syscall, even if UFFD_USER_MODE_ONLY is *not* set.
|
||||
|
||||
The second way, added to the kernel more recently, is by opening and issuing a
|
||||
USERFAULTFD_IOC_NEW ioctl to /dev/userfaultfd. This method yields equivalent
|
||||
userfaultfds to the userfaultfd(2) syscall; its benefit is in how access to
|
||||
creating userfaultfds is controlled.
|
||||
|
||||
Access to /dev/userfaultfd is controlled via normal filesystem permissions
|
||||
(user/group/mode for example), which gives fine grained access to userfaultfd
|
||||
specifically, without also granting other unrelated privileges at the same time
|
||||
(as e.g. granting CAP_SYS_PTRACE would do).
|
||||
|
||||
Initializing up a userfaultfd
|
||||
-----------------------------
|
||||
|
||||
When first opened the ``userfaultfd`` must be enabled invoking the
|
||||
``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or
|
||||
a later API version) which will specify the ``read/POLLIN`` protocol
|
||||
|
@ -565,9 +565,8 @@ See Documentation/admin-guide/mm/hugetlbpage.rst
|
||||
hugetlb_optimize_vmemmap
|
||||
========================
|
||||
|
||||
This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
|
||||
is configured or the size of 'struct page' (a structure defined in
|
||||
include/linux/mm_types.h) is not power of two (an unusual system config could
|
||||
This knob is not available when the size of 'struct page' (a structure defined
|
||||
in include/linux/mm_types.h) is not power of two (an unusual system config could
|
||||
result in this).
|
||||
|
||||
Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
|
||||
@ -928,6 +927,9 @@ calls without any restrictions.
|
||||
|
||||
The default value is 0.
|
||||
|
||||
An alternative to this sysctl / the userfaultfd(2) syscall is to create
|
||||
userfaultfds via /dev/userfaultfd. See
|
||||
Documentation/admin-guide/mm/userfaultfd.rst.
|
||||
|
||||
user_reserve_kbytes
|
||||
===================
|
||||
|
@ -36,6 +36,7 @@ Library functionality that is used throughout the kernel.
|
||||
kref
|
||||
assoc_array
|
||||
xarray
|
||||
maple_tree
|
||||
idr
|
||||
circular-buffers
|
||||
rbtree
|
||||
|
217
Documentation/core-api/maple_tree.rst
Normal file
217
Documentation/core-api/maple_tree.rst
Normal file
@ -0,0 +1,217 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0+
|
||||
|
||||
|
||||
==========
|
||||
Maple Tree
|
||||
==========
|
||||
|
||||
:Author: Liam R. Howlett
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
The Maple Tree is a B-Tree data type which is optimized for storing
|
||||
non-overlapping ranges, including ranges of size 1. The tree was designed to
|
||||
be simple to use and does not require a user written search method. It
|
||||
supports iterating over a range of entries and going to the previous or next
|
||||
entry in a cache-efficient manner. The tree can also be put into an RCU-safe
|
||||
mode of operation which allows reading and writing concurrently. Writers must
|
||||
synchronize on a lock, which can be the default spinlock, or the user can set
|
||||
the lock to an external lock of a different type.
|
||||
|
||||
The Maple Tree maintains a small memory footprint and was designed to use
|
||||
modern processor cache efficiently. The majority of the users will be able to
|
||||
use the normal API. An :ref:`maple-tree-advanced-api` exists for more complex
|
||||
scenarios. The most important usage of the Maple Tree is the tracking of the
|
||||
virtual memory areas.
|
||||
|
||||
The Maple Tree can store values between ``0`` and ``ULONG_MAX``. The Maple
|
||||
Tree reserves values with the bottom two bits set to '10' which are below 4096
|
||||
(ie 2, 6, 10 .. 4094) for internal use. If the entries may use reserved
|
||||
entries then the users can convert the entries using xa_mk_value() and convert
|
||||
them back by calling xa_to_value(). If the user needs to use a reserved
|
||||
value, then the user can convert the value when using the
|
||||
:ref:`maple-tree-advanced-api`, but are blocked by the normal API.
|
||||
|
||||
The Maple Tree can also be configured to support searching for a gap of a given
|
||||
size (or larger).
|
||||
|
||||
Pre-allocating of nodes is also supported using the
|
||||
:ref:`maple-tree-advanced-api`. This is useful for users who must guarantee a
|
||||
successful store operation within a given
|
||||
code segment when allocating cannot be done. Allocations of nodes are
|
||||
relatively small at around 256 bytes.
|
||||
|
||||
.. _maple-tree-normal-api:
|
||||
|
||||
Normal API
|
||||
==========
|
||||
|
||||
Start by initialising a maple tree, either with DEFINE_MTREE() for statically
|
||||
allocated maple trees or mt_init() for dynamically allocated ones. A
|
||||
freshly-initialised maple tree contains a ``NULL`` pointer for the range ``0``
|
||||
- ``ULONG_MAX``. There are currently two types of maple trees supported: the
|
||||
allocation tree and the regular tree. The regular tree has a higher branching
|
||||
factor for internal nodes. The allocation tree has a lower branching factor
|
||||
but allows the user to search for a gap of a given size or larger from either
|
||||
``0`` upwards or ``ULONG_MAX`` down. An allocation tree can be used by
|
||||
passing in the ``MT_FLAGS_ALLOC_RANGE`` flag when initialising the tree.
|
||||
|
||||
You can then set entries using mtree_store() or mtree_store_range().
|
||||
mtree_store() will overwrite any entry with the new entry and return 0 on
|
||||
success or an error code otherwise. mtree_store_range() works in the same way
|
||||
but takes a range. mtree_load() is used to retrieve the entry stored at a
|
||||
given index. You can use mtree_erase() to erase an entire range by only
|
||||
knowing one value within that range, or mtree_store() call with an entry of
|
||||
NULL may be used to partially erase a range or many ranges at once.
|
||||
|
||||
If you want to only store a new entry to a range (or index) if that range is
|
||||
currently ``NULL``, you can use mtree_insert_range() or mtree_insert() which
|
||||
return -EEXIST if the range is not empty.
|
||||
|
||||
You can search for an entry from an index upwards by using mt_find().
|
||||
|
||||
You can walk each entry within a range by calling mt_for_each(). You must
|
||||
provide a temporary variable to store a cursor. If you want to walk each
|
||||
element of the tree then ``0`` and ``ULONG_MAX`` may be used as the range. If
|
||||
the caller is going to hold the lock for the duration of the walk then it is
|
||||
worth looking at the mas_for_each() API in the :ref:`maple-tree-advanced-api`
|
||||
section.
|
||||
|
||||
Sometimes it is necessary to ensure the next call to store to a maple tree does
|
||||
not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case.
|
||||
|
||||
Finally, you can remove all entries from a maple tree by calling
|
||||
mtree_destroy(). If the maple tree entries are pointers, you may wish to free
|
||||
the entries first.
|
||||
|
||||
Allocating Nodes
|
||||
----------------
|
||||
|
||||
The allocations are handled by the internal tree code. See
|
||||
:ref:`maple-tree-advanced-alloc` for other options.
|
||||
|
||||
Locking
|
||||
-------
|
||||
|
||||
You do not have to worry about locking. See :ref:`maple-tree-advanced-locks`
|
||||
for other options.
|
||||
|
||||
The Maple Tree uses RCU and an internal spinlock to synchronise access:
|
||||
|
||||
Takes RCU read lock:
|
||||
* mtree_load()
|
||||
* mt_find()
|
||||
* mt_for_each()
|
||||
* mt_next()
|
||||
* mt_prev()
|
||||
|
||||
Takes ma_lock internally:
|
||||
* mtree_store()
|
||||
* mtree_store_range()
|
||||
* mtree_insert()
|
||||
* mtree_insert_range()
|
||||
* mtree_erase()
|
||||
* mtree_destroy()
|
||||
* mt_set_in_rcu()
|
||||
* mt_clear_in_rcu()
|
||||
|
||||
If you want to take advantage of the internal lock to protect the data
|
||||
structures that you are storing in the Maple Tree, you can call mtree_lock()
|
||||
before calling mtree_load(), then take a reference count on the object you
|
||||
have found before calling mtree_unlock(). This will prevent stores from
|
||||
removing the object from the tree between looking up the object and
|
||||
incrementing the refcount. You can also use RCU to avoid dereferencing
|
||||
freed memory, but an explanation of that is beyond the scope of this
|
||||
document.
|
||||
|
||||
.. _maple-tree-advanced-api:
|
||||
|
||||
Advanced API
|
||||
============
|
||||
|
||||
The advanced API offers more flexibility and better performance at the
|
||||
cost of an interface which can be harder to use and has fewer safeguards.
|
||||
You must take care of your own locking while using the advanced API.
|
||||
You can use the ma_lock, RCU or an external lock for protection.
|
||||
You can mix advanced and normal operations on the same array, as long
|
||||
as the locking is compatible. The :ref:`maple-tree-normal-api` is implemented
|
||||
in terms of the advanced API.
|
||||
|
||||
The advanced API is based around the ma_state, this is where the 'mas'
|
||||
prefix originates. The ma_state struct keeps track of tree operations to make
|
||||
life easier for both internal and external tree users.
|
||||
|
||||
Initialising the maple tree is the same as in the :ref:`maple-tree-normal-api`.
|
||||
Please see above.
|
||||
|
||||
The maple state keeps track of the range start and end in mas->index and
|
||||
mas->last, respectively.
|
||||
|
||||
mas_walk() will walk the tree to the location of mas->index and set the
|
||||
mas->index and mas->last according to the range for the entry.
|
||||
|
||||
You can set entries using mas_store(). mas_store() will overwrite any entry
|
||||
with the new entry and return the first existing entry that is overwritten.
|
||||
The range is passed in as members of the maple state: index and last.
|
||||
|
||||
You can use mas_erase() to erase an entire range by setting index and
|
||||
last of the maple state to the desired range to erase. This will erase
|
||||
the first range that is found in that range, set the maple state index
|
||||
and last as the range that was erased and return the entry that existed
|
||||
at that location.
|
||||
|
||||
You can walk each entry within a range by using mas_for_each(). If you want
|
||||
to walk each element of the tree then ``0`` and ``ULONG_MAX`` may be used as
|
||||
the range. If the lock needs to be periodically dropped, see the locking
|
||||
section mas_pause().
|
||||
|
||||
Using a maple state allows mas_next() and mas_prev() to function as if the
|
||||
tree was a linked list. With such a high branching factor the amortized
|
||||
performance penalty is outweighed by cache optimization. mas_next() will
|
||||
return the next entry which occurs after the entry at index. mas_prev()
|
||||
will return the previous entry which occurs before the entry at index.
|
||||
|
||||
mas_find() will find the first entry which exists at or above index on
|
||||
the first call, and the next entry from every subsequent calls.
|
||||
|
||||
mas_find_rev() will find the fist entry which exists at or below the last on
|
||||
the first call, and the previous entry from every subsequent calls.
|
||||
|
||||
If the user needs to yield the lock during an operation, then the maple state
|
||||
must be paused using mas_pause().
|
||||
|
||||
There are a few extra interfaces provided when using an allocation tree.
|
||||
If you wish to search for a gap within a range, then mas_empty_area()
|
||||
or mas_empty_area_rev() can be used. mas_empty_area() searches for a gap
|
||||
starting at the lowest index given up to the maximum of the range.
|
||||
mas_empty_area_rev() searches for a gap starting at the highest index given
|
||||
and continues downward to the lower bound of the range.
|
||||
|
||||
.. _maple-tree-advanced-alloc:
|
||||
|
||||
Advanced Allocating Nodes
|
||||
-------------------------
|
||||
|
||||
Allocations are usually handled internally to the tree, however if allocations
|
||||
need to occur before a write occurs then calling mas_expected_entries() will
|
||||
allocate the worst-case number of needed nodes to insert the provided number of
|
||||
ranges. This also causes the tree to enter mass insertion mode. Once
|
||||
insertions are complete calling mas_destroy() on the maple state will free the
|
||||
unused allocations.
|
||||
|
||||
.. _maple-tree-advanced-locks:
|
||||
|
||||
Advanced Locking
|
||||
----------------
|
||||
|
||||
The maple tree uses a spinlock by default, but external locks can be used for
|
||||
tree updates as well. To use an external lock, the tree must be initialized
|
||||
with the ``MT_FLAGS_LOCK_EXTERN flag``, this is usually done with the
|
||||
MTREE_INIT_EXT() #define, which takes an external lock as an argument.
|
||||
|
||||
Functions and structures
|
||||
========================
|
||||
|
||||
.. kernel-doc:: include/linux/maple_tree.h
|
||||
.. kernel-doc:: lib/maple_tree.c
|
@ -448,6 +448,7 @@ Memory Area, or VMA) there is a series of lines such as the following::
|
||||
MMUPageSize: 4 kB
|
||||
Rss: 892 kB
|
||||
Pss: 374 kB
|
||||
Pss_Dirty: 0 kB
|
||||
Shared_Clean: 892 kB
|
||||
Shared_Dirty: 0 kB
|
||||
Private_Clean: 0 kB
|
||||
@ -479,7 +480,9 @@ dirty shared and private pages in the mapping.
|
||||
The "proportional set size" (PSS) of a process is the count of pages it has
|
||||
in memory, where each page is divided by the number of processes sharing it.
|
||||
So if a process has 1000 pages all to itself, and 1000 shared with one other
|
||||
process, its PSS will be 1500.
|
||||
process, its PSS will be 1500. "Pss_Dirty" is the portion of PSS which
|
||||
consists of dirty pages. ("Pss_Clean" is not included, but it can be
|
||||
calculated by subtracting "Pss_Dirty" from "Pss".)
|
||||
|
||||
Note that even a page which is part of a MAP_SHARED mapping, but has only
|
||||
a single pte mapped, i.e. is currently used by only one process, is accounted
|
||||
@ -514,8 +517,10 @@ replaced by copy-on-write) part of the underlying shmem object out on swap.
|
||||
"SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
|
||||
does not take into account swapped out page of underlying shmem objects.
|
||||
"Locked" indicates whether the mapping is locked in memory or not.
|
||||
|
||||
"THPeligible" indicates whether the mapping is eligible for allocating THP
|
||||
pages - 1 if true, 0 otherwise. It just shows the current status.
|
||||
pages as well as the THP is PMD mappable or not - 1 if true, 0 otherwise.
|
||||
It just shows the current status.
|
||||
|
||||
"VmFlags" field deserves a separate description. This member represents the
|
||||
kernel flags associated with the particular virtual memory area in two letter
|
||||
@ -1886,13 +1891,14 @@ if precise results are needed.
|
||||
3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file
|
||||
---------------------------------------------------------------
|
||||
This file provides information associated with an opened file. The regular
|
||||
files have at least four fields -- 'pos', 'flags', 'mnt_id' and 'ino'.
|
||||
files have at least five fields -- 'pos', 'flags', 'mnt_id', 'ino', and 'size'.
|
||||
|
||||
The 'pos' represents the current offset of the opened file in decimal
|
||||
form [see lseek(2) for details], 'flags' denotes the octal O_xxx mask the
|
||||
file has been created with [see open(2) for details] and 'mnt_id' represents
|
||||
mount ID of the file system containing the opened file [see 3.5
|
||||
/proc/<pid>/mountinfo for details]. 'ino' represents the inode number of
|
||||
the file.
|
||||
the file, and 'size' represents the size of the file in bytes.
|
||||
|
||||
A typical output is::
|
||||
|
||||
@ -1900,11 +1906,15 @@ A typical output is::
|
||||
flags: 0100002
|
||||
mnt_id: 19
|
||||
ino: 63107
|
||||
size: 0
|
||||
|
||||
All locks associated with a file descriptor are shown in its fdinfo too::
|
||||
|
||||
lock: 1: FLOCK ADVISORY WRITE 359 00:13:11691 0 EOF
|
||||
|
||||
Files with anonymous inodes have an additional 'path' field which represents
|
||||
the anonymous file path.
|
||||
|
||||
The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags
|
||||
pair provide additional information particular to the objects they represent.
|
||||
|
||||
@ -1917,6 +1927,8 @@ Eventfd files
|
||||
flags: 04002
|
||||
mnt_id: 9
|
||||
ino: 63107
|
||||
size: 0
|
||||
path: anon_inode:[eventfd]
|
||||
eventfd-count: 5a
|
||||
|
||||
where 'eventfd-count' is hex value of a counter.
|
||||
@ -1930,6 +1942,8 @@ Signalfd files
|
||||
flags: 04002
|
||||
mnt_id: 9
|
||||
ino: 63107
|
||||
size: 0
|
||||
path: anon_inode:[signalfd]
|
||||
sigmask: 0000000000000200
|
||||
|
||||
where 'sigmask' is hex value of the signal mask associated
|
||||
@ -1944,6 +1958,8 @@ Epoll files
|
||||
flags: 02
|
||||
mnt_id: 9
|
||||
ino: 63107
|
||||
size: 0
|
||||
path: anon_inode:[eventpoll]
|
||||
tfd: 5 events: 1d data: ffffffffffffffff pos:0 ino:61af sdev:7
|
||||
|
||||
where 'tfd' is a target file descriptor number in decimal form,
|
||||
@ -1962,6 +1978,8 @@ For inotify files the format is the following::
|
||||
flags: 02000000
|
||||
mnt_id: 9
|
||||
ino: 63107
|
||||
size: 0
|
||||
path: anon_inode:inotify
|
||||
inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:7e9e0000640d1b6d
|
||||
|
||||
where 'wd' is a watch descriptor in decimal form, i.e. a target file
|
||||
@ -1985,6 +2003,8 @@ For fanotify files the format is::
|
||||
flags: 02
|
||||
mnt_id: 9
|
||||
ino: 63107
|
||||
size: 0
|
||||
path: anon_inode:[fanotify]
|
||||
fanotify flags:10 event-flags:0
|
||||
fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003
|
||||
fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4
|
||||
@ -2010,6 +2030,8 @@ Timerfd files
|
||||
flags: 02
|
||||
mnt_id: 9
|
||||
ino: 63107
|
||||
size: 0
|
||||
path: anon_inode:[timerfd]
|
||||
clockid: 0
|
||||
ticks: 0
|
||||
settime flags: 01
|
||||
@ -2034,6 +2056,7 @@ DMA Buffer files
|
||||
mnt_id: 9
|
||||
ino: 63107
|
||||
size: 32768
|
||||
path: /dmabuf:
|
||||
count: 2
|
||||
exp_name: system-heap
|
||||
|
||||
|
@ -6,7 +6,7 @@ Memory Balancing
|
||||
|
||||
Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
|
||||
|
||||
Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
|
||||
Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as
|
||||
well as for non __GFP_IO allocations.
|
||||
|
||||
The first reason why a caller may avoid reclaim is that the caller can not
|
||||
|
12
MAINTAINERS
12
MAINTAINERS
@ -11923,6 +11923,18 @@ L: linux-man@vger.kernel.org
|
||||
S: Maintained
|
||||
W: http://www.kernel.org/doc/man-pages
|
||||
|
||||
MAPLE TREE
|
||||
M: Liam R. Howlett <Liam.Howlett@oracle.com>
|
||||
L: linux-mm@kvack.org
|
||||
S: Supported
|
||||
F: Documentation/core-api/maple_tree.rst
|
||||
F: include/linux/maple_tree.h
|
||||
F: include/trace/events/maple_tree.h
|
||||
F: lib/maple_tree.c
|
||||
F: lib/test_maple_tree.c
|
||||
F: tools/testing/radix-tree/linux/maple_tree.h
|
||||
F: tools/testing/radix-tree/maple.c
|
||||
|
||||
MARDUK (CREATOR CI40) DEVICE TREE SUPPORT
|
||||
M: Rahul Bedarkar <rahulbedarkar89@gmail.com>
|
||||
L: linux-mips@vger.kernel.org
|
||||
|
@ -46,9 +46,6 @@ extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, unsigned long sz);
|
||||
#define __HAVE_ARCH_HUGE_PTEP_GET
|
||||
extern pte_t huge_ptep_get(pte_t *ptep);
|
||||
extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte, unsigned long sz);
|
||||
#define set_huge_swap_pte_at set_huge_swap_pte_at
|
||||
|
||||
void __init arm64_hugetlb_cma_reserve(void);
|
||||
|
||||
|
@ -8,9 +8,9 @@
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/mte.h>
|
||||
|
||||
#define for_each_mte_vma(tsk, vma) \
|
||||
#define for_each_mte_vma(vmi, vma) \
|
||||
if (system_supports_mte()) \
|
||||
for (vma = tsk->mm->mmap; vma; vma = vma->vm_next) \
|
||||
for_each_vma(vmi, vma) \
|
||||
if (vma->vm_flags & VM_MTE)
|
||||
|
||||
static unsigned long mte_vma_tag_dump_size(struct vm_area_struct *vma)
|
||||
@ -81,8 +81,9 @@ Elf_Half elf_core_extra_phdrs(void)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
int vma_count = 0;
|
||||
VMA_ITERATOR(vmi, current->mm, 0);
|
||||
|
||||
for_each_mte_vma(current, vma)
|
||||
for_each_mte_vma(vmi, vma)
|
||||
vma_count++;
|
||||
|
||||
return vma_count;
|
||||
@ -91,8 +92,9 @@ Elf_Half elf_core_extra_phdrs(void)
|
||||
int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, current->mm, 0);
|
||||
|
||||
for_each_mte_vma(current, vma) {
|
||||
for_each_mte_vma(vmi, vma) {
|
||||
struct elf_phdr phdr;
|
||||
|
||||
phdr.p_type = PT_AARCH64_MEMTAG_MTE;
|
||||
@ -116,8 +118,9 @@ size_t elf_core_extra_data_size(void)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
size_t data_size = 0;
|
||||
VMA_ITERATOR(vmi, current->mm, 0);
|
||||
|
||||
for_each_mte_vma(current, vma)
|
||||
for_each_mte_vma(vmi, vma)
|
||||
data_size += mte_vma_tag_dump_size(vma);
|
||||
|
||||
return data_size;
|
||||
@ -126,8 +129,9 @@ size_t elf_core_extra_data_size(void)
|
||||
int elf_core_write_extra_data(struct coredump_params *cprm)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, current->mm, 0);
|
||||
|
||||
for_each_mte_vma(current, vma) {
|
||||
for_each_mte_vma(vmi, vma) {
|
||||
if (vma->vm_flags & VM_DONTDUMP)
|
||||
continue;
|
||||
|
||||
|
@ -136,10 +136,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
||||
{
|
||||
struct mm_struct *mm = task->mm;
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
mmap_read_lock(mm);
|
||||
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
unsigned long size = vma->vm_end - vma->vm_start;
|
||||
|
||||
if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm))
|
||||
|
@ -238,6 +238,13 @@ static void clear_flush(struct mm_struct *mm,
|
||||
flush_tlb_range(&vma, saddr, addr);
|
||||
}
|
||||
|
||||
static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry)
|
||||
{
|
||||
VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry));
|
||||
|
||||
return page_folio(pfn_to_page(swp_offset(entry)));
|
||||
}
|
||||
|
||||
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte)
|
||||
{
|
||||
@ -247,11 +254,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
unsigned long pfn, dpfn;
|
||||
pgprot_t hugeprot;
|
||||
|
||||
/*
|
||||
* Code needs to be expanded to handle huge swap and migration
|
||||
* entries. Needed for HUGETLB and MEMORY_FAILURE.
|
||||
*/
|
||||
WARN_ON(!pte_present(pte));
|
||||
if (!pte_present(pte)) {
|
||||
struct folio *folio;
|
||||
|
||||
folio = hugetlb_swap_entry_to_folio(pte_to_swp_entry(pte));
|
||||
ncontig = num_contig_ptes(folio_size(folio), &pgsize);
|
||||
|
||||
for (i = 0; i < ncontig; i++, ptep++)
|
||||
set_pte_at(mm, addr, ptep, pte);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!pte_cont(pte)) {
|
||||
set_pte_at(mm, addr, ptep, pte);
|
||||
@ -269,18 +281,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
|
||||
}
|
||||
|
||||
void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte, unsigned long sz)
|
||||
{
|
||||
int i, ncontig;
|
||||
size_t pgsize;
|
||||
|
||||
ncontig = num_contig_ptes(sz, &pgsize);
|
||||
|
||||
for (i = 0; i < ncontig; i++, ptep++)
|
||||
set_pte(ptep, pte);
|
||||
}
|
||||
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long sz)
|
||||
{
|
||||
@ -368,6 +368,26 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
unsigned long hugetlb_mask_last_page(struct hstate *h)
|
||||
{
|
||||
unsigned long hp_size = huge_page_size(h);
|
||||
|
||||
switch (hp_size) {
|
||||
case PUD_SIZE:
|
||||
return PGDIR_SIZE - PUD_SIZE;
|
||||
case CONT_PMD_SIZE:
|
||||
return PUD_SIZE - CONT_PMD_SIZE;
|
||||
case PMD_SIZE:
|
||||
return PUD_SIZE - PMD_SIZE;
|
||||
case CONT_PTE_SIZE:
|
||||
return PMD_SIZE - CONT_PTE_SIZE;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0UL;
|
||||
}
|
||||
|
||||
pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
|
||||
{
|
||||
size_t pagesize = 1UL << shift;
|
||||
|
@ -230,7 +230,7 @@ void mips_mt_set_cpuoptions(void)
|
||||
|
||||
struct class *mt_class;
|
||||
|
||||
static int __init mt_init(void)
|
||||
static int __init mips_mt_init(void)
|
||||
{
|
||||
struct class *mtc;
|
||||
|
||||
@ -243,4 +243,4 @@ static int __init mt_init(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
subsys_initcall(mt_init);
|
||||
subsys_initcall(mips_mt_init);
|
||||
|
@ -660,15 +660,20 @@ static inline unsigned long mm_total_size(struct mm_struct *mm)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long usize = 0;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
for (vma = mm->mmap; vma && usize < parisc_cache_flush_threshold; vma = vma->vm_next)
|
||||
for_each_vma(vmi, vma) {
|
||||
if (usize >= parisc_cache_flush_threshold)
|
||||
break;
|
||||
usize += vma->vm_end - vma->vm_start;
|
||||
}
|
||||
return usize;
|
||||
}
|
||||
|
||||
void flush_cache_mm(struct mm_struct *mm)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
/*
|
||||
* Flushing the whole cache on each cpu takes forever on
|
||||
@ -688,7 +693,7 @@ void flush_cache_mm(struct mm_struct *mm)
|
||||
}
|
||||
|
||||
/* Flush mm */
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next)
|
||||
for_each_vma(vmi, vma)
|
||||
flush_cache_pages(vma, vma->vm_start, vma->vm_end);
|
||||
}
|
||||
|
||||
|
@ -113,18 +113,18 @@ struct vdso_data *arch_get_vdso_data(void *vvar_page)
|
||||
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
||||
{
|
||||
struct mm_struct *mm = task->mm;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
unsigned long size = vma->vm_end - vma->vm_start;
|
||||
|
||||
if (vma_is_special_mapping(vma, &vvar_spec))
|
||||
zap_page_range(vma, vma->vm_start, size);
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -81,14 +81,15 @@ EXPORT_SYMBOL(hash__flush_range);
|
||||
void hash__flush_tlb_mm(struct mm_struct *mm)
|
||||
{
|
||||
struct vm_area_struct *mp;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
/*
|
||||
* It is safe to go down the mm's list of vmas when called
|
||||
* from dup_mmap, holding mmap_lock. It would also be safe from
|
||||
* unmap_region or exit_mmap, but not from vmtruncate on SMP -
|
||||
* but it seems dup_mmap is the only SMP case which gets here.
|
||||
* It is safe to iterate the vmas when called from dup_mmap,
|
||||
* holding mmap_lock. It would also be safe from unmap_region
|
||||
* or exit_mmap, but not from vmtruncate on SMP - but it seems
|
||||
* dup_mmap is the only SMP case which gets here.
|
||||
*/
|
||||
for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
|
||||
for_each_vma(vmi, mp)
|
||||
hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
|
||||
}
|
||||
EXPORT_SYMBOL(hash__flush_tlb_mm);
|
||||
|
@ -149,24 +149,15 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
|
||||
unsigned long len)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, addr);
|
||||
|
||||
/*
|
||||
* We don't try too hard, we just mark all the vma in that range
|
||||
* VM_NOHUGEPAGE and split them.
|
||||
*/
|
||||
vma = find_vma(mm, addr);
|
||||
/*
|
||||
* If the range is in unmapped range, just return
|
||||
*/
|
||||
if (vma && ((addr + len) <= vma->vm_start))
|
||||
return;
|
||||
|
||||
while (vma) {
|
||||
if (vma->vm_start >= (addr + len))
|
||||
break;
|
||||
for_each_vma_range(vmi, vma, addr + len) {
|
||||
vma->vm_flags |= VM_NOHUGEPAGE;
|
||||
walk_page_vma(vma, &subpage_walk_ops, NULL);
|
||||
vma = vma->vm_next;
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
@ -114,11 +114,12 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
||||
{
|
||||
struct mm_struct *mm = task->mm;
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct __vdso_info *vdso_info = mm->context.vdso_info;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
unsigned long size = vma->vm_end - vma->vm_start;
|
||||
|
||||
if (vma_is_special_mapping(vma, vdso_info->dm))
|
||||
|
@ -69,10 +69,11 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
|
||||
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
||||
{
|
||||
struct mm_struct *mm = task->mm;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
unsigned long size = vma->vm_end - vma->vm_start;
|
||||
|
||||
if (!vma_is_special_mapping(vma, &vvar_mapping))
|
||||
|
@ -2515,8 +2515,9 @@ static const struct mm_walk_ops thp_split_walk_ops = {
|
||||
static inline void thp_split_mm(struct mm_struct *mm)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
vma->vm_flags &= ~VM_HUGEPAGE;
|
||||
vma->vm_flags |= VM_NOHUGEPAGE;
|
||||
walk_page_vma(vma, &thp_split_walk_ops, NULL);
|
||||
@ -2584,8 +2585,9 @@ int gmap_mark_unmergeable(void)
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
int ret;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
|
||||
MADV_UNMERGEABLE, &vma->vm_flags);
|
||||
if (ret)
|
||||
|
@ -584,21 +584,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
||||
|
||||
void flush_tlb_mm(struct mm_struct *mm)
|
||||
{
|
||||
struct vm_area_struct *vma = mm->mmap;
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
while (vma != NULL) {
|
||||
for_each_vma(vmi, vma)
|
||||
fix_range(mm, vma->vm_start, vma->vm_end, 0);
|
||||
vma = vma->vm_next;
|
||||
}
|
||||
}
|
||||
|
||||
void force_flush_all(void)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma = mm->mmap;
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
while (vma != NULL) {
|
||||
for_each_vma(vmi, vma)
|
||||
fix_range(mm, vma->vm_start, vma->vm_end, 1);
|
||||
vma = vma->vm_next;
|
||||
}
|
||||
}
|
||||
|
@ -127,17 +127,17 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
||||
{
|
||||
struct mm_struct *mm = task->mm;
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
mmap_read_lock(mm);
|
||||
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
unsigned long size = vma->vm_end - vma->vm_start;
|
||||
|
||||
if (vma_is_special_mapping(vma, &vvar_mapping))
|
||||
zap_page_range(vma, vma->vm_start, size);
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
@ -354,6 +354,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
mmap_write_lock(mm);
|
||||
/*
|
||||
@ -363,7 +364,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
|
||||
* We could search vma near context.vdso, but it's a slowpath,
|
||||
* so let's explicitly check all VMAs to be completely sure.
|
||||
*/
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
if (vma_is_special_mapping(vma, &vdso_mapping) ||
|
||||
vma_is_special_mapping(vma, &vvar_mapping)) {
|
||||
mmap_write_unlock(mm);
|
||||
|
@ -96,7 +96,7 @@ void __init tboot_probe(void)
|
||||
|
||||
static pgd_t *tboot_pg_dir;
|
||||
static struct mm_struct tboot_mm = {
|
||||
.mm_rb = RB_ROOT,
|
||||
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock),
|
||||
.pgd = swapper_pg_dir,
|
||||
.mm_users = ATOMIC_INIT(2),
|
||||
.mm_count = ATOMIC_INIT(1),
|
||||
|
@ -6699,7 +6699,7 @@ int kvm_mmu_vendor_module_init(void)
|
||||
if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
|
||||
goto out;
|
||||
|
||||
ret = register_shrinker(&mmu_shrinker);
|
||||
ret = register_shrinker(&mmu_shrinker, "x86-mmu");
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
|
@ -58,6 +58,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
unsigned long len, unsigned long pgoff, unsigned long flags)
|
||||
{
|
||||
struct vm_area_struct *vmm;
|
||||
struct vma_iterator vmi;
|
||||
|
||||
if (flags & MAP_FIXED) {
|
||||
/* We do not accept a shared mapping if it would violate
|
||||
@ -79,15 +80,20 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
else
|
||||
addr = PAGE_ALIGN(addr);
|
||||
|
||||
for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
|
||||
/* At this point: (!vmm || addr < vmm->vm_end). */
|
||||
if (TASK_SIZE - len < addr)
|
||||
return -ENOMEM;
|
||||
if (!vmm || addr + len <= vm_start_gap(vmm))
|
||||
return addr;
|
||||
vma_iter_init(&vmi, current->mm, addr);
|
||||
for_each_vma(vmi, vmm) {
|
||||
/* At this point: (addr < vmm->vm_end). */
|
||||
if (addr + len <= vm_start_gap(vmm))
|
||||
break;
|
||||
|
||||
addr = vmm->vm_end;
|
||||
if (flags & MAP_SHARED)
|
||||
addr = COLOUR_ALIGN(addr, pgoff);
|
||||
}
|
||||
|
||||
if (TASK_SIZE - len < addr)
|
||||
return -ENOMEM;
|
||||
|
||||
return addr;
|
||||
}
|
||||
#endif
|
||||
|
@ -213,7 +213,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
|
||||
|
||||
if (mm) {
|
||||
mmap_read_lock(mm);
|
||||
vma = alloc->vma;
|
||||
vma = vma_lookup(mm, alloc->vma_addr);
|
||||
}
|
||||
|
||||
if (!vma && need_mm) {
|
||||
@ -313,16 +313,22 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
|
||||
static inline void binder_alloc_set_vma(struct binder_alloc *alloc,
|
||||
struct vm_area_struct *vma)
|
||||
{
|
||||
if (vma)
|
||||
alloc->vma_vm_mm = vma->vm_mm;
|
||||
unsigned long vm_start = 0;
|
||||
|
||||
/*
|
||||
* If we see alloc->vma is not NULL, buffer data structures set up
|
||||
* completely. Look at smp_rmb side binder_alloc_get_vma.
|
||||
* We also want to guarantee new alloc->vma_vm_mm is always visible
|
||||
* if alloc->vma is set.
|
||||
* Allow clearing the vma with holding just the read lock to allow
|
||||
* munmapping downgrade of the write lock before freeing and closing the
|
||||
* file using binder_alloc_vma_close().
|
||||
*/
|
||||
smp_wmb();
|
||||
alloc->vma = vma;
|
||||
if (vma) {
|
||||
vm_start = vma->vm_start;
|
||||
alloc->vma_vm_mm = vma->vm_mm;
|
||||
mmap_assert_write_locked(alloc->vma_vm_mm);
|
||||
} else {
|
||||
mmap_assert_locked(alloc->vma_vm_mm);
|
||||
}
|
||||
|
||||
alloc->vma_addr = vm_start;
|
||||
}
|
||||
|
||||
static inline struct vm_area_struct *binder_alloc_get_vma(
|
||||
@ -330,11 +336,9 @@ static inline struct vm_area_struct *binder_alloc_get_vma(
|
||||
{
|
||||
struct vm_area_struct *vma = NULL;
|
||||
|
||||
if (alloc->vma) {
|
||||
/* Look at description in binder_alloc_set_vma */
|
||||
smp_rmb();
|
||||
vma = alloc->vma;
|
||||
}
|
||||
if (alloc->vma_addr)
|
||||
vma = vma_lookup(alloc->vma_vm_mm, alloc->vma_addr);
|
||||
|
||||
return vma;
|
||||
}
|
||||
|
||||
@ -817,7 +821,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
|
||||
|
||||
buffers = 0;
|
||||
mutex_lock(&alloc->mutex);
|
||||
BUG_ON(alloc->vma);
|
||||
BUG_ON(alloc->vma_addr &&
|
||||
vma_lookup(alloc->vma_vm_mm, alloc->vma_addr));
|
||||
|
||||
while ((n = rb_first(&alloc->allocated_buffers))) {
|
||||
buffer = rb_entry(n, struct binder_buffer, rb_node);
|
||||
@ -1084,7 +1089,7 @@ int binder_alloc_shrinker_init(void)
|
||||
int ret = list_lru_init(&binder_alloc_lru);
|
||||
|
||||
if (ret == 0) {
|
||||
ret = register_shrinker(&binder_shrinker);
|
||||
ret = register_shrinker(&binder_shrinker, "android-binder");
|
||||
if (ret)
|
||||
list_lru_destroy(&binder_alloc_lru);
|
||||
}
|
||||
|
@ -100,7 +100,7 @@ struct binder_lru_page {
|
||||
*/
|
||||
struct binder_alloc {
|
||||
struct mutex mutex;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long vma_addr;
|
||||
struct mm_struct *vma_vm_mm;
|
||||
void __user *buffer;
|
||||
struct list_head buffers;
|
||||
|
@ -287,7 +287,7 @@ void binder_selftest_alloc(struct binder_alloc *alloc)
|
||||
if (!binder_selftest_run)
|
||||
return;
|
||||
mutex_lock(&binder_selftest_lock);
|
||||
if (!binder_selftest_run || !alloc->vma)
|
||||
if (!binder_selftest_run || !alloc->vma_addr)
|
||||
goto done;
|
||||
pr_info("STARTED\n");
|
||||
binder_selftest_alloc_offset(alloc, end_offset, 0);
|
||||
|
@ -63,12 +63,6 @@ static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp)
|
||||
|
||||
bool zcomp_available_algorithm(const char *comp)
|
||||
{
|
||||
int i;
|
||||
|
||||
i = sysfs_match_string(backends, comp);
|
||||
if (i >= 0)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Crypto does not ignore a trailing new line symbol,
|
||||
* so make sure you don't supply a string containing
|
||||
@ -217,6 +211,11 @@ struct zcomp *zcomp_create(const char *compress)
|
||||
struct zcomp *comp;
|
||||
int error;
|
||||
|
||||
/*
|
||||
* Crypto API will execute /sbin/modprobe if the compression module
|
||||
* is not loaded yet. We must do it here, otherwise we are about to
|
||||
* call /sbin/modprobe under CPU hot-plug lock.
|
||||
*/
|
||||
if (!zcomp_available_algorithm(compress))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
|
@ -22,6 +22,8 @@
|
||||
* @private: dax driver private data
|
||||
* @flags: state and boolean properties
|
||||
* @ops: operations for this device
|
||||
* @holder_data: holder of a dax_device: could be filesystem or mapped device
|
||||
* @holder_ops: operations for the inner holder
|
||||
*/
|
||||
struct dax_device {
|
||||
struct inode inode;
|
||||
@ -29,6 +31,8 @@ struct dax_device {
|
||||
void *private;
|
||||
unsigned long flags;
|
||||
const struct dax_operations *ops;
|
||||
void *holder_data;
|
||||
const struct dax_holder_operations *holder_ops;
|
||||
};
|
||||
|
||||
static dev_t dax_devt;
|
||||
@ -71,8 +75,11 @@ EXPORT_SYMBOL_GPL(dax_remove_host);
|
||||
* fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax
|
||||
* @bdev: block device to find a dax_device for
|
||||
* @start_off: returns the byte offset into the dax_device that @bdev starts
|
||||
* @holder: filesystem or mapped device inside the dax_device
|
||||
* @ops: operations for the inner holder
|
||||
*/
|
||||
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
|
||||
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
|
||||
void *holder, const struct dax_holder_operations *ops)
|
||||
{
|
||||
struct dax_device *dax_dev;
|
||||
u64 part_size;
|
||||
@ -92,11 +99,26 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
|
||||
dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
|
||||
if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
|
||||
dax_dev = NULL;
|
||||
else if (holder) {
|
||||
if (!cmpxchg(&dax_dev->holder_data, NULL, holder))
|
||||
dax_dev->holder_ops = ops;
|
||||
else
|
||||
dax_dev = NULL;
|
||||
}
|
||||
dax_read_unlock(id);
|
||||
|
||||
return dax_dev;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
|
||||
|
||||
void fs_put_dax(struct dax_device *dax_dev, void *holder)
|
||||
{
|
||||
if (dax_dev && holder &&
|
||||
cmpxchg(&dax_dev->holder_data, holder, NULL) == holder)
|
||||
dax_dev->holder_ops = NULL;
|
||||
put_dax(dax_dev);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fs_put_dax);
|
||||
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
|
||||
|
||||
enum dax_device_flags {
|
||||
@ -204,6 +226,29 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_recovery_write);
|
||||
|
||||
int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off,
|
||||
u64 len, int mf_flags)
|
||||
{
|
||||
int rc, id;
|
||||
|
||||
id = dax_read_lock();
|
||||
if (!dax_alive(dax_dev)) {
|
||||
rc = -ENXIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!dax_dev->holder_ops) {
|
||||
rc = -EOPNOTSUPP;
|
||||
goto out;
|
||||
}
|
||||
|
||||
rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags);
|
||||
out:
|
||||
dax_read_unlock(id);
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_holder_notify_failure);
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_PMEM_API
|
||||
void arch_wb_cache_pmem(void *addr, size_t size);
|
||||
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
|
||||
@ -277,8 +322,15 @@ void kill_dax(struct dax_device *dax_dev)
|
||||
if (!dax_dev)
|
||||
return;
|
||||
|
||||
if (dax_dev->holder_data != NULL)
|
||||
dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
|
||||
|
||||
clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
|
||||
synchronize_srcu(&dax_srcu);
|
||||
|
||||
/* clear holder data */
|
||||
dax_dev->holder_ops = NULL;
|
||||
dax_dev->holder_data = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kill_dax);
|
||||
|
||||
@ -420,6 +472,19 @@ void put_dax(struct dax_device *dax_dev)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(put_dax);
|
||||
|
||||
/**
|
||||
* dax_holder() - obtain the holder of a dax device
|
||||
* @dax_dev: a dax_device instance
|
||||
|
||||
* Return: the holder's data which represents the holder if registered,
|
||||
* otherwize NULL.
|
||||
*/
|
||||
void *dax_holder(struct dax_device *dax_dev)
|
||||
{
|
||||
return dax_dev->holder_data;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_holder);
|
||||
|
||||
/**
|
||||
* inode_dax: convert a public inode into its dax_dev
|
||||
* @inode: An inode with i_cdev pointing to a dax_dev
|
||||
|
@ -484,7 +484,6 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file)
|
||||
{
|
||||
struct dma_buf *dmabuf = file->private_data;
|
||||
|
||||
seq_printf(m, "size:\t%zu\n", dmabuf->size);
|
||||
/* Don't count the temporary reference taken inside procfs seq_show */
|
||||
seq_printf(m, "count:\t%ld\n", file_count(dmabuf->file) - 1);
|
||||
seq_printf(m, "exp_name:\t%s\n", dmabuf->exp_name);
|
||||
|
@ -57,7 +57,7 @@ static unsigned long __initdata mem_reserve = EFI_INVALID_TABLE_ADDR;
|
||||
static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR;
|
||||
|
||||
struct mm_struct efi_mm = {
|
||||
.mm_rb = RB_ROOT,
|
||||
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock),
|
||||
.mm_users = ATOMIC_INIT(2),
|
||||
.mm_count = ATOMIC_INIT(1),
|
||||
.write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq),
|
||||
|
@ -671,13 +671,15 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
|
||||
migrate.vma = vma;
|
||||
migrate.start = start;
|
||||
migrate.end = end;
|
||||
migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
|
||||
migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
|
||||
if (adev->gmc.xgmi.connected_to_cpu)
|
||||
migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
|
||||
else
|
||||
migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
|
||||
|
||||
buf = kvcalloc(npages,
|
||||
2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t),
|
||||
GFP_KERNEL);
|
||||
|
||||
if (!buf)
|
||||
goto out;
|
||||
|
||||
@ -947,7 +949,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
|
||||
{
|
||||
struct kfd_dev *kfddev = adev->kfd.dev;
|
||||
struct dev_pagemap *pgmap;
|
||||
struct resource *res;
|
||||
struct resource *res = NULL;
|
||||
unsigned long size;
|
||||
void *r;
|
||||
|
||||
@ -962,28 +964,34 @@ int svm_migrate_init(struct amdgpu_device *adev)
|
||||
* should remove reserved size
|
||||
*/
|
||||
size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
|
||||
res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
|
||||
if (IS_ERR(res))
|
||||
return -ENOMEM;
|
||||
if (adev->gmc.xgmi.connected_to_cpu) {
|
||||
pgmap->range.start = adev->gmc.aper_base;
|
||||
pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1;
|
||||
pgmap->type = MEMORY_DEVICE_COHERENT;
|
||||
} else {
|
||||
res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
|
||||
if (IS_ERR(res))
|
||||
return -ENOMEM;
|
||||
pgmap->range.start = res->start;
|
||||
pgmap->range.end = res->end;
|
||||
pgmap->type = MEMORY_DEVICE_PRIVATE;
|
||||
}
|
||||
|
||||
pgmap->type = MEMORY_DEVICE_PRIVATE;
|
||||
pgmap->nr_range = 1;
|
||||
pgmap->range.start = res->start;
|
||||
pgmap->range.end = res->end;
|
||||
pgmap->ops = &svm_migrate_pgmap_ops;
|
||||
pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
|
||||
pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
|
||||
|
||||
pgmap->flags = 0;
|
||||
/* Device manager releases device-specific resources, memory region and
|
||||
* pgmap when driver disconnects from device.
|
||||
*/
|
||||
r = devm_memremap_pages(adev->dev, pgmap);
|
||||
if (IS_ERR(r)) {
|
||||
pr_err("failed to register HMM device memory\n");
|
||||
|
||||
/* Disable SVM support capability */
|
||||
pgmap->type = 0;
|
||||
devm_release_mem_region(adev->dev, res->start, resource_size(res));
|
||||
if (pgmap->type == MEMORY_DEVICE_PRIVATE)
|
||||
devm_release_mem_region(adev->dev, res->start,
|
||||
res->end - res->start + 1);
|
||||
return PTR_ERR(r);
|
||||
}
|
||||
|
||||
|
@ -426,7 +426,8 @@ void i915_gem_driver_register__shrinker(struct drm_i915_private *i915)
|
||||
i915->mm.shrinker.count_objects = i915_gem_shrinker_count;
|
||||
i915->mm.shrinker.seeks = DEFAULT_SEEKS;
|
||||
i915->mm.shrinker.batch = 4096;
|
||||
drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker));
|
||||
drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker,
|
||||
"drm-i915_gem"));
|
||||
|
||||
i915->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom;
|
||||
drm_WARN_ON(&i915->drm, register_oom_notifier(&i915->mm.oom_notifier));
|
||||
|
@ -426,12 +426,11 @@ static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = {
|
||||
static int
|
||||
probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len)
|
||||
{
|
||||
const unsigned long end = addr + len;
|
||||
VMA_ITERATOR(vmi, mm, addr);
|
||||
struct vm_area_struct *vma;
|
||||
int ret = -EFAULT;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
|
||||
for_each_vma_range(vmi, vma, addr + len) {
|
||||
/* Check for holes, note that we also update the addr below */
|
||||
if (vma->vm_start > addr)
|
||||
break;
|
||||
@ -439,16 +438,13 @@ probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len)
|
||||
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
|
||||
break;
|
||||
|
||||
if (vma->vm_end >= end) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
addr = vma->vm_end;
|
||||
}
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
return ret;
|
||||
if (vma)
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -221,7 +221,7 @@ void msm_gem_shrinker_init(struct drm_device *dev)
|
||||
priv->shrinker.count_objects = msm_gem_shrinker_count;
|
||||
priv->shrinker.scan_objects = msm_gem_shrinker_scan;
|
||||
priv->shrinker.seeks = DEFAULT_SEEKS;
|
||||
WARN_ON(register_shrinker(&priv->shrinker));
|
||||
WARN_ON(register_shrinker(&priv->shrinker, "drm-msm_gem"));
|
||||
|
||||
priv->vmap_notifier.notifier_call = msm_gem_shrinker_vmap;
|
||||
WARN_ON(register_vmap_purge_notifier(&priv->vmap_notifier));
|
||||
|
@ -103,7 +103,7 @@ void panfrost_gem_shrinker_init(struct drm_device *dev)
|
||||
pfdev->shrinker.count_objects = panfrost_gem_shrinker_count;
|
||||
pfdev->shrinker.scan_objects = panfrost_gem_shrinker_scan;
|
||||
pfdev->shrinker.seeks = DEFAULT_SEEKS;
|
||||
WARN_ON(register_shrinker(&pfdev->shrinker));
|
||||
WARN_ON(register_shrinker(&pfdev->shrinker, "drm-panfrost"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -722,7 +722,7 @@ int ttm_pool_mgr_init(unsigned long num_pages)
|
||||
mm_shrinker.count_objects = ttm_pool_shrinker_count;
|
||||
mm_shrinker.scan_objects = ttm_pool_shrinker_scan;
|
||||
mm_shrinker.seeks = 1;
|
||||
return register_shrinker(&mm_shrinker);
|
||||
return register_shrinker(&mm_shrinker, "drm-ttm_pool");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as,
|
||||
* allocate page in a sleeping context if GFP flags permit. Hence
|
||||
* spinlock needs to be unlocked and re-locked after allocation.
|
||||
*/
|
||||
if (!(gfp & __GFP_ATOMIC))
|
||||
if (gfp & __GFP_DIRECT_RECLAIM)
|
||||
spin_unlock_irqrestore(&as->lock, *flags);
|
||||
|
||||
page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO);
|
||||
|
||||
if (!(gfp & __GFP_ATOMIC))
|
||||
if (gfp & __GFP_DIRECT_RECLAIM)
|
||||
spin_lock_irqsave(&as->lock, *flags);
|
||||
|
||||
/*
|
||||
|
@ -812,7 +812,7 @@ int bch_btree_cache_alloc(struct cache_set *c)
|
||||
c->shrink.seeks = 4;
|
||||
c->shrink.batch = c->btree_pages * 2;
|
||||
|
||||
if (register_shrinker(&c->shrink))
|
||||
if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid))
|
||||
pr_warn("bcache: %s: could not register shrinker\n",
|
||||
__func__);
|
||||
|
||||
|
@ -1806,7 +1806,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
|
||||
c->shrinker.scan_objects = dm_bufio_shrink_scan;
|
||||
c->shrinker.seeks = 1;
|
||||
c->shrinker.batch = 0;
|
||||
r = register_shrinker(&c->shrinker);
|
||||
r = register_shrinker(&c->shrinker, "md-%s:(%u:%u)", slab_name,
|
||||
MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
|
||||
if (r)
|
||||
goto bad;
|
||||
|
||||
|
@ -2944,7 +2944,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
|
||||
zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
|
||||
|
||||
/* Metadata cache shrinker */
|
||||
ret = register_shrinker(&zmd->mblk_shrinker);
|
||||
ret = register_shrinker(&zmd->mblk_shrinker, "md-meta:(%u:%u)",
|
||||
MAJOR(dev->bdev->bd_dev),
|
||||
MINOR(dev->bdev->bd_dev));
|
||||
if (ret) {
|
||||
dmz_zmd_err(zmd, "Register metadata cache shrinker failed");
|
||||
goto err;
|
||||
|
@ -758,7 +758,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
|
||||
}
|
||||
|
||||
td->dm_dev.bdev = bdev;
|
||||
td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
|
||||
td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -7414,7 +7414,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
conf->shrinker.count_objects = raid5_cache_count;
|
||||
conf->shrinker.batch = 128;
|
||||
conf->shrinker.flags = 0;
|
||||
ret = register_shrinker(&conf->shrinker);
|
||||
ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev));
|
||||
if (ret) {
|
||||
pr_warn("md/raid:%s: couldn't register shrinker.\n",
|
||||
mdname(mddev));
|
||||
|
@ -280,22 +280,6 @@ void cxl_handle_fault(struct work_struct *fault_work)
|
||||
mmput(mm);
|
||||
}
|
||||
|
||||
static void cxl_prefault_one(struct cxl_context *ctx, u64 ea)
|
||||
{
|
||||
struct mm_struct *mm;
|
||||
|
||||
mm = get_mem_context(ctx);
|
||||
if (mm == NULL) {
|
||||
pr_devel("cxl_prefault_one unable to get mm %i\n",
|
||||
pid_nr(ctx->pid));
|
||||
return;
|
||||
}
|
||||
|
||||
cxl_fault_segment(ctx, mm, ea);
|
||||
|
||||
mmput(mm);
|
||||
}
|
||||
|
||||
static u64 next_segment(u64 ea, u64 vsid)
|
||||
{
|
||||
if (vsid & SLB_VSID_B_1T)
|
||||
@ -306,23 +290,16 @@ static u64 next_segment(u64 ea, u64 vsid)
|
||||
return ea + 1;
|
||||
}
|
||||
|
||||
static void cxl_prefault_vma(struct cxl_context *ctx)
|
||||
static void cxl_prefault_vma(struct cxl_context *ctx, struct mm_struct *mm)
|
||||
{
|
||||
u64 ea, last_esid = 0;
|
||||
struct copro_slb slb;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct vm_area_struct *vma;
|
||||
int rc;
|
||||
struct mm_struct *mm;
|
||||
|
||||
mm = get_mem_context(ctx);
|
||||
if (mm == NULL) {
|
||||
pr_devel("cxl_prefault_vm unable to get mm %i\n",
|
||||
pid_nr(ctx->pid));
|
||||
return;
|
||||
}
|
||||
|
||||
mmap_read_lock(mm);
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
for (ea = vma->vm_start; ea < vma->vm_end;
|
||||
ea = next_segment(ea, slb.vsid)) {
|
||||
rc = copro_calculate_slb(mm, ea, &slb);
|
||||
@ -337,20 +314,28 @@ static void cxl_prefault_vma(struct cxl_context *ctx)
|
||||
}
|
||||
}
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
mmput(mm);
|
||||
}
|
||||
|
||||
void cxl_prefault(struct cxl_context *ctx, u64 wed)
|
||||
{
|
||||
struct mm_struct *mm = get_mem_context(ctx);
|
||||
|
||||
if (mm == NULL) {
|
||||
pr_devel("cxl_prefault unable to get mm %i\n",
|
||||
pid_nr(ctx->pid));
|
||||
return;
|
||||
}
|
||||
|
||||
switch (ctx->afu->prefault_mode) {
|
||||
case CXL_PREFAULT_WED:
|
||||
cxl_prefault_one(ctx, wed);
|
||||
cxl_fault_segment(ctx, mm, wed);
|
||||
break;
|
||||
case CXL_PREFAULT_ALL:
|
||||
cxl_prefault_vma(ctx);
|
||||
cxl_prefault_vma(ctx, mm);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
mmput(mm);
|
||||
}
|
||||
|
@ -1585,7 +1585,7 @@ static int vmballoon_register_shrinker(struct vmballoon *b)
|
||||
b->shrinker.count_objects = vmballoon_shrinker_count;
|
||||
b->shrinker.seeks = DEFAULT_SEEKS;
|
||||
|
||||
r = register_shrinker(&b->shrinker);
|
||||
r = register_shrinker(&b->shrinker, "vmw-balloon");
|
||||
|
||||
if (r == 0)
|
||||
b->shrinker_registered = true;
|
||||
|
@ -453,6 +453,21 @@ static void pmem_release_disk(void *__pmem)
|
||||
blk_cleanup_disk(pmem->disk);
|
||||
}
|
||||
|
||||
static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
|
||||
unsigned long pfn, unsigned long nr_pages, int mf_flags)
|
||||
{
|
||||
struct pmem_device *pmem =
|
||||
container_of(pgmap, struct pmem_device, pgmap);
|
||||
u64 offset = PFN_PHYS(pfn) - pmem->phys_addr - pmem->data_offset;
|
||||
u64 len = nr_pages << PAGE_SHIFT;
|
||||
|
||||
return dax_holder_notify_failure(pmem->dax_dev, offset, len, mf_flags);
|
||||
}
|
||||
|
||||
static const struct dev_pagemap_ops fsdax_pagemap_ops = {
|
||||
.memory_failure = pmem_pagemap_memory_failure,
|
||||
};
|
||||
|
||||
static int pmem_attach_disk(struct device *dev,
|
||||
struct nd_namespace_common *ndns)
|
||||
{
|
||||
@ -514,6 +529,7 @@ static int pmem_attach_disk(struct device *dev,
|
||||
pmem->pfn_flags = PFN_DEV;
|
||||
if (is_nd_pfn(dev)) {
|
||||
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
|
||||
pmem->pgmap.ops = &fsdax_pagemap_ops;
|
||||
addr = devm_memremap_pages(dev, &pmem->pgmap);
|
||||
pfn_sb = nd_pfn->pfn_sb;
|
||||
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
|
||||
@ -527,6 +543,7 @@ static int pmem_attach_disk(struct device *dev,
|
||||
pmem->pgmap.range.end = res->end;
|
||||
pmem->pgmap.nr_range = 1;
|
||||
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
|
||||
pmem->pgmap.ops = &fsdax_pagemap_ops;
|
||||
addr = devm_memremap_pages(dev, &pmem->pgmap);
|
||||
pmem->pfn_flags |= PFN_MAP;
|
||||
bb_range = pmem->pgmap.range;
|
||||
|
@ -492,15 +492,18 @@ static bool is_normal_memory(pgprot_t p)
|
||||
#endif
|
||||
}
|
||||
|
||||
static int __check_mem_type(struct vm_area_struct *vma, unsigned long end)
|
||||
static int __check_mem_type(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
while (vma && is_normal_memory(vma->vm_page_prot)) {
|
||||
if (vma->vm_end >= end)
|
||||
return 0;
|
||||
vma = vma->vm_next;
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, start);
|
||||
|
||||
for_each_vma_range(vmi, vma, end) {
|
||||
if (!is_normal_memory(vma->vm_page_prot))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int optee_check_mem_type(unsigned long start, size_t num_pages)
|
||||
@ -516,8 +519,7 @@ int optee_check_mem_type(unsigned long start, size_t num_pages)
|
||||
return 0;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
rc = __check_mem_type(find_vma(mm, start),
|
||||
start + num_pages * PAGE_SIZE);
|
||||
rc = __check_mem_type(mm, start, start + num_pages * PAGE_SIZE);
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
return rc;
|
||||
|
@ -856,7 +856,7 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb)
|
||||
vb->shrinker.count_objects = virtio_balloon_shrinker_count;
|
||||
vb->shrinker.seeks = DEFAULT_SEEKS;
|
||||
|
||||
return register_shrinker(&vb->shrinker);
|
||||
return register_shrinker(&vb->shrinker, "virtio-balloon");
|
||||
}
|
||||
|
||||
static int virtballoon_probe(struct virtio_device *vdev)
|
||||
|
@ -282,7 +282,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata)
|
||||
struct page, lru);
|
||||
struct privcmd_mmap_entry *msg = page_address(page);
|
||||
|
||||
vma = find_vma(mm, msg->va);
|
||||
vma = vma_lookup(mm, msg->va);
|
||||
rc = -EINVAL;
|
||||
|
||||
if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data)
|
||||
|
@ -305,7 +305,7 @@ static int __init xenbus_probe_backend_init(void)
|
||||
|
||||
register_xenstore_notifier(&xenstore_notifier);
|
||||
|
||||
if (register_shrinker(&backend_memory_shrinker))
|
||||
if (register_shrinker(&backend_memory_shrinker, "xen-backend"))
|
||||
pr_warn("shrinker registration failed\n");
|
||||
|
||||
return 0;
|
||||
|
@ -1816,6 +1816,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
|
||||
error = -EBUSY;
|
||||
} else {
|
||||
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
|
||||
shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
|
||||
s->s_id);
|
||||
btrfs_sb(s)->bdev_holder = fs_type;
|
||||
if (!strstr(crc32c_impl(), "generic"))
|
||||
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
|
||||
|
@ -819,8 +819,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
|
||||
if (retry)
|
||||
gfp |= __GFP_NOFAIL;
|
||||
|
||||
/* The page lock pins the memcg */
|
||||
memcg = page_memcg(page);
|
||||
memcg = get_mem_cgroup_from_page(page);
|
||||
old_memcg = set_active_memcg(memcg);
|
||||
|
||||
head = NULL;
|
||||
@ -840,6 +839,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
|
||||
set_bh_page(bh, page, offset);
|
||||
}
|
||||
out:
|
||||
mem_cgroup_put(memcg);
|
||||
set_active_memcg(old_memcg);
|
||||
return head;
|
||||
/*
|
||||
|
@ -1072,30 +1072,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
|
||||
return vma->vm_end - vma->vm_start;
|
||||
}
|
||||
|
||||
static struct vm_area_struct *first_vma(struct task_struct *tsk,
|
||||
struct vm_area_struct *gate_vma)
|
||||
{
|
||||
struct vm_area_struct *ret = tsk->mm->mmap;
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
return gate_vma;
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper function for iterating across a vma list. It ensures that the caller
|
||||
* will visit `gate_vma' prior to terminating the search.
|
||||
*/
|
||||
static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
|
||||
static struct vm_area_struct *coredump_next_vma(struct ma_state *mas,
|
||||
struct vm_area_struct *vma,
|
||||
struct vm_area_struct *gate_vma)
|
||||
{
|
||||
struct vm_area_struct *ret;
|
||||
|
||||
ret = this_vma->vm_next;
|
||||
if (ret)
|
||||
return ret;
|
||||
if (this_vma == gate_vma)
|
||||
if (gate_vma && (vma == gate_vma))
|
||||
return NULL;
|
||||
|
||||
vma = mas_next(mas, ULONG_MAX);
|
||||
if (vma)
|
||||
return vma;
|
||||
return gate_vma;
|
||||
}
|
||||
|
||||
@ -1119,9 +1109,10 @@ static void free_vma_snapshot(struct coredump_params *cprm)
|
||||
*/
|
||||
static bool dump_vma_snapshot(struct coredump_params *cprm)
|
||||
{
|
||||
struct vm_area_struct *vma, *gate_vma;
|
||||
struct vm_area_struct *gate_vma, *vma = NULL;
|
||||
struct mm_struct *mm = current->mm;
|
||||
int i;
|
||||
MA_STATE(mas, &mm->mm_mt, 0, 0);
|
||||
int i = 0;
|
||||
|
||||
/*
|
||||
* Once the stack expansion code is fixed to not change VMA bounds
|
||||
@ -1141,8 +1132,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
|
||||
return false;
|
||||
}
|
||||
|
||||
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
|
||||
vma = next_vma(vma, gate_vma), i++) {
|
||||
while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) {
|
||||
struct core_vma_metadata *m = cprm->vma_meta + i;
|
||||
|
||||
m->start = vma->vm_start;
|
||||
@ -1150,10 +1140,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
|
||||
m->flags = vma->vm_flags;
|
||||
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
|
||||
m->pgoff = vma->vm_pgoff;
|
||||
|
||||
m->file = vma->vm_file;
|
||||
if (m->file)
|
||||
get_file(m->file);
|
||||
i++;
|
||||
}
|
||||
|
||||
mmap_write_unlock(mm);
|
||||
|
401
fs/dax.c
401
fs/dax.c
@ -334,13 +334,35 @@ static unsigned long dax_end_pfn(void *entry)
|
||||
for (pfn = dax_to_pfn(entry); \
|
||||
pfn < dax_end_pfn(entry); pfn++)
|
||||
|
||||
static inline bool dax_mapping_is_cow(struct address_space *mapping)
|
||||
{
|
||||
return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO: for reflink+dax we need a way to associate a single page with
|
||||
* multiple address_space instances at different linear_page_index()
|
||||
* offsets.
|
||||
* Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
|
||||
*/
|
||||
static inline void dax_mapping_set_cow(struct page *page)
|
||||
{
|
||||
if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
|
||||
/*
|
||||
* Reset the index if the page was already mapped
|
||||
* regularly before.
|
||||
*/
|
||||
if (page->mapping)
|
||||
page->index = 1;
|
||||
page->mapping = (void *)PAGE_MAPPING_DAX_COW;
|
||||
}
|
||||
page->index++;
|
||||
}
|
||||
|
||||
/*
|
||||
* When it is called in dax_insert_entry(), the cow flag will indicate that
|
||||
* whether this entry is shared by multiple files. If so, set the page->mapping
|
||||
* FS_DAX_MAPPING_COW, and use page->index as refcount.
|
||||
*/
|
||||
static void dax_associate_entry(void *entry, struct address_space *mapping,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
struct vm_area_struct *vma, unsigned long address, bool cow)
|
||||
{
|
||||
unsigned long size = dax_entry_size(entry), pfn, index;
|
||||
int i = 0;
|
||||
@ -352,9 +374,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
|
||||
for_each_mapped_pfn(entry, pfn) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
WARN_ON_ONCE(page->mapping);
|
||||
page->mapping = mapping;
|
||||
page->index = index + i++;
|
||||
if (cow) {
|
||||
dax_mapping_set_cow(page);
|
||||
} else {
|
||||
WARN_ON_ONCE(page->mapping);
|
||||
page->mapping = mapping;
|
||||
page->index = index + i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -370,7 +396,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
|
||||
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
|
||||
if (dax_mapping_is_cow(page->mapping)) {
|
||||
/* keep the CoW flag if this page is still shared */
|
||||
if (page->index-- > 0)
|
||||
continue;
|
||||
} else
|
||||
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
|
||||
page->mapping = NULL;
|
||||
page->index = 0;
|
||||
}
|
||||
@ -455,6 +486,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie)
|
||||
dax_unlock_entry(&xas, (void *)cookie);
|
||||
}
|
||||
|
||||
/*
|
||||
* dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
|
||||
* @mapping: the file's mapping whose entry we want to lock
|
||||
* @index: the offset within this file
|
||||
* @page: output the dax page corresponding to this dax entry
|
||||
*
|
||||
* Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
|
||||
* could not be locked.
|
||||
*/
|
||||
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
|
||||
struct page **page)
|
||||
{
|
||||
XA_STATE(xas, NULL, 0);
|
||||
void *entry;
|
||||
|
||||
rcu_read_lock();
|
||||
for (;;) {
|
||||
entry = NULL;
|
||||
if (!dax_mapping(mapping))
|
||||
break;
|
||||
|
||||
xas.xa = &mapping->i_pages;
|
||||
xas_lock_irq(&xas);
|
||||
xas_set(&xas, index);
|
||||
entry = xas_load(&xas);
|
||||
if (dax_is_locked(entry)) {
|
||||
rcu_read_unlock();
|
||||
wait_entry_unlocked(&xas, entry);
|
||||
rcu_read_lock();
|
||||
continue;
|
||||
}
|
||||
if (!entry ||
|
||||
dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
|
||||
/*
|
||||
* Because we are looking for entry from file's mapping
|
||||
* and index, so the entry may not be inserted for now,
|
||||
* or even a zero/empty entry. We don't think this is
|
||||
* an error case. So, return a special value and do
|
||||
* not output @page.
|
||||
*/
|
||||
entry = (void *)~0UL;
|
||||
} else {
|
||||
*page = pfn_to_page(dax_to_pfn(entry));
|
||||
dax_lock_entry(&xas, entry);
|
||||
}
|
||||
xas_unlock_irq(&xas);
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return (dax_entry_t)entry;
|
||||
}
|
||||
|
||||
void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
|
||||
dax_entry_t cookie)
|
||||
{
|
||||
XA_STATE(xas, &mapping->i_pages, index);
|
||||
|
||||
if (cookie == ~0UL)
|
||||
return;
|
||||
|
||||
dax_unlock_entry(&xas, (void *)cookie);
|
||||
}
|
||||
|
||||
/*
|
||||
* Find page cache entry at given index. If it is a DAX entry, return it
|
||||
* with the entry locked. If the page cache doesn't contain an entry at
|
||||
@ -735,6 +829,23 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* MAP_SYNC on a dax mapping guarantees dirty metadata is
|
||||
* flushed on write-faults (non-cow), but not read-faults.
|
||||
*/
|
||||
static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
|
||||
struct vm_area_struct *vma)
|
||||
{
|
||||
return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
|
||||
(iter->iomap.flags & IOMAP_F_DIRTY);
|
||||
}
|
||||
|
||||
static bool dax_fault_is_cow(const struct iomap_iter *iter)
|
||||
{
|
||||
return (iter->flags & IOMAP_WRITE) &&
|
||||
(iter->iomap.flags & IOMAP_F_SHARED);
|
||||
}
|
||||
|
||||
/*
|
||||
* By this point grab_mapping_entry() has ensured that we have a locked entry
|
||||
* of the appropriate size so we don't have to worry about downgrading PMDs to
|
||||
@ -742,16 +853,19 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
|
||||
* already in the tree, we will skip the insertion and just dirty the PMD as
|
||||
* appropriate.
|
||||
*/
|
||||
static void *dax_insert_entry(struct xa_state *xas,
|
||||
struct address_space *mapping, struct vm_fault *vmf,
|
||||
void *entry, pfn_t pfn, unsigned long flags, bool dirty)
|
||||
static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
|
||||
const struct iomap_iter *iter, void *entry, pfn_t pfn,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
||||
void *new_entry = dax_make_entry(pfn, flags);
|
||||
bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
|
||||
bool cow = dax_fault_is_cow(iter);
|
||||
|
||||
if (dirty)
|
||||
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
|
||||
|
||||
if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
|
||||
if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
|
||||
unsigned long index = xas->xa_index;
|
||||
/* we are replacing a zero page with block mapping */
|
||||
if (dax_is_pmd_entry(entry))
|
||||
@ -763,11 +877,12 @@ static void *dax_insert_entry(struct xa_state *xas,
|
||||
|
||||
xas_reset(xas);
|
||||
xas_lock_irq(xas);
|
||||
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
|
||||
if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
|
||||
void *old;
|
||||
|
||||
dax_disassociate_entry(entry, mapping, false);
|
||||
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
|
||||
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
|
||||
cow);
|
||||
/*
|
||||
* Only swap our new entry into the page cache if the current
|
||||
* entry is a zero page or an empty entry. If a normal PTE or
|
||||
@ -787,6 +902,9 @@ static void *dax_insert_entry(struct xa_state *xas,
|
||||
if (dirty)
|
||||
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
|
||||
|
||||
if (cow)
|
||||
xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
|
||||
|
||||
xas_unlock_irq(xas);
|
||||
return entry;
|
||||
}
|
||||
@ -931,20 +1049,22 @@ int dax_writeback_mapping_range(struct address_space *mapping,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
|
||||
|
||||
static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
|
||||
pfn_t *pfnp)
|
||||
static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
|
||||
size_t size, void **kaddr, pfn_t *pfnp)
|
||||
{
|
||||
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
|
||||
int id, rc;
|
||||
int id, rc = 0;
|
||||
long length;
|
||||
|
||||
id = dax_read_lock();
|
||||
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
|
||||
DAX_ACCESS, NULL, pfnp);
|
||||
DAX_ACCESS, kaddr, pfnp);
|
||||
if (length < 0) {
|
||||
rc = length;
|
||||
goto out;
|
||||
}
|
||||
if (!pfnp)
|
||||
goto out_check_addr;
|
||||
rc = -EINVAL;
|
||||
if (PFN_PHYS(length) < size)
|
||||
goto out;
|
||||
@ -954,11 +1074,71 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
|
||||
if (length > 1 && !pfn_t_devmap(*pfnp))
|
||||
goto out;
|
||||
rc = 0;
|
||||
|
||||
out_check_addr:
|
||||
if (!kaddr)
|
||||
goto out;
|
||||
if (!*kaddr)
|
||||
rc = -EFAULT;
|
||||
out:
|
||||
dax_read_unlock(id);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* dax_iomap_cow_copy - Copy the data from source to destination before write
|
||||
* @pos: address to do copy from.
|
||||
* @length: size of copy operation.
|
||||
* @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
|
||||
* @srcmap: iomap srcmap
|
||||
* @daddr: destination address to copy to.
|
||||
*
|
||||
* This can be called from two places. Either during DAX write fault (page
|
||||
* aligned), to copy the length size data to daddr. Or, while doing normal DAX
|
||||
* write operation, dax_iomap_actor() might call this to do the copy of either
|
||||
* start or end unaligned address. In the latter case the rest of the copy of
|
||||
* aligned ranges is taken care by dax_iomap_actor() itself.
|
||||
*/
|
||||
static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
|
||||
const struct iomap *srcmap, void *daddr)
|
||||
{
|
||||
loff_t head_off = pos & (align_size - 1);
|
||||
size_t size = ALIGN(head_off + length, align_size);
|
||||
loff_t end = pos + length;
|
||||
loff_t pg_end = round_up(end, align_size);
|
||||
bool copy_all = head_off == 0 && end == pg_end;
|
||||
void *saddr = 0;
|
||||
int ret = 0;
|
||||
|
||||
ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (copy_all) {
|
||||
ret = copy_mc_to_kernel(daddr, saddr, length);
|
||||
return ret ? -EIO : 0;
|
||||
}
|
||||
|
||||
/* Copy the head part of the range */
|
||||
if (head_off) {
|
||||
ret = copy_mc_to_kernel(daddr, saddr, head_off);
|
||||
if (ret)
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/* Copy the tail part of the range */
|
||||
if (end < pg_end) {
|
||||
loff_t tail_off = head_off + length;
|
||||
loff_t tail_len = pg_end - end;
|
||||
|
||||
ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
|
||||
tail_len);
|
||||
if (ret)
|
||||
return -EIO;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* The user has performed a load from a hole in the file. Allocating a new
|
||||
* page in the file would cause excessive storage usage for workloads with
|
||||
@ -966,17 +1146,15 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
|
||||
* If this page is ever written to we will re-fault and change the mapping to
|
||||
* point to real DAX storage instead.
|
||||
*/
|
||||
static vm_fault_t dax_load_hole(struct xa_state *xas,
|
||||
struct address_space *mapping, void **entry,
|
||||
struct vm_fault *vmf)
|
||||
static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
|
||||
const struct iomap_iter *iter, void **entry)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
struct inode *inode = iter->inode;
|
||||
unsigned long vaddr = vmf->address;
|
||||
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
|
||||
vm_fault_t ret;
|
||||
|
||||
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
|
||||
DAX_ZERO_PAGE, false);
|
||||
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
|
||||
|
||||
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
|
||||
trace_dax_load_hole(inode, vmf, ret);
|
||||
@ -985,7 +1163,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
|
||||
|
||||
#ifdef CONFIG_FS_DAX_PMD
|
||||
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
|
||||
const struct iomap *iomap, void **entry)
|
||||
const struct iomap_iter *iter, void **entry)
|
||||
{
|
||||
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
||||
unsigned long pmd_addr = vmf->address & PMD_MASK;
|
||||
@ -1003,8 +1181,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
|
||||
goto fallback;
|
||||
|
||||
pfn = page_to_pfn_t(zero_page);
|
||||
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
|
||||
DAX_PMD | DAX_ZERO_PAGE, false);
|
||||
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
|
||||
DAX_PMD | DAX_ZERO_PAGE);
|
||||
|
||||
if (arch_needs_pgtable_deposit()) {
|
||||
pgtable = pte_alloc_one(vma->vm_mm);
|
||||
@ -1037,23 +1215,34 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
|
||||
}
|
||||
#else
|
||||
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
|
||||
const struct iomap *iomap, void **entry)
|
||||
const struct iomap_iter *iter, void **entry)
|
||||
{
|
||||
return VM_FAULT_FALLBACK;
|
||||
}
|
||||
#endif /* CONFIG_FS_DAX_PMD */
|
||||
|
||||
static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
unsigned int offset, size_t size)
|
||||
static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
|
||||
{
|
||||
const struct iomap *iomap = &iter->iomap;
|
||||
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
||||
unsigned offset = offset_in_page(pos);
|
||||
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
|
||||
void *kaddr;
|
||||
long ret;
|
||||
|
||||
ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL);
|
||||
if (ret > 0) {
|
||||
memset(kaddr + offset, 0, size);
|
||||
dax_flush(dax_dev, kaddr + offset, size);
|
||||
}
|
||||
ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
|
||||
NULL);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
memset(kaddr + offset, 0, size);
|
||||
if (srcmap->addr != iomap->addr) {
|
||||
ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
|
||||
kaddr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
|
||||
} else
|
||||
dax_flush(iomap->dax_dev, kaddr + offset, size);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1080,7 +1269,7 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
|
||||
if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
|
||||
rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
|
||||
else
|
||||
rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
|
||||
rc = dax_memzero(iter, pos, size);
|
||||
dax_read_unlock(id);
|
||||
|
||||
if (rc < 0)
|
||||
@ -1129,15 +1318,17 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
struct iov_iter *iter)
|
||||
{
|
||||
const struct iomap *iomap = &iomi->iomap;
|
||||
const struct iomap *srcmap = &iomi->srcmap;
|
||||
loff_t length = iomap_length(iomi);
|
||||
loff_t pos = iomi->pos;
|
||||
struct dax_device *dax_dev = iomap->dax_dev;
|
||||
loff_t end = pos + length, done = 0;
|
||||
bool write = iov_iter_rw(iter) == WRITE;
|
||||
ssize_t ret = 0;
|
||||
size_t xfer;
|
||||
int id;
|
||||
|
||||
if (iov_iter_rw(iter) == READ) {
|
||||
if (!write) {
|
||||
end = min(end, i_size_read(iomi->inode));
|
||||
if (pos >= end)
|
||||
return 0;
|
||||
@ -1146,7 +1337,12 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
return iov_iter_zero(min(length, end - pos), iter);
|
||||
}
|
||||
|
||||
if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
|
||||
/*
|
||||
* In DAX mode, enforce either pure overwrites of written extents, or
|
||||
* writes to unwritten extents as part of a copy-on-write operation.
|
||||
*/
|
||||
if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
|
||||
!(iomap->flags & IOMAP_F_SHARED)))
|
||||
return -EIO;
|
||||
|
||||
/*
|
||||
@ -1188,6 +1384,14 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
break;
|
||||
}
|
||||
|
||||
if (write &&
|
||||
srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
|
||||
ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
|
||||
kaddr);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
map_len = PFN_PHYS(map_len);
|
||||
kaddr += offset;
|
||||
map_len -= offset;
|
||||
@ -1197,7 +1401,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
if (recovery)
|
||||
xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
|
||||
map_len, iter);
|
||||
else if (iov_iter_rw(iter) == WRITE)
|
||||
else if (write)
|
||||
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
|
||||
map_len, iter);
|
||||
else
|
||||
@ -1267,17 +1471,6 @@ static vm_fault_t dax_fault_return(int error)
|
||||
return vmf_error(error);
|
||||
}
|
||||
|
||||
/*
|
||||
* MAP_SYNC on a dax mapping guarantees dirty metadata is
|
||||
* flushed on write-faults (non-cow), but not read-faults.
|
||||
*/
|
||||
static bool dax_fault_is_synchronous(unsigned long flags,
|
||||
struct vm_area_struct *vma, const struct iomap *iomap)
|
||||
{
|
||||
return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
|
||||
&& (iomap->flags & IOMAP_F_DIRTY);
|
||||
}
|
||||
|
||||
/*
|
||||
* When handling a synchronous page fault and the inode need a fsync, we can
|
||||
* insert the PTE/PMD into page tables only after that fsync happened. Skip
|
||||
@ -1335,15 +1528,15 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
|
||||
const struct iomap_iter *iter, pfn_t *pfnp,
|
||||
struct xa_state *xas, void **entry, bool pmd)
|
||||
{
|
||||
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
||||
const struct iomap *iomap = &iter->iomap;
|
||||
const struct iomap *srcmap = &iter->srcmap;
|
||||
size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
|
||||
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
|
||||
bool write = vmf->flags & FAULT_FLAG_WRITE;
|
||||
bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap);
|
||||
bool write = iter->flags & IOMAP_WRITE;
|
||||
unsigned long entry_flags = pmd ? DAX_PMD : 0;
|
||||
int err = 0;
|
||||
pfn_t pfn;
|
||||
void *kaddr;
|
||||
|
||||
if (!pmd && vmf->cow_page)
|
||||
return dax_fault_cow_page(vmf, iter);
|
||||
@ -1352,23 +1545,29 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
|
||||
if (!write &&
|
||||
(iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
|
||||
if (!pmd)
|
||||
return dax_load_hole(xas, mapping, entry, vmf);
|
||||
return dax_pmd_load_hole(xas, vmf, iomap, entry);
|
||||
return dax_load_hole(xas, vmf, iter, entry);
|
||||
return dax_pmd_load_hole(xas, vmf, iter, entry);
|
||||
}
|
||||
|
||||
if (iomap->type != IOMAP_MAPPED) {
|
||||
if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
|
||||
}
|
||||
|
||||
err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn);
|
||||
err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
|
||||
if (err)
|
||||
return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
|
||||
|
||||
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
|
||||
write && !sync);
|
||||
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
|
||||
|
||||
if (sync)
|
||||
if (write &&
|
||||
srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
|
||||
err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
|
||||
if (err)
|
||||
return dax_fault_return(err);
|
||||
}
|
||||
|
||||
if (dax_fault_is_synchronous(iter, vmf->vma))
|
||||
return dax_fault_synchronous_pfnp(pfnp, pfn);
|
||||
|
||||
/* insert PMD pfn */
|
||||
@ -1674,3 +1873,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
|
||||
return dax_insert_pfn_mkwrite(vmf, pfn, order);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
|
||||
|
||||
static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
|
||||
struct iomap_iter *it_dest, u64 len, bool *same)
|
||||
{
|
||||
const struct iomap *smap = &it_src->iomap;
|
||||
const struct iomap *dmap = &it_dest->iomap;
|
||||
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
|
||||
void *saddr, *daddr;
|
||||
int id, ret;
|
||||
|
||||
len = min(len, min(smap->length, dmap->length));
|
||||
|
||||
if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
|
||||
*same = true;
|
||||
return len;
|
||||
}
|
||||
|
||||
if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
|
||||
*same = false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
id = dax_read_lock();
|
||||
ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
|
||||
&saddr, NULL);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
|
||||
ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
|
||||
&daddr, NULL);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
|
||||
*same = !memcmp(saddr, daddr, len);
|
||||
if (!*same)
|
||||
len = 0;
|
||||
dax_read_unlock(id);
|
||||
return len;
|
||||
|
||||
out_unlock:
|
||||
dax_read_unlock(id);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
|
||||
struct inode *dst, loff_t dstoff, loff_t len, bool *same,
|
||||
const struct iomap_ops *ops)
|
||||
{
|
||||
struct iomap_iter src_iter = {
|
||||
.inode = src,
|
||||
.pos = srcoff,
|
||||
.len = len,
|
||||
.flags = IOMAP_DAX,
|
||||
};
|
||||
struct iomap_iter dst_iter = {
|
||||
.inode = dst,
|
||||
.pos = dstoff,
|
||||
.len = len,
|
||||
.flags = IOMAP_DAX,
|
||||
};
|
||||
int ret;
|
||||
|
||||
while ((ret = iomap_iter(&src_iter, ops)) > 0) {
|
||||
while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
|
||||
dst_iter.processed = dax_range_compare_iter(&src_iter,
|
||||
&dst_iter, len, same);
|
||||
}
|
||||
if (ret <= 0)
|
||||
src_iter.processed = ret;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
|
||||
struct file *file_out, loff_t pos_out,
|
||||
loff_t *len, unsigned int remap_flags,
|
||||
const struct iomap_ops *ops)
|
||||
{
|
||||
return __generic_remap_file_range_prep(file_in, pos_in, file_out,
|
||||
pos_out, len, remap_flags, ops);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
|
||||
|
@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
|
||||
if (IS_ERR(bdev))
|
||||
return PTR_ERR(bdev);
|
||||
dif->bdev = bdev;
|
||||
dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
|
||||
dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
|
||||
NULL, NULL);
|
||||
}
|
||||
|
||||
dif->blocks = le32_to_cpu(dis->blocks);
|
||||
@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
|
||||
}
|
||||
|
||||
sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
|
||||
&sbi->dax_part_off);
|
||||
&sbi->dax_part_off,
|
||||
NULL, NULL);
|
||||
}
|
||||
|
||||
err = erofs_read_superblock(sb);
|
||||
@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
|
||||
{
|
||||
struct erofs_device_info *dif = ptr;
|
||||
|
||||
fs_put_dax(dif->dax_dev);
|
||||
fs_put_dax(dif->dax_dev, NULL);
|
||||
if (dif->bdev)
|
||||
blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
|
||||
erofs_fscache_unregister_cookie(&dif->fscache);
|
||||
@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb)
|
||||
return;
|
||||
|
||||
erofs_free_dev_context(sbi->devs);
|
||||
fs_put_dax(sbi->dax_dev);
|
||||
fs_put_dax(sbi->dax_dev, NULL);
|
||||
erofs_fscache_unregister_cookie(&sbi->s_fscache);
|
||||
erofs_fscache_unregister_fs(sb);
|
||||
kfree(sbi->opt.fsid);
|
||||
|
@ -282,7 +282,7 @@ static struct shrinker erofs_shrinker_info = {
|
||||
|
||||
int __init erofs_init_shrinker(void)
|
||||
{
|
||||
return register_shrinker(&erofs_shrinker_info);
|
||||
return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
|
||||
}
|
||||
|
||||
void erofs_exit_shrinker(void)
|
||||
|
12
fs/exec.c
12
fs/exec.c
@ -28,7 +28,6 @@
|
||||
#include <linux/file.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/vmacache.h>
|
||||
#include <linux/stat.h>
|
||||
#include <linux/fcntl.h>
|
||||
#include <linux/swap.h>
|
||||
@ -688,6 +687,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
|
||||
unsigned long length = old_end - old_start;
|
||||
unsigned long new_start = old_start - shift;
|
||||
unsigned long new_end = old_end - shift;
|
||||
VMA_ITERATOR(vmi, mm, new_start);
|
||||
struct vm_area_struct *next;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
BUG_ON(new_start > new_end);
|
||||
@ -696,7 +697,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
|
||||
* ensure there are no vmas between where we want to go
|
||||
* and where we are
|
||||
*/
|
||||
if (vma != find_vma(mm, new_start))
|
||||
if (vma != vma_next(&vmi))
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
@ -715,12 +716,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
|
||||
|
||||
lru_add_drain();
|
||||
tlb_gather_mmu(&tlb, mm);
|
||||
next = vma_next(&vmi);
|
||||
if (new_end > old_start) {
|
||||
/*
|
||||
* when the old and new regions overlap clear from new_end.
|
||||
*/
|
||||
free_pgd_range(&tlb, new_end, old_end, new_end,
|
||||
vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
|
||||
next ? next->vm_start : USER_PGTABLES_CEILING);
|
||||
} else {
|
||||
/*
|
||||
* otherwise, clean from old_start; this is done to not touch
|
||||
@ -729,7 +731,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
|
||||
* for the others its just a little faster.
|
||||
*/
|
||||
free_pgd_range(&tlb, old_start, old_end, new_end,
|
||||
vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
|
||||
next ? next->vm_start : USER_PGTABLES_CEILING);
|
||||
}
|
||||
tlb_finish_mmu(&tlb);
|
||||
|
||||
@ -1030,8 +1032,6 @@ static int exec_mmap(struct mm_struct *mm)
|
||||
activate_mm(active_mm, mm);
|
||||
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
|
||||
local_irq_enable();
|
||||
tsk->mm->vmacache_seqnum = 0;
|
||||
vmacache_flush(tsk);
|
||||
task_unlock(tsk);
|
||||
|
||||
if (vfork)
|
||||
|
@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb)
|
||||
brelse (sbi->s_sbh);
|
||||
sb->s_fs_info = NULL;
|
||||
kfree(sbi->s_blockgroup_lock);
|
||||
fs_put_dax(sbi->s_daxdev);
|
||||
fs_put_dax(sbi->s_daxdev, NULL);
|
||||
kfree(sbi);
|
||||
}
|
||||
|
||||
@ -833,7 +833,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
|
||||
}
|
||||
sb->s_fs_info = sbi;
|
||||
sbi->s_sb_block = sb_block;
|
||||
sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
|
||||
sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
|
||||
NULL, NULL);
|
||||
|
||||
spin_lock_init(&sbi->s_lock);
|
||||
ret = -EINVAL;
|
||||
@ -1202,7 +1203,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
|
||||
failed_mount:
|
||||
brelse(bh);
|
||||
failed_sbi:
|
||||
fs_put_dax(sbi->s_daxdev);
|
||||
fs_put_dax(sbi->s_daxdev, NULL);
|
||||
sb->s_fs_info = NULL;
|
||||
kfree(sbi->s_blockgroup_lock);
|
||||
kfree(sbi);
|
||||
|
@ -1654,7 +1654,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
|
||||
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
|
||||
sbi->s_es_shrinker.count_objects = ext4_es_count;
|
||||
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
|
||||
err = register_shrinker(&sbi->s_es_shrinker);
|
||||
err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
|
||||
sbi->s_sb->s_id);
|
||||
if (err)
|
||||
goto err4;
|
||||
|
||||
|
@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb)
|
||||
if (sbi->s_chksum_driver)
|
||||
crypto_free_shash(sbi->s_chksum_driver);
|
||||
kfree(sbi->s_blockgroup_lock);
|
||||
fs_put_dax(sbi->s_daxdev);
|
||||
fs_put_dax(sbi->s_daxdev, NULL);
|
||||
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
|
||||
#if IS_ENABLED(CONFIG_UNICODE)
|
||||
utf8_unload(sb->s_encoding);
|
||||
@ -4272,7 +4272,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi)
|
||||
return;
|
||||
|
||||
kfree(sbi->s_blockgroup_lock);
|
||||
fs_put_dax(sbi->s_daxdev);
|
||||
fs_put_dax(sbi->s_daxdev, NULL);
|
||||
kfree(sbi);
|
||||
}
|
||||
|
||||
@ -4284,7 +4284,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
|
||||
if (!sbi)
|
||||
return NULL;
|
||||
|
||||
sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
|
||||
sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
|
||||
NULL, NULL);
|
||||
|
||||
sbi->s_blockgroup_lock =
|
||||
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
|
||||
@ -4296,7 +4297,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
|
||||
sbi->s_sb = sb;
|
||||
return sbi;
|
||||
err_out:
|
||||
fs_put_dax(sbi->s_daxdev);
|
||||
fs_put_dax(sbi->s_daxdev, NULL);
|
||||
kfree(sbi);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -4616,7 +4616,7 @@ static int __init init_f2fs_fs(void)
|
||||
err = f2fs_init_sysfs();
|
||||
if (err)
|
||||
goto free_garbage_collection_cache;
|
||||
err = register_shrinker(&f2fs_shrinker_info);
|
||||
err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker");
|
||||
if (err)
|
||||
goto free_sysfs;
|
||||
err = register_filesystem(&f2fs_fs_type);
|
||||
|
@ -244,15 +244,13 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
|
||||
if (inode_cgwb_enabled(inode)) {
|
||||
struct cgroup_subsys_state *memcg_css;
|
||||
|
||||
if (page) {
|
||||
memcg_css = mem_cgroup_css_from_page(page);
|
||||
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
|
||||
} else {
|
||||
/* must pin memcg_css, see wb_get_create() */
|
||||
/* must pin memcg_css, see wb_get_create() */
|
||||
if (page)
|
||||
memcg_css = get_mem_cgroup_css_from_page(page);
|
||||
else
|
||||
memcg_css = task_get_css(current, memory_cgrp_id);
|
||||
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
|
||||
css_put(memcg_css);
|
||||
}
|
||||
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
|
||||
css_put(memcg_css);
|
||||
}
|
||||
|
||||
if (!wb)
|
||||
@ -869,16 +867,16 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
|
||||
if (!wbc->wb || wbc->no_cgroup_owner)
|
||||
return;
|
||||
|
||||
css = mem_cgroup_css_from_page(page);
|
||||
css = get_mem_cgroup_css_from_page(page);
|
||||
/* dead cgroups shouldn't contribute to inode ownership arbitration */
|
||||
if (!(css->flags & CSS_ONLINE))
|
||||
return;
|
||||
goto out;
|
||||
|
||||
id = css->id;
|
||||
|
||||
if (id == wbc->wb_id) {
|
||||
wbc->wb_bytes += bytes;
|
||||
return;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (id == wbc->wb_lcand_id)
|
||||
@ -891,6 +889,9 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
|
||||
wbc->wb_tcand_bytes += bytes;
|
||||
else
|
||||
wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
|
||||
|
||||
out:
|
||||
css_put(css);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
|
||||
|
||||
|
@ -2534,7 +2534,7 @@ int __init gfs2_glock_init(void)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
ret = register_shrinker(&glock_shrinker);
|
||||
ret = register_shrinker(&glock_shrinker, "gfs2-glock");
|
||||
if (ret) {
|
||||
destroy_workqueue(gfs2_delete_workqueue);
|
||||
destroy_workqueue(glock_workqueue);
|
||||
|
@ -148,7 +148,7 @@ static int __init init_gfs2_fs(void)
|
||||
if (!gfs2_trans_cachep)
|
||||
goto fail_cachep8;
|
||||
|
||||
error = register_shrinker(&gfs2_qd_shrinker);
|
||||
error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");
|
||||
if (error)
|
||||
goto fail_shrinker;
|
||||
|
||||
|
@ -1418,7 +1418,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
|
||||
if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
|
||||
goto err_cleanup;
|
||||
|
||||
if (register_shrinker(&journal->j_shrinker)) {
|
||||
if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
|
||||
MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) {
|
||||
percpu_counter_destroy(&journal->j_checkpoint_jh_count);
|
||||
goto err_cleanup;
|
||||
}
|
||||
|
@ -1217,6 +1217,15 @@ void kfree_link(void *p)
|
||||
}
|
||||
EXPORT_SYMBOL(kfree_link);
|
||||
|
||||
static const struct address_space_operations anon_aops = {
|
||||
.dirty_folio = noop_dirty_folio,
|
||||
};
|
||||
|
||||
bool is_anon_inode(struct inode *inode)
|
||||
{
|
||||
return inode->i_mapping->a_ops == &anon_aops;
|
||||
}
|
||||
|
||||
struct inode *alloc_anon_inode(struct super_block *s)
|
||||
{
|
||||
static const struct address_space_operations anon_aops = {
|
||||
|
@ -367,7 +367,7 @@ struct mb_cache *mb_cache_create(int bucket_bits)
|
||||
cache->c_shrink.count_objects = mb_cache_count;
|
||||
cache->c_shrink.scan_objects = mb_cache_scan;
|
||||
cache->c_shrink.seeks = DEFAULT_SEEKS;
|
||||
if (register_shrinker(&cache->c_shrink)) {
|
||||
if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) {
|
||||
kfree(cache->c_hash);
|
||||
kfree(cache);
|
||||
goto err_out;
|
||||
|
@ -1017,15 +1017,16 @@ int __init nfs4_xattr_cache_init(void)
|
||||
if (ret)
|
||||
goto out2;
|
||||
|
||||
ret = register_shrinker(&nfs4_xattr_cache_shrinker);
|
||||
ret = register_shrinker(&nfs4_xattr_cache_shrinker, "nfs-xattr_cache");
|
||||
if (ret)
|
||||
goto out1;
|
||||
|
||||
ret = register_shrinker(&nfs4_xattr_entry_shrinker);
|
||||
ret = register_shrinker(&nfs4_xattr_entry_shrinker, "nfs-xattr_entry");
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = register_shrinker(&nfs4_xattr_large_entry_shrinker);
|
||||
ret = register_shrinker(&nfs4_xattr_large_entry_shrinker,
|
||||
"nfs-xattr_large_entry");
|
||||
if (!ret)
|
||||
return 0;
|
||||
|
||||
|
@ -149,7 +149,7 @@ int __init register_nfs_fs(void)
|
||||
ret = nfs_register_sysctl();
|
||||
if (ret < 0)
|
||||
goto error_2;
|
||||
ret = register_shrinker(&acl_shrinker);
|
||||
ret = register_shrinker(&acl_shrinker, "nfs-acl");
|
||||
if (ret < 0)
|
||||
goto error_3;
|
||||
#ifdef CONFIG_NFS_V4_2
|
||||
|
@ -670,7 +670,7 @@ nfsd_file_cache_init(void)
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
ret = register_shrinker(&nfsd_file_shrinker);
|
||||
ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache");
|
||||
if (ret) {
|
||||
pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
|
||||
goto out_lru;
|
||||
|
@ -176,7 +176,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
|
||||
nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
|
||||
nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
|
||||
nn->nfsd_reply_cache_shrinker.seeks = 1;
|
||||
status = register_shrinker(&nn->nfsd_reply_cache_shrinker);
|
||||
status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
|
||||
"nfsd-reply:%s", nn->nfsd_name);
|
||||
if (status)
|
||||
goto out_stats_destroy;
|
||||
|
||||
|
@ -453,8 +453,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns,
|
||||
leave:
|
||||
if (status < 0 && did_quota_inode)
|
||||
dquot_free_inode(inode);
|
||||
if (handle)
|
||||
if (handle) {
|
||||
if (status < 0 && new_fe_bh != NULL)
|
||||
ocfs2_set_links_count((struct ocfs2_dinode *)
|
||||
new_fe_bh->b_data, 0);
|
||||
ocfs2_commit_trans(osb, handle);
|
||||
}
|
||||
|
||||
ocfs2_inode_unlock(dir, 1);
|
||||
if (did_block_signals)
|
||||
@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
|
||||
leave:
|
||||
if (status < 0) {
|
||||
if (*new_fe_bh) {
|
||||
if (fe)
|
||||
ocfs2_set_links_count(fe, 0);
|
||||
brelse(*new_fe_bh);
|
||||
*new_fe_bh = NULL;
|
||||
}
|
||||
@ -634,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
|
||||
status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
|
||||
parent_fe_bh, handle, inode_ac,
|
||||
fe_blkno, suballoc_loc, suballoc_bit);
|
||||
if (status < 0) {
|
||||
if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags &
|
||||
OCFS2_LOCK_INITIALIZED)) {
|
||||
u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
|
||||
int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
|
||||
inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
|
||||
@ -2027,8 +2034,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns,
|
||||
ocfs2_clusters_to_bytes(osb->sb, 1));
|
||||
if (status < 0 && did_quota_inode)
|
||||
dquot_free_inode(inode);
|
||||
if (handle)
|
||||
if (handle) {
|
||||
if (status < 0 && new_fe_bh != NULL)
|
||||
ocfs2_set_links_count((struct ocfs2_dinode *)
|
||||
new_fe_bh->b_data, 0);
|
||||
ocfs2_commit_trans(osb, handle);
|
||||
}
|
||||
|
||||
ocfs2_inode_unlock(dir, 1);
|
||||
if (did_block_signals)
|
||||
@ -2489,6 +2500,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
|
||||
}
|
||||
|
||||
int ocfs2_create_inode_in_orphan(struct inode *dir,
|
||||
struct buffer_head **dir_bh,
|
||||
int mode,
|
||||
struct inode **new_inode)
|
||||
{
|
||||
@ -2597,13 +2609,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
|
||||
|
||||
brelse(new_di_bh);
|
||||
|
||||
if (!status)
|
||||
*new_inode = inode;
|
||||
|
||||
ocfs2_free_dir_lookup_result(&orphan_insert);
|
||||
|
||||
ocfs2_inode_unlock(dir, 1);
|
||||
brelse(parent_di_bh);
|
||||
if (!status) {
|
||||
*new_inode = inode;
|
||||
*dir_bh = parent_di_bh;
|
||||
} else {
|
||||
ocfs2_inode_unlock(dir, 1);
|
||||
brelse(parent_di_bh);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -2760,11 +2775,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
|
||||
}
|
||||
|
||||
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
|
||||
struct buffer_head *dir_bh,
|
||||
struct inode *inode,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
int status = 0;
|
||||
struct buffer_head *parent_di_bh = NULL;
|
||||
handle_t *handle = NULL;
|
||||
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
|
||||
struct ocfs2_dinode *dir_di, *di;
|
||||
@ -2778,14 +2793,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
|
||||
(unsigned long long)OCFS2_I(dir)->ip_blkno,
|
||||
(unsigned long long)OCFS2_I(inode)->ip_blkno);
|
||||
|
||||
status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
|
||||
if (status < 0) {
|
||||
if (status != -ENOENT)
|
||||
mlog_errno(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
|
||||
dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
|
||||
if (!dir_di->i_links_count) {
|
||||
/* can't make a file in a deleted directory. */
|
||||
status = -ENOENT;
|
||||
@ -2798,7 +2806,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
|
||||
goto leave;
|
||||
|
||||
/* get a spot inside the dir. */
|
||||
status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
|
||||
status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
|
||||
dentry->d_name.name,
|
||||
dentry->d_name.len, &lookup);
|
||||
if (status < 0) {
|
||||
@ -2862,7 +2870,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
|
||||
ocfs2_journal_dirty(handle, di_bh);
|
||||
|
||||
status = ocfs2_add_entry(handle, dentry, inode,
|
||||
OCFS2_I(inode)->ip_blkno, parent_di_bh,
|
||||
OCFS2_I(inode)->ip_blkno, dir_bh,
|
||||
&lookup);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
@ -2886,10 +2894,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
|
||||
iput(orphan_dir_inode);
|
||||
leave:
|
||||
|
||||
ocfs2_inode_unlock(dir, 1);
|
||||
|
||||
brelse(di_bh);
|
||||
brelse(parent_di_bh);
|
||||
brelse(orphan_dir_bh);
|
||||
|
||||
ocfs2_free_dir_lookup_result(&lookup);
|
||||
|
@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
|
||||
struct buffer_head *orphan_dir_bh,
|
||||
bool dio);
|
||||
int ocfs2_create_inode_in_orphan(struct inode *dir,
|
||||
struct buffer_head **dir_bh,
|
||||
int mode,
|
||||
struct inode **new_inode);
|
||||
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
|
||||
@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
|
||||
struct inode *inode, struct buffer_head *di_bh,
|
||||
int update_isize, loff_t end);
|
||||
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
|
||||
struct buffer_head *dir_bh,
|
||||
struct inode *new_inode,
|
||||
struct dentry *new_dentry);
|
||||
|
||||
|
@ -4222,7 +4222,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
|
||||
{
|
||||
int error, had_lock;
|
||||
struct inode *inode = d_inode(old_dentry);
|
||||
struct buffer_head *old_bh = NULL;
|
||||
struct buffer_head *old_bh = NULL, *dir_bh = NULL;
|
||||
struct inode *new_orphan_inode = NULL;
|
||||
struct ocfs2_lock_holder oh;
|
||||
|
||||
@ -4230,7 +4230,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
|
||||
error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
|
||||
error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
|
||||
&new_orphan_inode);
|
||||
if (error) {
|
||||
mlog_errno(error);
|
||||
@ -4276,13 +4276,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
|
||||
|
||||
/* If the security isn't preserved, we need to re-initialize them. */
|
||||
if (!preserve) {
|
||||
error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
|
||||
error = ocfs2_init_security_and_acl(dir, dir_bh,
|
||||
new_orphan_inode,
|
||||
&new_dentry->d_name);
|
||||
if (error)
|
||||
mlog_errno(error);
|
||||
}
|
||||
if (!error) {
|
||||
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
|
||||
error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
|
||||
new_orphan_inode,
|
||||
new_dentry);
|
||||
if (error)
|
||||
mlog_errno(error);
|
||||
@ -4300,6 +4302,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
|
||||
iput(new_orphan_inode);
|
||||
}
|
||||
|
||||
if (dir_bh) {
|
||||
ocfs2_inode_unlock(dir, 1);
|
||||
brelse(dir_bh);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
|
||||
/*
|
||||
* Initialize security and acl for a already created inode.
|
||||
* Used for reflink a non-preserve-security file.
|
||||
*
|
||||
* It uses common api like ocfs2_xattr_set, so the caller
|
||||
* must not hold any lock expect i_rwsem.
|
||||
*/
|
||||
int ocfs2_init_security_and_acl(struct inode *dir,
|
||||
struct buffer_head *dir_bh,
|
||||
struct inode *inode,
|
||||
const struct qstr *qstr)
|
||||
{
|
||||
int ret = 0;
|
||||
struct buffer_head *dir_bh = NULL;
|
||||
|
||||
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
|
||||
if (ret) {
|
||||
@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
|
||||
goto leave;
|
||||
}
|
||||
|
||||
ret = ocfs2_inode_lock(dir, &dir_bh, 0);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
goto leave;
|
||||
}
|
||||
ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
|
||||
if (ret)
|
||||
mlog_errno(ret);
|
||||
|
||||
ocfs2_inode_unlock(dir, 0);
|
||||
brelse(dir_bh);
|
||||
leave:
|
||||
return ret;
|
||||
}
|
||||
|
@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
|
||||
struct buffer_head *new_bh,
|
||||
bool preserve_security);
|
||||
int ocfs2_init_security_and_acl(struct inode *dir,
|
||||
struct buffer_head *dir_bh,
|
||||
struct inode *inode,
|
||||
const struct qstr *qstr);
|
||||
#endif /* OCFS2_XATTR_H */
|
||||
|
@ -69,7 +69,6 @@
|
||||
#include <linux/sched/cputime.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/ioport.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/hugetlb.h>
|
||||
|
@ -2322,6 +2322,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
|
||||
GENRADIX(struct map_files_info) fa;
|
||||
struct map_files_info *p;
|
||||
int ret;
|
||||
MA_STATE(mas, NULL, 0, 0);
|
||||
|
||||
genradix_init(&fa);
|
||||
|
||||
@ -2349,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
|
||||
}
|
||||
|
||||
nr_files = 0;
|
||||
mas.tree = &mm->mm_mt;
|
||||
|
||||
/*
|
||||
* We need two passes here:
|
||||
@ -2360,7 +2362,8 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
|
||||
* routine might require mmap_lock taken in might_fault().
|
||||
*/
|
||||
|
||||
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
|
||||
pos = 2;
|
||||
mas_for_each(&mas, vma, ULONG_MAX) {
|
||||
if (!vma->vm_file)
|
||||
continue;
|
||||
if (++pos <= ctx->pos)
|
||||
|
18
fs/proc/fd.c
18
fs/proc/fd.c
@ -23,6 +23,7 @@ static int seq_show(struct seq_file *m, void *v)
|
||||
struct files_struct *files = NULL;
|
||||
int f_flags = 0, ret = -ENOENT;
|
||||
struct file *file = NULL;
|
||||
struct inode *inode = NULL;
|
||||
struct task_struct *task;
|
||||
|
||||
task = get_proc_task(m->private);
|
||||
@ -54,10 +55,19 @@ static int seq_show(struct seq_file *m, void *v)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n",
|
||||
(long long)file->f_pos, f_flags,
|
||||
real_mount(file->f_path.mnt)->mnt_id,
|
||||
file_inode(file)->i_ino);
|
||||
inode = file_inode(file);
|
||||
|
||||
seq_printf(m, "pos:\t%lli\n", (long long)file->f_pos);
|
||||
seq_printf(m, "flags:\t0%o\n", f_flags);
|
||||
seq_printf(m, "mnt_id:\t%i\n", real_mount(file->f_path.mnt)->mnt_id);
|
||||
seq_printf(m, "ino:\t%lu\n", inode->i_ino);
|
||||
seq_printf(m, "size:\t%lli\n", (long long)inode->i_size);
|
||||
|
||||
if (is_anon_inode(inode)) {
|
||||
seq_puts(m, "path:\t");
|
||||
seq_file_path(m, file, "\n");
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
|
||||
/* show_fd_locks() never deferences files so a stale value is safe */
|
||||
show_fd_locks(m, file, files);
|
||||
|
@ -26,8 +26,6 @@
|
||||
#include <linux/mount.h>
|
||||
#include <linux/bug.h>
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
static void proc_evict_inode(struct inode *inode)
|
||||
|
@ -285,7 +285,7 @@ struct proc_maps_private {
|
||||
struct task_struct *task;
|
||||
struct mm_struct *mm;
|
||||
#ifdef CONFIG_MMU
|
||||
struct vm_area_struct *tail_vma;
|
||||
struct vma_iterator iter;
|
||||
#endif
|
||||
#ifdef CONFIG_NUMA
|
||||
struct mempolicy *task_mempolicy;
|
||||
|
@ -15,7 +15,6 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/syslog.h>
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/io.h>
|
||||
|
||||
extern wait_queue_head_t log_wait;
|
||||
|
@ -21,7 +21,6 @@
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/div64.h>
|
||||
#include "internal.h"
|
||||
|
@ -8,9 +8,6 @@
|
||||
*
|
||||
* proc net directory handling functions
|
||||
*/
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/time.h>
|
||||
#include <linux/proc_fs.h>
|
||||
|
@ -4,8 +4,6 @@
|
||||
*
|
||||
* Copyright 1997, Theodore Ts'o
|
||||
*/
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/errno.h>
|
||||
|
@ -6,9 +6,6 @@
|
||||
*
|
||||
* proc root directory handling functions
|
||||
*/
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/time.h>
|
||||
#include <linux/proc_fs.h>
|
||||
|
@ -1,6 +1,5 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/vmacache.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/huge_mm.h>
|
||||
@ -124,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
|
||||
loff_t *ppos)
|
||||
{
|
||||
struct vm_area_struct *vma = vma_next(&priv->iter);
|
||||
|
||||
if (vma) {
|
||||
*ppos = vma->vm_start;
|
||||
} else {
|
||||
*ppos = -2UL;
|
||||
vma = get_gate_vma(priv->mm);
|
||||
}
|
||||
|
||||
return vma;
|
||||
}
|
||||
|
||||
static void *m_start(struct seq_file *m, loff_t *ppos)
|
||||
{
|
||||
struct proc_maps_private *priv = m->private;
|
||||
unsigned long last_addr = *ppos;
|
||||
struct mm_struct *mm;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
/* See m_next(). Zero at the start or after lseek. */
|
||||
if (last_addr == -1UL)
|
||||
@ -153,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
|
||||
return ERR_PTR(-EINTR);
|
||||
}
|
||||
|
||||
vma_iter_init(&priv->iter, mm, last_addr);
|
||||
hold_task_mempolicy(priv);
|
||||
priv->tail_vma = get_gate_vma(mm);
|
||||
if (last_addr == -2UL)
|
||||
return get_gate_vma(mm);
|
||||
|
||||
vma = find_vma(mm, last_addr);
|
||||
if (vma)
|
||||
return vma;
|
||||
|
||||
return priv->tail_vma;
|
||||
return proc_get_vma(priv, ppos);
|
||||
}
|
||||
|
||||
static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
|
||||
{
|
||||
struct proc_maps_private *priv = m->private;
|
||||
struct vm_area_struct *next, *vma = v;
|
||||
|
||||
if (vma == priv->tail_vma)
|
||||
next = NULL;
|
||||
else if (vma->vm_next)
|
||||
next = vma->vm_next;
|
||||
else
|
||||
next = priv->tail_vma;
|
||||
|
||||
*ppos = next ? next->vm_start : -1UL;
|
||||
|
||||
return next;
|
||||
if (*ppos == -2UL) {
|
||||
*ppos = -1UL;
|
||||
return NULL;
|
||||
}
|
||||
return proc_get_vma(m->private, ppos);
|
||||
}
|
||||
|
||||
static void m_stop(struct seq_file *m, void *v)
|
||||
@ -406,6 +409,7 @@ struct mem_size_stats {
|
||||
u64 pss_anon;
|
||||
u64 pss_file;
|
||||
u64 pss_shmem;
|
||||
u64 pss_dirty;
|
||||
u64 pss_locked;
|
||||
u64 swap_pss;
|
||||
};
|
||||
@ -427,6 +431,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
|
||||
mss->pss_locked += pss;
|
||||
|
||||
if (dirty || PageDirty(page)) {
|
||||
mss->pss_dirty += pss;
|
||||
if (private)
|
||||
mss->private_dirty += size;
|
||||
else
|
||||
@ -808,6 +813,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
|
||||
{
|
||||
SEQ_PUT_DEC("Rss: ", mss->resident);
|
||||
SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
|
||||
SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT);
|
||||
if (rollup_mode) {
|
||||
/*
|
||||
* These are meaningful only for smaps_rollup, otherwise two of
|
||||
@ -860,7 +866,7 @@ static int show_smap(struct seq_file *m, void *v)
|
||||
__show_smap(m, &mss, false);
|
||||
|
||||
seq_printf(m, "THPeligible: %d\n",
|
||||
transparent_hugepage_active(vma));
|
||||
hugepage_vma_check(vma, vma->vm_flags, true, false));
|
||||
|
||||
if (arch_pkeys_enabled())
|
||||
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
|
||||
@ -873,16 +879,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
|
||||
{
|
||||
struct proc_maps_private *priv = m->private;
|
||||
struct mem_size_stats mss;
|
||||
struct mm_struct *mm;
|
||||
struct mm_struct *mm = priv->mm;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long last_vma_end = 0;
|
||||
unsigned long vma_start = 0, last_vma_end = 0;
|
||||
int ret = 0;
|
||||
MA_STATE(mas, &mm->mm_mt, 0, 0);
|
||||
|
||||
priv->task = get_proc_task(priv->inode);
|
||||
if (!priv->task)
|
||||
return -ESRCH;
|
||||
|
||||
mm = priv->mm;
|
||||
if (!mm || !mmget_not_zero(mm)) {
|
||||
ret = -ESRCH;
|
||||
goto out_put_task;
|
||||
@ -895,8 +901,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
|
||||
goto out_put_mm;
|
||||
|
||||
hold_task_mempolicy(priv);
|
||||
vma = mas_find(&mas, 0);
|
||||
|
||||
for (vma = priv->mm->mmap; vma;) {
|
||||
if (unlikely(!vma))
|
||||
goto empty_set;
|
||||
|
||||
vma_start = vma->vm_start;
|
||||
do {
|
||||
smap_gather_stats(vma, &mss, 0);
|
||||
last_vma_end = vma->vm_end;
|
||||
|
||||
@ -905,6 +916,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
|
||||
* access it for write request.
|
||||
*/
|
||||
if (mmap_lock_is_contended(mm)) {
|
||||
mas_pause(&mas);
|
||||
mmap_read_unlock(mm);
|
||||
ret = mmap_read_lock_killable(mm);
|
||||
if (ret) {
|
||||
@ -948,7 +960,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
|
||||
* contains last_vma_end.
|
||||
* Iterate VMA' from last_vma_end.
|
||||
*/
|
||||
vma = find_vma(mm, last_vma_end - 1);
|
||||
vma = mas_find(&mas, ULONG_MAX);
|
||||
/* Case 3 above */
|
||||
if (!vma)
|
||||
break;
|
||||
@ -962,11 +974,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
|
||||
smap_gather_stats(vma, &mss, last_vma_end);
|
||||
}
|
||||
/* Case 2 above */
|
||||
vma = vma->vm_next;
|
||||
}
|
||||
} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
|
||||
|
||||
show_vma_header_prefix(m, priv->mm->mmap->vm_start,
|
||||
last_vma_end, 0, 0, 0, 0);
|
||||
empty_set:
|
||||
show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
|
||||
seq_pad(m, ' ');
|
||||
seq_puts(m, "[rollup]\n");
|
||||
|
||||
@ -1259,6 +1270,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
|
||||
return -ESRCH;
|
||||
mm = get_task_mm(task);
|
||||
if (mm) {
|
||||
MA_STATE(mas, &mm->mm_mt, 0, 0);
|
||||
struct mmu_notifier_range range;
|
||||
struct clear_refs_private cp = {
|
||||
.type = type,
|
||||
@ -1278,7 +1290,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
|
||||
}
|
||||
|
||||
if (type == CLEAR_REFS_SOFT_DIRTY) {
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
mas_for_each(&mas, vma, ULONG_MAX) {
|
||||
if (!(vma->vm_flags & VM_SOFTDIRTY))
|
||||
continue;
|
||||
vma->vm_flags &= ~VM_SOFTDIRTY;
|
||||
@ -1290,8 +1302,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
|
||||
0, NULL, mm, 0, -1UL);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
}
|
||||
walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
|
||||
&cp);
|
||||
walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
|
||||
if (type == CLEAR_REFS_SOFT_DIRTY) {
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
flush_tlb_mm(mm);
|
||||
@ -1792,7 +1803,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
|
||||
return NULL;
|
||||
|
||||
page = vm_normal_page(vma, addr, pte);
|
||||
if (!page)
|
||||
if (!page || is_zone_device_page(page))
|
||||
return NULL;
|
||||
|
||||
if (PageReserved(page))
|
||||
|
@ -20,15 +20,13 @@
|
||||
*/
|
||||
void task_mem(struct seq_file *m, struct mm_struct *mm)
|
||||
{
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct vm_area_struct *vma;
|
||||
struct vm_region *region;
|
||||
struct rb_node *p;
|
||||
unsigned long bytes = 0, sbytes = 0, slack = 0, size;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
|
||||
vma = rb_entry(p, struct vm_area_struct, vm_rb);
|
||||
|
||||
mmap_read_lock(mm);
|
||||
for_each_vma(vmi, vma) {
|
||||
bytes += kobjsize(vma);
|
||||
|
||||
region = vma->vm_region;
|
||||
@ -82,15 +80,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
|
||||
|
||||
unsigned long task_vsize(struct mm_struct *mm)
|
||||
{
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct vm_area_struct *vma;
|
||||
struct rb_node *p;
|
||||
unsigned long vsize = 0;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
|
||||
vma = rb_entry(p, struct vm_area_struct, vm_rb);
|
||||
for_each_vma(vmi, vma)
|
||||
vsize += vma->vm_end - vma->vm_start;
|
||||
}
|
||||
mmap_read_unlock(mm);
|
||||
return vsize;
|
||||
}
|
||||
@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm,
|
||||
unsigned long *shared, unsigned long *text,
|
||||
unsigned long *data, unsigned long *resident)
|
||||
{
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct vm_area_struct *vma;
|
||||
struct vm_region *region;
|
||||
struct rb_node *p;
|
||||
unsigned long size = kobjsize(mm);
|
||||
|
||||
mmap_read_lock(mm);
|
||||
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
|
||||
vma = rb_entry(p, struct vm_area_struct, vm_rb);
|
||||
for_each_vma(vmi, vma) {
|
||||
size += kobjsize(vma);
|
||||
region = vma->vm_region;
|
||||
if (region) {
|
||||
@ -190,17 +185,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
|
||||
*/
|
||||
static int show_map(struct seq_file *m, void *_p)
|
||||
{
|
||||
struct rb_node *p = _p;
|
||||
|
||||
return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
|
||||
return nommu_vma_show(m, _p);
|
||||
}
|
||||
|
||||
static void *m_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
struct proc_maps_private *priv = m->private;
|
||||
struct mm_struct *mm;
|
||||
struct rb_node *p;
|
||||
loff_t n = *pos;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long addr = *pos;
|
||||
|
||||
/* See m_next(). Zero at the start or after lseek. */
|
||||
if (addr == -1UL)
|
||||
return NULL;
|
||||
|
||||
/* pin the task and mm whilst we play with them */
|
||||
priv->task = get_proc_task(priv->inode);
|
||||
@ -216,10 +213,10 @@ static void *m_start(struct seq_file *m, loff_t *pos)
|
||||
return ERR_PTR(-EINTR);
|
||||
}
|
||||
|
||||
/* start from the Nth VMA */
|
||||
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
|
||||
if (n-- == 0)
|
||||
return p;
|
||||
/* start the next element from addr */
|
||||
vma = find_vma(mm, addr);
|
||||
if (vma)
|
||||
return vma;
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
mmput(mm);
|
||||
@ -242,10 +239,10 @@ static void m_stop(struct seq_file *m, void *_vml)
|
||||
|
||||
static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
|
||||
{
|
||||
struct rb_node *p = _p;
|
||||
struct vm_area_struct *vma = _p;
|
||||
|
||||
(*pos)++;
|
||||
return p ? rb_next(p) : NULL;
|
||||
*pos = vma->vm_end;
|
||||
return find_vma(vma->vm_mm, vma->vm_end);
|
||||
}
|
||||
|
||||
static const struct seq_operations proc_pid_maps_ops = {
|
||||
|
@ -25,7 +25,6 @@
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/cc_platform.h>
|
||||
#include <asm/io.h>
|
||||
|
@ -3002,7 +3002,7 @@ static int __init dquot_init(void)
|
||||
pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
|
||||
" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
|
||||
|
||||
if (register_shrinker(&dqcache_shrinker))
|
||||
if (register_shrinker(&dqcache_shrinker, "dquota-cache"))
|
||||
panic("Cannot register dquot shrinker");
|
||||
|
||||
return 0;
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <linux/compat.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/dax.h>
|
||||
#include "internal.h"
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
@ -262,9 +263,11 @@ static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff,
|
||||
* If there's an error, then the usual negative error code is returned.
|
||||
* Otherwise returns 0 with *len set to the request length.
|
||||
*/
|
||||
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
|
||||
struct file *file_out, loff_t pos_out,
|
||||
loff_t *len, unsigned int remap_flags)
|
||||
int
|
||||
__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
|
||||
struct file *file_out, loff_t pos_out,
|
||||
loff_t *len, unsigned int remap_flags,
|
||||
const struct iomap_ops *dax_read_ops)
|
||||
{
|
||||
struct inode *inode_in = file_inode(file_in);
|
||||
struct inode *inode_out = file_inode(file_out);
|
||||
@ -324,8 +327,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
|
||||
if (remap_flags & REMAP_FILE_DEDUP) {
|
||||
bool is_same = false;
|
||||
|
||||
ret = vfs_dedupe_file_range_compare(file_in, pos_in,
|
||||
file_out, pos_out, *len, &is_same);
|
||||
if (*len == 0)
|
||||
return 0;
|
||||
|
||||
if (!IS_DAX(inode_in))
|
||||
ret = vfs_dedupe_file_range_compare(file_in, pos_in,
|
||||
file_out, pos_out, *len, &is_same);
|
||||
else if (dax_read_ops)
|
||||
ret = dax_dedupe_file_range_compare(inode_in, pos_in,
|
||||
inode_out, pos_out, *len, &is_same,
|
||||
dax_read_ops);
|
||||
else
|
||||
return -EINVAL;
|
||||
if (ret)
|
||||
return ret;
|
||||
if (!is_same)
|
||||
@ -343,6 +356,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
|
||||
struct file *file_out, loff_t pos_out,
|
||||
loff_t *len, unsigned int remap_flags)
|
||||
{
|
||||
return __generic_remap_file_range_prep(file_in, pos_in, file_out,
|
||||
pos_out, len, remap_flags, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL(generic_remap_file_range_prep);
|
||||
|
||||
loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
|
||||
|
@ -5,9 +5,9 @@
|
||||
|
||||
obj-$(CONFIG_SQUASHFS) += squashfs.o
|
||||
squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
|
||||
squashfs-y += namei.o super.o symlink.o decompressor.o
|
||||
squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o
|
||||
squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
|
||||
squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
|
||||
squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o
|
||||
squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
|
||||
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
|
||||
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user