Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

# Conflicts:
#	include/linux/pagevec.h
This commit is contained in:
Stephen Rothwell 2022-06-28 16:54:05 +10:00
commit 5f9df76887
276 changed files with 55123 additions and 4394 deletions

View File

@ -22,6 +22,7 @@ Description:
MMUPageSize: 4 kB
Rss: 884 kB
Pss: 385 kB
Pss_Dirty: 68 kB
Pss_Anon: 301 kB
Pss_File: 80 kB
Pss_Shmem: 4 kB

View File

@ -1667,6 +1667,19 @@
hlt [BUGS=ARM,SH]
hostname= [KNL] Set the hostname (aka UTS nodename).
Format: <string>
This allows setting the system's hostname during early
startup. This sets the name returned by gethostname.
Using this parameter to set the hostname makes it
possible to ensure the hostname is correctly set before
any userspace processes run, avoiding the possibility
that a process may call gethostname before the hostname
has been explicitly set, resulting in the calling
process getting an incorrect result. The string must
not exceed the maximum allowed hostname length (usually
64 characters) and will be truncated otherwise.
hpet= [X86-32,HPET] option to control HPET usage
Format: { enable (default) | disable | force |
verbose }
@ -1722,9 +1735,11 @@
Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
the default is on.
This is not compatible with memory_hotplug.memmap_on_memory.
If both parameters are enabled, hugetlb_free_vmemmap takes
precedence over memory_hotplug.memmap_on_memory.
Note that the vmemmap pages may be allocated from the added
memory block itself when memory_hotplug.memmap_on_memory is
enabled, those vmemmap pages cannot be optimized even if this
feature is enabled. Other vmemmap pages not allocated from
the added memory block itself do not be affected.
hung_task_panic=
[KNL] Should the hung task detector generate panics.
@ -3083,10 +3098,12 @@
[KNL,X86,ARM] Boolean flag to enable this feature.
Format: {on | off (default)}
When enabled, runtime hotplugged memory will
allocate its internal metadata (struct pages)
from the hotadded memory which will allow to
hotadd a lot of memory without requiring
additional memory to do so.
allocate its internal metadata (struct pages,
those vmemmap pages cannot be optimized even
if hugetlb_free_vmemmap is enabled) from the
hotadded memory which will allow to hotadd a
lot of memory without requiring additional
memory to do so.
This feature is disabled by default because it
has some implication on large (e.g. GB)
allocations in some configurations (e.g. small
@ -3096,10 +3113,6 @@
Note that even when enabled, there are a few cases where
the feature is not effective.
This is not compatible with hugetlb_free_vmemmap. If
both parameters are enabled, hugetlb_free_vmemmap takes
precedence over memory_hotplug.memmap_on_memory.
memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
Format: <integer>
default : 0 <disable>

View File

@ -14,3 +14,4 @@ optimize those.
start
usage
reclaim
lru_sort

View File

@ -0,0 +1,294 @@
.. SPDX-License-Identifier: GPL-2.0
=============================
DAMON-based LRU-lists Sorting
=============================
DAMON-based LRU-lists Sorting (DAMON_LRU_SORT) is a static kernel module that
aimed to be used for proactive and lightweight data access pattern based
(de)prioritization of pages on their LRU-lists for making LRU-lists a more
trusworthy data access pattern source.
Where Proactive LRU-lists Sorting is Required?
==============================================
As page-granularity access checking overhead could be significant on huge
systems, LRU lists are normally not proactively sorted but partially and
reactively sorted for special events including specific user requests, system
calls and memory pressure. As a result, LRU lists are sometimes not so
perfectly prepared to be used as a trustworthy access pattern source for some
situations including reclamation target pages selection under sudden memory
pressure.
Because DAMON can identify access patterns of best-effort accuracy while
inducing only user-specified range of overhead, proactively running
DAMON_LRU_SORT could be helpful for making LRU lists more trustworthy access
pattern source with low and controlled overhead.
How It Works?
=============
DAMON_LRU_SORT finds hot pages (pages of memory regions that showing access
rates that higher than a user-specified threshold) and cold pages (pages of
memory regions that showing no access for a time that longer than a
user-specified threshold) using DAMON, and prioritizes hot pages while
deprioritizing cold pages on their LRU-lists. To avoid it consuming too much
CPU for the prioritizations, a CPU time usage limit can be configured. Under
the limit, it prioritizes and deprioritizes more hot and cold pages first,
respectively. System administrators can also configure under what situation
this scheme should automatically activated and deactivated with three memory
pressure watermarks.
Its default parameters for hotness/coldness thresholds and CPU quota limit are
conservatively chosen. That is, the module under its default parameters could
be widely used without harm for common situations while providing a level of
benefits for systems having clear hot/cold access patterns under memory
pressure while consuming only a limited small portion of CPU time.
Interface: Module Parameters
============================
To use this feature, you should first ensure your system is running on a kernel
that is built with ``CONFIG_DAMON_LRU_SORT=y``.
To let sysadmins enable or disable it and tune for the given system,
DAMON_LRU_SORT utilizes module parameters. That is, you can put
``damon_lru_sort.<parameter>=<value>`` on the kernel boot command line or write
proper values to ``/sys/modules/damon_lru_sort/parameters/<parameter>`` files.
Below are the description of each parameter.
enabled
-------
Enable or disable DAMON_LRU_SORT.
You can enable DAMON_LRU_SORT by setting the value of this parameter as ``Y``.
Setting it as ``N`` disables DAMON_LRU_SORT. Note that DAMON_LRU_SORT could do
no real monitoring and LRU-lists sorting due to the watermarks-based activation
condition. Refer to below descriptions for the watermarks parameter for this.
commit_inputs
-------------
Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``.
Input parameters that updated while DAMON_LRU_SORT is running are not applied
by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT reads values
of parametrs except ``enabled`` again. Once the re-reading is done, this
parameter is set as ``N``. If invalid parameters are found while the
re-reading, DAMON_LRU_SORT will be disabled.
hot_thres_access_freq
---------------------
Access frequency threshold for hot memory regions identification in permil.
If a memory region is accessed in frequency of this or higher, DAMON_LRU_SORT
identifies the region as hot, and mark it as accessed on the LRU list, so that
it could not be reclaimed under memory pressure. 50% by default.
cold_min_age
------------
Time threshold for cold memory regions identification in microseconds.
If a memory region is not accessed for this or longer time, DAMON_LRU_SORT
identifies the region as cold, and mark it as unaccessed on the LRU list, so
that it could be reclaimed first under memory pressure. 120 seconds by
default.
quota_ms
--------
Limit of time for trying the LRU lists sorting in milliseconds.
DAMON_LRU_SORT tries to use only up to this time within a time window
(quota_reset_interval_ms) for trying LRU lists sorting. This can be used
for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the
limit is disabled.
10 ms by default.
quota_reset_interval_ms
-----------------------
The time quota charge reset interval in milliseconds.
The charge reset interval for the quota of time (quota_ms). That is,
DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms
milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds.
1 second by default.
wmarks_interval
---------------
The watermarks check time interval in microseconds.
Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is
enabled but inactive due to its watermarks rule. 5 seconds by default.
wmarks_high
-----------
Free memory rate (per thousand) for the high watermark.
If free memory of the system in bytes per thousand bytes is higher than this,
DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the
watermarks. 200 (20%) by default.
wmarks_mid
----------
Free memory rate (per thousand) for the middle watermark.
If free memory of the system in bytes per thousand bytes is between this and
the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring and
the LRU-lists sorting. 150 (15%) by default.
wmarks_low
----------
Free memory rate (per thousand) for the low watermark.
If free memory of the system in bytes per thousand bytes is lower than this,
DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the
watermarks. 50 (5%) by default.
sample_interval
---------------
Sampling interval for the monitoring in microseconds.
The sampling interval of DAMON for the cold memory monitoring. Please refer to
the DAMON documentation (:doc:`usage`) for more detail. 5ms by default.
aggr_interval
-------------
Aggregation interval for the monitoring in microseconds.
The aggregation interval of DAMON for the cold memory monitoring. Please
refer to the DAMON documentation (:doc:`usage`) for more detail. 100ms by
default.
min_nr_regions
--------------
Minimum number of monitoring regions.
The minimal number of monitoring regions of DAMON for the cold memory
monitoring. This can be used to set lower-bound of the monitoring quality.
But, setting this too high could result in increased monitoring overhead.
Please refer to the DAMON documentation (:doc:`usage`) for more detail. 10 by
default.
max_nr_regions
--------------
Maximum number of monitoring regions.
The maximum number of monitoring regions of DAMON for the cold memory
monitoring. This can be used to set upper-bound of the monitoring overhead.
However, setting this too low could result in bad monitoring quality. Please
refer to the DAMON documentation (:doc:`usage`) for more detail. 1000 by
defaults.
monitor_region_start
--------------------
Start of target memory region in physical address.
The start physical address of memory region that DAMON_LRU_SORT will do work
against. By default, biggest System RAM is used as the region.
monitor_region_end
------------------
End of target memory region in physical address.
The end physical address of memory region that DAMON_LRU_SORT will do work
against. By default, biggest System RAM is used as the region.
kdamond_pid
-----------
PID of the DAMON thread.
If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. Else,
-1.
nr_lru_sort_tried_hot_regions
-----------------------------
Number of hot memory regions that tried to be LRU-sorted.
bytes_lru_sort_tried_hot_regions
--------------------------------
Total bytes of hot memory regions that tried to be LRU-sorted.
nr_lru_sorted_hot_regions
-------------------------
Number of hot memory regions that successfully be LRU-sorted.
bytes_lru_sorted_hot_regions
----------------------------
Total bytes of hot memory regions that successfully be LRU-sorted.
nr_hot_quota_exceeds
--------------------
Number of times that the time quota limit for hot regions have exceeded.
nr_lru_sort_tried_cold_regions
------------------------------
Number of cold memory regions that tried to be LRU-sorted.
bytes_lru_sort_tried_cold_regions
---------------------------------
Total bytes of cold memory regions that tried to be LRU-sorted.
nr_lru_sorted_cold_regions
--------------------------
Number of cold memory regions that successfully be LRU-sorted.
bytes_lru_sorted_cold_regions
-----------------------------
Total bytes of cold memory regions that successfully be LRU-sorted.
nr_cold_quota_exceeds
---------------------
Number of times that the time quota limit for cold regions have exceeded.
Example
=======
Below runtime example commands make DAMON_LRU_SORT to find memory regions
having >=50% access frequency and LRU-prioritize while LRU-deprioritizing
memory regions that not accessed for 120 seconds. The prioritization and
deprioritization is limited to be done using only up to 1% CPU time to avoid
DAMON_LRU_SORT consuming too much CPU time for the (de)prioritization. It also
asks DAMON_LRU_SORT to do nothing if the system's free memory rate is more than
50%, but start the real works if it becomes lower than 40%. If DAMON_RECLAIM
doesn't make progress and therefore the free memory rate becomes lower than
20%, it asks DAMON_LRU_SORT to do nothing again, so that we can fall back to
the LRU-list based page granularity reclamation. ::
# cd /sys/modules/damon_lru_sort/parameters
# echo 500 > hot_thres_access_freq
# echo 120000000 > cold_min_age
# echo 10 > quota_ms
# echo 1000 > quota_reset_interval_ms
# echo 500 > wmarks_high
# echo 400 > wmarks_mid
# echo 200 > wmarks_low
# echo Y > enabled

View File

@ -48,12 +48,6 @@ DAMON_RECLAIM utilizes module parameters. That is, you can put
``damon_reclaim.<parameter>=<value>`` on the kernel boot command line or write
proper values to ``/sys/modules/damon_reclaim/parameters/<parameter>`` files.
Note that the parameter values except ``enabled`` are applied only when
DAMON_RECLAIM starts. Therefore, if you want to apply new parameter values in
runtime and DAMON_RECLAIM is already enabled, you should disable and re-enable
it via ``enabled`` parameter file. Writing of the new values to proper
parameter values should be done before the re-enablement.
Below are the description of each parameter.
enabled

View File

@ -264,6 +264,8 @@ that can be written to and read from the file and their meaning are as below.
- ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
- ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
- ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
- ``lru_prio``: Prioritize the region on its LRU lists.
- ``lru_deprio``: Deprioritize the region on its LRU lists.
- ``stat``: Do nothing but count the statistics
schemes/<N>/access_pattern/

View File

@ -36,6 +36,7 @@ the Linux memory management.
numa_memory_policy
numaperf
pagemap
shrinker_debugfs
soft-dirty
swap_numa
transhuge

View File

@ -0,0 +1,135 @@
.. _shrinker_debugfs:
==========================
Shrinker Debugfs Interface
==========================
Shrinker debugfs interface provides a visibility into the kernel memory
shrinkers subsystem and allows to get information about individual shrinkers
and interact with them.
For each shrinker registered in the system a directory in **<debugfs>/shrinker/**
is created. The directory's name is composed from the shrinker's name and an
unique id: e.g. *kfree_rcu-0* or *sb-xfs:vda1-36*.
Each shrinker directory contains **count** and **scan** files, which allow to
trigger *count_objects()* and *scan_objects()* callbacks for each memcg and
numa node (if applicable).
Usage:
------
1. *List registered shrinkers*
::
$ cd /sys/kernel/debug/shrinker/
$ ls
dquota-cache-16 sb-devpts-28 sb-proc-47 sb-tmpfs-42
mm-shadow-18 sb-devtmpfs-5 sb-proc-48 sb-tmpfs-43
mm-zspool:zram0-34 sb-hugetlbfs-17 sb-pstore-31 sb-tmpfs-44
rcu-kfree-0 sb-hugetlbfs-33 sb-rootfs-2 sb-tmpfs-49
sb-aio-20 sb-iomem-12 sb-securityfs-6 sb-tracefs-13
sb-anon_inodefs-15 sb-mqueue-21 sb-selinuxfs-22 sb-xfs:vda1-36
sb-bdev-3 sb-nsfs-4 sb-sockfs-8 sb-zsmalloc-19
sb-bpf-32 sb-pipefs-14 sb-sysfs-26 thp-deferred_split-10
sb-btrfs:vda2-24 sb-proc-25 sb-tmpfs-1 thp-zero-9
sb-cgroup2-30 sb-proc-39 sb-tmpfs-27 xfs-buf:vda1-37
sb-configfs-23 sb-proc-41 sb-tmpfs-29 xfs-inodegc:vda1-38
sb-dax-11 sb-proc-45 sb-tmpfs-35
sb-debugfs-7 sb-proc-46 sb-tmpfs-40
2. *Get information about a specific shrinker*
::
$ cd sb-btrfs\:vda2-24/
$ ls
count scan
3. *Count objects*
Each line in the output has the following format::
<cgroup inode id> <nr of objects on node 0> <nr of objects on node 1> ...
<cgroup inode id> <nr of objects on node 0> <nr of objects on node 1> ...
...
If there are no objects on all numa nodes, a line is omitted. If there
are no objects at all, the output might be empty.
If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is printed
as cgroup inode id. If the shrinker is not numa-aware, 0's are printed
for all nodes except the first one.
::
$ cat count
1 224 2
21 98 0
55 818 10
2367 2 0
2401 30 0
225 13 0
599 35 0
939 124 0
1041 3 0
1075 1 0
1109 1 0
1279 60 0
1313 7 0
1347 39 0
1381 3 0
1449 14 0
1483 63 0
1517 53 0
1551 6 0
1585 1 0
1619 6 0
1653 40 0
1687 11 0
1721 8 0
1755 4 0
1789 52 0
1823 888 0
1857 1 0
1925 2 0
1959 32 0
2027 22 0
2061 9 0
2469 799 0
2537 861 0
2639 1 0
2707 70 0
2775 4 0
2877 84 0
293 1 0
735 8 0
4. *Scan objects*
The expected input format::
<cgroup inode id> <numa id> <number of objects to scan>
For a non-memcg-aware shrinker or on a system with no memory
cgrups **0** should be passed as cgroup id.
::
$ cd /sys/kernel/debug/shrinker/
$ cd sb-btrfs\:vda2-24/
$ cat count | head -n 5
1 212 0
21 97 0
55 802 5
2367 2 0
225 13 0
$ echo "55 0 200" > scan
$ cat count | head -n 5
1 212 0
21 96 0
55 752 5
2367 2 0
225 13 0

View File

@ -17,7 +17,10 @@ of the ``PROT_NONE+SIGSEGV`` trick.
Design
======
Userfaults are delivered and resolved through the ``userfaultfd`` syscall.
Userspace creates a new userfaultfd, initializes it, and registers one or more
regions of virtual memory with it. Then, any page faults which occur within the
region(s) result in a message being delivered to the userfaultfd, notifying
userspace of the fault.
The ``userfaultfd`` (aside from registering and unregistering virtual
memory ranges) provides two primary functionalities:
@ -34,12 +37,11 @@ The real advantage of userfaults if compared to regular virtual memory
management of mremap/mprotect is that the userfaults in all their
operations never involve heavyweight structures like vmas (in fact the
``userfaultfd`` runtime load never takes the mmap_lock for writing).
Vmas are not suitable for page- (or hugepage) granular fault tracking
when dealing with virtual address spaces that could span
Terabytes. Too many vmas would be needed for that.
The ``userfaultfd`` once opened by invoking the syscall, can also be
The ``userfaultfd``, once created, can also be
passed using unix domain sockets to a manager process, so the same
manager process could handle the userfaults of a multitude of
different processes without them being aware about what is going on
@ -50,6 +52,38 @@ is a corner case that would currently return ``-EBUSY``).
API
===
Creating a userfaultfd
----------------------
There are two ways to create a new userfaultfd, each of which provide ways to
restrict access to this functionality (since historically userfaultfds which
handle kernel page faults have been a useful tool for exploiting the kernel).
The first way, supported by older kernels, is the userfaultfd(2) syscall.
Access to this is controlled in several ways:
- By default, the userfaultfd will be able to handle kernel page faults. This
can be disabled by passing in UFFD_USER_MODE_ONLY.
- If vm.unprivileged_userfaultfd is 0, then the caller must *either* have
CAP_SYS_PTRACE, or pass in UFFD_USER_MODE_ONLY.
- If vm.unprivileged_userfaultfd is 1, then no particular privilege is needed to
use this syscall, even if UFFD_USER_MODE_ONLY is *not* set.
The second way, added to the kernel more recently, is by opening and issuing a
USERFAULTFD_IOC_NEW ioctl to /dev/userfaultfd. This method yields equivalent
userfaultfds to the userfaultfd(2) syscall; its benefit is in how access to
creating userfaultfds is controlled.
Access to /dev/userfaultfd is controlled via normal filesystem permissions
(user/group/mode for example), which gives fine grained access to userfaultfd
specifically, without also granting other unrelated privileges at the same time
(as e.g. granting CAP_SYS_PTRACE would do).
Initializing up a userfaultfd
-----------------------------
When first opened the ``userfaultfd`` must be enabled invoking the
``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or
a later API version) which will specify the ``read/POLLIN`` protocol

View File

@ -565,9 +565,8 @@ See Documentation/admin-guide/mm/hugetlbpage.rst
hugetlb_optimize_vmemmap
========================
This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
is configured or the size of 'struct page' (a structure defined in
include/linux/mm_types.h) is not power of two (an unusual system config could
This knob is not available when the size of 'struct page' (a structure defined
in include/linux/mm_types.h) is not power of two (an unusual system config could
result in this).
Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
@ -928,6 +927,9 @@ calls without any restrictions.
The default value is 0.
An alternative to this sysctl / the userfaultfd(2) syscall is to create
userfaultfds via /dev/userfaultfd. See
Documentation/admin-guide/mm/userfaultfd.rst.
user_reserve_kbytes
===================

View File

@ -36,6 +36,7 @@ Library functionality that is used throughout the kernel.
kref
assoc_array
xarray
maple_tree
idr
circular-buffers
rbtree

View File

@ -0,0 +1,217 @@
.. SPDX-License-Identifier: GPL-2.0+
==========
Maple Tree
==========
:Author: Liam R. Howlett
Overview
========
The Maple Tree is a B-Tree data type which is optimized for storing
non-overlapping ranges, including ranges of size 1. The tree was designed to
be simple to use and does not require a user written search method. It
supports iterating over a range of entries and going to the previous or next
entry in a cache-efficient manner. The tree can also be put into an RCU-safe
mode of operation which allows reading and writing concurrently. Writers must
synchronize on a lock, which can be the default spinlock, or the user can set
the lock to an external lock of a different type.
The Maple Tree maintains a small memory footprint and was designed to use
modern processor cache efficiently. The majority of the users will be able to
use the normal API. An :ref:`maple-tree-advanced-api` exists for more complex
scenarios. The most important usage of the Maple Tree is the tracking of the
virtual memory areas.
The Maple Tree can store values between ``0`` and ``ULONG_MAX``. The Maple
Tree reserves values with the bottom two bits set to '10' which are below 4096
(ie 2, 6, 10 .. 4094) for internal use. If the entries may use reserved
entries then the users can convert the entries using xa_mk_value() and convert
them back by calling xa_to_value(). If the user needs to use a reserved
value, then the user can convert the value when using the
:ref:`maple-tree-advanced-api`, but are blocked by the normal API.
The Maple Tree can also be configured to support searching for a gap of a given
size (or larger).
Pre-allocating of nodes is also supported using the
:ref:`maple-tree-advanced-api`. This is useful for users who must guarantee a
successful store operation within a given
code segment when allocating cannot be done. Allocations of nodes are
relatively small at around 256 bytes.
.. _maple-tree-normal-api:
Normal API
==========
Start by initialising a maple tree, either with DEFINE_MTREE() for statically
allocated maple trees or mt_init() for dynamically allocated ones. A
freshly-initialised maple tree contains a ``NULL`` pointer for the range ``0``
- ``ULONG_MAX``. There are currently two types of maple trees supported: the
allocation tree and the regular tree. The regular tree has a higher branching
factor for internal nodes. The allocation tree has a lower branching factor
but allows the user to search for a gap of a given size or larger from either
``0`` upwards or ``ULONG_MAX`` down. An allocation tree can be used by
passing in the ``MT_FLAGS_ALLOC_RANGE`` flag when initialising the tree.
You can then set entries using mtree_store() or mtree_store_range().
mtree_store() will overwrite any entry with the new entry and return 0 on
success or an error code otherwise. mtree_store_range() works in the same way
but takes a range. mtree_load() is used to retrieve the entry stored at a
given index. You can use mtree_erase() to erase an entire range by only
knowing one value within that range, or mtree_store() call with an entry of
NULL may be used to partially erase a range or many ranges at once.
If you want to only store a new entry to a range (or index) if that range is
currently ``NULL``, you can use mtree_insert_range() or mtree_insert() which
return -EEXIST if the range is not empty.
You can search for an entry from an index upwards by using mt_find().
You can walk each entry within a range by calling mt_for_each(). You must
provide a temporary variable to store a cursor. If you want to walk each
element of the tree then ``0`` and ``ULONG_MAX`` may be used as the range. If
the caller is going to hold the lock for the duration of the walk then it is
worth looking at the mas_for_each() API in the :ref:`maple-tree-advanced-api`
section.
Sometimes it is necessary to ensure the next call to store to a maple tree does
not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case.
Finally, you can remove all entries from a maple tree by calling
mtree_destroy(). If the maple tree entries are pointers, you may wish to free
the entries first.
Allocating Nodes
----------------
The allocations are handled by the internal tree code. See
:ref:`maple-tree-advanced-alloc` for other options.
Locking
-------
You do not have to worry about locking. See :ref:`maple-tree-advanced-locks`
for other options.
The Maple Tree uses RCU and an internal spinlock to synchronise access:
Takes RCU read lock:
* mtree_load()
* mt_find()
* mt_for_each()
* mt_next()
* mt_prev()
Takes ma_lock internally:
* mtree_store()
* mtree_store_range()
* mtree_insert()
* mtree_insert_range()
* mtree_erase()
* mtree_destroy()
* mt_set_in_rcu()
* mt_clear_in_rcu()
If you want to take advantage of the internal lock to protect the data
structures that you are storing in the Maple Tree, you can call mtree_lock()
before calling mtree_load(), then take a reference count on the object you
have found before calling mtree_unlock(). This will prevent stores from
removing the object from the tree between looking up the object and
incrementing the refcount. You can also use RCU to avoid dereferencing
freed memory, but an explanation of that is beyond the scope of this
document.
.. _maple-tree-advanced-api:
Advanced API
============
The advanced API offers more flexibility and better performance at the
cost of an interface which can be harder to use and has fewer safeguards.
You must take care of your own locking while using the advanced API.
You can use the ma_lock, RCU or an external lock for protection.
You can mix advanced and normal operations on the same array, as long
as the locking is compatible. The :ref:`maple-tree-normal-api` is implemented
in terms of the advanced API.
The advanced API is based around the ma_state, this is where the 'mas'
prefix originates. The ma_state struct keeps track of tree operations to make
life easier for both internal and external tree users.
Initialising the maple tree is the same as in the :ref:`maple-tree-normal-api`.
Please see above.
The maple state keeps track of the range start and end in mas->index and
mas->last, respectively.
mas_walk() will walk the tree to the location of mas->index and set the
mas->index and mas->last according to the range for the entry.
You can set entries using mas_store(). mas_store() will overwrite any entry
with the new entry and return the first existing entry that is overwritten.
The range is passed in as members of the maple state: index and last.
You can use mas_erase() to erase an entire range by setting index and
last of the maple state to the desired range to erase. This will erase
the first range that is found in that range, set the maple state index
and last as the range that was erased and return the entry that existed
at that location.
You can walk each entry within a range by using mas_for_each(). If you want
to walk each element of the tree then ``0`` and ``ULONG_MAX`` may be used as
the range. If the lock needs to be periodically dropped, see the locking
section mas_pause().
Using a maple state allows mas_next() and mas_prev() to function as if the
tree was a linked list. With such a high branching factor the amortized
performance penalty is outweighed by cache optimization. mas_next() will
return the next entry which occurs after the entry at index. mas_prev()
will return the previous entry which occurs before the entry at index.
mas_find() will find the first entry which exists at or above index on
the first call, and the next entry from every subsequent calls.
mas_find_rev() will find the fist entry which exists at or below the last on
the first call, and the previous entry from every subsequent calls.
If the user needs to yield the lock during an operation, then the maple state
must be paused using mas_pause().
There are a few extra interfaces provided when using an allocation tree.
If you wish to search for a gap within a range, then mas_empty_area()
or mas_empty_area_rev() can be used. mas_empty_area() searches for a gap
starting at the lowest index given up to the maximum of the range.
mas_empty_area_rev() searches for a gap starting at the highest index given
and continues downward to the lower bound of the range.
.. _maple-tree-advanced-alloc:
Advanced Allocating Nodes
-------------------------
Allocations are usually handled internally to the tree, however if allocations
need to occur before a write occurs then calling mas_expected_entries() will
allocate the worst-case number of needed nodes to insert the provided number of
ranges. This also causes the tree to enter mass insertion mode. Once
insertions are complete calling mas_destroy() on the maple state will free the
unused allocations.
.. _maple-tree-advanced-locks:
Advanced Locking
----------------
The maple tree uses a spinlock by default, but external locks can be used for
tree updates as well. To use an external lock, the tree must be initialized
with the ``MT_FLAGS_LOCK_EXTERN flag``, this is usually done with the
MTREE_INIT_EXT() #define, which takes an external lock as an argument.
Functions and structures
========================
.. kernel-doc:: include/linux/maple_tree.h
.. kernel-doc:: lib/maple_tree.c

View File

@ -448,6 +448,7 @@ Memory Area, or VMA) there is a series of lines such as the following::
MMUPageSize: 4 kB
Rss: 892 kB
Pss: 374 kB
Pss_Dirty: 0 kB
Shared_Clean: 892 kB
Shared_Dirty: 0 kB
Private_Clean: 0 kB
@ -479,7 +480,9 @@ dirty shared and private pages in the mapping.
The "proportional set size" (PSS) of a process is the count of pages it has
in memory, where each page is divided by the number of processes sharing it.
So if a process has 1000 pages all to itself, and 1000 shared with one other
process, its PSS will be 1500.
process, its PSS will be 1500. "Pss_Dirty" is the portion of PSS which
consists of dirty pages. ("Pss_Clean" is not included, but it can be
calculated by subtracting "Pss_Dirty" from "Pss".)
Note that even a page which is part of a MAP_SHARED mapping, but has only
a single pte mapped, i.e. is currently used by only one process, is accounted
@ -514,8 +517,10 @@ replaced by copy-on-write) part of the underlying shmem object out on swap.
"SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
does not take into account swapped out page of underlying shmem objects.
"Locked" indicates whether the mapping is locked in memory or not.
"THPeligible" indicates whether the mapping is eligible for allocating THP
pages - 1 if true, 0 otherwise. It just shows the current status.
pages as well as the THP is PMD mappable or not - 1 if true, 0 otherwise.
It just shows the current status.
"VmFlags" field deserves a separate description. This member represents the
kernel flags associated with the particular virtual memory area in two letter
@ -1886,13 +1891,14 @@ if precise results are needed.
3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file
---------------------------------------------------------------
This file provides information associated with an opened file. The regular
files have at least four fields -- 'pos', 'flags', 'mnt_id' and 'ino'.
files have at least five fields -- 'pos', 'flags', 'mnt_id', 'ino', and 'size'.
The 'pos' represents the current offset of the opened file in decimal
form [see lseek(2) for details], 'flags' denotes the octal O_xxx mask the
file has been created with [see open(2) for details] and 'mnt_id' represents
mount ID of the file system containing the opened file [see 3.5
/proc/<pid>/mountinfo for details]. 'ino' represents the inode number of
the file.
the file, and 'size' represents the size of the file in bytes.
A typical output is::
@ -1900,11 +1906,15 @@ A typical output is::
flags: 0100002
mnt_id: 19
ino: 63107
size: 0
All locks associated with a file descriptor are shown in its fdinfo too::
lock: 1: FLOCK ADVISORY WRITE 359 00:13:11691 0 EOF
Files with anonymous inodes have an additional 'path' field which represents
the anonymous file path.
The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags
pair provide additional information particular to the objects they represent.
@ -1917,6 +1927,8 @@ Eventfd files
flags: 04002
mnt_id: 9
ino: 63107
size: 0
path: anon_inode:[eventfd]
eventfd-count: 5a
where 'eventfd-count' is hex value of a counter.
@ -1930,6 +1942,8 @@ Signalfd files
flags: 04002
mnt_id: 9
ino: 63107
size: 0
path: anon_inode:[signalfd]
sigmask: 0000000000000200
where 'sigmask' is hex value of the signal mask associated
@ -1944,6 +1958,8 @@ Epoll files
flags: 02
mnt_id: 9
ino: 63107
size: 0
path: anon_inode:[eventpoll]
tfd: 5 events: 1d data: ffffffffffffffff pos:0 ino:61af sdev:7
where 'tfd' is a target file descriptor number in decimal form,
@ -1962,6 +1978,8 @@ For inotify files the format is the following::
flags: 02000000
mnt_id: 9
ino: 63107
size: 0
path: anon_inode:inotify
inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:7e9e0000640d1b6d
where 'wd' is a watch descriptor in decimal form, i.e. a target file
@ -1985,6 +2003,8 @@ For fanotify files the format is::
flags: 02
mnt_id: 9
ino: 63107
size: 0
path: anon_inode:[fanotify]
fanotify flags:10 event-flags:0
fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003
fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4
@ -2010,6 +2030,8 @@ Timerfd files
flags: 02
mnt_id: 9
ino: 63107
size: 0
path: anon_inode:[timerfd]
clockid: 0
ticks: 0
settime flags: 01
@ -2034,6 +2056,7 @@ DMA Buffer files
mnt_id: 9
ino: 63107
size: 32768
path: /dmabuf:
count: 2
exp_name: system-heap

View File

@ -6,7 +6,7 @@ Memory Balancing
Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as
well as for non __GFP_IO allocations.
The first reason why a caller may avoid reclaim is that the caller can not

View File

@ -11923,6 +11923,18 @@ L: linux-man@vger.kernel.org
S: Maintained
W: http://www.kernel.org/doc/man-pages
MAPLE TREE
M: Liam R. Howlett <Liam.Howlett@oracle.com>
L: linux-mm@kvack.org
S: Supported
F: Documentation/core-api/maple_tree.rst
F: include/linux/maple_tree.h
F: include/trace/events/maple_tree.h
F: lib/maple_tree.c
F: lib/test_maple_tree.c
F: tools/testing/radix-tree/linux/maple_tree.h
F: tools/testing/radix-tree/maple.c
MARDUK (CREATOR CI40) DEVICE TREE SUPPORT
M: Rahul Bedarkar <rahulbedarkar89@gmail.com>
L: linux-mips@vger.kernel.org

View File

@ -46,9 +46,6 @@ extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz);
#define __HAVE_ARCH_HUGE_PTEP_GET
extern pte_t huge_ptep_get(pte_t *ptep);
extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned long sz);
#define set_huge_swap_pte_at set_huge_swap_pte_at
void __init arm64_hugetlb_cma_reserve(void);

View File

@ -8,9 +8,9 @@
#include <asm/cpufeature.h>
#include <asm/mte.h>
#define for_each_mte_vma(tsk, vma) \
#define for_each_mte_vma(vmi, vma) \
if (system_supports_mte()) \
for (vma = tsk->mm->mmap; vma; vma = vma->vm_next) \
for_each_vma(vmi, vma) \
if (vma->vm_flags & VM_MTE)
static unsigned long mte_vma_tag_dump_size(struct vm_area_struct *vma)
@ -81,8 +81,9 @@ Elf_Half elf_core_extra_phdrs(void)
{
struct vm_area_struct *vma;
int vma_count = 0;
VMA_ITERATOR(vmi, current->mm, 0);
for_each_mte_vma(current, vma)
for_each_mte_vma(vmi, vma)
vma_count++;
return vma_count;
@ -91,8 +92,9 @@ Elf_Half elf_core_extra_phdrs(void)
int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, current->mm, 0);
for_each_mte_vma(current, vma) {
for_each_mte_vma(vmi, vma) {
struct elf_phdr phdr;
phdr.p_type = PT_AARCH64_MEMTAG_MTE;
@ -116,8 +118,9 @@ size_t elf_core_extra_data_size(void)
{
struct vm_area_struct *vma;
size_t data_size = 0;
VMA_ITERATOR(vmi, current->mm, 0);
for_each_mte_vma(current, vma)
for_each_mte_vma(vmi, vma)
data_size += mte_vma_tag_dump_size(vma);
return data_size;
@ -126,8 +129,9 @@ size_t elf_core_extra_data_size(void)
int elf_core_write_extra_data(struct coredump_params *cprm)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, current->mm, 0);
for_each_mte_vma(current, vma) {
for_each_mte_vma(vmi, vma) {
if (vma->vm_flags & VM_DONTDUMP)
continue;

View File

@ -136,10 +136,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
{
struct mm_struct *mm = task->mm;
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
unsigned long size = vma->vm_end - vma->vm_start;
if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm))

View File

@ -238,6 +238,13 @@ static void clear_flush(struct mm_struct *mm,
flush_tlb_range(&vma, saddr, addr);
}
static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry)
{
VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry));
return page_folio(pfn_to_page(swp_offset(entry)));
}
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
@ -247,11 +254,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
unsigned long pfn, dpfn;
pgprot_t hugeprot;
/*
* Code needs to be expanded to handle huge swap and migration
* entries. Needed for HUGETLB and MEMORY_FAILURE.
*/
WARN_ON(!pte_present(pte));
if (!pte_present(pte)) {
struct folio *folio;
folio = hugetlb_swap_entry_to_folio(pte_to_swp_entry(pte));
ncontig = num_contig_ptes(folio_size(folio), &pgsize);
for (i = 0; i < ncontig; i++, ptep++)
set_pte_at(mm, addr, ptep, pte);
return;
}
if (!pte_cont(pte)) {
set_pte_at(mm, addr, ptep, pte);
@ -269,18 +281,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
}
void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned long sz)
{
int i, ncontig;
size_t pgsize;
ncontig = num_contig_ptes(sz, &pgsize);
for (i = 0; i < ncontig; i++, ptep++)
set_pte(ptep, pte);
}
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
@ -368,6 +368,26 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return NULL;
}
unsigned long hugetlb_mask_last_page(struct hstate *h)
{
unsigned long hp_size = huge_page_size(h);
switch (hp_size) {
case PUD_SIZE:
return PGDIR_SIZE - PUD_SIZE;
case CONT_PMD_SIZE:
return PUD_SIZE - CONT_PMD_SIZE;
case PMD_SIZE:
return PUD_SIZE - PMD_SIZE;
case CONT_PTE_SIZE:
return PMD_SIZE - CONT_PTE_SIZE;
default:
break;
}
return 0UL;
}
pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
{
size_t pagesize = 1UL << shift;

View File

@ -230,7 +230,7 @@ void mips_mt_set_cpuoptions(void)
struct class *mt_class;
static int __init mt_init(void)
static int __init mips_mt_init(void)
{
struct class *mtc;
@ -243,4 +243,4 @@ static int __init mt_init(void)
return 0;
}
subsys_initcall(mt_init);
subsys_initcall(mips_mt_init);

View File

@ -660,15 +660,20 @@ static inline unsigned long mm_total_size(struct mm_struct *mm)
{
struct vm_area_struct *vma;
unsigned long usize = 0;
VMA_ITERATOR(vmi, mm, 0);
for (vma = mm->mmap; vma && usize < parisc_cache_flush_threshold; vma = vma->vm_next)
for_each_vma(vmi, vma) {
if (usize >= parisc_cache_flush_threshold)
break;
usize += vma->vm_end - vma->vm_start;
}
return usize;
}
void flush_cache_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
/*
* Flushing the whole cache on each cpu takes forever on
@ -688,7 +693,7 @@ void flush_cache_mm(struct mm_struct *mm)
}
/* Flush mm */
for (vma = mm->mmap; vma; vma = vma->vm_next)
for_each_vma(vmi, vma)
flush_cache_pages(vma, vma->vm_start, vma->vm_end);
}

View File

@ -113,18 +113,18 @@ struct vdso_data *arch_get_vdso_data(void *vvar_page)
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
{
struct mm_struct *mm = task->mm;
VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
unsigned long size = vma->vm_end - vma->vm_start;
if (vma_is_special_mapping(vma, &vvar_spec))
zap_page_range(vma, vma->vm_start, size);
}
mmap_read_unlock(mm);
return 0;
}

View File

@ -81,14 +81,15 @@ EXPORT_SYMBOL(hash__flush_range);
void hash__flush_tlb_mm(struct mm_struct *mm)
{
struct vm_area_struct *mp;
VMA_ITERATOR(vmi, mm, 0);
/*
* It is safe to go down the mm's list of vmas when called
* from dup_mmap, holding mmap_lock. It would also be safe from
* unmap_region or exit_mmap, but not from vmtruncate on SMP -
* but it seems dup_mmap is the only SMP case which gets here.
* It is safe to iterate the vmas when called from dup_mmap,
* holding mmap_lock. It would also be safe from unmap_region
* or exit_mmap, but not from vmtruncate on SMP - but it seems
* dup_mmap is the only SMP case which gets here.
*/
for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
for_each_vma(vmi, mp)
hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
}
EXPORT_SYMBOL(hash__flush_tlb_mm);

View File

@ -149,24 +149,15 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, addr);
/*
* We don't try too hard, we just mark all the vma in that range
* VM_NOHUGEPAGE and split them.
*/
vma = find_vma(mm, addr);
/*
* If the range is in unmapped range, just return
*/
if (vma && ((addr + len) <= vma->vm_start))
return;
while (vma) {
if (vma->vm_start >= (addr + len))
break;
for_each_vma_range(vmi, vma, addr + len) {
vma->vm_flags |= VM_NOHUGEPAGE;
walk_page_vma(vma, &subpage_walk_ops, NULL);
vma = vma->vm_next;
}
}
#else

View File

@ -114,11 +114,12 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
{
struct mm_struct *mm = task->mm;
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
struct __vdso_info *vdso_info = mm->context.vdso_info;
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
unsigned long size = vma->vm_end - vma->vm_start;
if (vma_is_special_mapping(vma, vdso_info->dm))

View File

@ -69,10 +69,11 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
{
struct mm_struct *mm = task->mm;
VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
unsigned long size = vma->vm_end - vma->vm_start;
if (!vma_is_special_mapping(vma, &vvar_mapping))

View File

@ -2515,8 +2515,9 @@ static const struct mm_walk_ops thp_split_walk_ops = {
static inline void thp_split_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
vma->vm_flags &= ~VM_HUGEPAGE;
vma->vm_flags |= VM_NOHUGEPAGE;
walk_page_vma(vma, &thp_split_walk_ops, NULL);
@ -2584,8 +2585,9 @@ int gmap_mark_unmergeable(void)
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int ret;
VMA_ITERATOR(vmi, mm, 0);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
MADV_UNMERGEABLE, &vma->vm_flags);
if (ret)

View File

@ -584,21 +584,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
void flush_tlb_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma = mm->mmap;
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
while (vma != NULL) {
for_each_vma(vmi, vma)
fix_range(mm, vma->vm_start, vma->vm_end, 0);
vma = vma->vm_next;
}
}
void force_flush_all(void)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = mm->mmap;
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
while (vma != NULL) {
for_each_vma(vmi, vma)
fix_range(mm, vma->vm_start, vma->vm_end, 1);
vma = vma->vm_next;
}
}

View File

@ -127,17 +127,17 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
{
struct mm_struct *mm = task->mm;
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
unsigned long size = vma->vm_end - vma->vm_start;
if (vma_is_special_mapping(vma, &vvar_mapping))
zap_page_range(vma, vma->vm_start, size);
}
mmap_read_unlock(mm);
return 0;
}
#else
@ -354,6 +354,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
mmap_write_lock(mm);
/*
@ -363,7 +364,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
* We could search vma near context.vdso, but it's a slowpath,
* so let's explicitly check all VMAs to be completely sure.
*/
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
if (vma_is_special_mapping(vma, &vdso_mapping) ||
vma_is_special_mapping(vma, &vvar_mapping)) {
mmap_write_unlock(mm);

View File

@ -96,7 +96,7 @@ void __init tboot_probe(void)
static pgd_t *tboot_pg_dir;
static struct mm_struct tboot_mm = {
.mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock),
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),

View File

@ -6699,7 +6699,7 @@ int kvm_mmu_vendor_module_init(void)
if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
goto out;
ret = register_shrinker(&mmu_shrinker);
ret = register_shrinker(&mmu_shrinker, "x86-mmu");
if (ret)
goto out;

View File

@ -58,6 +58,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct vm_area_struct *vmm;
struct vma_iterator vmi;
if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
@ -79,15 +80,20 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
else
addr = PAGE_ALIGN(addr);
for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
/* At this point: (!vmm || addr < vmm->vm_end). */
if (TASK_SIZE - len < addr)
return -ENOMEM;
if (!vmm || addr + len <= vm_start_gap(vmm))
return addr;
vma_iter_init(&vmi, current->mm, addr);
for_each_vma(vmi, vmm) {
/* At this point: (addr < vmm->vm_end). */
if (addr + len <= vm_start_gap(vmm))
break;
addr = vmm->vm_end;
if (flags & MAP_SHARED)
addr = COLOUR_ALIGN(addr, pgoff);
}
if (TASK_SIZE - len < addr)
return -ENOMEM;
return addr;
}
#endif

View File

@ -213,7 +213,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
if (mm) {
mmap_read_lock(mm);
vma = alloc->vma;
vma = vma_lookup(mm, alloc->vma_addr);
}
if (!vma && need_mm) {
@ -313,16 +313,22 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
static inline void binder_alloc_set_vma(struct binder_alloc *alloc,
struct vm_area_struct *vma)
{
if (vma)
alloc->vma_vm_mm = vma->vm_mm;
unsigned long vm_start = 0;
/*
* If we see alloc->vma is not NULL, buffer data structures set up
* completely. Look at smp_rmb side binder_alloc_get_vma.
* We also want to guarantee new alloc->vma_vm_mm is always visible
* if alloc->vma is set.
* Allow clearing the vma with holding just the read lock to allow
* munmapping downgrade of the write lock before freeing and closing the
* file using binder_alloc_vma_close().
*/
smp_wmb();
alloc->vma = vma;
if (vma) {
vm_start = vma->vm_start;
alloc->vma_vm_mm = vma->vm_mm;
mmap_assert_write_locked(alloc->vma_vm_mm);
} else {
mmap_assert_locked(alloc->vma_vm_mm);
}
alloc->vma_addr = vm_start;
}
static inline struct vm_area_struct *binder_alloc_get_vma(
@ -330,11 +336,9 @@ static inline struct vm_area_struct *binder_alloc_get_vma(
{
struct vm_area_struct *vma = NULL;
if (alloc->vma) {
/* Look at description in binder_alloc_set_vma */
smp_rmb();
vma = alloc->vma;
}
if (alloc->vma_addr)
vma = vma_lookup(alloc->vma_vm_mm, alloc->vma_addr);
return vma;
}
@ -817,7 +821,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
buffers = 0;
mutex_lock(&alloc->mutex);
BUG_ON(alloc->vma);
BUG_ON(alloc->vma_addr &&
vma_lookup(alloc->vma_vm_mm, alloc->vma_addr));
while ((n = rb_first(&alloc->allocated_buffers))) {
buffer = rb_entry(n, struct binder_buffer, rb_node);
@ -1084,7 +1089,7 @@ int binder_alloc_shrinker_init(void)
int ret = list_lru_init(&binder_alloc_lru);
if (ret == 0) {
ret = register_shrinker(&binder_shrinker);
ret = register_shrinker(&binder_shrinker, "android-binder");
if (ret)
list_lru_destroy(&binder_alloc_lru);
}

View File

@ -100,7 +100,7 @@ struct binder_lru_page {
*/
struct binder_alloc {
struct mutex mutex;
struct vm_area_struct *vma;
unsigned long vma_addr;
struct mm_struct *vma_vm_mm;
void __user *buffer;
struct list_head buffers;

View File

@ -287,7 +287,7 @@ void binder_selftest_alloc(struct binder_alloc *alloc)
if (!binder_selftest_run)
return;
mutex_lock(&binder_selftest_lock);
if (!binder_selftest_run || !alloc->vma)
if (!binder_selftest_run || !alloc->vma_addr)
goto done;
pr_info("STARTED\n");
binder_selftest_alloc_offset(alloc, end_offset, 0);

View File

@ -63,12 +63,6 @@ static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp)
bool zcomp_available_algorithm(const char *comp)
{
int i;
i = sysfs_match_string(backends, comp);
if (i >= 0)
return true;
/*
* Crypto does not ignore a trailing new line symbol,
* so make sure you don't supply a string containing
@ -217,6 +211,11 @@ struct zcomp *zcomp_create(const char *compress)
struct zcomp *comp;
int error;
/*
* Crypto API will execute /sbin/modprobe if the compression module
* is not loaded yet. We must do it here, otherwise we are about to
* call /sbin/modprobe under CPU hot-plug lock.
*/
if (!zcomp_available_algorithm(compress))
return ERR_PTR(-EINVAL);

View File

@ -22,6 +22,8 @@
* @private: dax driver private data
* @flags: state and boolean properties
* @ops: operations for this device
* @holder_data: holder of a dax_device: could be filesystem or mapped device
* @holder_ops: operations for the inner holder
*/
struct dax_device {
struct inode inode;
@ -29,6 +31,8 @@ struct dax_device {
void *private;
unsigned long flags;
const struct dax_operations *ops;
void *holder_data;
const struct dax_holder_operations *holder_ops;
};
static dev_t dax_devt;
@ -71,8 +75,11 @@ EXPORT_SYMBOL_GPL(dax_remove_host);
* fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax
* @bdev: block device to find a dax_device for
* @start_off: returns the byte offset into the dax_device that @bdev starts
* @holder: filesystem or mapped device inside the dax_device
* @ops: operations for the inner holder
*/
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
void *holder, const struct dax_holder_operations *ops)
{
struct dax_device *dax_dev;
u64 part_size;
@ -92,11 +99,26 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
dax_dev = NULL;
else if (holder) {
if (!cmpxchg(&dax_dev->holder_data, NULL, holder))
dax_dev->holder_ops = ops;
else
dax_dev = NULL;
}
dax_read_unlock(id);
return dax_dev;
}
EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
void fs_put_dax(struct dax_device *dax_dev, void *holder)
{
if (dax_dev && holder &&
cmpxchg(&dax_dev->holder_data, holder, NULL) == holder)
dax_dev->holder_ops = NULL;
put_dax(dax_dev);
}
EXPORT_SYMBOL_GPL(fs_put_dax);
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
enum dax_device_flags {
@ -204,6 +226,29 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
}
EXPORT_SYMBOL_GPL(dax_recovery_write);
int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off,
u64 len, int mf_flags)
{
int rc, id;
id = dax_read_lock();
if (!dax_alive(dax_dev)) {
rc = -ENXIO;
goto out;
}
if (!dax_dev->holder_ops) {
rc = -EOPNOTSUPP;
goto out;
}
rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags);
out:
dax_read_unlock(id);
return rc;
}
EXPORT_SYMBOL_GPL(dax_holder_notify_failure);
#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
@ -277,8 +322,15 @@ void kill_dax(struct dax_device *dax_dev)
if (!dax_dev)
return;
if (dax_dev->holder_data != NULL)
dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
synchronize_srcu(&dax_srcu);
/* clear holder data */
dax_dev->holder_ops = NULL;
dax_dev->holder_data = NULL;
}
EXPORT_SYMBOL_GPL(kill_dax);
@ -420,6 +472,19 @@ void put_dax(struct dax_device *dax_dev)
}
EXPORT_SYMBOL_GPL(put_dax);
/**
* dax_holder() - obtain the holder of a dax device
* @dax_dev: a dax_device instance
* Return: the holder's data which represents the holder if registered,
* otherwize NULL.
*/
void *dax_holder(struct dax_device *dax_dev)
{
return dax_dev->holder_data;
}
EXPORT_SYMBOL_GPL(dax_holder);
/**
* inode_dax: convert a public inode into its dax_dev
* @inode: An inode with i_cdev pointing to a dax_dev

View File

@ -484,7 +484,6 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file)
{
struct dma_buf *dmabuf = file->private_data;
seq_printf(m, "size:\t%zu\n", dmabuf->size);
/* Don't count the temporary reference taken inside procfs seq_show */
seq_printf(m, "count:\t%ld\n", file_count(dmabuf->file) - 1);
seq_printf(m, "exp_name:\t%s\n", dmabuf->exp_name);

View File

@ -57,7 +57,7 @@ static unsigned long __initdata mem_reserve = EFI_INVALID_TABLE_ADDR;
static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR;
struct mm_struct efi_mm = {
.mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock),
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
.write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq),

View File

@ -671,13 +671,15 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
migrate.vma = vma;
migrate.start = start;
migrate.end = end;
migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
if (adev->gmc.xgmi.connected_to_cpu)
migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
else
migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
buf = kvcalloc(npages,
2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t),
GFP_KERNEL);
if (!buf)
goto out;
@ -947,7 +949,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
{
struct kfd_dev *kfddev = adev->kfd.dev;
struct dev_pagemap *pgmap;
struct resource *res;
struct resource *res = NULL;
unsigned long size;
void *r;
@ -962,28 +964,34 @@ int svm_migrate_init(struct amdgpu_device *adev)
* should remove reserved size
*/
size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
if (IS_ERR(res))
return -ENOMEM;
if (adev->gmc.xgmi.connected_to_cpu) {
pgmap->range.start = adev->gmc.aper_base;
pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1;
pgmap->type = MEMORY_DEVICE_COHERENT;
} else {
res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
if (IS_ERR(res))
return -ENOMEM;
pgmap->range.start = res->start;
pgmap->range.end = res->end;
pgmap->type = MEMORY_DEVICE_PRIVATE;
}
pgmap->type = MEMORY_DEVICE_PRIVATE;
pgmap->nr_range = 1;
pgmap->range.start = res->start;
pgmap->range.end = res->end;
pgmap->ops = &svm_migrate_pgmap_ops;
pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
pgmap->flags = 0;
/* Device manager releases device-specific resources, memory region and
* pgmap when driver disconnects from device.
*/
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
/* Disable SVM support capability */
pgmap->type = 0;
devm_release_mem_region(adev->dev, res->start, resource_size(res));
if (pgmap->type == MEMORY_DEVICE_PRIVATE)
devm_release_mem_region(adev->dev, res->start,
res->end - res->start + 1);
return PTR_ERR(r);
}

View File

@ -426,7 +426,8 @@ void i915_gem_driver_register__shrinker(struct drm_i915_private *i915)
i915->mm.shrinker.count_objects = i915_gem_shrinker_count;
i915->mm.shrinker.seeks = DEFAULT_SEEKS;
i915->mm.shrinker.batch = 4096;
drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker));
drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker,
"drm-i915_gem"));
i915->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom;
drm_WARN_ON(&i915->drm, register_oom_notifier(&i915->mm.oom_notifier));

View File

@ -426,12 +426,11 @@ static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = {
static int
probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len)
{
const unsigned long end = addr + len;
VMA_ITERATOR(vmi, mm, addr);
struct vm_area_struct *vma;
int ret = -EFAULT;
mmap_read_lock(mm);
for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
for_each_vma_range(vmi, vma, addr + len) {
/* Check for holes, note that we also update the addr below */
if (vma->vm_start > addr)
break;
@ -439,16 +438,13 @@ probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len)
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
break;
if (vma->vm_end >= end) {
ret = 0;
break;
}
addr = vma->vm_end;
}
mmap_read_unlock(mm);
return ret;
if (vma)
return -EFAULT;
return 0;
}
/*

View File

@ -221,7 +221,7 @@ void msm_gem_shrinker_init(struct drm_device *dev)
priv->shrinker.count_objects = msm_gem_shrinker_count;
priv->shrinker.scan_objects = msm_gem_shrinker_scan;
priv->shrinker.seeks = DEFAULT_SEEKS;
WARN_ON(register_shrinker(&priv->shrinker));
WARN_ON(register_shrinker(&priv->shrinker, "drm-msm_gem"));
priv->vmap_notifier.notifier_call = msm_gem_shrinker_vmap;
WARN_ON(register_vmap_purge_notifier(&priv->vmap_notifier));

View File

@ -103,7 +103,7 @@ void panfrost_gem_shrinker_init(struct drm_device *dev)
pfdev->shrinker.count_objects = panfrost_gem_shrinker_count;
pfdev->shrinker.scan_objects = panfrost_gem_shrinker_scan;
pfdev->shrinker.seeks = DEFAULT_SEEKS;
WARN_ON(register_shrinker(&pfdev->shrinker));
WARN_ON(register_shrinker(&pfdev->shrinker, "drm-panfrost"));
}
/**

View File

@ -722,7 +722,7 @@ int ttm_pool_mgr_init(unsigned long num_pages)
mm_shrinker.count_objects = ttm_pool_shrinker_count;
mm_shrinker.scan_objects = ttm_pool_shrinker_scan;
mm_shrinker.seeks = 1;
return register_shrinker(&mm_shrinker);
return register_shrinker(&mm_shrinker, "drm-ttm_pool");
}
/**

View File

@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as,
* allocate page in a sleeping context if GFP flags permit. Hence
* spinlock needs to be unlocked and re-locked after allocation.
*/
if (!(gfp & __GFP_ATOMIC))
if (gfp & __GFP_DIRECT_RECLAIM)
spin_unlock_irqrestore(&as->lock, *flags);
page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO);
if (!(gfp & __GFP_ATOMIC))
if (gfp & __GFP_DIRECT_RECLAIM)
spin_lock_irqsave(&as->lock, *flags);
/*

View File

@ -812,7 +812,7 @@ int bch_btree_cache_alloc(struct cache_set *c)
c->shrink.seeks = 4;
c->shrink.batch = c->btree_pages * 2;
if (register_shrinker(&c->shrink))
if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid))
pr_warn("bcache: %s: could not register shrinker\n",
__func__);

View File

@ -1806,7 +1806,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
c->shrinker.scan_objects = dm_bufio_shrink_scan;
c->shrinker.seeks = 1;
c->shrinker.batch = 0;
r = register_shrinker(&c->shrinker);
r = register_shrinker(&c->shrinker, "md-%s:(%u:%u)", slab_name,
MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
if (r)
goto bad;

View File

@ -2944,7 +2944,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
/* Metadata cache shrinker */
ret = register_shrinker(&zmd->mblk_shrinker);
ret = register_shrinker(&zmd->mblk_shrinker, "md-meta:(%u:%u)",
MAJOR(dev->bdev->bd_dev),
MINOR(dev->bdev->bd_dev));
if (ret) {
dmz_zmd_err(zmd, "Register metadata cache shrinker failed");
goto err;

View File

@ -758,7 +758,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
}
td->dm_dev.bdev = bdev;
td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
return 0;
}

View File

@ -7414,7 +7414,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->shrinker.count_objects = raid5_cache_count;
conf->shrinker.batch = 128;
conf->shrinker.flags = 0;
ret = register_shrinker(&conf->shrinker);
ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev));
if (ret) {
pr_warn("md/raid:%s: couldn't register shrinker.\n",
mdname(mddev));

View File

@ -280,22 +280,6 @@ void cxl_handle_fault(struct work_struct *fault_work)
mmput(mm);
}
static void cxl_prefault_one(struct cxl_context *ctx, u64 ea)
{
struct mm_struct *mm;
mm = get_mem_context(ctx);
if (mm == NULL) {
pr_devel("cxl_prefault_one unable to get mm %i\n",
pid_nr(ctx->pid));
return;
}
cxl_fault_segment(ctx, mm, ea);
mmput(mm);
}
static u64 next_segment(u64 ea, u64 vsid)
{
if (vsid & SLB_VSID_B_1T)
@ -306,23 +290,16 @@ static u64 next_segment(u64 ea, u64 vsid)
return ea + 1;
}
static void cxl_prefault_vma(struct cxl_context *ctx)
static void cxl_prefault_vma(struct cxl_context *ctx, struct mm_struct *mm)
{
u64 ea, last_esid = 0;
struct copro_slb slb;
VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int rc;
struct mm_struct *mm;
mm = get_mem_context(ctx);
if (mm == NULL) {
pr_devel("cxl_prefault_vm unable to get mm %i\n",
pid_nr(ctx->pid));
return;
}
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
for (ea = vma->vm_start; ea < vma->vm_end;
ea = next_segment(ea, slb.vsid)) {
rc = copro_calculate_slb(mm, ea, &slb);
@ -337,20 +314,28 @@ static void cxl_prefault_vma(struct cxl_context *ctx)
}
}
mmap_read_unlock(mm);
mmput(mm);
}
void cxl_prefault(struct cxl_context *ctx, u64 wed)
{
struct mm_struct *mm = get_mem_context(ctx);
if (mm == NULL) {
pr_devel("cxl_prefault unable to get mm %i\n",
pid_nr(ctx->pid));
return;
}
switch (ctx->afu->prefault_mode) {
case CXL_PREFAULT_WED:
cxl_prefault_one(ctx, wed);
cxl_fault_segment(ctx, mm, wed);
break;
case CXL_PREFAULT_ALL:
cxl_prefault_vma(ctx);
cxl_prefault_vma(ctx, mm);
break;
default:
break;
}
mmput(mm);
}

View File

@ -1585,7 +1585,7 @@ static int vmballoon_register_shrinker(struct vmballoon *b)
b->shrinker.count_objects = vmballoon_shrinker_count;
b->shrinker.seeks = DEFAULT_SEEKS;
r = register_shrinker(&b->shrinker);
r = register_shrinker(&b->shrinker, "vmw-balloon");
if (r == 0)
b->shrinker_registered = true;

View File

@ -453,6 +453,21 @@ static void pmem_release_disk(void *__pmem)
blk_cleanup_disk(pmem->disk);
}
static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
unsigned long pfn, unsigned long nr_pages, int mf_flags)
{
struct pmem_device *pmem =
container_of(pgmap, struct pmem_device, pgmap);
u64 offset = PFN_PHYS(pfn) - pmem->phys_addr - pmem->data_offset;
u64 len = nr_pages << PAGE_SHIFT;
return dax_holder_notify_failure(pmem->dax_dev, offset, len, mf_flags);
}
static const struct dev_pagemap_ops fsdax_pagemap_ops = {
.memory_failure = pmem_pagemap_memory_failure,
};
static int pmem_attach_disk(struct device *dev,
struct nd_namespace_common *ndns)
{
@ -514,6 +529,7 @@ static int pmem_attach_disk(struct device *dev,
pmem->pfn_flags = PFN_DEV;
if (is_nd_pfn(dev)) {
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
pmem->pgmap.ops = &fsdax_pagemap_ops;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pfn_sb = nd_pfn->pfn_sb;
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@ -527,6 +543,7 @@ static int pmem_attach_disk(struct device *dev,
pmem->pgmap.range.end = res->end;
pmem->pgmap.nr_range = 1;
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
pmem->pgmap.ops = &fsdax_pagemap_ops;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pmem->pfn_flags |= PFN_MAP;
bb_range = pmem->pgmap.range;

View File

@ -492,15 +492,18 @@ static bool is_normal_memory(pgprot_t p)
#endif
}
static int __check_mem_type(struct vm_area_struct *vma, unsigned long end)
static int __check_mem_type(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
while (vma && is_normal_memory(vma->vm_page_prot)) {
if (vma->vm_end >= end)
return 0;
vma = vma->vm_next;
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, start);
for_each_vma_range(vmi, vma, end) {
if (!is_normal_memory(vma->vm_page_prot))
return -EINVAL;
}
return -EINVAL;
return 0;
}
int optee_check_mem_type(unsigned long start, size_t num_pages)
@ -516,8 +519,7 @@ int optee_check_mem_type(unsigned long start, size_t num_pages)
return 0;
mmap_read_lock(mm);
rc = __check_mem_type(find_vma(mm, start),
start + num_pages * PAGE_SIZE);
rc = __check_mem_type(mm, start, start + num_pages * PAGE_SIZE);
mmap_read_unlock(mm);
return rc;

View File

@ -856,7 +856,7 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb)
vb->shrinker.count_objects = virtio_balloon_shrinker_count;
vb->shrinker.seeks = DEFAULT_SEEKS;
return register_shrinker(&vb->shrinker);
return register_shrinker(&vb->shrinker, "virtio-balloon");
}
static int virtballoon_probe(struct virtio_device *vdev)

View File

@ -282,7 +282,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata)
struct page, lru);
struct privcmd_mmap_entry *msg = page_address(page);
vma = find_vma(mm, msg->va);
vma = vma_lookup(mm, msg->va);
rc = -EINVAL;
if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data)

View File

@ -305,7 +305,7 @@ static int __init xenbus_probe_backend_init(void)
register_xenstore_notifier(&xenstore_notifier);
if (register_shrinker(&backend_memory_shrinker))
if (register_shrinker(&backend_memory_shrinker, "xen-backend"))
pr_warn("shrinker registration failed\n");
return 0;

View File

@ -1816,6 +1816,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
error = -EBUSY;
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
s->s_id);
btrfs_sb(s)->bdev_holder = fs_type;
if (!strstr(crc32c_impl(), "generic"))
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);

View File

@ -819,8 +819,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
if (retry)
gfp |= __GFP_NOFAIL;
/* The page lock pins the memcg */
memcg = page_memcg(page);
memcg = get_mem_cgroup_from_page(page);
old_memcg = set_active_memcg(memcg);
head = NULL;
@ -840,6 +839,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
set_bh_page(bh, page, offset);
}
out:
mem_cgroup_put(memcg);
set_active_memcg(old_memcg);
return head;
/*

View File

@ -1072,30 +1072,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
return vma->vm_end - vma->vm_start;
}
static struct vm_area_struct *first_vma(struct task_struct *tsk,
struct vm_area_struct *gate_vma)
{
struct vm_area_struct *ret = tsk->mm->mmap;
if (ret)
return ret;
return gate_vma;
}
/*
* Helper function for iterating across a vma list. It ensures that the caller
* will visit `gate_vma' prior to terminating the search.
*/
static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
static struct vm_area_struct *coredump_next_vma(struct ma_state *mas,
struct vm_area_struct *vma,
struct vm_area_struct *gate_vma)
{
struct vm_area_struct *ret;
ret = this_vma->vm_next;
if (ret)
return ret;
if (this_vma == gate_vma)
if (gate_vma && (vma == gate_vma))
return NULL;
vma = mas_next(mas, ULONG_MAX);
if (vma)
return vma;
return gate_vma;
}
@ -1119,9 +1109,10 @@ static void free_vma_snapshot(struct coredump_params *cprm)
*/
static bool dump_vma_snapshot(struct coredump_params *cprm)
{
struct vm_area_struct *vma, *gate_vma;
struct vm_area_struct *gate_vma, *vma = NULL;
struct mm_struct *mm = current->mm;
int i;
MA_STATE(mas, &mm->mm_mt, 0, 0);
int i = 0;
/*
* Once the stack expansion code is fixed to not change VMA bounds
@ -1141,8 +1132,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
return false;
}
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma), i++) {
while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) {
struct core_vma_metadata *m = cprm->vma_meta + i;
m->start = vma->vm_start;
@ -1150,10 +1140,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
m->flags = vma->vm_flags;
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
m->pgoff = vma->vm_pgoff;
m->file = vma->vm_file;
if (m->file)
get_file(m->file);
i++;
}
mmap_write_unlock(mm);

401
fs/dax.c
View File

@ -334,13 +334,35 @@ static unsigned long dax_end_pfn(void *entry)
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)
static inline bool dax_mapping_is_cow(struct address_space *mapping)
{
return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
}
/*
* TODO: for reflink+dax we need a way to associate a single page with
* multiple address_space instances at different linear_page_index()
* offsets.
* Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
*/
static inline void dax_mapping_set_cow(struct page *page)
{
if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
/*
* Reset the index if the page was already mapped
* regularly before.
*/
if (page->mapping)
page->index = 1;
page->mapping = (void *)PAGE_MAPPING_DAX_COW;
}
page->index++;
}
/*
* When it is called in dax_insert_entry(), the cow flag will indicate that
* whether this entry is shared by multiple files. If so, set the page->mapping
* FS_DAX_MAPPING_COW, and use page->index as refcount.
*/
static void dax_associate_entry(void *entry, struct address_space *mapping,
struct vm_area_struct *vma, unsigned long address)
struct vm_area_struct *vma, unsigned long address, bool cow)
{
unsigned long size = dax_entry_size(entry), pfn, index;
int i = 0;
@ -352,9 +374,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(page->mapping);
page->mapping = mapping;
page->index = index + i++;
if (cow) {
dax_mapping_set_cow(page);
} else {
WARN_ON_ONCE(page->mapping);
page->mapping = mapping;
page->index = index + i++;
}
}
}
@ -370,7 +396,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
if (dax_mapping_is_cow(page->mapping)) {
/* keep the CoW flag if this page is still shared */
if (page->index-- > 0)
continue;
} else
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
page->mapping = NULL;
page->index = 0;
}
@ -455,6 +486,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie)
dax_unlock_entry(&xas, (void *)cookie);
}
/*
* dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
* @mapping: the file's mapping whose entry we want to lock
* @index: the offset within this file
* @page: output the dax page corresponding to this dax entry
*
* Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
* could not be locked.
*/
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
struct page **page)
{
XA_STATE(xas, NULL, 0);
void *entry;
rcu_read_lock();
for (;;) {
entry = NULL;
if (!dax_mapping(mapping))
break;
xas.xa = &mapping->i_pages;
xas_lock_irq(&xas);
xas_set(&xas, index);
entry = xas_load(&xas);
if (dax_is_locked(entry)) {
rcu_read_unlock();
wait_entry_unlocked(&xas, entry);
rcu_read_lock();
continue;
}
if (!entry ||
dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
* Because we are looking for entry from file's mapping
* and index, so the entry may not be inserted for now,
* or even a zero/empty entry. We don't think this is
* an error case. So, return a special value and do
* not output @page.
*/
entry = (void *)~0UL;
} else {
*page = pfn_to_page(dax_to_pfn(entry));
dax_lock_entry(&xas, entry);
}
xas_unlock_irq(&xas);
break;
}
rcu_read_unlock();
return (dax_entry_t)entry;
}
void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
dax_entry_t cookie)
{
XA_STATE(xas, &mapping->i_pages, index);
if (cookie == ~0UL)
return;
dax_unlock_entry(&xas, (void *)cookie);
}
/*
* Find page cache entry at given index. If it is a DAX entry, return it
* with the entry locked. If the page cache doesn't contain an entry at
@ -735,6 +829,23 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
return 0;
}
/*
* MAP_SYNC on a dax mapping guarantees dirty metadata is
* flushed on write-faults (non-cow), but not read-faults.
*/
static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
struct vm_area_struct *vma)
{
return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
(iter->iomap.flags & IOMAP_F_DIRTY);
}
static bool dax_fault_is_cow(const struct iomap_iter *iter)
{
return (iter->flags & IOMAP_WRITE) &&
(iter->iomap.flags & IOMAP_F_SHARED);
}
/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
@ -742,16 +853,19 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
static void *dax_insert_entry(struct xa_state *xas,
struct address_space *mapping, struct vm_fault *vmf,
void *entry, pfn_t pfn, unsigned long flags, bool dirty)
static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
const struct iomap_iter *iter, void *entry, pfn_t pfn,
unsigned long flags)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *new_entry = dax_make_entry(pfn, flags);
bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
bool cow = dax_fault_is_cow(iter);
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@ -763,11 +877,12 @@ static void *dax_insert_entry(struct xa_state *xas,
xas_reset(xas);
xas_lock_irq(xas);
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
cow);
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@ -787,6 +902,9 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
if (cow)
xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irq(xas);
return entry;
}
@ -931,20 +1049,22 @@ int dax_writeback_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
pfn_t *pfnp)
static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
size_t size, void **kaddr, pfn_t *pfnp)
{
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
int id, rc;
int id, rc = 0;
long length;
id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
DAX_ACCESS, NULL, pfnp);
DAX_ACCESS, kaddr, pfnp);
if (length < 0) {
rc = length;
goto out;
}
if (!pfnp)
goto out_check_addr;
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
@ -954,11 +1074,71 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
if (length > 1 && !pfn_t_devmap(*pfnp))
goto out;
rc = 0;
out_check_addr:
if (!kaddr)
goto out;
if (!*kaddr)
rc = -EFAULT;
out:
dax_read_unlock(id);
return rc;
}
/**
* dax_iomap_cow_copy - Copy the data from source to destination before write
* @pos: address to do copy from.
* @length: size of copy operation.
* @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
* @srcmap: iomap srcmap
* @daddr: destination address to copy to.
*
* This can be called from two places. Either during DAX write fault (page
* aligned), to copy the length size data to daddr. Or, while doing normal DAX
* write operation, dax_iomap_actor() might call this to do the copy of either
* start or end unaligned address. In the latter case the rest of the copy of
* aligned ranges is taken care by dax_iomap_actor() itself.
*/
static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
const struct iomap *srcmap, void *daddr)
{
loff_t head_off = pos & (align_size - 1);
size_t size = ALIGN(head_off + length, align_size);
loff_t end = pos + length;
loff_t pg_end = round_up(end, align_size);
bool copy_all = head_off == 0 && end == pg_end;
void *saddr = 0;
int ret = 0;
ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
if (ret)
return ret;
if (copy_all) {
ret = copy_mc_to_kernel(daddr, saddr, length);
return ret ? -EIO : 0;
}
/* Copy the head part of the range */
if (head_off) {
ret = copy_mc_to_kernel(daddr, saddr, head_off);
if (ret)
return -EIO;
}
/* Copy the tail part of the range */
if (end < pg_end) {
loff_t tail_off = head_off + length;
loff_t tail_len = pg_end - end;
ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
tail_len);
if (ret)
return -EIO;
}
return 0;
}
/*
* The user has performed a load from a hole in the file. Allocating a new
* page in the file would cause excessive storage usage for workloads with
@ -966,17 +1146,15 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
* If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead.
*/
static vm_fault_t dax_load_hole(struct xa_state *xas,
struct address_space *mapping, void **entry,
struct vm_fault *vmf)
static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
const struct iomap_iter *iter, void **entry)
{
struct inode *inode = mapping->host;
struct inode *inode = iter->inode;
unsigned long vaddr = vmf->address;
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
DAX_ZERO_PAGE, false);
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
@ -985,7 +1163,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
#ifdef CONFIG_FS_DAX_PMD
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
const struct iomap *iomap, void **entry)
const struct iomap_iter *iter, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
@ -1003,8 +1181,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
goto fallback;
pfn = page_to_pfn_t(zero_page);
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
DAX_PMD | DAX_ZERO_PAGE, false);
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
DAX_PMD | DAX_ZERO_PAGE);
if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm);
@ -1037,23 +1215,34 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#else
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
const struct iomap *iomap, void **entry)
const struct iomap_iter *iter, void **entry)
{
return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */
static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
unsigned int offset, size_t size)
static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
{
const struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
unsigned offset = offset_in_page(pos);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
void *kaddr;
long ret;
ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL);
if (ret > 0) {
memset(kaddr + offset, 0, size);
dax_flush(dax_dev, kaddr + offset, size);
}
ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
NULL);
if (ret < 0)
return ret;
memset(kaddr + offset, 0, size);
if (srcmap->addr != iomap->addr) {
ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
kaddr);
if (ret < 0)
return ret;
dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
} else
dax_flush(iomap->dax_dev, kaddr + offset, size);
return ret;
}
@ -1080,7 +1269,7 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
rc = dax_memzero(iter, pos, size);
dax_read_unlock(id);
if (rc < 0)
@ -1129,15 +1318,17 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
const struct iomap *srcmap = &iomi->srcmap;
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
struct dax_device *dax_dev = iomap->dax_dev;
loff_t end = pos + length, done = 0;
bool write = iov_iter_rw(iter) == WRITE;
ssize_t ret = 0;
size_t xfer;
int id;
if (iov_iter_rw(iter) == READ) {
if (!write) {
end = min(end, i_size_read(iomi->inode));
if (pos >= end)
return 0;
@ -1146,7 +1337,12 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
return iov_iter_zero(min(length, end - pos), iter);
}
if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
/*
* In DAX mode, enforce either pure overwrites of written extents, or
* writes to unwritten extents as part of a copy-on-write operation.
*/
if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
!(iomap->flags & IOMAP_F_SHARED)))
return -EIO;
/*
@ -1188,6 +1384,14 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
break;
}
if (write &&
srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
kaddr);
if (ret)
break;
}
map_len = PFN_PHYS(map_len);
kaddr += offset;
map_len -= offset;
@ -1197,7 +1401,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (recovery)
xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
map_len, iter);
else if (iov_iter_rw(iter) == WRITE)
else if (write)
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
@ -1267,17 +1471,6 @@ static vm_fault_t dax_fault_return(int error)
return vmf_error(error);
}
/*
* MAP_SYNC on a dax mapping guarantees dirty metadata is
* flushed on write-faults (non-cow), but not read-faults.
*/
static bool dax_fault_is_synchronous(unsigned long flags,
struct vm_area_struct *vma, const struct iomap *iomap)
{
return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
&& (iomap->flags & IOMAP_F_DIRTY);
}
/*
* When handling a synchronous page fault and the inode need a fsync, we can
* insert the PTE/PMD into page tables only after that fsync happened. Skip
@ -1335,15 +1528,15 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
const struct iomap_iter *iter, pfn_t *pfnp,
struct xa_state *xas, void **entry, bool pmd)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
const struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = &iter->srcmap;
size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap);
bool write = iter->flags & IOMAP_WRITE;
unsigned long entry_flags = pmd ? DAX_PMD : 0;
int err = 0;
pfn_t pfn;
void *kaddr;
if (!pmd && vmf->cow_page)
return dax_fault_cow_page(vmf, iter);
@ -1352,23 +1545,29 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
if (!write &&
(iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
if (!pmd)
return dax_load_hole(xas, mapping, entry, vmf);
return dax_pmd_load_hole(xas, vmf, iomap, entry);
return dax_load_hole(xas, vmf, iter, entry);
return dax_pmd_load_hole(xas, vmf, iter, entry);
}
if (iomap->type != IOMAP_MAPPED) {
if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
WARN_ON_ONCE(1);
return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
}
err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn);
err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
if (err)
return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
write && !sync);
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
if (sync)
if (write &&
srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
if (err)
return dax_fault_return(err);
}
if (dax_fault_is_synchronous(iter, vmf->vma))
return dax_fault_synchronous_pfnp(pfnp, pfn);
/* insert PMD pfn */
@ -1674,3 +1873,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
struct iomap_iter *it_dest, u64 len, bool *same)
{
const struct iomap *smap = &it_src->iomap;
const struct iomap *dmap = &it_dest->iomap;
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
void *saddr, *daddr;
int id, ret;
len = min(len, min(smap->length, dmap->length));
if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
*same = true;
return len;
}
if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
*same = false;
return 0;
}
id = dax_read_lock();
ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
&saddr, NULL);
if (ret < 0)
goto out_unlock;
ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
&daddr, NULL);
if (ret < 0)
goto out_unlock;
*same = !memcmp(saddr, daddr, len);
if (!*same)
len = 0;
dax_read_unlock(id);
return len;
out_unlock:
dax_read_unlock(id);
return -EIO;
}
int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dst, loff_t dstoff, loff_t len, bool *same,
const struct iomap_ops *ops)
{
struct iomap_iter src_iter = {
.inode = src,
.pos = srcoff,
.len = len,
.flags = IOMAP_DAX,
};
struct iomap_iter dst_iter = {
.inode = dst,
.pos = dstoff,
.len = len,
.flags = IOMAP_DAX,
};
int ret;
while ((ret = iomap_iter(&src_iter, ops)) > 0) {
while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
dst_iter.processed = dax_range_compare_iter(&src_iter,
&dst_iter, len, same);
}
if (ret <= 0)
src_iter.processed = ret;
}
return ret;
}
int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags,
const struct iomap_ops *ops)
{
return __generic_remap_file_range_prep(file_in, pos_in, file_out,
pos_out, len, remap_flags, ops);
}
EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);

View File

@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
dif->bdev = bdev;
dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
NULL, NULL);
}
dif->blocks = le32_to_cpu(dis->blocks);
@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
}
sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
&sbi->dax_part_off);
&sbi->dax_part_off,
NULL, NULL);
}
err = erofs_read_superblock(sb);
@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
{
struct erofs_device_info *dif = ptr;
fs_put_dax(dif->dax_dev);
fs_put_dax(dif->dax_dev, NULL);
if (dif->bdev)
blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
erofs_fscache_unregister_cookie(&dif->fscache);
@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb)
return;
erofs_free_dev_context(sbi->devs);
fs_put_dax(sbi->dax_dev);
fs_put_dax(sbi->dax_dev, NULL);
erofs_fscache_unregister_cookie(&sbi->s_fscache);
erofs_fscache_unregister_fs(sb);
kfree(sbi->opt.fsid);

View File

@ -282,7 +282,7 @@ static struct shrinker erofs_shrinker_info = {
int __init erofs_init_shrinker(void)
{
return register_shrinker(&erofs_shrinker_info);
return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
}
void erofs_exit_shrinker(void)

View File

@ -28,7 +28,6 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
@ -688,6 +687,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
unsigned long length = old_end - old_start;
unsigned long new_start = old_start - shift;
unsigned long new_end = old_end - shift;
VMA_ITERATOR(vmi, mm, new_start);
struct vm_area_struct *next;
struct mmu_gather tlb;
BUG_ON(new_start > new_end);
@ -696,7 +697,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* ensure there are no vmas between where we want to go
* and where we are
*/
if (vma != find_vma(mm, new_start))
if (vma != vma_next(&vmi))
return -EFAULT;
/*
@ -715,12 +716,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
next = vma_next(&vmi);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
*/
free_pgd_range(&tlb, new_end, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
next ? next->vm_start : USER_PGTABLES_CEILING);
} else {
/*
* otherwise, clean from old_start; this is done to not touch
@ -729,7 +731,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* for the others its just a little faster.
*/
free_pgd_range(&tlb, old_start, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
next ? next->vm_start : USER_PGTABLES_CEILING);
}
tlb_finish_mmu(&tlb);
@ -1030,8 +1032,6 @@ static int exec_mmap(struct mm_struct *mm)
activate_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
tsk->mm->vmacache_seqnum = 0;
vmacache_flush(tsk);
task_unlock(tsk);
if (vfork)

View File

@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb)
brelse (sbi->s_sbh);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
}
@ -833,7 +833,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_sb_block = sb_block;
sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
NULL, NULL);
spin_lock_init(&sbi->s_lock);
ret = -EINVAL;
@ -1202,7 +1203,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
failed_mount:
brelse(bh);
failed_sbi:
fs_put_dax(sbi->s_daxdev);
fs_put_dax(sbi->s_daxdev, NULL);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
kfree(sbi);

View File

@ -1654,7 +1654,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
err = register_shrinker(&sbi->s_es_shrinker);
err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
sbi->s_sb->s_id);
if (err)
goto err4;

View File

@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb)
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
fs_put_dax(sbi->s_daxdev, NULL);
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
@ -4272,7 +4272,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi)
return;
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
}
@ -4284,7 +4284,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
if (!sbi)
return NULL;
sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
NULL, NULL);
sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
@ -4296,7 +4297,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
sbi->s_sb = sb;
return sbi;
err_out:
fs_put_dax(sbi->s_daxdev);
fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
return NULL;
}

View File

@ -4616,7 +4616,7 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_sysfs();
if (err)
goto free_garbage_collection_cache;
err = register_shrinker(&f2fs_shrinker_info);
err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker");
if (err)
goto free_sysfs;
err = register_filesystem(&f2fs_fs_type);

View File

@ -244,15 +244,13 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
if (inode_cgwb_enabled(inode)) {
struct cgroup_subsys_state *memcg_css;
if (page) {
memcg_css = mem_cgroup_css_from_page(page);
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
} else {
/* must pin memcg_css, see wb_get_create() */
/* must pin memcg_css, see wb_get_create() */
if (page)
memcg_css = get_mem_cgroup_css_from_page(page);
else
memcg_css = task_get_css(current, memory_cgrp_id);
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
css_put(memcg_css);
}
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
css_put(memcg_css);
}
if (!wb)
@ -869,16 +867,16 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
if (!wbc->wb || wbc->no_cgroup_owner)
return;
css = mem_cgroup_css_from_page(page);
css = get_mem_cgroup_css_from_page(page);
/* dead cgroups shouldn't contribute to inode ownership arbitration */
if (!(css->flags & CSS_ONLINE))
return;
goto out;
id = css->id;
if (id == wbc->wb_id) {
wbc->wb_bytes += bytes;
return;
goto out;
}
if (id == wbc->wb_lcand_id)
@ -891,6 +889,9 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
wbc->wb_tcand_bytes += bytes;
else
wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
out:
css_put(css);
}
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);

View File

@ -2534,7 +2534,7 @@ int __init gfs2_glock_init(void)
return -ENOMEM;
}
ret = register_shrinker(&glock_shrinker);
ret = register_shrinker(&glock_shrinker, "gfs2-glock");
if (ret) {
destroy_workqueue(gfs2_delete_workqueue);
destroy_workqueue(glock_workqueue);

View File

@ -148,7 +148,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_trans_cachep)
goto fail_cachep8;
error = register_shrinker(&gfs2_qd_shrinker);
error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");
if (error)
goto fail_shrinker;

View File

@ -1418,7 +1418,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
goto err_cleanup;
if (register_shrinker(&journal->j_shrinker)) {
if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) {
percpu_counter_destroy(&journal->j_checkpoint_jh_count);
goto err_cleanup;
}

View File

@ -1217,6 +1217,15 @@ void kfree_link(void *p)
}
EXPORT_SYMBOL(kfree_link);
static const struct address_space_operations anon_aops = {
.dirty_folio = noop_dirty_folio,
};
bool is_anon_inode(struct inode *inode)
{
return inode->i_mapping->a_ops == &anon_aops;
}
struct inode *alloc_anon_inode(struct super_block *s)
{
static const struct address_space_operations anon_aops = {

View File

@ -367,7 +367,7 @@ struct mb_cache *mb_cache_create(int bucket_bits)
cache->c_shrink.count_objects = mb_cache_count;
cache->c_shrink.scan_objects = mb_cache_scan;
cache->c_shrink.seeks = DEFAULT_SEEKS;
if (register_shrinker(&cache->c_shrink)) {
if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) {
kfree(cache->c_hash);
kfree(cache);
goto err_out;

View File

@ -1017,15 +1017,16 @@ int __init nfs4_xattr_cache_init(void)
if (ret)
goto out2;
ret = register_shrinker(&nfs4_xattr_cache_shrinker);
ret = register_shrinker(&nfs4_xattr_cache_shrinker, "nfs-xattr_cache");
if (ret)
goto out1;
ret = register_shrinker(&nfs4_xattr_entry_shrinker);
ret = register_shrinker(&nfs4_xattr_entry_shrinker, "nfs-xattr_entry");
if (ret)
goto out;
ret = register_shrinker(&nfs4_xattr_large_entry_shrinker);
ret = register_shrinker(&nfs4_xattr_large_entry_shrinker,
"nfs-xattr_large_entry");
if (!ret)
return 0;

View File

@ -149,7 +149,7 @@ int __init register_nfs_fs(void)
ret = nfs_register_sysctl();
if (ret < 0)
goto error_2;
ret = register_shrinker(&acl_shrinker);
ret = register_shrinker(&acl_shrinker, "nfs-acl");
if (ret < 0)
goto error_3;
#ifdef CONFIG_NFS_V4_2

View File

@ -670,7 +670,7 @@ nfsd_file_cache_init(void)
goto out_err;
}
ret = register_shrinker(&nfsd_file_shrinker);
ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache");
if (ret) {
pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
goto out_lru;

View File

@ -176,7 +176,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
nn->nfsd_reply_cache_shrinker.seeks = 1;
status = register_shrinker(&nn->nfsd_reply_cache_shrinker);
status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
"nfsd-reply:%s", nn->nfsd_name);
if (status)
goto out_stats_destroy;

View File

@ -453,8 +453,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns,
leave:
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
if (handle)
if (handle) {
if (status < 0 && new_fe_bh != NULL)
ocfs2_set_links_count((struct ocfs2_dinode *)
new_fe_bh->b_data, 0);
ocfs2_commit_trans(osb, handle);
}
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
leave:
if (status < 0) {
if (*new_fe_bh) {
if (fe)
ocfs2_set_links_count(fe, 0);
brelse(*new_fe_bh);
*new_fe_bh = NULL;
}
@ -634,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
if (status < 0) {
if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags &
OCFS2_LOCK_INITIALIZED)) {
u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
@ -2027,8 +2034,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns,
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
if (handle)
if (handle) {
if (status < 0 && new_fe_bh != NULL)
ocfs2_set_links_count((struct ocfs2_dinode *)
new_fe_bh->b_data, 0);
ocfs2_commit_trans(osb, handle);
}
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@ -2489,6 +2500,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
}
int ocfs2_create_inode_in_orphan(struct inode *dir,
struct buffer_head **dir_bh,
int mode,
struct inode **new_inode)
{
@ -2597,13 +2609,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
brelse(new_di_bh);
if (!status)
*new_inode = inode;
ocfs2_free_dir_lookup_result(&orphan_insert);
ocfs2_inode_unlock(dir, 1);
brelse(parent_di_bh);
if (!status) {
*new_inode = inode;
*dir_bh = parent_di_bh;
} else {
ocfs2_inode_unlock(dir, 1);
brelse(parent_di_bh);
}
return status;
}
@ -2760,11 +2775,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
}
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct buffer_head *dir_bh,
struct inode *inode,
struct dentry *dentry)
{
int status = 0;
struct buffer_head *parent_di_bh = NULL;
handle_t *handle = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dinode *dir_di, *di;
@ -2778,14 +2793,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
return status;
}
dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
if (!dir_di->i_links_count) {
/* can't make a file in a deleted directory. */
status = -ENOENT;
@ -2798,7 +2806,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
goto leave;
/* get a spot inside the dir. */
status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
dentry->d_name.name,
dentry->d_name.len, &lookup);
if (status < 0) {
@ -2862,7 +2870,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
ocfs2_journal_dirty(handle, di_bh);
status = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno, parent_di_bh,
OCFS2_I(inode)->ip_blkno, dir_bh,
&lookup);
if (status < 0) {
mlog_errno(status);
@ -2886,10 +2894,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
iput(orphan_dir_inode);
leave:
ocfs2_inode_unlock(dir, 1);
brelse(di_bh);
brelse(parent_di_bh);
brelse(orphan_dir_bh);
ocfs2_free_dir_lookup_result(&lookup);

View File

@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
struct buffer_head *orphan_dir_bh,
bool dio);
int ocfs2_create_inode_in_orphan(struct inode *dir,
struct buffer_head **dir_bh,
int mode,
struct inode **new_inode);
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
struct inode *inode, struct buffer_head *di_bh,
int update_isize, loff_t end);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct buffer_head *dir_bh,
struct inode *new_inode,
struct dentry *new_dentry);

View File

@ -4222,7 +4222,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
{
int error, had_lock;
struct inode *inode = d_inode(old_dentry);
struct buffer_head *old_bh = NULL;
struct buffer_head *old_bh = NULL, *dir_bh = NULL;
struct inode *new_orphan_inode = NULL;
struct ocfs2_lock_holder oh;
@ -4230,7 +4230,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
return -EOPNOTSUPP;
error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
@ -4276,13 +4276,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
error = ocfs2_init_security_and_acl(dir, dir_bh,
new_orphan_inode,
&new_dentry->d_name);
if (error)
mlog_errno(error);
}
if (!error) {
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
new_orphan_inode,
new_dentry);
if (error)
mlog_errno(error);
@ -4300,6 +4302,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
iput(new_orphan_inode);
}
if (dir_bh) {
ocfs2_inode_unlock(dir, 1);
brelse(dir_bh);
}
return error;
}

View File

@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
/*
* Initialize security and acl for a already created inode.
* Used for reflink a non-preserve-security file.
*
* It uses common api like ocfs2_xattr_set, so the caller
* must not hold any lock expect i_rwsem.
*/
int ocfs2_init_security_and_acl(struct inode *dir,
struct buffer_head *dir_bh,
struct inode *inode,
const struct qstr *qstr)
{
int ret = 0;
struct buffer_head *dir_bh = NULL;
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (ret) {
@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
goto leave;
}
ret = ocfs2_inode_lock(dir, &dir_bh, 0);
if (ret) {
mlog_errno(ret);
goto leave;
}
ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
if (ret)
mlog_errno(ret);
ocfs2_inode_unlock(dir, 0);
brelse(dir_bh);
leave:
return ret;
}

View File

@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
struct buffer_head *new_bh,
bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
struct buffer_head *dir_bh,
struct inode *inode,
const struct qstr *qstr);
#endif /* OCFS2_XATTR_H */

View File

@ -69,7 +69,6 @@
#include <linux/sched/cputime.h>
#include <linux/proc_fs.h>
#include <linux/ioport.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>

View File

@ -2322,6 +2322,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
MA_STATE(mas, NULL, 0, 0);
genradix_init(&fa);
@ -2349,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
}
nr_files = 0;
mas.tree = &mm->mm_mt;
/*
* We need two passes here:
@ -2360,7 +2362,8 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
* routine might require mmap_lock taken in might_fault().
*/
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
pos = 2;
mas_for_each(&mas, vma, ULONG_MAX) {
if (!vma->vm_file)
continue;
if (++pos <= ctx->pos)

View File

@ -23,6 +23,7 @@ static int seq_show(struct seq_file *m, void *v)
struct files_struct *files = NULL;
int f_flags = 0, ret = -ENOENT;
struct file *file = NULL;
struct inode *inode = NULL;
struct task_struct *task;
task = get_proc_task(m->private);
@ -54,10 +55,19 @@ static int seq_show(struct seq_file *m, void *v)
if (ret)
return ret;
seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n",
(long long)file->f_pos, f_flags,
real_mount(file->f_path.mnt)->mnt_id,
file_inode(file)->i_ino);
inode = file_inode(file);
seq_printf(m, "pos:\t%lli\n", (long long)file->f_pos);
seq_printf(m, "flags:\t0%o\n", f_flags);
seq_printf(m, "mnt_id:\t%i\n", real_mount(file->f_path.mnt)->mnt_id);
seq_printf(m, "ino:\t%lu\n", inode->i_ino);
seq_printf(m, "size:\t%lli\n", (long long)inode->i_size);
if (is_anon_inode(inode)) {
seq_puts(m, "path:\t");
seq_file_path(m, file, "\n");
seq_putc(m, '\n');
}
/* show_fd_locks() never deferences files so a stale value is safe */
show_fd_locks(m, file, files);

View File

@ -26,8 +26,6 @@
#include <linux/mount.h>
#include <linux/bug.h>
#include <linux/uaccess.h>
#include "internal.h"
static void proc_evict_inode(struct inode *inode)

View File

@ -285,7 +285,7 @@ struct proc_maps_private {
struct task_struct *task;
struct mm_struct *mm;
#ifdef CONFIG_MMU
struct vm_area_struct *tail_vma;
struct vma_iterator iter;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;

View File

@ -15,7 +15,6 @@
#include <linux/fs.h>
#include <linux/syslog.h>
#include <linux/uaccess.h>
#include <asm/io.h>
extern wait_queue_head_t log_wait;

View File

@ -21,7 +21,6 @@
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/div64.h>
#include "internal.h"

View File

@ -8,9 +8,6 @@
*
* proc net directory handling functions
*/
#include <linux/uaccess.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>

View File

@ -4,8 +4,6 @@
*
* Copyright 1997, Theodore Ts'o
*/
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>

View File

@ -6,9 +6,6 @@
*
* proc root directory handling functions
*/
#include <linux/uaccess.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>

View File

@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
#include <linux/vmacache.h>
#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
@ -124,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
loff_t *ppos)
{
struct vm_area_struct *vma = vma_next(&priv->iter);
if (vma) {
*ppos = vma->vm_start;
} else {
*ppos = -2UL;
vma = get_gate_vma(priv->mm);
}
return vma;
}
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
unsigned long last_addr = *ppos;
struct mm_struct *mm;
struct vm_area_struct *vma;
/* See m_next(). Zero at the start or after lseek. */
if (last_addr == -1UL)
@ -153,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return ERR_PTR(-EINTR);
}
vma_iter_init(&priv->iter, mm, last_addr);
hold_task_mempolicy(priv);
priv->tail_vma = get_gate_vma(mm);
if (last_addr == -2UL)
return get_gate_vma(mm);
vma = find_vma(mm, last_addr);
if (vma)
return vma;
return priv->tail_vma;
return proc_get_vma(priv, ppos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
struct vm_area_struct *next, *vma = v;
if (vma == priv->tail_vma)
next = NULL;
else if (vma->vm_next)
next = vma->vm_next;
else
next = priv->tail_vma;
*ppos = next ? next->vm_start : -1UL;
return next;
if (*ppos == -2UL) {
*ppos = -1UL;
return NULL;
}
return proc_get_vma(m->private, ppos);
}
static void m_stop(struct seq_file *m, void *v)
@ -406,6 +409,7 @@ struct mem_size_stats {
u64 pss_anon;
u64 pss_file;
u64 pss_shmem;
u64 pss_dirty;
u64 pss_locked;
u64 swap_pss;
};
@ -427,6 +431,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
mss->pss_locked += pss;
if (dirty || PageDirty(page)) {
mss->pss_dirty += pss;
if (private)
mss->private_dirty += size;
else
@ -808,6 +813,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
{
SEQ_PUT_DEC("Rss: ", mss->resident);
SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT);
if (rollup_mode) {
/*
* These are meaningful only for smaps_rollup, otherwise two of
@ -860,7 +866,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %d\n",
transparent_hugepage_active(vma));
hugepage_vma_check(vma, vma->vm_flags, true, false));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@ -873,16 +879,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
struct mem_size_stats mss;
struct mm_struct *mm;
struct mm_struct *mm = priv->mm;
struct vm_area_struct *vma;
unsigned long last_vma_end = 0;
unsigned long vma_start = 0, last_vma_end = 0;
int ret = 0;
MA_STATE(mas, &mm->mm_mt, 0, 0);
priv->task = get_proc_task(priv->inode);
if (!priv->task)
return -ESRCH;
mm = priv->mm;
if (!mm || !mmget_not_zero(mm)) {
ret = -ESRCH;
goto out_put_task;
@ -895,8 +901,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
goto out_put_mm;
hold_task_mempolicy(priv);
vma = mas_find(&mas, 0);
for (vma = priv->mm->mmap; vma;) {
if (unlikely(!vma))
goto empty_set;
vma_start = vma->vm_start;
do {
smap_gather_stats(vma, &mss, 0);
last_vma_end = vma->vm_end;
@ -905,6 +916,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
* access it for write request.
*/
if (mmap_lock_is_contended(mm)) {
mas_pause(&mas);
mmap_read_unlock(mm);
ret = mmap_read_lock_killable(mm);
if (ret) {
@ -948,7 +960,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
* contains last_vma_end.
* Iterate VMA' from last_vma_end.
*/
vma = find_vma(mm, last_vma_end - 1);
vma = mas_find(&mas, ULONG_MAX);
/* Case 3 above */
if (!vma)
break;
@ -962,11 +974,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
smap_gather_stats(vma, &mss, last_vma_end);
}
/* Case 2 above */
vma = vma->vm_next;
}
} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
show_vma_header_prefix(m, priv->mm->mmap->vm_start,
last_vma_end, 0, 0, 0, 0);
empty_set:
show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
seq_pad(m, ' ');
seq_puts(m, "[rollup]\n");
@ -1259,6 +1270,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
MA_STATE(mas, &mm->mm_mt, 0, 0);
struct mmu_notifier_range range;
struct clear_refs_private cp = {
.type = type,
@ -1278,7 +1290,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
}
if (type == CLEAR_REFS_SOFT_DIRTY) {
for (vma = mm->mmap; vma; vma = vma->vm_next) {
mas_for_each(&mas, vma, ULONG_MAX) {
if (!(vma->vm_flags & VM_SOFTDIRTY))
continue;
vma->vm_flags &= ~VM_SOFTDIRTY;
@ -1290,8 +1302,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
&cp);
walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
if (type == CLEAR_REFS_SOFT_DIRTY) {
mmu_notifier_invalidate_range_end(&range);
flush_tlb_mm(mm);
@ -1792,7 +1803,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return NULL;
page = vm_normal_page(vma, addr, pte);
if (!page)
if (!page || is_zone_device_page(page))
return NULL;
if (PageReserved(page))

View File

@ -20,15 +20,13 @@
*/
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *p;
unsigned long bytes = 0, sbytes = 0, slack = 0, size;
mmap_read_lock(mm);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
vma = rb_entry(p, struct vm_area_struct, vm_rb);
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
bytes += kobjsize(vma);
region = vma->vm_region;
@ -82,15 +80,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long task_vsize(struct mm_struct *mm)
{
VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct rb_node *p;
unsigned long vsize = 0;
mmap_read_lock(mm);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
vma = rb_entry(p, struct vm_area_struct, vm_rb);
for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
}
mmap_read_unlock(mm);
return vsize;
}
@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *p;
unsigned long size = kobjsize(mm);
mmap_read_lock(mm);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
vma = rb_entry(p, struct vm_area_struct, vm_rb);
for_each_vma(vmi, vma) {
size += kobjsize(vma);
region = vma->vm_region;
if (region) {
@ -190,17 +185,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
*/
static int show_map(struct seq_file *m, void *_p)
{
struct rb_node *p = _p;
return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
return nommu_vma_show(m, _p);
}
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
struct mm_struct *mm;
struct rb_node *p;
loff_t n = *pos;
struct vm_area_struct *vma;
unsigned long addr = *pos;
/* See m_next(). Zero at the start or after lseek. */
if (addr == -1UL)
return NULL;
/* pin the task and mm whilst we play with them */
priv->task = get_proc_task(priv->inode);
@ -216,10 +213,10 @@ static void *m_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-EINTR);
}
/* start from the Nth VMA */
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
if (n-- == 0)
return p;
/* start the next element from addr */
vma = find_vma(mm, addr);
if (vma)
return vma;
mmap_read_unlock(mm);
mmput(mm);
@ -242,10 +239,10 @@ static void m_stop(struct seq_file *m, void *_vml)
static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
{
struct rb_node *p = _p;
struct vm_area_struct *vma = _p;
(*pos)++;
return p ? rb_next(p) : NULL;
*pos = vma->vm_end;
return find_vma(vma->vm_mm, vma->vm_end);
}
static const struct seq_operations proc_pid_maps_ops = {

View File

@ -25,7 +25,6 @@
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
#include <linux/uio.h>
#include <linux/cc_platform.h>
#include <asm/io.h>

View File

@ -3002,7 +3002,7 @@ static int __init dquot_init(void)
pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
if (register_shrinker(&dqcache_shrinker))
if (register_shrinker(&dqcache_shrinker, "dquota-cache"))
panic("Cannot register dquot shrinker");
return 0;

View File

@ -14,6 +14,7 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/dax.h>
#include "internal.h"
#include <linux/uaccess.h>
@ -262,9 +263,11 @@ static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff,
* If there's an error, then the usual negative error code is returned.
* Otherwise returns 0 with *len set to the request length.
*/
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags)
int
__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags,
const struct iomap_ops *dax_read_ops)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@ -324,8 +327,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
ret = vfs_dedupe_file_range_compare(file_in, pos_in,
file_out, pos_out, *len, &is_same);
if (*len == 0)
return 0;
if (!IS_DAX(inode_in))
ret = vfs_dedupe_file_range_compare(file_in, pos_in,
file_out, pos_out, *len, &is_same);
else if (dax_read_ops)
ret = dax_dedupe_file_range_compare(inode_in, pos_in,
inode_out, pos_out, *len, &is_same,
dax_read_ops);
else
return -EINVAL;
if (ret)
return ret;
if (!is_same)
@ -343,6 +356,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
return ret;
}
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags)
{
return __generic_remap_file_range_prep(file_in, pos_in, file_out,
pos_out, len, remap_flags, NULL);
}
EXPORT_SYMBOL(generic_remap_file_range_prep);
loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,

View File

@ -5,9 +5,9 @@
obj-$(CONFIG_SQUASHFS) += squashfs.o
squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
squashfs-y += namei.o super.o symlink.o decompressor.o
squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o
squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o

Some files were not shown because too many files have changed in this diff Show More