mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-01 10:45:49 +00:00
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - A few misc subsystems: kthread, scripts, ntfs, ocfs2, block, and vfs - Most the MM patches which precede the patches in Willy's tree: kasan, pagecache, gup, swap, shmem, memcg, selftests, pagemap, mremap, sparsemem, vmalloc, pagealloc, memory-failure, mlock, hugetlb, userfaultfd, vmscan, compaction, mempolicy, oom-kill, migration, thp, cma, autonuma, psi, ksm, page-poison, madvise, memory-hotplug, rmap, zswap, uaccess, ioremap, highmem, cleanups, kfence, hmm, and damon. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (227 commits) mm/damon/sysfs: remove repeat container_of() in damon_sysfs_kdamond_release() Docs/ABI/testing: add DAMON sysfs interface ABI document Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface selftests/damon: add a test for DAMON sysfs interface mm/damon/sysfs: support DAMOS stats mm/damon/sysfs: support DAMOS watermarks mm/damon/sysfs: support schemes prioritization mm/damon/sysfs: support DAMOS quotas mm/damon/sysfs: support DAMON-based Operation Schemes mm/damon/sysfs: support the physical address space monitoring mm/damon/sysfs: link DAMON for virtual address spaces monitoring mm/damon: implement a minimal stub for sysfs-based DAMON interface mm/damon/core: add number of each enum type values mm/damon/core: allow non-exclusive DAMON start/stop Docs/damon: update outdated term 'regions update interval' Docs/vm/damon/design: update DAMON-Idle Page Tracking interference handling Docs/vm/damon: call low level monitoring primitives the operations mm/damon: remove unnecessary CONFIG_DAMON option mm/damon/paddr,vaddr: remove damon_{p,v}a_{target_valid,set_operations}() mm/damon/dbgfs-test: fix is_target_id() change ...
This commit is contained in:
commit
3bf03b9a08
274
Documentation/ABI/testing/sysfs-kernel-mm-damon
Normal file
274
Documentation/ABI/testing/sysfs-kernel-mm-damon
Normal file
@ -0,0 +1,274 @@
|
||||
what: /sys/kernel/mm/damon/
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Interface for Data Access MONitoring (DAMON). Contains files
|
||||
for controlling DAMON. For more details on DAMON itself,
|
||||
please refer to Documentation/admin-guide/mm/damon/index.rst.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Interface for privileged users of DAMON. Contains files for
|
||||
controlling DAMON that aimed to be used by privileged users.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/nr_kdamonds
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a number 'N' to this file creates the number of
|
||||
directories for controlling each DAMON worker thread (kdamond)
|
||||
named '0' to 'N-1' under the kdamonds/ directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/state
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing 'on' or 'off' to this file makes the kdamond starts or
|
||||
stops, respectively. Reading the file returns the keywords
|
||||
based on the current status. Writing 'update_schemes_stats' to
|
||||
the file updates contents of schemes stats files of the
|
||||
kdamond.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/pid
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the pid of the kdamond if it is
|
||||
running.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/nr_contexts
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a number 'N' to this file creates the number of
|
||||
directories for controlling each DAMON context named '0' to
|
||||
'N-1' under the contexts/ directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/operations
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a keyword for a monitoring operations set ('vaddr' for
|
||||
virtual address spaces monitoring, and 'paddr' for the physical
|
||||
address space monitoring) to this file makes the context to use
|
||||
the operations set. Reading the file returns the keyword for
|
||||
the operations set the context is set to use.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/sample_us
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a value to this file sets the sampling interval of the
|
||||
DAMON context in microseconds as the value. Reading this file
|
||||
returns the value.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/aggr_us
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a value to this file sets the aggregation interval of
|
||||
the DAMON context in microseconds as the value. Reading this
|
||||
file returns the value.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/update_us
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a value to this file sets the update interval of the
|
||||
DAMON context in microseconds as the value. Reading this file
|
||||
returns the value.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/nr_regions/min
|
||||
|
||||
WDate: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a value to this file sets the minimum number of
|
||||
monitoring regions of the DAMON context as the value. Reading
|
||||
this file returns the value.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/nr_regions/max
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a value to this file sets the maximum number of
|
||||
monitoring regions of the DAMON context as the value. Reading
|
||||
this file returns the value.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/nr_targets
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a number 'N' to this file creates the number of
|
||||
directories for controlling each DAMON target of the context
|
||||
named '0' to 'N-1' under the contexts/ directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/pid_target
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the pid of
|
||||
the target process if the context is for virtual address spaces
|
||||
monitoring, respectively.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/regions/nr_regions
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a number 'N' to this file creates the number of
|
||||
directories for setting each DAMON target memory region of the
|
||||
context named '0' to 'N-1' under the regions/ directory. In
|
||||
case of the virtual address space monitoring, DAMON
|
||||
automatically sets the target memory region based on the target
|
||||
processes' mappings.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/regions/<R>/start
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the start
|
||||
address of the monitoring region.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/regions/<R>/end
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the end
|
||||
address of the monitoring region.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/nr_schemes
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a number 'N' to this file creates the number of
|
||||
directories for controlling each DAMON-based operation scheme
|
||||
of the context named '0' to 'N-1' under the schemes/ directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/action
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the action
|
||||
of the scheme.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/sz/min
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the mimimum
|
||||
size of the scheme's target regions in bytes.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/sz/max
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the maximum
|
||||
size of the scheme's target regions in bytes.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/nr_accesses/min
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the manimum
|
||||
'nr_accesses' of the scheme's target regions.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/nr_accesses/max
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the maximum
|
||||
'nr_accesses' of the scheme's target regions.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/age/min
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the minimum
|
||||
'age' of the scheme's target regions.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/age/max
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the maximum
|
||||
'age' of the scheme's target regions.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/ms
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the time
|
||||
quota of the scheme in milliseconds.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/bytes
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the size
|
||||
quota of the scheme in bytes.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/reset_interval_ms
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the quotas
|
||||
charge reset interval of the scheme in milliseconds.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/sz_permil
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the
|
||||
under-quota limit regions prioritization weight for 'size' in
|
||||
permil.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/nr_accesses_permil
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the
|
||||
under-quota limit regions prioritization weight for
|
||||
'nr_accesses' in permil.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/age_permil
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the
|
||||
under-quota limit regions prioritization weight for 'age' in
|
||||
permil.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/metric
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the metric
|
||||
of the watermarks for the scheme. The writable/readable
|
||||
keywords for this file are 'none' for disabling the watermarks
|
||||
feature, or 'free_mem_rate' for the system's global free memory
|
||||
rate in permil.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/interval_us
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the metric
|
||||
check interval of the watermarks for the scheme in
|
||||
microseconds.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/high
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the high
|
||||
watermark of the scheme in permil.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/mid
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the mid
|
||||
watermark of the scheme in permil.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/low
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the low
|
||||
watermark of the scheme in permil.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_tried
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the number of regions that the action
|
||||
of the scheme has tried to be applied.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/sz_tried
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the total size of regions that the
|
||||
action of the scheme has tried to be applied in bytes.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_applied
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the number of regions that the action
|
||||
of the scheme has successfully applied.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/sz_applied
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the total size of regions that the
|
||||
action of the scheme has successfully applied in bytes.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/qt_exceeds
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the number of the exceed events of
|
||||
the scheme's quotas.
|
@ -64,6 +64,7 @@ Brief summary of control files.
|
||||
threads
|
||||
cgroup.procs show list of processes
|
||||
cgroup.event_control an interface for event_fd()
|
||||
This knob is not available on CONFIG_PREEMPT_RT systems.
|
||||
memory.usage_in_bytes show current usage for memory
|
||||
(See 5.5 for details)
|
||||
memory.memsw.usage_in_bytes show current usage for memory+Swap
|
||||
@ -75,6 +76,7 @@ Brief summary of control files.
|
||||
memory.max_usage_in_bytes show max memory usage recorded
|
||||
memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded
|
||||
memory.soft_limit_in_bytes set/show soft limit of memory usage
|
||||
This knob is not available on CONFIG_PREEMPT_RT systems.
|
||||
memory.stat show various statistics
|
||||
memory.use_hierarchy set/show hierarchical account enabled
|
||||
This knob is deprecated and shouldn't be
|
||||
|
@ -1301,6 +1301,11 @@ PAGE_SIZE multiple when read back.
|
||||
Amount of memory used to cache filesystem data,
|
||||
including tmpfs and shared memory.
|
||||
|
||||
kernel (npn)
|
||||
Amount of total kernel memory, including
|
||||
(kernel_stack, pagetables, percpu, vmalloc, slab) in
|
||||
addition to other kernel memory use cases.
|
||||
|
||||
kernel_stack
|
||||
Amount of memory allocated to kernel stacks.
|
||||
|
||||
|
@ -1649,7 +1649,7 @@
|
||||
[KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
|
||||
enabled.
|
||||
Allows heavy hugetlb users to free up some more
|
||||
memory (6 * PAGE_SIZE for each 2MB hugetlb page).
|
||||
memory (7 * PAGE_SIZE for each 2MB hugetlb page).
|
||||
Format: { on | off (default) }
|
||||
|
||||
on: enable the feature
|
||||
|
@ -4,7 +4,7 @@
|
||||
Detailed Usages
|
||||
===============
|
||||
|
||||
DAMON provides below three interfaces for different users.
|
||||
DAMON provides below interfaces for different users.
|
||||
|
||||
- *DAMON user space tool.*
|
||||
`This <https://github.com/awslabs/damo>`_ is for privileged people such as
|
||||
@ -14,17 +14,21 @@ DAMON provides below three interfaces for different users.
|
||||
virtual and physical address spaces monitoring. For more detail, please
|
||||
refer to its `usage document
|
||||
<https://github.com/awslabs/damo/blob/next/USAGE.md>`_.
|
||||
- *debugfs interface.*
|
||||
:ref:`This <debugfs_interface>` is for privileged user space programmers who
|
||||
- *sysfs interface.*
|
||||
:ref:`This <sysfs_interface>` is for privileged user space programmers who
|
||||
want more optimized use of DAMON. Using this, users can use DAMON’s major
|
||||
features by reading from and writing to special debugfs files. Therefore,
|
||||
you can write and use your personalized DAMON debugfs wrapper programs that
|
||||
reads/writes the debugfs files instead of you. The `DAMON user space tool
|
||||
features by reading from and writing to special sysfs files. Therefore,
|
||||
you can write and use your personalized DAMON sysfs wrapper programs that
|
||||
reads/writes the sysfs files instead of you. The `DAMON user space tool
|
||||
<https://github.com/awslabs/damo>`_ is one example of such programs. It
|
||||
supports both virtual and physical address spaces monitoring. Note that this
|
||||
interface provides only simple :ref:`statistics <damos_stats>` for the
|
||||
monitoring results. For detailed monitoring results, DAMON provides a
|
||||
:ref:`tracepoint <tracepoint>`.
|
||||
- *debugfs interface.*
|
||||
:ref:`This <debugfs_interface>` is almost identical to :ref:`sysfs interface
|
||||
<sysfs_interface>`. This will be removed after next LTS kernel is released,
|
||||
so users should move to the :ref:`sysfs interface <sysfs_interface>`.
|
||||
- *Kernel Space Programming Interface.*
|
||||
:doc:`This </vm/damon/api>` is for kernel space programmers. Using this,
|
||||
users can utilize every feature of DAMON most flexibly and efficiently by
|
||||
@ -32,6 +36,340 @@ DAMON provides below three interfaces for different users.
|
||||
DAMON for various address spaces. For detail, please refer to the interface
|
||||
:doc:`document </vm/damon/api>`.
|
||||
|
||||
.. _sysfs_interface:
|
||||
|
||||
sysfs Interface
|
||||
===============
|
||||
|
||||
DAMON sysfs interface is built when ``CONFIG_DAMON_SYSFS`` is defined. It
|
||||
creates multiple directories and files under its sysfs directory,
|
||||
``<sysfs>/kernel/mm/damon/``. You can control DAMON by writing to and reading
|
||||
from the files under the directory.
|
||||
|
||||
For a short example, users can monitor the virtual address space of a given
|
||||
workload as below. ::
|
||||
|
||||
# cd /sys/kernel/mm/damon/admin/
|
||||
# echo 1 > kdamonds/nr && echo 1 > kdamonds/0/contexts/nr
|
||||
# echo vaddr > kdamonds/0/contexts/0/operations
|
||||
# echo 1 > kdamonds/0/contexts/0/targets/nr
|
||||
# echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid
|
||||
# echo on > kdamonds/0/state
|
||||
|
||||
Files Hierarchy
|
||||
---------------
|
||||
|
||||
The files hierarchy of DAMON sysfs interface is shown below. In the below
|
||||
figure, parents-children relations are represented with indentations, each
|
||||
directory is having ``/`` suffix, and files in each directory are separated by
|
||||
comma (","). ::
|
||||
|
||||
/sys/kernel/mm/damon/admin
|
||||
│ kdamonds/nr_kdamonds
|
||||
│ │ 0/state,pid
|
||||
│ │ │ contexts/nr_contexts
|
||||
│ │ │ │ 0/operations
|
||||
│ │ │ │ │ monitoring_attrs/
|
||||
│ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
|
||||
│ │ │ │ │ │ nr_regions/min,max
|
||||
│ │ │ │ │ targets/nr_targets
|
||||
│ │ │ │ │ │ 0/pid_target
|
||||
│ │ │ │ │ │ │ regions/nr_regions
|
||||
│ │ │ │ │ │ │ │ 0/start,end
|
||||
│ │ │ │ │ │ │ │ ...
|
||||
│ │ │ │ │ │ ...
|
||||
│ │ │ │ │ schemes/nr_schemes
|
||||
│ │ │ │ │ │ 0/action
|
||||
│ │ │ │ │ │ │ access_pattern/
|
||||
│ │ │ │ │ │ │ │ sz/min,max
|
||||
│ │ │ │ │ │ │ │ nr_accesses/min,max
|
||||
│ │ │ │ │ │ │ │ age/min,max
|
||||
│ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms
|
||||
│ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
|
||||
│ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
|
||||
│ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
|
||||
│ │ │ │ │ │ ...
|
||||
│ │ │ │ ...
|
||||
│ │ ...
|
||||
|
||||
Root
|
||||
----
|
||||
|
||||
The root of the DAMON sysfs interface is ``<sysfs>/kernel/mm/damon/``, and it
|
||||
has one directory named ``admin``. The directory contains the files for
|
||||
privileged user space programs' control of DAMON. User space tools or deamons
|
||||
having the root permission could use this directory.
|
||||
|
||||
kdamonds/
|
||||
---------
|
||||
|
||||
The monitoring-related information including request specifications and results
|
||||
are called DAMON context. DAMON executes each context with a kernel thread
|
||||
called kdamond, and multiple kdamonds could run in parallel.
|
||||
|
||||
Under the ``admin`` directory, one directory, ``kdamonds``, which has files for
|
||||
controlling the kdamonds exist. In the beginning, this directory has only one
|
||||
file, ``nr_kdamonds``. Writing a number (``N``) to the file creates the number
|
||||
of child directories named ``0`` to ``N-1``. Each directory represents each
|
||||
kdamond.
|
||||
|
||||
kdamonds/<N>/
|
||||
-------------
|
||||
|
||||
In each kdamond directory, two files (``state`` and ``pid``) and one directory
|
||||
(``contexts``) exist.
|
||||
|
||||
Reading ``state`` returns ``on`` if the kdamond is currently running, or
|
||||
``off`` if it is not running. Writing ``on`` or ``off`` makes the kdamond be
|
||||
in the state. Writing ``update_schemes_stats`` to ``state`` file updates the
|
||||
contents of stats files for each DAMON-based operation scheme of the kdamond.
|
||||
For details of the stats, please refer to :ref:`stats section
|
||||
<sysfs_schemes_stats>`.
|
||||
|
||||
If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
|
||||
|
||||
``contexts`` directory contains files for controlling the monitoring contexts
|
||||
that this kdamond will execute.
|
||||
|
||||
kdamonds/<N>/contexts/
|
||||
----------------------
|
||||
|
||||
In the beginning, this directory has only one file, ``nr_contexts``. Writing a
|
||||
number (``N``) to the file creates the number of child directories named as
|
||||
``0`` to ``N-1``. Each directory represents each monitoring context. At the
|
||||
moment, only one context per kdamond is supported, so only ``0`` or ``1`` can
|
||||
be written to the file.
|
||||
|
||||
contexts/<N>/
|
||||
-------------
|
||||
|
||||
In each context directory, one file (``operations``) and three directories
|
||||
(``monitoring_attrs``, ``targets``, and ``schemes``) exist.
|
||||
|
||||
DAMON supports multiple types of monitoring operations, including those for
|
||||
virtual address space and the physical address space. You can set and get what
|
||||
type of monitoring operations DAMON will use for the context by writing one of
|
||||
below keywords to, and reading from the file.
|
||||
|
||||
- vaddr: Monitor virtual address spaces of specific processes
|
||||
- paddr: Monitor the physical address space of the system
|
||||
|
||||
contexts/<N>/monitoring_attrs/
|
||||
------------------------------
|
||||
|
||||
Files for specifying attributes of the monitoring including required quality
|
||||
and efficiency of the monitoring are in ``monitoring_attrs`` directory.
|
||||
Specifically, two directories, ``intervals`` and ``nr_regions`` exist in this
|
||||
directory.
|
||||
|
||||
Under ``intervals`` directory, three files for DAMON's sampling interval
|
||||
(``sample_us``), aggregation interval (``aggr_us``), and update interval
|
||||
(``update_us``) exist. You can set and get the values in micro-seconds by
|
||||
writing to and reading from the files.
|
||||
|
||||
Under ``nr_regions`` directory, two files for the lower-bound and upper-bound
|
||||
of DAMON's monitoring regions (``min`` and ``max``, respectively), which
|
||||
controls the monitoring overhead, exist. You can set and get the values by
|
||||
writing to and rading from the files.
|
||||
|
||||
For more details about the intervals and monitoring regions range, please refer
|
||||
to the Design document (:doc:`/vm/damon/design`).
|
||||
|
||||
contexts/<N>/targets/
|
||||
---------------------
|
||||
|
||||
In the beginning, this directory has only one file, ``nr_targets``. Writing a
|
||||
number (``N``) to the file creates the number of child directories named ``0``
|
||||
to ``N-1``. Each directory represents each monitoring target.
|
||||
|
||||
targets/<N>/
|
||||
------------
|
||||
|
||||
In each target directory, one file (``pid_target``) and one directory
|
||||
(``regions``) exist.
|
||||
|
||||
If you wrote ``vaddr`` to the ``contexts/<N>/operations``, each target should
|
||||
be a process. You can specify the process to DAMON by writing the pid of the
|
||||
process to the ``pid_target`` file.
|
||||
|
||||
targets/<N>/regions
|
||||
-------------------
|
||||
|
||||
When ``vaddr`` monitoring operations set is being used (``vaddr`` is written to
|
||||
the ``contexts/<N>/operations`` file), DAMON automatically sets and updates the
|
||||
monitoring target regions so that entire memory mappings of target processes
|
||||
can be covered. However, users could want to set the initial monitoring region
|
||||
to specific address ranges.
|
||||
|
||||
In contrast, DAMON do not automatically sets and updates the monitoring target
|
||||
regions when ``paddr`` monitoring operations set is being used (``paddr`` is
|
||||
written to the ``contexts/<N>/operations``). Therefore, users should set the
|
||||
monitoring target regions by themselves in the case.
|
||||
|
||||
For such cases, users can explicitly set the initial monitoring target regions
|
||||
as they want, by writing proper values to the files under this directory.
|
||||
|
||||
In the beginning, this directory has only one file, ``nr_regions``. Writing a
|
||||
number (``N``) to the file creates the number of child directories named ``0``
|
||||
to ``N-1``. Each directory represents each initial monitoring target region.
|
||||
|
||||
regions/<N>/
|
||||
------------
|
||||
|
||||
In each region directory, you will find two files (``start`` and ``end``). You
|
||||
can set and get the start and end addresses of the initial monitoring target
|
||||
region by writing to and reading from the files, respectively.
|
||||
|
||||
contexts/<N>/schemes/
|
||||
---------------------
|
||||
|
||||
For usual DAMON-based data access aware memory management optimizations, users
|
||||
would normally want the system to apply a memory management action to a memory
|
||||
region of a specific access pattern. DAMON receives such formalized operation
|
||||
schemes from the user and applies those to the target memory regions. Users
|
||||
can get and set the schemes by reading from and writing to files under this
|
||||
directory.
|
||||
|
||||
In the beginning, this directory has only one file, ``nr_schemes``. Writing a
|
||||
number (``N``) to the file creates the number of child directories named ``0``
|
||||
to ``N-1``. Each directory represents each DAMON-based operation scheme.
|
||||
|
||||
schemes/<N>/
|
||||
------------
|
||||
|
||||
In each scheme directory, four directories (``access_pattern``, ``quotas``,
|
||||
``watermarks``, and ``stats``) and one file (``action``) exist.
|
||||
|
||||
The ``action`` file is for setting and getting what action you want to apply to
|
||||
memory regions having specific access pattern of the interest. The keywords
|
||||
that can be written to and read from the file and their meaning are as below.
|
||||
|
||||
- ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``
|
||||
- ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``
|
||||
- ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
|
||||
- ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
|
||||
- ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
|
||||
- ``stat``: Do nothing but count the statistics
|
||||
|
||||
schemes/<N>/access_pattern/
|
||||
---------------------------
|
||||
|
||||
The target access pattern of each DAMON-based operation scheme is constructed
|
||||
with three ranges including the size of the region in bytes, number of
|
||||
monitored accesses per aggregate interval, and number of aggregated intervals
|
||||
for the age of the region.
|
||||
|
||||
Under the ``access_pattern`` directory, three directories (``sz``,
|
||||
``nr_accesses``, and ``age``) each having two files (``min`` and ``max``)
|
||||
exist. You can set and get the access pattern for the given scheme by writing
|
||||
to and reading from the ``min`` and ``max`` files under ``sz``,
|
||||
``nr_accesses``, and ``age`` directories, respectively.
|
||||
|
||||
schemes/<N>/quotas/
|
||||
-------------------
|
||||
|
||||
Optimal ``target access pattern`` for each ``action`` is workload dependent, so
|
||||
not easy to find. Worse yet, setting a scheme of some action too aggressive
|
||||
can cause severe overhead. To avoid such overhead, users can limit time and
|
||||
size quota for each scheme. In detail, users can ask DAMON to try to use only
|
||||
up to specific time (``time quota``) for applying the action, and to apply the
|
||||
action to only up to specific amount (``size quota``) of memory regions having
|
||||
the target access pattern within a given time interval (``reset interval``).
|
||||
|
||||
When the quota limit is expected to be exceeded, DAMON prioritizes found memory
|
||||
regions of the ``target access pattern`` based on their size, access frequency,
|
||||
and age. For personalized prioritization, users can set the weights for the
|
||||
three properties.
|
||||
|
||||
Under ``quotas`` directory, three files (``ms``, ``bytes``,
|
||||
``reset_interval_ms``) and one directory (``weights``) having three files
|
||||
(``sz_permil``, ``nr_accesses_permil``, and ``age_permil``) in it exist.
|
||||
|
||||
You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and
|
||||
``reset interval`` in milliseconds by writing the values to the three files,
|
||||
respectively. You can also set the prioritization weights for size, access
|
||||
frequency, and age in per-thousand unit by writing the values to the three
|
||||
files under the ``weights`` directory.
|
||||
|
||||
schemes/<N>/watermarks/
|
||||
-----------------------
|
||||
|
||||
To allow easy activation and deactivation of each scheme based on system
|
||||
status, DAMON provides a feature called watermarks. The feature receives five
|
||||
values called ``metric``, ``interval``, ``high``, ``mid``, and ``low``. The
|
||||
``metric`` is the system metric such as free memory ratio that can be measured.
|
||||
If the metric value of the system is higher than the value in ``high`` or lower
|
||||
than ``low`` at the memoent, the scheme is deactivated. If the value is lower
|
||||
than ``mid``, the scheme is activated.
|
||||
|
||||
Under the watermarks directory, five files (``metric``, ``interval_us``,
|
||||
``high``, ``mid``, and ``low``) for setting each value exist. You can set and
|
||||
get the five values by writing to the files, respectively.
|
||||
|
||||
Keywords and meanings of those that can be written to the ``metric`` file are
|
||||
as below.
|
||||
|
||||
- none: Ignore the watermarks
|
||||
- free_mem_rate: System's free memory rate (per thousand)
|
||||
|
||||
The ``interval`` should written in microseconds unit.
|
||||
|
||||
.. _sysfs_schemes_stats:
|
||||
|
||||
schemes/<N>/stats/
|
||||
------------------
|
||||
|
||||
DAMON counts the total number and bytes of regions that each scheme is tried to
|
||||
be applied, the two numbers for the regions that each scheme is successfully
|
||||
applied, and the total number of the quota limit exceeds. This statistics can
|
||||
be used for online analysis or tuning of the schemes.
|
||||
|
||||
The statistics can be retrieved by reading the files under ``stats`` directory
|
||||
(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, and
|
||||
``qt_exceeds``), respectively. The files are not updated in real time, so you
|
||||
should ask DAMON sysfs interface to updte the content of the files for the
|
||||
stats by writing a special keyword, ``update_schemes_stats`` to the relevant
|
||||
``kdamonds/<N>/state`` file.
|
||||
|
||||
Example
|
||||
~~~~~~~
|
||||
|
||||
Below commands applies a scheme saying "If a memory region of size in [4KiB,
|
||||
8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
|
||||
interval in [10, 20], page out the region. For the paging out, use only up to
|
||||
10ms per second, and also don't page out more than 1GiB per second. Under the
|
||||
limitation, page out memory regions having longer age first. Also, check the
|
||||
free memory rate of the system every 5 seconds, start the monitoring and paging
|
||||
out when the free memory rate becomes lower than 50%, but stop it if the free
|
||||
memory rate becomes larger than 60%, or lower than 30%". ::
|
||||
|
||||
# cd <sysfs>/kernel/mm/damon/admin
|
||||
# # populate directories
|
||||
# echo 1 > kdamonds/nr_kdamonds; echo 1 > kdamonds/0/contexts/nr_contexts;
|
||||
# echo 1 > kdamonds/0/contexts/0/schemes/nr_schemes
|
||||
# cd kdamonds/0/contexts/0/schemes/0
|
||||
# # set the basic access pattern and the action
|
||||
# echo 4096 > access_patterns/sz/min
|
||||
# echo 8192 > access_patterns/sz/max
|
||||
# echo 0 > access_patterns/nr_accesses/min
|
||||
# echo 5 > access_patterns/nr_accesses/max
|
||||
# echo 10 > access_patterns/age/min
|
||||
# echo 20 > access_patterns/age/max
|
||||
# echo pageout > action
|
||||
# # set quotas
|
||||
# echo 10 > quotas/ms
|
||||
# echo $((1024*1024*1024)) > quotas/bytes
|
||||
# echo 1000 > quotas/reset_interval_ms
|
||||
# # set watermark
|
||||
# echo free_mem_rate > watermarks/metric
|
||||
# echo 5000000 > watermarks/interval_us
|
||||
# echo 600 > watermarks/high
|
||||
# echo 500 > watermarks/mid
|
||||
# echo 300 > watermarks/low
|
||||
|
||||
Please note that it's highly recommended to use user space tools like `damo
|
||||
<https://github.com/awslabs/damo>`_ rather than manually reading and writing
|
||||
the files as above. Above is only for an example.
|
||||
|
||||
.. _debugfs_interface:
|
||||
|
||||
@ -47,7 +385,7 @@ Attributes
|
||||
----------
|
||||
|
||||
Users can get and set the ``sampling interval``, ``aggregation interval``,
|
||||
``regions update interval``, and min/max number of monitoring target regions by
|
||||
``update interval``, and min/max number of monitoring target regions by
|
||||
reading from and writing to the ``attrs`` file. To know about the monitoring
|
||||
attributes in detail, please refer to the :doc:`/vm/damon/design`. For
|
||||
example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and
|
||||
@ -108,24 +446,28 @@ In such cases, users can explicitly set the initial monitoring target regions
|
||||
as they want, by writing proper values to the ``init_regions`` file. Each line
|
||||
of the input should represent one region in below form.::
|
||||
|
||||
<target id> <start address> <end address>
|
||||
<target idx> <start address> <end address>
|
||||
|
||||
The ``target id`` should already in ``target_ids`` file, and the regions should
|
||||
be passed in address order. For example, below commands will set a couple of
|
||||
address ranges, ``1-100`` and ``100-200`` as the initial monitoring target
|
||||
region of process 42, and another couple of address ranges, ``20-40`` and
|
||||
``50-100`` as that of process 4242.::
|
||||
The ``target idx`` should be the index of the target in ``target_ids`` file,
|
||||
starting from ``0``, and the regions should be passed in address order. For
|
||||
example, below commands will set a couple of address ranges, ``1-100`` and
|
||||
``100-200`` as the initial monitoring target region of pid 42, which is the
|
||||
first one (index ``0``) in ``target_ids``, and another couple of address
|
||||
ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
|
||||
(index ``1``) in ``target_ids``.::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo "42 1 100
|
||||
42 100 200
|
||||
4242 20 40
|
||||
4242 50 100" > init_regions
|
||||
# cat target_ids
|
||||
42 4242
|
||||
# echo "0 1 100
|
||||
0 100 200
|
||||
1 20 40
|
||||
1 50 100" > init_regions
|
||||
|
||||
Note that this sets the initial monitoring target regions only. In case of
|
||||
virtual memory monitoring, DAMON will automatically updates the boundary of the
|
||||
regions after one ``regions update interval``. Therefore, users should set the
|
||||
``regions update interval`` large enough in this case, if they don't want the
|
||||
regions after one ``update interval``. Therefore, users should set the
|
||||
``update interval`` large enough in this case, if they don't want the
|
||||
update.
|
||||
|
||||
|
||||
|
@ -130,9 +130,25 @@ attribute, e.g.::
|
||||
echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
|
||||
|
||||
When zswap same-filled page identification is disabled at runtime, it will stop
|
||||
checking for the same-value filled pages during store operation. However, the
|
||||
existing pages which are marked as same-value filled pages remain stored
|
||||
unchanged in zswap until they are either loaded or invalidated.
|
||||
checking for the same-value filled pages during store operation.
|
||||
In other words, every page will be then considered non-same-value filled.
|
||||
However, the existing pages which are marked as same-value filled pages remain
|
||||
stored unchanged in zswap until they are either loaded or invalidated.
|
||||
|
||||
In some circumstances it might be advantageous to make use of just the zswap
|
||||
ability to efficiently store same-filled pages without enabling the whole
|
||||
compressed page storage.
|
||||
In this case the handling of non-same-value pages by zswap (enabled by default)
|
||||
can be disabled by setting the ``non_same_filled_pages_enabled`` attribute
|
||||
to 0, e.g. ``zswap.non_same_filled_pages_enabled=0``.
|
||||
It can also be enabled and disabled at runtime using the sysfs
|
||||
``non_same_filled_pages_enabled`` attribute, e.g.::
|
||||
|
||||
echo 1 > /sys/module/zswap/parameters/non_same_filled_pages_enabled
|
||||
|
||||
Disabling both ``zswap.same_filled_pages_enabled`` and
|
||||
``zswap.non_same_filled_pages_enabled`` effectively disables accepting any new
|
||||
pages by zswap.
|
||||
|
||||
To prevent zswap from shrinking pool when zswap is full and there's a high
|
||||
pressure on swap (this will result in flipping pages in and out zswap pool
|
||||
|
@ -595,22 +595,34 @@ Documentation/admin-guide/kernel-parameters.rst).
|
||||
numa_balancing
|
||||
==============
|
||||
|
||||
Enables/disables automatic page fault based NUMA memory
|
||||
balancing. Memory is moved automatically to nodes
|
||||
that access it often.
|
||||
Enables/disables and configures automatic page fault based NUMA memory
|
||||
balancing. Memory is moved automatically to nodes that access it often.
|
||||
The value to set can be the result of ORing the following:
|
||||
|
||||
Enables/disables automatic NUMA memory balancing. On NUMA machines, there
|
||||
is a performance penalty if remote memory is accessed by a CPU. When this
|
||||
feature is enabled the kernel samples what task thread is accessing memory
|
||||
by periodically unmapping pages and later trapping a page fault. At the
|
||||
time of the page fault, it is determined if the data being accessed should
|
||||
be migrated to a local memory node.
|
||||
= =================================
|
||||
0 NUMA_BALANCING_DISABLED
|
||||
1 NUMA_BALANCING_NORMAL
|
||||
2 NUMA_BALANCING_MEMORY_TIERING
|
||||
= =================================
|
||||
|
||||
Or NUMA_BALANCING_NORMAL to optimize page placement among different
|
||||
NUMA nodes to reduce remote accessing. On NUMA machines, there is a
|
||||
performance penalty if remote memory is accessed by a CPU. When this
|
||||
feature is enabled the kernel samples what task thread is accessing
|
||||
memory by periodically unmapping pages and later trapping a page
|
||||
fault. At the time of the page fault, it is determined if the data
|
||||
being accessed should be migrated to a local memory node.
|
||||
|
||||
The unmapping of pages and trapping faults incur additional overhead that
|
||||
ideally is offset by improved memory locality but there is no universal
|
||||
guarantee. If the target workload is already bound to NUMA nodes then this
|
||||
feature should be disabled.
|
||||
|
||||
Or NUMA_BALANCING_MEMORY_TIERING to optimize page placement among
|
||||
different types of memory (represented as different NUMA nodes) to
|
||||
place the hot pages in the fast memory. This is implemented based on
|
||||
unmapping and page fault too.
|
||||
|
||||
oops_all_cpu_backtrace
|
||||
======================
|
||||
|
||||
|
@ -58,15 +58,30 @@ Virtually Contiguous Mappings
|
||||
File Mapping and Page Cache
|
||||
===========================
|
||||
|
||||
.. kernel-doc:: mm/readahead.c
|
||||
:export:
|
||||
Filemap
|
||||
-------
|
||||
|
||||
.. kernel-doc:: mm/filemap.c
|
||||
:export:
|
||||
|
||||
Readahead
|
||||
---------
|
||||
|
||||
.. kernel-doc:: mm/readahead.c
|
||||
:doc: Readahead Overview
|
||||
|
||||
.. kernel-doc:: mm/readahead.c
|
||||
:export:
|
||||
|
||||
Writeback
|
||||
---------
|
||||
|
||||
.. kernel-doc:: mm/page-writeback.c
|
||||
:export:
|
||||
|
||||
Truncate
|
||||
--------
|
||||
|
||||
.. kernel-doc:: mm/truncate.c
|
||||
:export:
|
||||
|
||||
|
@ -41,6 +41,18 @@ guarded by KFENCE. The default is configurable via the Kconfig option
|
||||
``CONFIG_KFENCE_SAMPLE_INTERVAL``. Setting ``kfence.sample_interval=0``
|
||||
disables KFENCE.
|
||||
|
||||
The sample interval controls a timer that sets up KFENCE allocations. By
|
||||
default, to keep the real sample interval predictable, the normal timer also
|
||||
causes CPU wake-ups when the system is completely idle. This may be undesirable
|
||||
on power-constrained systems. The boot parameter ``kfence.deferrable=1``
|
||||
instead switches to a "deferrable" timer which does not force CPU wake-ups on
|
||||
idle systems, at the risk of unpredictable sample intervals. The default is
|
||||
configurable via the Kconfig option ``CONFIG_KFENCE_DEFERRABLE``.
|
||||
|
||||
.. warning::
|
||||
The KUnit test suite is very likely to fail when using a deferrable timer
|
||||
since it currently causes very unpredictable sample intervals.
|
||||
|
||||
The KFENCE memory pool is of fixed size, and if the pool is exhausted, no
|
||||
further KFENCE allocations occur. With ``CONFIG_KFENCE_NUM_OBJECTS`` (default
|
||||
255), the number of available guarded objects can be controlled. Each object
|
||||
|
@ -45,6 +45,12 @@ typically between calling iget_locked() and unlocking the inode.
|
||||
|
||||
At some point that will become mandatory.
|
||||
|
||||
**mandatory**
|
||||
|
||||
The foo_inode_info should always be allocated through alloc_inode_sb() rather
|
||||
than kmem_cache_alloc() or kmalloc() related to set up the inode reclaim context
|
||||
correctly.
|
||||
|
||||
---
|
||||
|
||||
**mandatory**
|
||||
|
@ -806,12 +806,16 @@ cache in your filesystem. The following members are defined:
|
||||
object. The pages are consecutive in the page cache and are
|
||||
locked. The implementation should decrement the page refcount
|
||||
after starting I/O on each page. Usually the page will be
|
||||
unlocked by the I/O completion handler. If the filesystem decides
|
||||
to stop attempting I/O before reaching the end of the readahead
|
||||
window, it can simply return. The caller will decrement the page
|
||||
refcount and unlock the remaining pages for you. Set PageUptodate
|
||||
if the I/O completes successfully. Setting PageError on any page
|
||||
will be ignored; simply unlock the page if an I/O error occurs.
|
||||
unlocked by the I/O completion handler. The set of pages are
|
||||
divided into some sync pages followed by some async pages,
|
||||
rac->ra->async_size gives the number of async pages. The
|
||||
filesystem should attempt to read all sync pages but may decide
|
||||
to stop once it reaches the async pages. If it does decide to
|
||||
stop attempting I/O, it can simply return. The caller will
|
||||
remove the remaining pages from the address space, unlock them
|
||||
and decrement the page refcount. Set PageUptodate if the I/O
|
||||
completes successfully. Setting PageError on any page will be
|
||||
ignored; simply unlock the page if an I/O error occurs.
|
||||
|
||||
``readpages``
|
||||
called by the VM to read pages associated with the address_space
|
||||
|
@ -13,12 +13,13 @@ primitives that dependent on and optimized for the target address space. On
|
||||
the other hand, the accuracy and overhead tradeoff mechanism, which is the core
|
||||
of DAMON, is in the pure logic space. DAMON separates the two parts in
|
||||
different layers and defines its interface to allow various low level
|
||||
primitives implementations configurable with the core logic.
|
||||
primitives implementations configurable with the core logic. We call the low
|
||||
level primitives implementations monitoring operations.
|
||||
|
||||
Due to this separated design and the configurable interface, users can extend
|
||||
DAMON for any address space by configuring the core logics with appropriate low
|
||||
level primitive implementations. If appropriate one is not provided, users can
|
||||
implement the primitives on their own.
|
||||
DAMON for any address space by configuring the core logics with appropriate
|
||||
monitoring operations. If appropriate one is not provided, users can implement
|
||||
the operations on their own.
|
||||
|
||||
For example, physical memory, virtual memory, swap space, those for specific
|
||||
processes, NUMA nodes, files, and backing memory devices would be supportable.
|
||||
@ -26,25 +27,24 @@ Also, if some architectures or devices support special optimized access check
|
||||
primitives, those will be easily configurable.
|
||||
|
||||
|
||||
Reference Implementations of Address Space Specific Primitives
|
||||
==============================================================
|
||||
Reference Implementations of Address Space Specific Monitoring Operations
|
||||
=========================================================================
|
||||
|
||||
The low level primitives for the fundamental access monitoring are defined in
|
||||
two parts:
|
||||
The monitoring operations are defined in two parts:
|
||||
|
||||
1. Identification of the monitoring target address range for the address space.
|
||||
2. Access check of specific address range in the target space.
|
||||
|
||||
DAMON currently provides the implementations of the primitives for the physical
|
||||
DAMON currently provides the implementations of the operations for the physical
|
||||
and virtual address spaces. Below two subsections describe how those work.
|
||||
|
||||
|
||||
VMA-based Target Address Range Construction
|
||||
-------------------------------------------
|
||||
|
||||
This is only for the virtual address space primitives implementation. That for
|
||||
the physical address space simply asks users to manually set the monitoring
|
||||
target address ranges.
|
||||
This is only for the virtual address space monitoring operations
|
||||
implementation. That for the physical address space simply asks users to
|
||||
manually set the monitoring target address ranges.
|
||||
|
||||
Only small parts in the super-huge virtual address space of the processes are
|
||||
mapped to the physical memory and accessed. Thus, tracking the unmapped
|
||||
@ -84,9 +84,10 @@ table having a mapping to the address. In this way, the implementations find
|
||||
and clear the bit(s) for next sampling target address and checks whether the
|
||||
bit(s) set again after one sampling period. This could disturb other kernel
|
||||
subsystems using the Accessed bits, namely Idle page tracking and the reclaim
|
||||
logic. To avoid such disturbances, DAMON makes it mutually exclusive with Idle
|
||||
page tracking and uses ``PG_idle`` and ``PG_young`` page flags to solve the
|
||||
conflict with the reclaim logic, as Idle page tracking does.
|
||||
logic. DAMON does nothing to avoid disturbing Idle page tracking, so handling
|
||||
the interference is the responsibility of sysadmins. However, it solves the
|
||||
conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags,
|
||||
as Idle page tracking does.
|
||||
|
||||
|
||||
Address Space Independent Core Mechanisms
|
||||
@ -94,8 +95,8 @@ Address Space Independent Core Mechanisms
|
||||
|
||||
Below four sections describe each of the DAMON core mechanisms and the five
|
||||
monitoring attributes, ``sampling interval``, ``aggregation interval``,
|
||||
``regions update interval``, ``minimum number of regions``, and ``maximum
|
||||
number of regions``.
|
||||
``update interval``, ``minimum number of regions``, and ``maximum number of
|
||||
regions``.
|
||||
|
||||
|
||||
Access Frequency Monitoring
|
||||
@ -168,6 +169,8 @@ The monitoring target address range could dynamically changed. For example,
|
||||
virtual memory could be dynamically mapped and unmapped. Physical memory could
|
||||
be hot-plugged.
|
||||
|
||||
As the changes could be quite frequent in some cases, DAMON checks the dynamic
|
||||
memory mapping changes and applies it to the abstracted target area only for
|
||||
each of a user-specified time interval (``regions update interval``).
|
||||
As the changes could be quite frequent in some cases, DAMON allows the
|
||||
monitoring operations to check dynamic changes including memory mapping changes
|
||||
and applies it to monitoring operations-related data structures such as the
|
||||
abstracted monitoring target memory area only for each of a user-specified time
|
||||
interval (``update interval``).
|
||||
|
@ -31,7 +31,7 @@ Does DAMON support virtual memory only?
|
||||
=======================================
|
||||
|
||||
No. The core of the DAMON is address space independent. The address space
|
||||
specific low level primitive parts including monitoring target regions
|
||||
specific monitoring operations including monitoring target regions
|
||||
constructions and actual access checks can be implemented and configured on the
|
||||
DAMON core by the users. In this way, DAMON users can monitor any address
|
||||
space with any access check technique.
|
||||
|
@ -5326,6 +5326,7 @@ DATA ACCESS MONITOR
|
||||
M: SeongJae Park <sj@kernel.org>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: Documentation/ABI/testing/sysfs-kernel-mm-damon
|
||||
F: Documentation/admin-guide/mm/damon/
|
||||
F: Documentation/vm/damon/
|
||||
F: include/linux/damon.h
|
||||
|
@ -38,6 +38,7 @@ config ARM
|
||||
select ARCH_USE_CMPXCHG_LOCKREF
|
||||
select ARCH_USE_MEMTEST
|
||||
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
|
||||
select ARCH_WANT_GENERAL_HUGETLB
|
||||
select ARCH_WANT_IPC_PARSE_VERSION
|
||||
select ARCH_WANT_LD_ORPHAN_WARN
|
||||
select BINFMT_FLAT_ARGVP_ENVP_ON_STACK
|
||||
@ -1509,9 +1510,6 @@ config HW_PERF_EVENTS
|
||||
def_bool y
|
||||
depends on ARM_PMU
|
||||
|
||||
config ARCH_WANT_GENERAL_HUGETLB
|
||||
def_bool y
|
||||
|
||||
config ARM_MODULE_PLTS
|
||||
bool "Use PLTs to allow module memory to spill over into vmalloc area"
|
||||
depends on MODULES
|
||||
|
@ -406,9 +406,6 @@ static int __init topology_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for_each_online_node(i)
|
||||
register_one_node(i);
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
struct cpu *cpu = &per_cpu(cpu_data.cpu, i);
|
||||
cpu->hotpluggable = cpu_can_disable(i);
|
||||
|
@ -356,6 +356,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
|
||||
{
|
||||
size_t pagesize = 1UL << shift;
|
||||
|
||||
entry = pte_mkhuge(entry);
|
||||
if (pagesize == CONT_PTE_SIZE) {
|
||||
entry = pte_mkcont(entry);
|
||||
} else if (pagesize == CONT_PMD_SIZE) {
|
||||
|
@ -29,8 +29,6 @@ int max_kernel_seg = 0x303;
|
||||
/* indicate pfn's of high memory */
|
||||
unsigned long highstart_pfn, highend_pfn;
|
||||
|
||||
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
|
||||
|
||||
/* Default cache attribute for newly created page tables */
|
||||
unsigned long _dflt_cache_att = CACHEDEF;
|
||||
|
||||
|
@ -70,16 +70,6 @@ static int __init topology_init(void)
|
||||
{
|
||||
int i, err = 0;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
/*
|
||||
* MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes?
|
||||
*/
|
||||
for_each_online_node(i) {
|
||||
if ((err = register_one_node(i)))
|
||||
goto out;
|
||||
}
|
||||
#endif
|
||||
|
||||
sysfs_cpus = kcalloc(NR_CPUS, sizeof(struct ia64_cpu), GFP_KERNEL);
|
||||
if (!sysfs_cpus)
|
||||
panic("kzalloc in topology_init failed - NR_CPUS too big?");
|
||||
|
@ -608,17 +608,11 @@ void __init paging_init(void)
|
||||
zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
pg_data_t *arch_alloc_nodedata(int nid)
|
||||
pg_data_t * __init arch_alloc_nodedata(int nid)
|
||||
{
|
||||
unsigned long size = compute_pernodesize(nid);
|
||||
|
||||
return kzalloc(size, GFP_KERNEL);
|
||||
}
|
||||
|
||||
void arch_free_nodedata(pg_data_t *pgdat)
|
||||
{
|
||||
kfree(pgdat);
|
||||
return memblock_alloc(size, SMP_CACHE_BYTES);
|
||||
}
|
||||
|
||||
void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
|
||||
@ -626,7 +620,6 @@ void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
|
||||
pgdat_list[update_node] = update_pgdat;
|
||||
scatter_node_data();
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
||||
|
@ -12,11 +12,6 @@ static int __init topology_init(void)
|
||||
{
|
||||
int i, ret;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
for_each_online_node(i)
|
||||
register_one_node(i);
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
||||
for_each_present_cpu(i) {
|
||||
struct cpu *c = &per_cpu(cpu_devices, i);
|
||||
|
||||
|
@ -18,7 +18,6 @@
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/page.h>
|
||||
|
||||
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
|
||||
DEFINE_SPINLOCK(anon_alias_lock);
|
||||
extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
|
||||
|
||||
|
@ -38,8 +38,6 @@
|
||||
|
||||
int mem_init_done;
|
||||
|
||||
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
|
||||
|
||||
static void __init zone_sizes_init(void)
|
||||
{
|
||||
unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
|
||||
|
@ -19,11 +19,6 @@
|
||||
|
||||
#define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt)
|
||||
|
||||
/* Alignment per CMA requirement. */
|
||||
#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \
|
||||
max_t(unsigned long, MAX_ORDER - 1, \
|
||||
pageblock_order))
|
||||
|
||||
/* FAD commands */
|
||||
#define FADUMP_REGISTER 1
|
||||
#define FADUMP_UNREGISTER 2
|
||||
|
@ -71,9 +71,9 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags
|
||||
size_t size = 1UL << shift;
|
||||
|
||||
if (size == SZ_16K)
|
||||
return __pte(pte_val(entry) & ~_PAGE_HUGE);
|
||||
return __pte(pte_val(entry) | _PAGE_SPS);
|
||||
else
|
||||
return entry;
|
||||
return __pte(pte_val(entry) | _PAGE_SPS | _PAGE_HUGE);
|
||||
}
|
||||
#define arch_make_huge_pte arch_make_huge_pte
|
||||
#endif
|
||||
|
@ -112,6 +112,12 @@ static int __init fadump_cma_init(void)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* If CMA activation fails, keep the pages reserved, instead of
|
||||
* exposing them to buddy allocator. Same as 'fadump=nocma' case.
|
||||
*/
|
||||
cma_reserve_pages_on_error(fadump_cma);
|
||||
|
||||
/*
|
||||
* So we now have successfully initialized cma area for fadump.
|
||||
*/
|
||||
@ -544,7 +550,7 @@ int __init fadump_reserve_mem(void)
|
||||
if (!fw_dump.nocma) {
|
||||
fw_dump.boot_memory_size =
|
||||
ALIGN(fw_dump.boot_memory_size,
|
||||
FADUMP_CMA_ALIGNMENT);
|
||||
CMA_MIN_ALIGNMENT_BYTES);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -1110,14 +1110,6 @@ EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group);
|
||||
/* NUMA stuff */
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
static void __init register_nodes(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < MAX_NUMNODES; i++)
|
||||
register_one_node(i);
|
||||
}
|
||||
|
||||
int sysfs_add_device_to_node(struct device *dev, int nid)
|
||||
{
|
||||
struct node *node = node_devices[nid];
|
||||
@ -1132,13 +1124,6 @@ void sysfs_remove_device_from_node(struct device *dev, int nid)
|
||||
sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node);
|
||||
|
||||
#else
|
||||
static void __init register_nodes(void)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* Only valid if CPU is present. */
|
||||
@ -1155,8 +1140,6 @@ static int __init topology_init(void)
|
||||
{
|
||||
int cpu, r;
|
||||
|
||||
register_nodes();
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct cpu *c = &per_cpu(cpu_devices, cpu);
|
||||
|
||||
|
@ -40,6 +40,7 @@ config RISCV
|
||||
select ARCH_USE_MEMTEST
|
||||
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
|
||||
select ARCH_WANT_FRAME_POINTERS
|
||||
select ARCH_WANT_GENERAL_HUGETLB
|
||||
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
|
||||
select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
|
||||
select BUILDTIME_TABLE_SORT if MMU
|
||||
@ -171,9 +172,6 @@ config ARCH_SPARSEMEM_ENABLE
|
||||
config ARCH_SELECT_MEMORY_MODEL
|
||||
def_bool ARCH_SPARSEMEM_ENABLE
|
||||
|
||||
config ARCH_WANT_GENERAL_HUGETLB
|
||||
def_bool y
|
||||
|
||||
config ARCH_SUPPORTS_UPROBES
|
||||
def_bool y
|
||||
|
||||
|
@ -301,9 +301,6 @@ static int __init topology_init(void)
|
||||
{
|
||||
int i, ret;
|
||||
|
||||
for_each_online_node(i)
|
||||
register_one_node(i);
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
struct cpu *cpu = &per_cpu(cpu_devices, i);
|
||||
|
||||
|
@ -33,10 +33,3 @@ void __init numa_setup(void)
|
||||
NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT;
|
||||
NODE_DATA(0)->node_id = 0;
|
||||
}
|
||||
|
||||
static int __init numa_init_late(void)
|
||||
{
|
||||
register_one_node(0);
|
||||
return 0;
|
||||
}
|
||||
arch_initcall(numa_init_late);
|
||||
|
@ -46,11 +46,6 @@ static int __init topology_init(void)
|
||||
{
|
||||
int i, ret;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
for_each_online_node(i)
|
||||
register_one_node(i);
|
||||
#endif
|
||||
|
||||
for_each_present_cpu(i) {
|
||||
struct cpu *c = &per_cpu(cpu_devices, i);
|
||||
|
||||
|
@ -244,22 +244,10 @@ static void __init check_mmu_stats(void)
|
||||
mmu_stats_supported = 1;
|
||||
}
|
||||
|
||||
static void register_nodes(void)
|
||||
{
|
||||
#ifdef CONFIG_NUMA
|
||||
int i;
|
||||
|
||||
for (i = 0; i < MAX_NUMNODES; i++)
|
||||
register_one_node(i);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int __init topology_init(void)
|
||||
{
|
||||
int cpu, ret;
|
||||
|
||||
register_nodes();
|
||||
|
||||
check_mmu_stats();
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
|
@ -181,6 +181,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
|
||||
{
|
||||
pte_t pte;
|
||||
|
||||
entry = pte_mkhuge(entry);
|
||||
pte = hugepage_shift_to_tte(entry, shift);
|
||||
|
||||
#ifdef CONFIG_SPARC64
|
||||
|
@ -119,6 +119,7 @@ config X86
|
||||
select ARCH_WANT_DEFAULT_BPF_JIT if X86_64
|
||||
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
|
||||
select ARCH_WANTS_NO_INSTR
|
||||
select ARCH_WANT_GENERAL_HUGETLB
|
||||
select ARCH_WANT_HUGE_PMD_SHARE
|
||||
select ARCH_WANT_LD_ORPHAN_WARN
|
||||
select ARCH_WANTS_RT_DELAYED_SIGNALS
|
||||
@ -349,9 +350,6 @@ config ARCH_NR_GPIO
|
||||
config ARCH_SUSPEND_POSSIBLE
|
||||
def_bool y
|
||||
|
||||
config ARCH_WANT_GENERAL_HUGETLB
|
||||
def_bool y
|
||||
|
||||
config AUDIT_ARCH
|
||||
def_bool y if X86_64
|
||||
|
||||
|
@ -1299,10 +1299,12 @@ static void kill_me_maybe(struct callback_head *cb)
|
||||
|
||||
/*
|
||||
* -EHWPOISON from memory_failure() means that it already sent SIGBUS
|
||||
* to the current process with the proper error info, so no need to
|
||||
* send SIGBUS here again.
|
||||
* to the current process with the proper error info,
|
||||
* -EOPNOTSUPP means hwpoison_filter() filtered the error event,
|
||||
*
|
||||
* In both cases, no further processing is required.
|
||||
*/
|
||||
if (ret == -EHWPOISON)
|
||||
if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
|
||||
return;
|
||||
|
||||
pr_err("Memory error not recovered");
|
||||
|
@ -154,11 +154,6 @@ static int __init topology_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
for_each_online_node(i)
|
||||
register_one_node(i);
|
||||
#endif
|
||||
|
||||
for_each_present_cpu(i)
|
||||
arch_register_cpu(i);
|
||||
|
||||
|
@ -738,17 +738,6 @@ void __init x86_numa_init(void)
|
||||
numa_init(dummy_numa_init);
|
||||
}
|
||||
|
||||
static void __init init_memory_less_node(int nid)
|
||||
{
|
||||
/* Allocate and initialize node data. Memory-less node is now online.*/
|
||||
alloc_node_data(nid);
|
||||
free_area_init_memoryless_node(nid);
|
||||
|
||||
/*
|
||||
* All zonelists will be built later in start_kernel() after per cpu
|
||||
* areas are initialized.
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
* A node may exist which has one or more Generic Initiators but no CPUs and no
|
||||
@ -766,9 +755,18 @@ void __init init_gi_nodes(void)
|
||||
{
|
||||
int nid;
|
||||
|
||||
/*
|
||||
* Exclude this node from
|
||||
* bringup_nonboot_cpus
|
||||
* cpu_up
|
||||
* __try_online_node
|
||||
* register_one_node
|
||||
* because node_subsys is not initialized yet.
|
||||
* TODO remove dependency on node_online
|
||||
*/
|
||||
for_each_node_state(nid, N_GENERIC_INITIATOR)
|
||||
if (!node_online(nid))
|
||||
init_memory_less_node(nid);
|
||||
node_set_online(nid);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -798,8 +796,17 @@ void __init init_cpu_to_node(void)
|
||||
if (node == NUMA_NO_NODE)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Exclude this node from
|
||||
* bringup_nonboot_cpus
|
||||
* cpu_up
|
||||
* __try_online_node
|
||||
* register_one_node
|
||||
* because node_subsys is not initialized yet.
|
||||
* TODO remove dependency on node_online
|
||||
*/
|
||||
if (!node_online(node))
|
||||
init_memory_less_node(node);
|
||||
node_set_online(node);
|
||||
|
||||
numa_set_node(cpu, node);
|
||||
}
|
||||
|
@ -385,7 +385,7 @@ static struct kmem_cache * bdev_cachep __read_mostly;
|
||||
|
||||
static struct inode *bdev_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
|
||||
struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL);
|
||||
|
||||
if (!ei)
|
||||
return NULL;
|
||||
|
@ -5459,7 +5459,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
|
||||
bfqq = bic_to_bfqq(bic, false);
|
||||
if (bfqq) {
|
||||
bfq_release_process_ref(bfqd, bfqq);
|
||||
bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true);
|
||||
bfqq = bfq_get_queue(bfqd, bio, false, bic, true);
|
||||
bic_set_bfqq(bic, bfqq, false);
|
||||
}
|
||||
|
||||
|
@ -35,5 +35,6 @@ void __init driver_init(void)
|
||||
auxiliary_bus_init();
|
||||
cpu_dev_init();
|
||||
memory_dev_init();
|
||||
node_dev_init();
|
||||
container_dev_init();
|
||||
}
|
||||
|
@ -215,6 +215,7 @@ static int memory_block_online(struct memory_block *mem)
|
||||
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
|
||||
nr_vmemmap_pages);
|
||||
|
||||
mem->zone = zone;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -225,6 +226,9 @@ static int memory_block_offline(struct memory_block *mem)
|
||||
unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
|
||||
int ret;
|
||||
|
||||
if (!mem->zone)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Unaccount before offlining, such that unpopulated zone and kthreads
|
||||
* can properly be torn down in offline_pages().
|
||||
@ -234,7 +238,7 @@ static int memory_block_offline(struct memory_block *mem)
|
||||
-nr_vmemmap_pages);
|
||||
|
||||
ret = offline_pages(start_pfn + nr_vmemmap_pages,
|
||||
nr_pages - nr_vmemmap_pages, mem->group);
|
||||
nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
|
||||
if (ret) {
|
||||
/* offline_pages() failed. Account back. */
|
||||
if (nr_vmemmap_pages)
|
||||
@ -246,6 +250,7 @@ static int memory_block_offline(struct memory_block *mem)
|
||||
if (nr_vmemmap_pages)
|
||||
mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
|
||||
|
||||
mem->zone = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -411,11 +416,10 @@ static ssize_t valid_zones_show(struct device *dev,
|
||||
*/
|
||||
if (mem->state == MEM_ONLINE) {
|
||||
/*
|
||||
* The block contains more than one zone can not be offlined.
|
||||
* This can happen e.g. for ZONE_DMA and ZONE_DMA32
|
||||
* If !mem->zone, the memory block spans multiple zones and
|
||||
* cannot get offlined.
|
||||
*/
|
||||
default_zone = test_pages_in_a_zone(start_pfn,
|
||||
start_pfn + nr_pages);
|
||||
default_zone = mem->zone;
|
||||
if (!default_zone)
|
||||
return sysfs_emit(buf, "%s\n", "none");
|
||||
len += sysfs_emit_at(buf, len, "%s", default_zone->name);
|
||||
@ -555,6 +559,8 @@ static ssize_t hard_offline_page_store(struct device *dev,
|
||||
return -EINVAL;
|
||||
pfn >>= PAGE_SHIFT;
|
||||
ret = memory_failure(pfn, 0);
|
||||
if (ret == -EOPNOTSUPP)
|
||||
ret = 0;
|
||||
return ret ? ret : count;
|
||||
}
|
||||
|
||||
@ -613,11 +619,7 @@ static const struct attribute_group *memory_memblk_attr_groups[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
/*
|
||||
* register_memory - Setup a sysfs device for a memory block
|
||||
*/
|
||||
static
|
||||
int register_memory(struct memory_block *memory)
|
||||
static int __add_memory_block(struct memory_block *memory)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -641,9 +643,85 @@ int register_memory(struct memory_block *memory)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int init_memory_block(unsigned long block_id, unsigned long state,
|
||||
unsigned long nr_vmemmap_pages,
|
||||
struct memory_group *group)
|
||||
static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
|
||||
int nid)
|
||||
{
|
||||
const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
|
||||
const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
|
||||
struct zone *zone, *matching_zone = NULL;
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
int i;
|
||||
|
||||
/*
|
||||
* This logic only works for early memory, when the applicable zones
|
||||
* already span the memory block. We don't expect overlapping zones on
|
||||
* a single node for early memory. So if we're told that some PFNs
|
||||
* of a node fall into this memory block, we can assume that all node
|
||||
* zones that intersect with the memory block are actually applicable.
|
||||
* No need to look at the memmap.
|
||||
*/
|
||||
for (i = 0; i < MAX_NR_ZONES; i++) {
|
||||
zone = pgdat->node_zones + i;
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
if (!zone_intersects(zone, start_pfn, nr_pages))
|
||||
continue;
|
||||
if (!matching_zone) {
|
||||
matching_zone = zone;
|
||||
continue;
|
||||
}
|
||||
/* Spans multiple zones ... */
|
||||
matching_zone = NULL;
|
||||
break;
|
||||
}
|
||||
return matching_zone;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
/**
|
||||
* memory_block_add_nid() - Indicate that system RAM falling into this memory
|
||||
* block device (partially) belongs to the given node.
|
||||
* @mem: The memory block device.
|
||||
* @nid: The node id.
|
||||
* @context: The memory initialization context.
|
||||
*
|
||||
* Indicate that system RAM falling into this memory block (partially) belongs
|
||||
* to the given node. If the context indicates ("early") that we are adding the
|
||||
* node during node device subsystem initialization, this will also properly
|
||||
* set/adjust mem->zone based on the zone ranges of the given node.
|
||||
*/
|
||||
void memory_block_add_nid(struct memory_block *mem, int nid,
|
||||
enum meminit_context context)
|
||||
{
|
||||
if (context == MEMINIT_EARLY && mem->nid != nid) {
|
||||
/*
|
||||
* For early memory we have to determine the zone when setting
|
||||
* the node id and handle multiple nodes spanning a single
|
||||
* memory block by indicate via zone == NULL that we're not
|
||||
* dealing with a single zone. So if we're setting the node id
|
||||
* the first time, determine if there is a single zone. If we're
|
||||
* setting the node id a second time to a different node,
|
||||
* invalidate the single detected zone.
|
||||
*/
|
||||
if (mem->nid == NUMA_NO_NODE)
|
||||
mem->zone = early_node_zone_for_memory_block(mem, nid);
|
||||
else
|
||||
mem->zone = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this memory block spans multiple nodes, we only indicate
|
||||
* the last processed node. If we span multiple nodes (not applicable
|
||||
* to hotplugged memory), zone == NULL will prohibit memory offlining
|
||||
* and consequently unplug.
|
||||
*/
|
||||
mem->nid = nid;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int add_memory_block(unsigned long block_id, unsigned long state,
|
||||
unsigned long nr_vmemmap_pages,
|
||||
struct memory_group *group)
|
||||
{
|
||||
struct memory_block *mem;
|
||||
int ret = 0;
|
||||
@ -663,17 +741,30 @@ static int init_memory_block(unsigned long block_id, unsigned long state,
|
||||
mem->nr_vmemmap_pages = nr_vmemmap_pages;
|
||||
INIT_LIST_HEAD(&mem->group_next);
|
||||
|
||||
#ifndef CONFIG_NUMA
|
||||
if (state == MEM_ONLINE)
|
||||
/*
|
||||
* MEM_ONLINE at this point implies early memory. With NUMA,
|
||||
* we'll determine the zone when setting the node id via
|
||||
* memory_block_add_nid(). Memory hotplug updated the zone
|
||||
* manually when memory onlining/offlining succeeds.
|
||||
*/
|
||||
mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
||||
ret = __add_memory_block(mem);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (group) {
|
||||
mem->group = group;
|
||||
list_add(&mem->group_next, &group->memory_blocks);
|
||||
}
|
||||
|
||||
ret = register_memory(mem);
|
||||
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int add_memory_block(unsigned long base_section_nr)
|
||||
static int __init add_boot_memory_block(unsigned long base_section_nr)
|
||||
{
|
||||
int section_count = 0;
|
||||
unsigned long nr;
|
||||
@ -685,11 +776,18 @@ static int add_memory_block(unsigned long base_section_nr)
|
||||
|
||||
if (section_count == 0)
|
||||
return 0;
|
||||
return init_memory_block(memory_block_id(base_section_nr),
|
||||
MEM_ONLINE, 0, NULL);
|
||||
return add_memory_block(memory_block_id(base_section_nr),
|
||||
MEM_ONLINE, 0, NULL);
|
||||
}
|
||||
|
||||
static void unregister_memory(struct memory_block *memory)
|
||||
static int add_hotplug_memory_block(unsigned long block_id,
|
||||
unsigned long nr_vmemmap_pages,
|
||||
struct memory_group *group)
|
||||
{
|
||||
return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
|
||||
}
|
||||
|
||||
static void remove_memory_block(struct memory_block *memory)
|
||||
{
|
||||
if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
|
||||
return;
|
||||
@ -728,8 +826,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
|
||||
return -EINVAL;
|
||||
|
||||
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
|
||||
ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages,
|
||||
group);
|
||||
ret = add_hotplug_memory_block(block_id, vmemmap_pages, group);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
@ -740,7 +837,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
|
||||
mem = find_memory_block_by_id(block_id);
|
||||
if (WARN_ON_ONCE(!mem))
|
||||
continue;
|
||||
unregister_memory(mem);
|
||||
remove_memory_block(mem);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
@ -769,7 +866,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
|
||||
if (WARN_ON_ONCE(!mem))
|
||||
continue;
|
||||
unregister_memory_block_under_nodes(mem);
|
||||
unregister_memory(mem);
|
||||
remove_memory_block(mem);
|
||||
}
|
||||
}
|
||||
|
||||
@ -829,7 +926,7 @@ void __init memory_dev_init(void)
|
||||
*/
|
||||
for (nr = 0; nr <= __highest_present_section_nr;
|
||||
nr += sections_per_block) {
|
||||
ret = add_memory_block(nr);
|
||||
ret = add_boot_memory_block(nr);
|
||||
if (ret)
|
||||
panic("%s() failed to add memory block: %d\n", __func__,
|
||||
ret);
|
||||
|
@ -796,15 +796,12 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
|
||||
}
|
||||
|
||||
static void do_register_memory_block_under_node(int nid,
|
||||
struct memory_block *mem_blk)
|
||||
struct memory_block *mem_blk,
|
||||
enum meminit_context context)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* If this memory block spans multiple nodes, we only indicate
|
||||
* the last processed node.
|
||||
*/
|
||||
mem_blk->nid = nid;
|
||||
memory_block_add_nid(mem_blk, nid, context);
|
||||
|
||||
ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
|
||||
&mem_blk->dev.kobj,
|
||||
@ -857,7 +854,7 @@ static int register_mem_block_under_node_early(struct memory_block *mem_blk,
|
||||
if (page_nid != nid)
|
||||
continue;
|
||||
|
||||
do_register_memory_block_under_node(nid, mem_blk);
|
||||
do_register_memory_block_under_node(nid, mem_blk, MEMINIT_EARLY);
|
||||
return 0;
|
||||
}
|
||||
/* mem section does not span the specified node */
|
||||
@ -873,7 +870,7 @@ static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk,
|
||||
{
|
||||
int nid = *(int *)arg;
|
||||
|
||||
do_register_memory_block_under_node(nid, mem_blk);
|
||||
do_register_memory_block_under_node(nid, mem_blk, MEMINIT_HOTPLUG);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -892,8 +889,9 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
|
||||
kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
|
||||
}
|
||||
|
||||
void link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn,
|
||||
enum meminit_context context)
|
||||
void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
|
||||
unsigned long end_pfn,
|
||||
enum meminit_context context)
|
||||
{
|
||||
walk_memory_blocks_func_t func;
|
||||
|
||||
@ -1065,26 +1063,30 @@ static const struct attribute_group *cpu_root_attr_groups[] = {
|
||||
};
|
||||
|
||||
#define NODE_CALLBACK_PRI 2 /* lower than SLAB */
|
||||
static int __init register_node_type(void)
|
||||
void __init node_dev_init(void)
|
||||
{
|
||||
int ret;
|
||||
static struct notifier_block node_memory_callback_nb = {
|
||||
.notifier_call = node_memory_callback,
|
||||
.priority = NODE_CALLBACK_PRI,
|
||||
};
|
||||
int ret, i;
|
||||
|
||||
BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES);
|
||||
BUILD_BUG_ON(ARRAY_SIZE(node_state_attrs)-1 != NR_NODE_STATES);
|
||||
|
||||
ret = subsys_system_register(&node_subsys, cpu_root_attr_groups);
|
||||
if (!ret) {
|
||||
static struct notifier_block node_memory_callback_nb = {
|
||||
.notifier_call = node_memory_callback,
|
||||
.priority = NODE_CALLBACK_PRI,
|
||||
};
|
||||
register_hotmemory_notifier(&node_memory_callback_nb);
|
||||
}
|
||||
if (ret)
|
||||
panic("%s() failed to register subsystem: %d\n", __func__, ret);
|
||||
|
||||
register_hotmemory_notifier(&node_memory_callback_nb);
|
||||
|
||||
/*
|
||||
* Note: we're not going to unregister the node class if we fail
|
||||
* to register the node state class attribute files.
|
||||
* Create all node devices, which will properly link the node
|
||||
* to applicable memory block devices and already created cpu devices.
|
||||
*/
|
||||
return ret;
|
||||
for_each_online_node(i) {
|
||||
ret = register_one_node(i);
|
||||
if (ret)
|
||||
panic("%s() failed to add node: %d\n", __func__, ret);
|
||||
}
|
||||
}
|
||||
postcore_initcall(register_node_type);
|
||||
|
@ -637,9 +637,6 @@ enum {
|
||||
STATE_SENT, /* Do not change state/UUIDs while this is set */
|
||||
CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
|
||||
* pending, from drbd worker context.
|
||||
* If set, bdi_write_congested() returns true,
|
||||
* so shrink_page_list() would not recurse into,
|
||||
* and potentially deadlock on, this drbd worker.
|
||||
*/
|
||||
DISCONNECT_SENT,
|
||||
|
||||
|
@ -910,8 +910,7 @@ static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t se
|
||||
|
||||
switch (rbm) {
|
||||
case RB_CONGESTED_REMOTE:
|
||||
return bdi_read_congested(
|
||||
device->ldev->backing_bdev->bd_disk->bdi);
|
||||
return 0;
|
||||
case RB_LEAST_PENDING:
|
||||
return atomic_read(&device->local_cnt) >
|
||||
atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt);
|
||||
|
@ -282,7 +282,7 @@ static struct inode *dax_alloc_inode(struct super_block *sb)
|
||||
struct dax_device *dax_dev;
|
||||
struct inode *inode;
|
||||
|
||||
dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
|
||||
dax_dev = alloc_inode_sb(sb, dax_cache, GFP_KERNEL);
|
||||
if (!dax_dev)
|
||||
return NULL;
|
||||
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/kmemleak.h>
|
||||
#include <linux/cma.h>
|
||||
|
||||
#include "of_private.h"
|
||||
|
||||
@ -116,12 +117,8 @@ static int __init __reserved_mem_alloc_size(unsigned long node,
|
||||
if (IS_ENABLED(CONFIG_CMA)
|
||||
&& of_flat_dt_is_compatible(node, "shared-dma-pool")
|
||||
&& of_get_flat_dt_prop(node, "reusable", NULL)
|
||||
&& !nomap) {
|
||||
unsigned long order =
|
||||
max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
|
||||
|
||||
align = max(align, (phys_addr_t)PAGE_SIZE << order);
|
||||
}
|
||||
&& !nomap)
|
||||
align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES);
|
||||
|
||||
prop = of_get_flat_dt_prop(node, "alloc-ranges", &len);
|
||||
if (prop) {
|
||||
|
@ -3088,7 +3088,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx)
|
||||
{
|
||||
struct tty_struct *tty;
|
||||
|
||||
tty = kzalloc(sizeof(*tty), GFP_KERNEL);
|
||||
tty = kzalloc(sizeof(*tty), GFP_KERNEL_ACCOUNT);
|
||||
if (!tty)
|
||||
return NULL;
|
||||
|
||||
|
@ -2476,13 +2476,10 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm)
|
||||
VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
|
||||
|
||||
/*
|
||||
* We want subblocks to span at least MAX_ORDER_NR_PAGES and
|
||||
* pageblock_nr_pages pages. This:
|
||||
* - Is required for now for alloc_contig_range() to work reliably -
|
||||
* it doesn't properly handle smaller granularity on ZONE_NORMAL.
|
||||
* TODO: once alloc_contig_range() works reliably with pageblock
|
||||
* granularity on ZONE_NORMAL, use pageblock_nr_pages instead.
|
||||
*/
|
||||
sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
|
||||
pageblock_nr_pages) * PAGE_SIZE;
|
||||
sb_size = PAGE_SIZE * MAX_ORDER_NR_PAGES;
|
||||
sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
|
||||
|
||||
if (sb_size < memory_block_size_bytes() && !force_bbm) {
|
||||
|
@ -228,7 +228,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct v9fs_inode *v9inode;
|
||||
|
||||
v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL);
|
||||
v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL);
|
||||
if (!v9inode)
|
||||
return NULL;
|
||||
#ifdef CONFIG_9P_FSCACHE
|
||||
|
@ -220,7 +220,7 @@ static struct kmem_cache *adfs_inode_cachep;
|
||||
static struct inode *adfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct adfs_inode_info *ei;
|
||||
ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
|
||||
ei = alloc_inode_sb(sb, adfs_inode_cachep, GFP_KERNEL);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
return &ei->vfs_inode;
|
||||
|
@ -100,7 +100,7 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct affs_inode_info *i;
|
||||
|
||||
i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
|
||||
i = alloc_inode_sb(sb, affs_inode_cachep, GFP_KERNEL);
|
||||
if (!i)
|
||||
return NULL;
|
||||
|
||||
|
@ -679,7 +679,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct afs_vnode *vnode;
|
||||
|
||||
vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
|
||||
vnode = alloc_inode_sb(sb, afs_inode_cachep, GFP_KERNEL);
|
||||
if (!vnode)
|
||||
return NULL;
|
||||
|
||||
|
@ -277,7 +277,7 @@ befs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct befs_inode_info *bi;
|
||||
|
||||
bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
|
||||
bi = alloc_inode_sb(sb, befs_inode_cachep, GFP_KERNEL);
|
||||
if (!bi)
|
||||
return NULL;
|
||||
return &bi->vfs_inode;
|
||||
|
@ -239,7 +239,7 @@ static struct kmem_cache *bfs_inode_cachep;
|
||||
static struct inode *bfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct bfs_inode_info *bi;
|
||||
bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL);
|
||||
bi = alloc_inode_sb(sb, bfs_inode_cachep, GFP_KERNEL);
|
||||
if (!bi)
|
||||
return NULL;
|
||||
return &bi->vfs_inode;
|
||||
|
@ -8819,7 +8819,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
|
||||
struct btrfs_inode *ei;
|
||||
struct inode *inode;
|
||||
|
||||
ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
|
||||
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
|
||||
|
@ -1235,16 +1235,18 @@ static void bh_lru_install(struct buffer_head *bh)
|
||||
int i;
|
||||
|
||||
check_irqs_on();
|
||||
bh_lru_lock();
|
||||
|
||||
/*
|
||||
* the refcount of buffer_head in bh_lru prevents dropping the
|
||||
* attached page(i.e., try_to_free_buffers) so it could cause
|
||||
* failing page migration.
|
||||
* Skip putting upcoming bh into bh_lru until migration is done.
|
||||
*/
|
||||
if (lru_cache_disabled())
|
||||
if (lru_cache_disabled()) {
|
||||
bh_lru_unlock();
|
||||
return;
|
||||
|
||||
bh_lru_lock();
|
||||
}
|
||||
|
||||
b = this_cpu_ptr(&bh_lrus);
|
||||
for (i = 0; i < BH_LRU_SIZE; i++) {
|
||||
|
@ -563,7 +563,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
||||
|
||||
if (atomic_long_inc_return(&fsc->writeback_count) >
|
||||
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
|
||||
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
|
||||
fsc->write_congested = true;
|
||||
|
||||
req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
|
||||
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
|
||||
@ -623,7 +623,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
||||
|
||||
if (atomic_long_dec_return(&fsc->writeback_count) <
|
||||
CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
|
||||
clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
|
||||
fsc->write_congested = false;
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -635,6 +635,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
|
||||
BUG_ON(!inode);
|
||||
ihold(inode);
|
||||
|
||||
if (wbc->sync_mode == WB_SYNC_NONE &&
|
||||
ceph_inode_to_client(inode)->write_congested)
|
||||
return AOP_WRITEPAGE_ACTIVATE;
|
||||
|
||||
wait_on_page_fscache(page);
|
||||
|
||||
err = writepage_nounlock(page, wbc);
|
||||
@ -707,8 +711,7 @@ static void writepages_finish(struct ceph_osd_request *req)
|
||||
if (atomic_long_dec_return(&fsc->writeback_count) <
|
||||
CONGESTION_OFF_THRESH(
|
||||
fsc->mount_options->congestion_kb))
|
||||
clear_bdi_congested(inode_to_bdi(inode),
|
||||
BLK_RW_ASYNC);
|
||||
fsc->write_congested = false;
|
||||
|
||||
ceph_put_snap_context(detach_page_private(page));
|
||||
end_page_writeback(page);
|
||||
@ -760,6 +763,10 @@ static int ceph_writepages_start(struct address_space *mapping,
|
||||
bool done = false;
|
||||
bool caching = ceph_is_cache_enabled(inode);
|
||||
|
||||
if (wbc->sync_mode == WB_SYNC_NONE &&
|
||||
fsc->write_congested)
|
||||
return 0;
|
||||
|
||||
dout("writepages_start %p (mode=%s)\n", inode,
|
||||
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
|
||||
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
|
||||
@ -954,11 +961,8 @@ static int ceph_writepages_start(struct address_space *mapping,
|
||||
|
||||
if (atomic_long_inc_return(&fsc->writeback_count) >
|
||||
CONGESTION_ON_THRESH(
|
||||
fsc->mount_options->congestion_kb)) {
|
||||
set_bdi_congested(inode_to_bdi(inode),
|
||||
BLK_RW_ASYNC);
|
||||
}
|
||||
|
||||
fsc->mount_options->congestion_kb))
|
||||
fsc->write_congested = true;
|
||||
|
||||
pages[locked_pages++] = page;
|
||||
pvec.pages[i] = NULL;
|
||||
|
@ -447,7 +447,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
|
||||
struct ceph_inode_info *ci;
|
||||
int i;
|
||||
|
||||
ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
|
||||
ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
|
||||
if (!ci)
|
||||
return NULL;
|
||||
|
||||
|
@ -802,6 +802,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
|
||||
fsc->have_copy_from2 = true;
|
||||
|
||||
atomic_long_set(&fsc->writeback_count, 0);
|
||||
fsc->write_congested = false;
|
||||
|
||||
err = -ENOMEM;
|
||||
/*
|
||||
|
@ -121,6 +121,7 @@ struct ceph_fs_client {
|
||||
struct ceph_mds_client *mdsc;
|
||||
|
||||
atomic_long_t writeback_count;
|
||||
bool write_congested;
|
||||
|
||||
struct workqueue_struct *inode_wq;
|
||||
struct workqueue_struct *cap_wq;
|
||||
|
@ -362,7 +362,7 @@ static struct inode *
|
||||
cifs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct cifsInodeInfo *cifs_inode;
|
||||
cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL);
|
||||
cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL);
|
||||
if (!cifs_inode)
|
||||
return NULL;
|
||||
cifs_inode->cifsAttrs = 0x20; /* default */
|
||||
|
@ -43,7 +43,7 @@ static struct kmem_cache * coda_inode_cachep;
|
||||
static struct inode *coda_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct coda_inode_info *ei;
|
||||
ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
|
||||
ei = alloc_inode_sb(sb, coda_inode_cachep, GFP_KERNEL);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
memset(&ei->c_fid, 0, sizeof(struct CodaFid));
|
||||
|
@ -1766,7 +1766,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
|
||||
char *dname;
|
||||
int err;
|
||||
|
||||
dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
|
||||
dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru,
|
||||
GFP_KERNEL);
|
||||
if (!dentry)
|
||||
return NULL;
|
||||
|
||||
|
@ -38,7 +38,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
|
||||
struct ecryptfs_inode_info *inode_info;
|
||||
struct inode *inode = NULL;
|
||||
|
||||
inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL);
|
||||
inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL);
|
||||
if (unlikely(!inode_info))
|
||||
goto out;
|
||||
if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) {
|
||||
|
@ -69,7 +69,7 @@ static struct kmem_cache * efs_inode_cachep;
|
||||
static struct inode *efs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct efs_inode_info *ei;
|
||||
ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
|
||||
ei = alloc_inode_sb(sb, efs_inode_cachep, GFP_KERNEL);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
return &ei->vfs_inode;
|
||||
|
@ -84,7 +84,7 @@ static void erofs_inode_init_once(void *ptr)
|
||||
static struct inode *erofs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct erofs_inode *vi =
|
||||
kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL);
|
||||
alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL);
|
||||
|
||||
if (!vi)
|
||||
return NULL;
|
||||
|
@ -183,7 +183,7 @@ static struct inode *exfat_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct exfat_inode_info *ei;
|
||||
|
||||
ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS);
|
||||
ei = alloc_inode_sb(sb, exfat_inode_cachep, GFP_NOFS);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
|
||||
|
@ -170,11 +170,6 @@ static void ext2_preread_inode(struct inode *inode)
|
||||
unsigned long offset;
|
||||
unsigned long block;
|
||||
struct ext2_group_desc * gdp;
|
||||
struct backing_dev_info *bdi;
|
||||
|
||||
bdi = inode_to_bdi(inode);
|
||||
if (bdi_rw_congested(bdi))
|
||||
return;
|
||||
|
||||
block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
|
||||
gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL);
|
||||
|
@ -180,7 +180,7 @@ static struct kmem_cache * ext2_inode_cachep;
|
||||
static struct inode *ext2_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct ext2_inode_info *ei;
|
||||
ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
|
||||
ei = alloc_inode_sb(sb, ext2_inode_cachep, GFP_KERNEL);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
ei->i_block_alloc_info = NULL;
|
||||
|
@ -1316,7 +1316,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct ext4_inode_info *ei;
|
||||
|
||||
ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
|
||||
ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
|
||||
|
@ -1504,9 +1504,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
|
||||
if (IS_NOQUOTA(cc->inode))
|
||||
return 0;
|
||||
ret = 0;
|
||||
cond_resched();
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
|
||||
goto retry_write;
|
||||
}
|
||||
return ret;
|
||||
|
@ -3041,8 +3041,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
|
||||
} else if (ret == -EAGAIN) {
|
||||
ret = 0;
|
||||
if (wbc->sync_mode == WB_SYNC_ALL) {
|
||||
cond_resched();
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
f2fs_io_schedule_timeout(
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
goto retry_write;
|
||||
}
|
||||
|
@ -4538,6 +4538,12 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
|
||||
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
|
||||
}
|
||||
|
||||
static inline void f2fs_io_schedule_timeout(long timeout)
|
||||
{
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
io_schedule_timeout(timeout);
|
||||
}
|
||||
|
||||
#define EFSBADCRC EBADMSG /* Bad CRC detected */
|
||||
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
|
||||
|
||||
|
@ -313,8 +313,7 @@ void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure)
|
||||
skip:
|
||||
iput(inode);
|
||||
}
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
cond_resched();
|
||||
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
|
||||
if (gc_failure) {
|
||||
if (++looped >= count)
|
||||
return;
|
||||
@ -803,8 +802,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
|
||||
do {
|
||||
ret = __submit_flush_wait(sbi, FDEV(i).bdev);
|
||||
if (ret)
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
|
||||
} while (ret && --count);
|
||||
|
||||
if (ret) {
|
||||
@ -3137,7 +3135,7 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi,
|
||||
blk_finish_plug(&plug);
|
||||
mutex_unlock(&dcc->cmd_lock);
|
||||
trimmed += __wait_all_discard_cmd(sbi, NULL);
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
|
||||
goto next;
|
||||
}
|
||||
skip:
|
||||
|
@ -1345,8 +1345,12 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct f2fs_inode_info *fi;
|
||||
|
||||
fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep,
|
||||
GFP_F2FS_ZERO, false, F2FS_SB(sb));
|
||||
if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) {
|
||||
f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO);
|
||||
if (!fi)
|
||||
return NULL;
|
||||
|
||||
@ -2145,8 +2149,7 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
|
||||
/* we should flush all the data to keep data consistency */
|
||||
do {
|
||||
sync_inodes_sb(sbi->sb);
|
||||
cond_resched();
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
|
||||
} while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--);
|
||||
|
||||
if (unlikely(retry < 0))
|
||||
@ -2514,8 +2517,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
|
||||
&page, &fsdata);
|
||||
if (unlikely(err)) {
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
|
||||
goto retry;
|
||||
}
|
||||
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
|
||||
|
@ -745,7 +745,7 @@ static struct kmem_cache *fat_inode_cachep;
|
||||
static struct inode *fat_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct msdos_inode_info *ei;
|
||||
ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
|
||||
ei = alloc_inode_sb(sb, fat_inode_cachep, GFP_NOFS);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
|
||||
|
@ -124,7 +124,7 @@ static struct inode *vxfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct vxfs_inode_info *vi;
|
||||
|
||||
vi = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL);
|
||||
vi = alloc_inode_sb(sb, vxfs_inode_cachep, GFP_KERNEL);
|
||||
if (!vi)
|
||||
return NULL;
|
||||
inode_init_once(&vi->vfs_inode);
|
||||
|
@ -893,43 +893,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
|
||||
|
||||
/**
|
||||
* inode_congested - test whether an inode is congested
|
||||
* @inode: inode to test for congestion (may be NULL)
|
||||
* @cong_bits: mask of WB_[a]sync_congested bits to test
|
||||
*
|
||||
* Tests whether @inode is congested. @cong_bits is the mask of congestion
|
||||
* bits to test and the return value is the mask of set bits.
|
||||
*
|
||||
* If cgroup writeback is enabled for @inode, the congestion state is
|
||||
* determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
|
||||
* associated with @inode is congested; otherwise, the root wb's congestion
|
||||
* state is used.
|
||||
*
|
||||
* @inode is allowed to be NULL as this function is often called on
|
||||
* mapping->host which is NULL for the swapper space.
|
||||
*/
|
||||
int inode_congested(struct inode *inode, int cong_bits)
|
||||
{
|
||||
/*
|
||||
* Once set, ->i_wb never becomes NULL while the inode is alive.
|
||||
* Start transaction iff ->i_wb is visible.
|
||||
*/
|
||||
if (inode && inode_to_wb_is_valid(inode)) {
|
||||
struct bdi_writeback *wb;
|
||||
struct wb_lock_cookie lock_cookie = {};
|
||||
bool congested;
|
||||
|
||||
wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
|
||||
congested = wb_congested(wb, cong_bits);
|
||||
unlocked_inode_to_wb_end(inode, &lock_cookie);
|
||||
return congested;
|
||||
}
|
||||
|
||||
return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inode_congested);
|
||||
|
||||
/**
|
||||
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
|
||||
* @wb: target bdi_writeback to split @nr_pages to
|
||||
@ -2233,7 +2196,6 @@ void wb_workfn(struct work_struct *work)
|
||||
long pages_written;
|
||||
|
||||
set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
|
||||
current->flags |= PF_SWAPWRITE;
|
||||
|
||||
if (likely(!current_is_workqueue_rescuer() ||
|
||||
!test_bit(WB_registered, &wb->state))) {
|
||||
@ -2262,8 +2224,6 @@ void wb_workfn(struct work_struct *work)
|
||||
wb_wakeup(wb);
|
||||
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
|
||||
wb_wakeup_delayed(wb);
|
||||
|
||||
current->flags &= ~PF_SWAPWRITE;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
|
||||
{
|
||||
unsigned val;
|
||||
struct fuse_conn *fc;
|
||||
struct fuse_mount *fm;
|
||||
ssize_t ret;
|
||||
|
||||
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
|
||||
@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
|
||||
down_read(&fc->killsb);
|
||||
spin_lock(&fc->bg_lock);
|
||||
fc->congestion_threshold = val;
|
||||
|
||||
/*
|
||||
* Get any fuse_mount belonging to this fuse_conn; s_bdi is
|
||||
* shared between all of them
|
||||
*/
|
||||
|
||||
if (!list_empty(&fc->mounts)) {
|
||||
fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
|
||||
if (fc->num_background < fc->congestion_threshold) {
|
||||
clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
|
||||
clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
|
||||
} else {
|
||||
set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
|
||||
set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
|
||||
}
|
||||
}
|
||||
spin_unlock(&fc->bg_lock);
|
||||
up_read(&fc->killsb);
|
||||
fuse_conn_put(fc);
|
||||
|
@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req)
|
||||
wake_up(&fc->blocked_waitq);
|
||||
}
|
||||
|
||||
if (fc->num_background == fc->congestion_threshold && fm->sb) {
|
||||
clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
|
||||
clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
|
||||
}
|
||||
fc->num_background--;
|
||||
fc->active_background--;
|
||||
flush_bg_queue(fc);
|
||||
@ -540,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_req *req)
|
||||
fc->num_background++;
|
||||
if (fc->num_background == fc->max_background)
|
||||
fc->blocked = 1;
|
||||
if (fc->num_background == fc->congestion_threshold && fm->sb) {
|
||||
set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
|
||||
set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
|
||||
}
|
||||
list_add_tail(&req->list, &fc->bg_queue);
|
||||
flush_bg_queue(fc);
|
||||
queued = true;
|
||||
|
@ -966,6 +966,14 @@ static void fuse_readahead(struct readahead_control *rac)
|
||||
struct fuse_io_args *ia;
|
||||
struct fuse_args_pages *ap;
|
||||
|
||||
if (fc->num_background >= fc->congestion_threshold &&
|
||||
rac->ra->async_size >= readahead_count(rac))
|
||||
/*
|
||||
* Congested and only async pages left, so skip the
|
||||
* rest.
|
||||
*/
|
||||
break;
|
||||
|
||||
nr_pages = readahead_count(rac) - nr_pages;
|
||||
if (nr_pages > max_pages)
|
||||
nr_pages = max_pages;
|
||||
@ -1959,6 +1967,7 @@ static int fuse_writepage_locked(struct page *page)
|
||||
|
||||
static int fuse_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
|
||||
int err;
|
||||
|
||||
if (fuse_page_is_writeback(page->mapping->host, page->index)) {
|
||||
@ -1974,6 +1983,10 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (wbc->sync_mode == WB_SYNC_NONE &&
|
||||
fc->num_background >= fc->congestion_threshold)
|
||||
return AOP_WRITEPAGE_ACTIVATE;
|
||||
|
||||
err = fuse_writepage_locked(page);
|
||||
unlock_page(page);
|
||||
|
||||
@ -2227,6 +2240,10 @@ static int fuse_writepages(struct address_space *mapping,
|
||||
if (fuse_is_bad(inode))
|
||||
goto out;
|
||||
|
||||
if (wbc->sync_mode == WB_SYNC_NONE &&
|
||||
fc->num_background >= fc->congestion_threshold)
|
||||
return 0;
|
||||
|
||||
data.inode = inode;
|
||||
data.wpa = NULL;
|
||||
data.ff = NULL;
|
||||
|
@ -72,7 +72,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct fuse_inode *fi;
|
||||
|
||||
fi = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
|
||||
fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL);
|
||||
if (!fi)
|
||||
return NULL;
|
||||
|
||||
|
@ -1425,7 +1425,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct gfs2_inode *ip;
|
||||
|
||||
ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
|
||||
ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL);
|
||||
if (!ip)
|
||||
return NULL;
|
||||
ip->i_flags = 0;
|
||||
|
@ -162,7 +162,7 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct hfs_inode_info *i;
|
||||
|
||||
i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL);
|
||||
i = alloc_inode_sb(sb, hfs_inode_cachep, GFP_KERNEL);
|
||||
return i ? &i->vfs_inode : NULL;
|
||||
}
|
||||
|
||||
|
@ -624,7 +624,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct hfsplus_inode_info *i;
|
||||
|
||||
i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL);
|
||||
i = alloc_inode_sb(sb, hfsplus_inode_cachep, GFP_KERNEL);
|
||||
return i ? &i->vfs_inode : NULL;
|
||||
}
|
||||
|
||||
|
@ -222,7 +222,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct hostfs_inode_info *hi;
|
||||
|
||||
hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
|
||||
hi = alloc_inode_sb(sb, hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
|
||||
if (hi == NULL)
|
||||
return NULL;
|
||||
hi->fd = -1;
|
||||
|
@ -232,7 +232,7 @@ static struct kmem_cache * hpfs_inode_cachep;
|
||||
static struct inode *hpfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct hpfs_inode_info *ei;
|
||||
ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
|
||||
ei = alloc_inode_sb(sb, hpfs_inode_cachep, GFP_NOFS);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
return &ei->vfs_inode;
|
||||
|
@ -1110,7 +1110,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
|
||||
|
||||
if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
|
||||
return NULL;
|
||||
p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
|
||||
p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
|
||||
if (unlikely(!p)) {
|
||||
hugetlbfs_inc_free_inodes(sbinfo);
|
||||
return NULL;
|
||||
|
@ -259,7 +259,7 @@ static struct inode *alloc_inode(struct super_block *sb)
|
||||
if (ops->alloc_inode)
|
||||
inode = ops->alloc_inode(sb);
|
||||
else
|
||||
inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
|
||||
inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);
|
||||
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
@ -70,7 +70,7 @@ static struct kmem_cache *isofs_inode_cachep;
|
||||
static struct inode *isofs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct iso_inode_info *ei;
|
||||
ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL);
|
||||
ei = alloc_inode_sb(sb, isofs_inode_cachep, GFP_KERNEL);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
return &ei->vfs_inode;
|
||||
|
@ -39,7 +39,7 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct jffs2_inode_info *f;
|
||||
|
||||
f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL);
|
||||
f = alloc_inode_sb(sb, jffs2_inode_cachep, GFP_KERNEL);
|
||||
if (!f)
|
||||
return NULL;
|
||||
return &f->vfs_inode;
|
||||
|
@ -102,7 +102,7 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct jfs_inode_info *jfs_inode;
|
||||
|
||||
jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
|
||||
jfs_inode = alloc_inode_sb(sb, jfs_inode_cachep, GFP_NOFS);
|
||||
if (!jfs_inode)
|
||||
return NULL;
|
||||
#ifdef CONFIG_QUOTA
|
||||
|
@ -63,7 +63,7 @@ static struct kmem_cache * minix_inode_cachep;
|
||||
static struct inode *minix_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct minix_inode_info *ei;
|
||||
ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
|
||||
ei = alloc_inode_sb(sb, minix_inode_cachep, GFP_KERNEL);
|
||||
if (!ei)
|
||||
return NULL;
|
||||
return &ei->vfs_inode;
|
||||
|
@ -2597,6 +2597,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
|
||||
struct super_block *sb = mnt->mnt_sb;
|
||||
|
||||
if (!__mnt_is_readonly(mnt) &&
|
||||
(!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
|
||||
(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
|
||||
char *buf = (char *)__get_free_page(GFP_KERNEL);
|
||||
char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
|
||||
@ -2611,6 +2612,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
|
||||
tm.tm_year+1900, (unsigned long long)sb->s_time_max);
|
||||
|
||||
free_page((unsigned long)buf);
|
||||
sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2238,7 +2238,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
|
||||
struct inode *nfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct nfs_inode *nfsi;
|
||||
nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
|
||||
nfsi = alloc_inode_sb(sb, nfs_inode_cachep, GFP_KERNEL);
|
||||
if (!nfsi)
|
||||
return NULL;
|
||||
nfsi->flags = 0UL;
|
||||
|
@ -417,7 +417,7 @@ static void nfs_set_page_writeback(struct page *page)
|
||||
|
||||
if (atomic_long_inc_return(&nfss->writeback) >
|
||||
NFS_CONGESTION_ON_THRESH)
|
||||
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
|
||||
nfss->write_congested = 1;
|
||||
}
|
||||
|
||||
static void nfs_end_page_writeback(struct nfs_page *req)
|
||||
@ -433,7 +433,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
|
||||
|
||||
end_page_writeback(req->wb_page);
|
||||
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
|
||||
clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
|
||||
nfss->write_congested = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -672,6 +672,10 @@ static int nfs_writepage_locked(struct page *page,
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
int err;
|
||||
|
||||
if (wbc->sync_mode == WB_SYNC_NONE &&
|
||||
NFS_SERVER(inode)->write_congested)
|
||||
return AOP_WRITEPAGE_ACTIVATE;
|
||||
|
||||
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
|
||||
nfs_pageio_init_write(&pgio, inode, 0,
|
||||
false, &nfs_async_write_completion_ops);
|
||||
@ -719,6 +723,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
|
||||
int priority = 0;
|
||||
int err;
|
||||
|
||||
if (wbc->sync_mode == WB_SYNC_NONE &&
|
||||
NFS_SERVER(inode)->write_congested)
|
||||
return 0;
|
||||
|
||||
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
|
||||
|
||||
if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
|
||||
@ -1893,7 +1901,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
|
||||
}
|
||||
nfss = NFS_SERVER(data->inode);
|
||||
if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
|
||||
clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
|
||||
nfss->write_congested = 0;
|
||||
|
||||
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
|
||||
nfs_commit_end(cinfo.mds);
|
||||
|
@ -340,18 +340,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
|
||||
struct nilfs_write_info *wi)
|
||||
{
|
||||
struct bio *bio = wi->bio;
|
||||
int err;
|
||||
|
||||
if (segbuf->sb_nbio > 0 &&
|
||||
bdi_write_congested(segbuf->sb_super->s_bdi)) {
|
||||
wait_for_completion(&segbuf->sb_bio_event);
|
||||
segbuf->sb_nbio--;
|
||||
if (unlikely(atomic_read(&segbuf->sb_err))) {
|
||||
bio_put(bio);
|
||||
err = -EIO;
|
||||
goto failed;
|
||||
}
|
||||
}
|
||||
|
||||
bio->bi_end_io = nilfs_end_bio_write;
|
||||
bio->bi_private = segbuf;
|
||||
@ -363,10 +351,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
|
||||
wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
|
||||
wi->start = wi->end;
|
||||
return 0;
|
||||
|
||||
failed:
|
||||
wi->bio = NULL;
|
||||
return err;
|
||||
}
|
||||
|
||||
static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
|
||||
|
@ -151,7 +151,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct nilfs_inode_info *ii;
|
||||
|
||||
ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
|
||||
ii = alloc_inode_sb(sb, nilfs_inode_cachep, GFP_NOFS);
|
||||
if (!ii)
|
||||
return NULL;
|
||||
ii->i_bh = NULL;
|
||||
|
@ -310,7 +310,7 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
|
||||
ntfs_inode *ni;
|
||||
|
||||
ntfs_debug("Entering.");
|
||||
ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS);
|
||||
ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS);
|
||||
if (likely(ni != NULL)) {
|
||||
ni->state = 0;
|
||||
return VFS_I(ni);
|
||||
@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi)
|
||||
}
|
||||
/* Now allocate memory for the attribute list. */
|
||||
ni->attr_list_size = (u32)ntfs_attr_size(a);
|
||||
if (!ni->attr_list_size) {
|
||||
ntfs_error(sb, "Attr_list_size is zero");
|
||||
goto put_err_out;
|
||||
}
|
||||
ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
|
||||
if (!ni->attr_list) {
|
||||
ntfs_error(sb, "Not enough memory to allocate buffer "
|
||||
|
@ -399,7 +399,7 @@ static struct kmem_cache *ntfs_inode_cachep;
|
||||
|
||||
static struct inode *ntfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct ntfs_inode *ni = kmem_cache_alloc(ntfs_inode_cachep, GFP_NOFS);
|
||||
struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS);
|
||||
|
||||
if (!ni)
|
||||
return NULL;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user