mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-01 10:43:43 +00:00
MM patches for 6.2-rc1.
- More userfaultfs work from Peter Xu. - Several convert-to-folios series from Sidhartha Kumar and Huang Ying. - Some filemap cleanups from Vishal Moola. - David Hildenbrand added the ability to selftest anon memory COW handling. - Some cpuset simplifications from Liu Shixin. - Addition of vmalloc tracing support by Uladzislau Rezki. - Some pagecache folioifications and simplifications from Matthew Wilcox. - A pagemap cleanup from Kefeng Wang: we have VM_ACCESS_FLAGS, so use it. - Miguel Ojeda contributed some cleanups for our use of the __no_sanitize_thread__ gcc keyword. This series shold have been in the non-MM tree, my bad. - Naoya Horiguchi improved the interaction between memory poisoning and memory section removal for huge pages. - DAMON cleanups and tuneups from SeongJae Park - Tony Luck fixed the handling of COW faults against poisoned pages. - Peter Xu utilized the PTE marker code for handling swapin errors. - Hugh Dickins reworked compound page mapcount handling, simplifying it and making it more efficient. - Removal of the autonuma savedwrite infrastructure from Nadav Amit and David Hildenbrand. - zram support for multiple compression streams from Sergey Senozhatsky. - David Hildenbrand reworked the GUP code's R/O long-term pinning so that drivers no longer need to use the FOLL_FORCE workaround which didn't work very well anyway. - Mel Gorman altered the page allocator so that local IRQs can remnain enabled during per-cpu page allocations. - Vishal Moola removed the try_to_release_page() wrapper. - Stefan Roesch added some per-BDI sysfs tunables which are used to prevent network block devices from dirtying excessive amounts of pagecache. - David Hildenbrand did some cleanup and repair work on KSM COW breaking. - Nhat Pham and Johannes Weiner have implemented writeback in zswap's zsmalloc backend. - Brian Foster has fixed a longstanding corner-case oddity in file[map]_write_and_wait_range(). - sparse-vmemmap changes for MIPS, LoongArch and NIOS2 from Feiyang Chen. - Shiyang Ruan has done some work on fsdax, to make its reflink mode work better under xfstests. Better, but still not perfect. - Christoph Hellwig has removed the .writepage() method from several filesystems. They only need .writepages(). - Yosry Ahmed wrote a series which fixes the memcg reclaim target beancounting. - David Hildenbrand has fixed some of our MM selftests for 32-bit machines. - Many singleton patches, as usual. -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCY5j6ZwAKCRDdBJ7gKXxA jkDYAP9qNeVqp9iuHjZNTqzMXkfmJPsw2kmy2P+VdzYVuQRcJgEAgoV9d7oMq4ml CodAgiA51qwzId3GRytIo/tfWZSezgA= =d19R -----END PGP SIGNATURE----- Merge tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull MM updates from Andrew Morton: - More userfaultfs work from Peter Xu - Several convert-to-folios series from Sidhartha Kumar and Huang Ying - Some filemap cleanups from Vishal Moola - David Hildenbrand added the ability to selftest anon memory COW handling - Some cpuset simplifications from Liu Shixin - Addition of vmalloc tracing support by Uladzislau Rezki - Some pagecache folioifications and simplifications from Matthew Wilcox - A pagemap cleanup from Kefeng Wang: we have VM_ACCESS_FLAGS, so use it - Miguel Ojeda contributed some cleanups for our use of the __no_sanitize_thread__ gcc keyword. This series should have been in the non-MM tree, my bad - Naoya Horiguchi improved the interaction between memory poisoning and memory section removal for huge pages - DAMON cleanups and tuneups from SeongJae Park - Tony Luck fixed the handling of COW faults against poisoned pages - Peter Xu utilized the PTE marker code for handling swapin errors - Hugh Dickins reworked compound page mapcount handling, simplifying it and making it more efficient - Removal of the autonuma savedwrite infrastructure from Nadav Amit and David Hildenbrand - zram support for multiple compression streams from Sergey Senozhatsky - David Hildenbrand reworked the GUP code's R/O long-term pinning so that drivers no longer need to use the FOLL_FORCE workaround which didn't work very well anyway - Mel Gorman altered the page allocator so that local IRQs can remnain enabled during per-cpu page allocations - Vishal Moola removed the try_to_release_page() wrapper - Stefan Roesch added some per-BDI sysfs tunables which are used to prevent network block devices from dirtying excessive amounts of pagecache - David Hildenbrand did some cleanup and repair work on KSM COW breaking - Nhat Pham and Johannes Weiner have implemented writeback in zswap's zsmalloc backend - Brian Foster has fixed a longstanding corner-case oddity in file[map]_write_and_wait_range() - sparse-vmemmap changes for MIPS, LoongArch and NIOS2 from Feiyang Chen - Shiyang Ruan has done some work on fsdax, to make its reflink mode work better under xfstests. Better, but still not perfect - Christoph Hellwig has removed the .writepage() method from several filesystems. They only need .writepages() - Yosry Ahmed wrote a series which fixes the memcg reclaim target beancounting - David Hildenbrand has fixed some of our MM selftests for 32-bit machines - Many singleton patches, as usual * tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (313 commits) mm/hugetlb: set head flag before setting compound_order in __prep_compound_gigantic_folio mm: mmu_gather: allow more than one batch of delayed rmaps mm: fix typo in struct pglist_data code comment kmsan: fix memcpy tests mm: add cond_resched() in swapin_walk_pmd_entry() mm: do not show fs mm pc for VM_LOCKONFAULT pages selftests/vm: ksm_functional_tests: fixes for 32bit selftests/vm: cow: fix compile warning on 32bit selftests/vm: madv_populate: fix missing MADV_POPULATE_(READ|WRITE) definitions mm/gup_test: fix PIN_LONGTERM_TEST_READ with highmem mm,thp,rmap: fix races between updates of subpages_mapcount mm: memcg: fix swapcached stat accounting mm: add nodes= arg to memory.reclaim mm: disable top-tier fallback to reclaim on proactive reclaim selftests: cgroup: make sure reclaim target memcg is unprotected selftests: cgroup: refactor proactive reclaim code to reclaim_until() mm: memcg: fix stale protection of reclaim target memcg mm/mmap: properly unaccount memory on mas_preallocate() failure omfs: remove ->writepage jfs: remove ->writepage ...
This commit is contained in:
commit
e2ca6ba6ba
@ -137,3 +137,17 @@ Description:
|
||||
The writeback_limit file is read-write and specifies the maximum
|
||||
amount of writeback ZRAM can do. The limit could be changed
|
||||
in run time.
|
||||
|
||||
What: /sys/block/zram<id>/recomp_algorithm
|
||||
Date: November 2022
|
||||
Contact: Sergey Senozhatsky <senozhatsky@chromium.org>
|
||||
Description:
|
||||
The recomp_algorithm file is read-write and allows to set
|
||||
or show secondary compression algorithms.
|
||||
|
||||
What: /sys/block/zram<id>/recompress
|
||||
Date: November 2022
|
||||
Contact: Sergey Senozhatsky <senozhatsky@chromium.org>
|
||||
Description:
|
||||
The recompress file is write-only and triggers re-compression
|
||||
with secondary compression algorithms.
|
||||
|
@ -44,6 +44,21 @@ Description:
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/min_ratio_fine
|
||||
Date: November 2022
|
||||
Contact: Stefan Roesch <shr@devkernel.io>
|
||||
Description:
|
||||
Under normal circumstances each device is given a part of the
|
||||
total write-back cache that relates to its current average
|
||||
writeout speed in relation to the other devices.
|
||||
|
||||
The 'min_ratio_fine' parameter allows assigning a minimum reserve
|
||||
of the write-back cache to a particular device. The value is
|
||||
expressed as part of 1 million. For example, this is useful for
|
||||
providing a minimum QoS.
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/max_ratio
|
||||
Date: January 2008
|
||||
Contact: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
@ -55,6 +70,59 @@ Description:
|
||||
mount that is prone to get stuck, or a FUSE mount which cannot
|
||||
be trusted to play fair.
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/max_ratio_fine
|
||||
Date: November 2022
|
||||
Contact: Stefan Roesch <shr@devkernel.io>
|
||||
Description:
|
||||
Allows limiting a particular device to use not more than the
|
||||
given value of the write-back cache. The value is given as part
|
||||
of 1 million. This is useful in situations where we want to avoid
|
||||
one device taking all or most of the write-back cache. For example
|
||||
in case of an NFS mount that is prone to get stuck, or a FUSE mount
|
||||
which cannot be trusted to play fair.
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/min_bytes
|
||||
Date: October 2022
|
||||
Contact: Stefan Roesch <shr@devkernel.io>
|
||||
Description:
|
||||
Under normal circumstances each device is given a part of the
|
||||
total write-back cache that relates to its current average
|
||||
writeout speed in relation to the other devices.
|
||||
|
||||
The 'min_bytes' parameter allows assigning a minimum
|
||||
percentage of the write-back cache to a particular device
|
||||
expressed in bytes.
|
||||
For example, this is useful for providing a minimum QoS.
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/max_bytes
|
||||
Date: October 2022
|
||||
Contact: Stefan Roesch <shr@devkernel.io>
|
||||
Description:
|
||||
Allows limiting a particular device to use not more than the
|
||||
given 'max_bytes' of the write-back cache. This is useful in
|
||||
situations where we want to avoid one device taking all or
|
||||
most of the write-back cache. For example in case of an NFS
|
||||
mount that is prone to get stuck, a FUSE mount which cannot be
|
||||
trusted to play fair, or a nbd device.
|
||||
|
||||
(read-write)
|
||||
|
||||
What: /sys/class/bdi/<bdi>/strict_limit
|
||||
Date: October 2022
|
||||
Contact: Stefan Roesch <shr@devkernel.io>
|
||||
Description:
|
||||
Forces per-BDI checks for the share of given device in the write-back
|
||||
cache even before the global background dirty limit is reached. This
|
||||
is useful in situations where the global limit is much higher than
|
||||
affordable for given relatively slow (or untrusted) device. Turning
|
||||
strictlimit on has no visible effect if max_ratio is equal to 100%.
|
||||
|
||||
(read-write)
|
||||
What: /sys/class/bdi/<bdi>/stable_pages_required
|
||||
Date: January 2008
|
||||
|
@ -27,6 +27,10 @@ Description: Writing 'on' or 'off' to this file makes the kdamond starts or
|
||||
makes the kdamond reads the user inputs in the sysfs files
|
||||
except 'state' again. Writing 'update_schemes_stats' to the
|
||||
file updates contents of schemes stats files of the kdamond.
|
||||
Writing 'update_schemes_tried_regions' to the file updates
|
||||
contents of 'tried_regions' directory of every scheme directory
|
||||
of this kdamond. Writing 'clear_schemes_tried_regions' to the
|
||||
file removes contents of the 'tried_regions' directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/pid
|
||||
Date: Mar 2022
|
||||
@ -283,3 +287,31 @@ Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the number of the exceed events of
|
||||
the scheme's quotas.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/start
|
||||
Date: Oct 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the start address of a memory region
|
||||
that corresponding DAMON-based Operation Scheme's action has
|
||||
tried to be applied.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/end
|
||||
Date: Oct 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the end address of a memory region
|
||||
that corresponding DAMON-based Operation Scheme's action has
|
||||
tried to be applied.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/nr_accesses
|
||||
Date: Oct 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the 'nr_accesses' of a memory region
|
||||
that corresponding DAMON-based Operation Scheme's action has
|
||||
tried to be applied.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/age
|
||||
Date: Oct 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the 'age' of a memory region that
|
||||
corresponding DAMON-based Operation Scheme's action has tried
|
||||
to be applied.
|
||||
|
@ -348,8 +348,13 @@ this can be accomplished with::
|
||||
|
||||
echo huge_idle > /sys/block/zramX/writeback
|
||||
|
||||
If a user chooses to writeback only incompressible pages (pages that none of
|
||||
algorithms can compress) this can be accomplished with::
|
||||
|
||||
echo incompressible > /sys/block/zramX/writeback
|
||||
|
||||
If an admin wants to write a specific page in zram device to the backing device,
|
||||
they could write a page index into the interface.
|
||||
they could write a page index into the interface::
|
||||
|
||||
echo "page_index=1251" > /sys/block/zramX/writeback
|
||||
|
||||
@ -401,6 +406,87 @@ budget in next setting is user's job.
|
||||
If admin wants to measure writeback count in a certain period, they could
|
||||
know it via /sys/block/zram0/bd_stat's 3rd column.
|
||||
|
||||
recompression
|
||||
-------------
|
||||
|
||||
With CONFIG_ZRAM_MULTI_COMP, zram can recompress pages using alternative
|
||||
(secondary) compression algorithms. The basic idea is that alternative
|
||||
compression algorithm can provide better compression ratio at a price of
|
||||
(potentially) slower compression/decompression speeds. Alternative compression
|
||||
algorithm can, for example, be more successful compressing huge pages (those
|
||||
that default algorithm failed to compress). Another application is idle pages
|
||||
recompression - pages that are cold and sit in the memory can be recompressed
|
||||
using more effective algorithm and, hence, reduce zsmalloc memory usage.
|
||||
|
||||
With CONFIG_ZRAM_MULTI_COMP, zram supports up to 4 compression algorithms:
|
||||
one primary and up to 3 secondary ones. Primary zram compressor is explained
|
||||
in "3) Select compression algorithm", secondary algorithms are configured
|
||||
using recomp_algorithm device attribute.
|
||||
|
||||
Example:::
|
||||
|
||||
#show supported recompression algorithms
|
||||
cat /sys/block/zramX/recomp_algorithm
|
||||
#1: lzo lzo-rle lz4 lz4hc [zstd]
|
||||
#2: lzo lzo-rle lz4 [lz4hc] zstd
|
||||
|
||||
Alternative compression algorithms are sorted by priority. In the example
|
||||
above, zstd is used as the first alternative algorithm, which has priority
|
||||
of 1, while lz4hc is configured as a compression algorithm with priority 2.
|
||||
Alternative compression algorithm's priority is provided during algorithms
|
||||
configuration:::
|
||||
|
||||
#select zstd recompression algorithm, priority 1
|
||||
echo "algo=zstd priority=1" > /sys/block/zramX/recomp_algorithm
|
||||
|
||||
#select deflate recompression algorithm, priority 2
|
||||
echo "algo=deflate priority=2" > /sys/block/zramX/recomp_algorithm
|
||||
|
||||
Another device attribute that CONFIG_ZRAM_MULTI_COMP enables is recompress,
|
||||
which controls recompression.
|
||||
|
||||
Examples:::
|
||||
|
||||
#IDLE pages recompression is activated by `idle` mode
|
||||
echo "type=idle" > /sys/block/zramX/recompress
|
||||
|
||||
#HUGE pages recompression is activated by `huge` mode
|
||||
echo "type=huge" > /sys/block/zram0/recompress
|
||||
|
||||
#HUGE_IDLE pages recompression is activated by `huge_idle` mode
|
||||
echo "type=huge_idle" > /sys/block/zramX/recompress
|
||||
|
||||
The number of idle pages can be significant, so user-space can pass a size
|
||||
threshold (in bytes) to the recompress knob: zram will recompress only pages
|
||||
of equal or greater size:::
|
||||
|
||||
#recompress all pages larger than 3000 bytes
|
||||
echo "threshold=3000" > /sys/block/zramX/recompress
|
||||
|
||||
#recompress idle pages larger than 2000 bytes
|
||||
echo "type=idle threshold=2000" > /sys/block/zramX/recompress
|
||||
|
||||
Recompression of idle pages requires memory tracking.
|
||||
|
||||
During re-compression for every page, that matches re-compression criteria,
|
||||
ZRAM iterates the list of registered alternative compression algorithms in
|
||||
order of their priorities. ZRAM stops either when re-compression was
|
||||
successful (re-compressed object is smaller in size than the original one)
|
||||
and matches re-compression criteria (e.g. size threshold) or when there are
|
||||
no secondary algorithms left to try. If none of the secondary algorithms can
|
||||
successfully re-compressed the page such a page is marked as incompressible,
|
||||
so ZRAM will not attempt to re-compress it in the future.
|
||||
|
||||
This re-compression behaviour, when it iterates through the list of
|
||||
registered compression algorithms, increases our chances of finding the
|
||||
algorithm that successfully compresses a particular page. Sometimes, however,
|
||||
it is convenient (and sometimes even necessary) to limit recompression to
|
||||
only one particular algorithm so that it will not try any other algorithms.
|
||||
This can be achieved by providing a algo=NAME parameter:::
|
||||
|
||||
#use zstd algorithm only (if registered)
|
||||
echo "type=huge algo=zstd" > /sys/block/zramX/recompress
|
||||
|
||||
memory tracking
|
||||
===============
|
||||
|
||||
@ -411,9 +497,11 @@ pages of the process with*pagemap.
|
||||
If you enable the feature, you could see block state via
|
||||
/sys/kernel/debug/zram/zram0/block_state". The output is as follows::
|
||||
|
||||
300 75.033841 .wh.
|
||||
301 63.806904 s...
|
||||
302 63.806919 ..hi
|
||||
300 75.033841 .wh...
|
||||
301 63.806904 s.....
|
||||
302 63.806919 ..hi..
|
||||
303 62.801919 ....r.
|
||||
304 146.781902 ..hi.n
|
||||
|
||||
First column
|
||||
zram's block index.
|
||||
@ -430,6 +518,10 @@ Third column
|
||||
huge page
|
||||
i:
|
||||
idle page
|
||||
r:
|
||||
recompressed page (secondary compression algorithm)
|
||||
n:
|
||||
none (including secondary) of algorithms could compress it
|
||||
|
||||
First line of above example says 300th block is accessed at 75.033841sec
|
||||
and the block's state is huge so it is written back to the backing
|
||||
|
@ -543,7 +543,8 @@ inactive_anon # of bytes of anonymous and swap cache memory on inactive
|
||||
LRU list.
|
||||
active_anon # of bytes of anonymous and swap cache memory on active
|
||||
LRU list.
|
||||
inactive_file # of bytes of file-backed memory on inactive LRU list.
|
||||
inactive_file # of bytes of file-backed memory and MADV_FREE anonymous memory(
|
||||
LazyFree pages) on inactive LRU list.
|
||||
active_file # of bytes of file-backed memory on active LRU list.
|
||||
unevictable # of bytes of memory that cannot be reclaimed (mlocked etc).
|
||||
=============== ===============================================================
|
||||
|
@ -1245,17 +1245,13 @@ PAGE_SIZE multiple when read back.
|
||||
This is a simple interface to trigger memory reclaim in the
|
||||
target cgroup.
|
||||
|
||||
This file accepts a single key, the number of bytes to reclaim.
|
||||
No nested keys are currently supported.
|
||||
This file accepts a string which contains the number of bytes to
|
||||
reclaim.
|
||||
|
||||
Example::
|
||||
|
||||
echo "1G" > memory.reclaim
|
||||
|
||||
The interface can be later extended with nested keys to
|
||||
configure the reclaim behavior. For example, specify the
|
||||
type of memory to reclaim from (anon, file, ..).
|
||||
|
||||
Please note that the kernel can over or under reclaim from
|
||||
the target cgroup. If less bytes are reclaimed than the
|
||||
specified amount, -EAGAIN is returned.
|
||||
@ -1267,6 +1263,13 @@ PAGE_SIZE multiple when read back.
|
||||
This means that the networking layer will not adapt based on
|
||||
reclaim induced by memory.reclaim.
|
||||
|
||||
This file also allows the user to specify the nodes to reclaim from,
|
||||
via the 'nodes=' key, for example::
|
||||
|
||||
echo "1G nodes=0,1" > memory.reclaim
|
||||
|
||||
The above instructs the kernel to reclaim memory from nodes 0,1.
|
||||
|
||||
memory.peak
|
||||
A read-only single value file which exists on non-root
|
||||
cgroups.
|
||||
@ -1488,12 +1491,18 @@ PAGE_SIZE multiple when read back.
|
||||
pgscan_direct (npn)
|
||||
Amount of scanned pages directly (in an inactive LRU list)
|
||||
|
||||
pgscan_khugepaged (npn)
|
||||
Amount of scanned pages by khugepaged (in an inactive LRU list)
|
||||
|
||||
pgsteal_kswapd (npn)
|
||||
Amount of reclaimed pages by kswapd
|
||||
|
||||
pgsteal_direct (npn)
|
||||
Amount of reclaimed pages directly
|
||||
|
||||
pgsteal_khugepaged (npn)
|
||||
Amount of reclaimed pages by khugepaged
|
||||
|
||||
pgfault (npn)
|
||||
Total number of page faults incurred
|
||||
|
||||
|
@ -88,6 +88,9 @@ comma (","). ::
|
||||
│ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
|
||||
│ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
|
||||
│ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
|
||||
│ │ │ │ │ │ │ tried_regions/
|
||||
│ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
|
||||
│ │ │ │ │ │ │ │ ...
|
||||
│ │ │ │ │ │ ...
|
||||
│ │ │ │ ...
|
||||
│ │ ...
|
||||
@ -125,7 +128,14 @@ in the state. Writing ``commit`` to the ``state`` file makes kdamond reads the
|
||||
user inputs in the sysfs files except ``state`` file again. Writing
|
||||
``update_schemes_stats`` to ``state`` file updates the contents of stats files
|
||||
for each DAMON-based operation scheme of the kdamond. For details of the
|
||||
stats, please refer to :ref:`stats section <sysfs_schemes_stats>`.
|
||||
stats, please refer to :ref:`stats section <sysfs_schemes_stats>`. Writing
|
||||
``update_schemes_tried_regions`` to ``state`` file updates the DAMON-based
|
||||
operation scheme action tried regions directory for each DAMON-based operation
|
||||
scheme of the kdamond. Writing ``clear_schemes_tried_regions`` to ``state``
|
||||
file clears the DAMON-based operating scheme action tried regions directory for
|
||||
each DAMON-based operation scheme of the kdamond. For details of the
|
||||
DAMON-based operation scheme action tried regions directory, please refer to
|
||||
:ref:tried_regions section <sysfs_schemes_tried_regions>`.
|
||||
|
||||
If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
|
||||
|
||||
@ -166,6 +176,8 @@ You can set and get what type of monitoring operations DAMON will use for the
|
||||
context by writing one of the keywords listed in ``avail_operations`` file and
|
||||
reading from the ``operations`` file.
|
||||
|
||||
.. _sysfs_monitoring_attrs:
|
||||
|
||||
contexts/<N>/monitoring_attrs/
|
||||
------------------------------
|
||||
|
||||
@ -235,6 +247,9 @@ In each region directory, you will find two files (``start`` and ``end``). You
|
||||
can set and get the start and end addresses of the initial monitoring target
|
||||
region by writing to and reading from the files, respectively.
|
||||
|
||||
Each region should not overlap with others. ``end`` of directory ``N`` should
|
||||
be equal or smaller than ``start`` of directory ``N+1``.
|
||||
|
||||
contexts/<N>/schemes/
|
||||
---------------------
|
||||
|
||||
@ -252,8 +267,9 @@ to ``N-1``. Each directory represents each DAMON-based operation scheme.
|
||||
schemes/<N>/
|
||||
------------
|
||||
|
||||
In each scheme directory, four directories (``access_pattern``, ``quotas``,
|
||||
``watermarks``, and ``stats``) and one file (``action``) exist.
|
||||
In each scheme directory, five directories (``access_pattern``, ``quotas``,
|
||||
``watermarks``, ``stats``, and ``tried_regions``) and one file (``action``)
|
||||
exist.
|
||||
|
||||
The ``action`` file is for setting and getting what action you want to apply to
|
||||
memory regions having specific access pattern of the interest. The keywords
|
||||
@ -348,6 +364,32 @@ should ask DAMON sysfs interface to updte the content of the files for the
|
||||
stats by writing a special keyword, ``update_schemes_stats`` to the relevant
|
||||
``kdamonds/<N>/state`` file.
|
||||
|
||||
.. _sysfs_schemes_tried_regions:
|
||||
|
||||
schemes/<N>/tried_regions/
|
||||
--------------------------
|
||||
|
||||
When a special keyword, ``update_schemes_tried_regions``, is written to the
|
||||
relevant ``kdamonds/<N>/state`` file, DAMON creates directories named integer
|
||||
starting from ``0`` under this directory. Each directory contains files
|
||||
exposing detailed information about each of the memory region that the
|
||||
corresponding scheme's ``action`` has tried to be applied under this directory,
|
||||
during next :ref:`aggregation interval <sysfs_monitoring_attrs>`. The
|
||||
information includes address range, ``nr_accesses``, , and ``age`` of the
|
||||
region.
|
||||
|
||||
The directories will be removed when another special keyword,
|
||||
``clear_schemes_tried_regions``, is written to the relevant
|
||||
``kdamonds/<N>/state`` file.
|
||||
|
||||
tried_regions/<N>/
|
||||
------------------
|
||||
|
||||
In each region directory, you will find four files (``start``, ``end``,
|
||||
``nr_accesses``, and ``age``). Reading the files will show the start and end
|
||||
addresses, ``nr_accesses``, and ``age`` of the region that corresponding
|
||||
DAMON-based operation scheme ``action`` has tried to be applied.
|
||||
|
||||
Example
|
||||
~~~~~~~
|
||||
|
||||
@ -465,8 +507,9 @@ regions in case of physical memory monitoring. Therefore, users should set the
|
||||
monitoring target regions by themselves.
|
||||
|
||||
In such cases, users can explicitly set the initial monitoring target regions
|
||||
as they want, by writing proper values to the ``init_regions`` file. Each line
|
||||
of the input should represent one region in below form.::
|
||||
as they want, by writing proper values to the ``init_regions`` file. The input
|
||||
should be a sequence of three integers separated by white spaces that represent
|
||||
one region in below form.::
|
||||
|
||||
<target idx> <start address> <end address>
|
||||
|
||||
@ -481,9 +524,9 @@ ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
|
||||
# cd <debugfs>/damon
|
||||
# cat target_ids
|
||||
42 4242
|
||||
# echo "0 1 100
|
||||
0 100 200
|
||||
1 20 40
|
||||
# echo "0 1 100 \
|
||||
0 100 200 \
|
||||
1 20 40 \
|
||||
1 50 100" > init_regions
|
||||
|
||||
Note that this sets the initial monitoring target regions only. In case of
|
||||
|
@ -428,14 +428,16 @@ with the memory region, as the case would be with BSS (uninitialized data).
|
||||
The "pathname" shows the name associated file for this mapping. If the mapping
|
||||
is not associated with a file:
|
||||
|
||||
============= ====================================
|
||||
=================== ===========================================
|
||||
[heap] the heap of the program
|
||||
[stack] the stack of the main process
|
||||
[vdso] the "virtual dynamic shared object",
|
||||
the kernel system call handler
|
||||
[anon:<name>] an anonymous mapping that has been
|
||||
[anon:<name>] a private anonymous mapping that has been
|
||||
named by userspace
|
||||
============= ====================================
|
||||
[anon_shmem:<name>] an anonymous shared memory mapping that has
|
||||
been named by userspace
|
||||
=================== ===========================================
|
||||
|
||||
or if empty, the mapping is anonymous.
|
||||
|
||||
|
@ -94,7 +94,7 @@ PMD Page Table Helpers
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| pmd_trans_huge | Tests a Transparent Huge Page (THP) at PMD |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| pmd_present | Tests a valid mapped PMD |
|
||||
| pmd_present | Tests whether pmd_page() points to valid memory |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| pmd_young | Tests a young PMD |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
|
@ -117,31 +117,15 @@ pages:
|
||||
- ->_refcount in tail pages is always zero: get_page_unless_zero() never
|
||||
succeeds on tail pages.
|
||||
|
||||
- map/unmap of the pages with PTE entry increment/decrement ->_mapcount
|
||||
on relevant sub-page of the compound page.
|
||||
- map/unmap of PMD entry for the whole compound page increment/decrement
|
||||
->compound_mapcount, stored in the first tail page of the compound page;
|
||||
and also increment/decrement ->subpages_mapcount (also in the first tail)
|
||||
by COMPOUND_MAPPED when compound_mapcount goes from -1 to 0 or 0 to -1.
|
||||
|
||||
- map/unmap of the whole compound page is accounted for in compound_mapcount
|
||||
(stored in first tail page). For file huge pages, we also increment
|
||||
->_mapcount of all sub-pages in order to have race-free detection of
|
||||
last unmap of subpages.
|
||||
|
||||
PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
|
||||
|
||||
For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all
|
||||
subpages is offset up by one. This additional reference is required to
|
||||
get race-free detection of unmap of subpages when we have them mapped with
|
||||
both PMDs and PTEs.
|
||||
|
||||
This optimization is required to lower the overhead of per-subpage mapcount
|
||||
tracking. The alternative is to alter ->_mapcount in all subpages on each
|
||||
map/unmap of the whole compound page.
|
||||
|
||||
For anonymous pages, we set PG_double_map when a PMD of the page is split
|
||||
for the first time, but still have a PMD mapping. The additional references
|
||||
go away with the last compound_mapcount.
|
||||
|
||||
File pages get PG_double_map set on the first map of the page with PTE and
|
||||
goes away when the page gets evicted from the page cache.
|
||||
- map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount
|
||||
on relevant sub-page of the compound page, and also increment/decrement
|
||||
->subpages_mapcount, stored in first tail page of the compound page, when
|
||||
_mapcount goes from -1 to 0 or 0 to -1: counting sub-pages mapped by PTE.
|
||||
|
||||
split_huge_page internally has to distribute the refcounts in the head
|
||||
page to the tail pages before clearing all PG_head/tail bits from the page
|
||||
|
12
MAINTAINERS
12
MAINTAINERS
@ -13399,10 +13399,20 @@ F: include/linux/memory_hotplug.h
|
||||
F: include/linux/mm.h
|
||||
F: include/linux/mmzone.h
|
||||
F: include/linux/pagewalk.h
|
||||
F: include/linux/vmalloc.h
|
||||
F: mm/
|
||||
F: tools/testing/selftests/vm/
|
||||
|
||||
VMALLOC
|
||||
M: Andrew Morton <akpm@linux-foundation.org>
|
||||
R: Uladzislau Rezki <urezki@gmail.com>
|
||||
R: Christoph Hellwig <hch@infradead.org>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
W: http://www.linux-mm.org
|
||||
T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
|
||||
F: include/linux/vmalloc.h
|
||||
F: mm/vmalloc.c
|
||||
|
||||
MEMORY HOT(UN)PLUG
|
||||
M: David Hildenbrand <david@redhat.com>
|
||||
M: Oscar Salvador <osalvador@suse.de>
|
||||
|
@ -313,8 +313,6 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
|
||||
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
|
||||
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
#define pte_ERROR(e) \
|
||||
printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
|
||||
#define pmd_ERROR(e) \
|
||||
|
@ -120,8 +120,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
|
||||
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
|
||||
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
#include <asm/hugepage.h>
|
||||
#endif
|
||||
|
@ -21,8 +21,6 @@
|
||||
#define pgd_none(pgd) (0)
|
||||
#define pgd_bad(pgd) (0)
|
||||
#define pgd_clear(pgdp)
|
||||
#define kern_addr_valid(addr) (1)
|
||||
/* FIXME */
|
||||
/*
|
||||
* PMD_SHIFT determines the size of the area a second-level page table can map
|
||||
* PGDIR_SHIFT determines what a third-level page table entry can map
|
||||
|
@ -300,10 +300,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
|
||||
*/
|
||||
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
|
||||
|
||||
/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
|
||||
/* FIXME: this is not correct */
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
/*
|
||||
* We provide our own arch_get_unmapped_area to cope with VIPT caches.
|
||||
*/
|
||||
|
@ -1020,8 +1020,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
|
||||
*/
|
||||
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
|
||||
|
||||
extern int kern_addr_valid(unsigned long addr);
|
||||
|
||||
#ifdef CONFIG_ARM64_MTE
|
||||
|
||||
#define __HAVE_ARCH_PREPARE_TO_SWAP
|
||||
|
@ -814,53 +814,6 @@ void __init paging_init(void)
|
||||
create_idmap();
|
||||
}
|
||||
|
||||
/*
|
||||
* Check whether a kernel address is valid (derived from arch/x86/).
|
||||
*/
|
||||
int kern_addr_valid(unsigned long addr)
|
||||
{
|
||||
pgd_t *pgdp;
|
||||
p4d_t *p4dp;
|
||||
pud_t *pudp, pud;
|
||||
pmd_t *pmdp, pmd;
|
||||
pte_t *ptep, pte;
|
||||
|
||||
addr = arch_kasan_reset_tag(addr);
|
||||
if ((((long)addr) >> VA_BITS) != -1UL)
|
||||
return 0;
|
||||
|
||||
pgdp = pgd_offset_k(addr);
|
||||
if (pgd_none(READ_ONCE(*pgdp)))
|
||||
return 0;
|
||||
|
||||
p4dp = p4d_offset(pgdp, addr);
|
||||
if (p4d_none(READ_ONCE(*p4dp)))
|
||||
return 0;
|
||||
|
||||
pudp = pud_offset(p4dp, addr);
|
||||
pud = READ_ONCE(*pudp);
|
||||
if (pud_none(pud))
|
||||
return 0;
|
||||
|
||||
if (pud_sect(pud))
|
||||
return pfn_valid(pud_pfn(pud));
|
||||
|
||||
pmdp = pmd_offset(pudp, addr);
|
||||
pmd = READ_ONCE(*pmdp);
|
||||
if (pmd_none(pmd))
|
||||
return 0;
|
||||
|
||||
if (pmd_sect(pmd))
|
||||
return pfn_valid(pmd_pfn(pmd));
|
||||
|
||||
ptep = pte_offset_kernel(pmdp, addr);
|
||||
pte = READ_ONCE(*ptep);
|
||||
if (pte_none(pte))
|
||||
return 0;
|
||||
|
||||
return pfn_valid(pte_pfn(pte));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
static void free_hotplug_page_range(struct page *page, size_t size,
|
||||
struct vmem_altmap *altmap)
|
||||
@ -1184,53 +1137,28 @@ static void free_empty_tables(unsigned long addr, unsigned long end,
|
||||
}
|
||||
#endif
|
||||
|
||||
void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
|
||||
unsigned long addr, unsigned long next)
|
||||
{
|
||||
pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
|
||||
}
|
||||
|
||||
int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
|
||||
unsigned long addr, unsigned long next)
|
||||
{
|
||||
vmemmap_verify((pte_t *)pmdp, node, addr, next);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
||||
struct vmem_altmap *altmap)
|
||||
{
|
||||
unsigned long addr = start;
|
||||
unsigned long next;
|
||||
pgd_t *pgdp;
|
||||
p4d_t *p4dp;
|
||||
pud_t *pudp;
|
||||
pmd_t *pmdp;
|
||||
|
||||
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
|
||||
|
||||
if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES))
|
||||
return vmemmap_populate_basepages(start, end, node, altmap);
|
||||
|
||||
do {
|
||||
next = pmd_addr_end(addr, end);
|
||||
|
||||
pgdp = vmemmap_pgd_populate(addr, node);
|
||||
if (!pgdp)
|
||||
return -ENOMEM;
|
||||
|
||||
p4dp = vmemmap_p4d_populate(pgdp, addr, node);
|
||||
if (!p4dp)
|
||||
return -ENOMEM;
|
||||
|
||||
pudp = vmemmap_pud_populate(p4dp, addr, node);
|
||||
if (!pudp)
|
||||
return -ENOMEM;
|
||||
|
||||
pmdp = pmd_offset(pudp, addr);
|
||||
if (pmd_none(READ_ONCE(*pmdp))) {
|
||||
void *p = NULL;
|
||||
|
||||
p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
|
||||
if (!p) {
|
||||
if (vmemmap_populate_basepages(addr, next, node, altmap))
|
||||
return -ENOMEM;
|
||||
continue;
|
||||
}
|
||||
|
||||
pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
|
||||
} else
|
||||
vmemmap_verify((pte_t *)pmdp, node, addr, next);
|
||||
} while (addr = next, addr != end);
|
||||
|
||||
return 0;
|
||||
else
|
||||
return vmemmap_populate_hugepages(start, end, node, altmap);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
|
@ -202,8 +202,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
|
||||
|
||||
/*
|
||||
* This function is used to determine if a linear map page has been marked as
|
||||
* not-valid. Walk the page table and check the PTE_VALID bit. This is based
|
||||
* on kern_addr_valid(), which almost does what we need.
|
||||
* not-valid. Walk the page table and check the PTE_VALID bit.
|
||||
*
|
||||
* Because this is only called on the kernel linear map, p?d_sect() implies
|
||||
* p?d_present(). When debug_pagealloc is enabled, sections mappings are
|
||||
|
@ -249,9 +249,6 @@ extern void paging_init(void);
|
||||
void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
|
||||
pte_t *pte);
|
||||
|
||||
/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
|
||||
remap_pfn_range(vma, vaddr, pfn, size, prot)
|
||||
|
||||
|
@ -131,13 +131,6 @@ static inline void clear_page(void *page)
|
||||
|
||||
#define page_to_virt(page) __va(page_to_phys(page))
|
||||
|
||||
/*
|
||||
* For port to Hexagon Virtual Machine, MAYBE we check for attempts
|
||||
* to reference reserved HVM space, but in any case, the VM will be
|
||||
* protected.
|
||||
*/
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
#include <asm/mem-layout.h>
|
||||
#include <asm-generic/memory_model.h>
|
||||
/* XXX Todo: implement assembly-optimized version of getorder. */
|
||||
|
@ -181,22 +181,6 @@ ia64_phys_addr_valid (unsigned long addr)
|
||||
return (addr & (local_cpu_data->unimpl_pa_mask)) == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
|
||||
* memory. For the return value to be meaningful, ADDR must be >=
|
||||
* PAGE_OFFSET. This operation can be relatively expensive (e.g.,
|
||||
* require a hash-, or multi-level tree-lookup or something of that
|
||||
* sort) but it guarantees to return TRUE only if accessing the page
|
||||
* at that address does not cause an error. Note that there may be
|
||||
* addresses for which kern_addr_valid() returns FALSE even though an
|
||||
* access would not cause an error (e.g., this is typically true for
|
||||
* memory mapped I/O regions.
|
||||
*
|
||||
* XXX Need to implement this for IA-64.
|
||||
*/
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
|
||||
/*
|
||||
* Now come the defines and routines to manage and access the three-level
|
||||
* page table.
|
||||
|
@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file,
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
|
||||
{
|
||||
struct page *page;
|
||||
pte_t *ptep;
|
||||
|
||||
if (REGION_NUMBER(addr) != RGN_HPAGE)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
|
||||
if (!ptep || pte_none(*ptep))
|
||||
return NULL;
|
||||
page = pte_page(*ptep);
|
||||
page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
|
||||
return page;
|
||||
}
|
||||
int pmd_huge(pmd_t pmd)
|
||||
{
|
||||
return 0;
|
||||
|
@ -53,6 +53,7 @@ config LOONGARCH
|
||||
select ARCH_USE_QUEUED_RWLOCKS
|
||||
select ARCH_USE_QUEUED_SPINLOCKS
|
||||
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
|
||||
select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
select ARCH_WANT_LD_ORPHAN_WARN
|
||||
select ARCH_WANTS_NO_INSTR
|
||||
select BUILDTIME_TABLE_SORT
|
||||
@ -488,6 +489,7 @@ config ARCH_FLATMEM_ENABLE
|
||||
|
||||
config ARCH_SPARSEMEM_ENABLE
|
||||
def_bool y
|
||||
select SPARSEMEM_VMEMMAP_ENABLE
|
||||
help
|
||||
Say Y to support efficient handling of sparse physical memory,
|
||||
for architectures which are either NUMA (Non-Uniform Memory Access)
|
||||
|
@ -42,15 +42,6 @@ static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
|
||||
|
||||
extern void pagetable_init(void);
|
||||
|
||||
/*
|
||||
* Initialize a new pmd table with invalid pointers.
|
||||
*/
|
||||
extern void pmd_init(unsigned long page, unsigned long pagetable);
|
||||
|
||||
/*
|
||||
* Initialize a new pgd / pmd table with invalid pointers.
|
||||
*/
|
||||
extern void pgd_init(unsigned long page);
|
||||
extern pgd_t *pgd_alloc(struct mm_struct *mm);
|
||||
|
||||
#define __pte_free_tlb(tlb, pte, address) \
|
||||
@ -76,7 +67,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
|
||||
}
|
||||
|
||||
pmd = (pmd_t *)page_address(pg);
|
||||
pmd_init((unsigned long)pmd, (unsigned long)invalid_pte_table);
|
||||
pmd_init(pmd);
|
||||
return pmd;
|
||||
}
|
||||
|
||||
@ -92,7 +83,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
|
||||
|
||||
pud = (pud_t *) __get_free_page(GFP_KERNEL);
|
||||
if (pud)
|
||||
pud_init((unsigned long)pud, (unsigned long)invalid_pmd_table);
|
||||
pud_init(pud);
|
||||
return pud;
|
||||
}
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
#include <linux/compiler.h>
|
||||
#include <asm/addrspace.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgtable-bits.h>
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS == 2
|
||||
@ -59,6 +60,7 @@
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <asm/fixmap.h>
|
||||
#include <asm/sparsemem.h>
|
||||
|
||||
struct mm_struct;
|
||||
struct vm_area_struct;
|
||||
@ -86,7 +88,10 @@ extern unsigned long zero_page_mask;
|
||||
#define VMALLOC_START MODULES_END
|
||||
#define VMALLOC_END \
|
||||
(vm_map_base + \
|
||||
min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE)
|
||||
min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE - VMEMMAP_SIZE)
|
||||
|
||||
#define vmemmap ((struct page *)((VMALLOC_END + PMD_SIZE) & PMD_MASK))
|
||||
#define VMEMMAP_END ((unsigned long)vmemmap + VMEMMAP_SIZE - 1)
|
||||
|
||||
#define pte_ERROR(e) \
|
||||
pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
|
||||
@ -237,11 +242,11 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pm
|
||||
#define pfn_pmd(pfn, prot) __pmd(((pfn) << _PFN_SHIFT) | pgprot_val(prot))
|
||||
|
||||
/*
|
||||
* Initialize a new pgd / pmd table with invalid pointers.
|
||||
* Initialize a new pgd / pud / pmd table with invalid pointers.
|
||||
*/
|
||||
extern void pgd_init(unsigned long page);
|
||||
extern void pud_init(unsigned long page, unsigned long pagetable);
|
||||
extern void pmd_init(unsigned long page, unsigned long pagetable);
|
||||
extern void pgd_init(void *addr);
|
||||
extern void pud_init(void *addr);
|
||||
extern void pmd_init(void *addr);
|
||||
|
||||
/*
|
||||
* Non-present pages: high 40 bits are offset, next 8 bits type,
|
||||
@ -425,8 +430,6 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
|
||||
__update_tlb(vma, address, (pte_t *)pmdp);
|
||||
}
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
static inline unsigned long pmd_pfn(pmd_t pmd)
|
||||
{
|
||||
return (pmd_val(pmd) & _PFN_MASK) >> _PFN_SHIFT;
|
||||
|
@ -11,8 +11,16 @@
|
||||
#define SECTION_SIZE_BITS 29 /* 2^29 = Largest Huge Page Size */
|
||||
#define MAX_PHYSMEM_BITS 48
|
||||
|
||||
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||
#define VMEMMAP_SIZE (sizeof(struct page) * (1UL << (cpu_pabits + 1 - PAGE_SHIFT)))
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_SPARSEMEM */
|
||||
|
||||
#ifndef VMEMMAP_SIZE
|
||||
#define VMEMMAP_SIZE 0 /* 1, For FLATMEM; 2, For SPARSEMEM without VMEMMAP. */
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
int memory_add_physaddr_to_nid(u64 addr);
|
||||
#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
|
||||
|
@ -78,7 +78,7 @@ void __init pcpu_populate_pte(unsigned long addr)
|
||||
new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
pgd_populate(&init_mm, pgd, new);
|
||||
#ifndef __PAGETABLE_PUD_FOLDED
|
||||
pud_init((unsigned long)new, (unsigned long)invalid_pmd_table);
|
||||
pud_init(new);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -89,7 +89,7 @@ void __init pcpu_populate_pte(unsigned long addr)
|
||||
new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
pud_populate(&init_mm, pud, new);
|
||||
#ifndef __PAGETABLE_PMD_FOLDED
|
||||
pmd_init((unsigned long)new, (unsigned long)invalid_pte_table);
|
||||
pmd_init(new);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -22,7 +22,7 @@
|
||||
#include <linux/pfn.h>
|
||||
#include <linux/hardirq.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/initrd.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/mmzone.h>
|
||||
|
||||
#include <asm/asm-offsets.h>
|
||||
@ -152,6 +152,45 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||
void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
|
||||
unsigned long addr, unsigned long next)
|
||||
{
|
||||
pmd_t entry;
|
||||
|
||||
entry = pfn_pmd(virt_to_pfn(p), PAGE_KERNEL);
|
||||
pmd_val(entry) |= _PAGE_HUGE | _PAGE_HGLOBAL;
|
||||
set_pmd_at(&init_mm, addr, pmd, entry);
|
||||
}
|
||||
|
||||
int __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
|
||||
unsigned long addr, unsigned long next)
|
||||
{
|
||||
int huge = pmd_val(*pmd) & _PAGE_HUGE;
|
||||
|
||||
if (huge)
|
||||
vmemmap_verify((pte_t *)pmd, node, addr, next);
|
||||
|
||||
return huge;
|
||||
}
|
||||
|
||||
int __meminit vmemmap_populate(unsigned long start, unsigned long end,
|
||||
int node, struct vmem_altmap *altmap)
|
||||
{
|
||||
#if CONFIG_PGTABLE_LEVELS == 2
|
||||
return vmemmap_populate_basepages(start, end, node, NULL);
|
||||
#else
|
||||
return vmemmap_populate_hugepages(start, end, node, NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static pte_t *fixmap_pte(unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
@ -168,7 +207,7 @@ static pte_t *fixmap_pte(unsigned long addr)
|
||||
new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
|
||||
pgd_populate(&init_mm, pgd, new);
|
||||
#ifndef __PAGETABLE_PUD_FOLDED
|
||||
pud_init((unsigned long)new, (unsigned long)invalid_pmd_table);
|
||||
pud_init(new);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -179,7 +218,7 @@ static pte_t *fixmap_pte(unsigned long addr)
|
||||
new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
|
||||
pud_populate(&init_mm, pud, new);
|
||||
#ifndef __PAGETABLE_PMD_FOLDED
|
||||
pmd_init((unsigned long)new, (unsigned long)invalid_pte_table);
|
||||
pmd_init(new);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -16,7 +16,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
ret = (pgd_t *) __get_free_page(GFP_KERNEL);
|
||||
if (ret) {
|
||||
init = pgd_offset(&init_mm, 0UL);
|
||||
pgd_init((unsigned long)ret);
|
||||
pgd_init(ret);
|
||||
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
|
||||
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
|
||||
}
|
||||
@ -25,7 +25,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(pgd_alloc);
|
||||
|
||||
void pgd_init(unsigned long page)
|
||||
void pgd_init(void *addr)
|
||||
{
|
||||
unsigned long *p, *end;
|
||||
unsigned long entry;
|
||||
@ -38,7 +38,7 @@ void pgd_init(unsigned long page)
|
||||
entry = (unsigned long)invalid_pte_table;
|
||||
#endif
|
||||
|
||||
p = (unsigned long *) page;
|
||||
p = (unsigned long *)addr;
|
||||
end = p + PTRS_PER_PGD;
|
||||
|
||||
do {
|
||||
@ -56,11 +56,12 @@ void pgd_init(unsigned long page)
|
||||
EXPORT_SYMBOL_GPL(pgd_init);
|
||||
|
||||
#ifndef __PAGETABLE_PMD_FOLDED
|
||||
void pmd_init(unsigned long addr, unsigned long pagetable)
|
||||
void pmd_init(void *addr)
|
||||
{
|
||||
unsigned long *p, *end;
|
||||
unsigned long pagetable = (unsigned long)invalid_pte_table;
|
||||
|
||||
p = (unsigned long *) addr;
|
||||
p = (unsigned long *)addr;
|
||||
end = p + PTRS_PER_PMD;
|
||||
|
||||
do {
|
||||
@ -79,9 +80,10 @@ EXPORT_SYMBOL_GPL(pmd_init);
|
||||
#endif
|
||||
|
||||
#ifndef __PAGETABLE_PUD_FOLDED
|
||||
void pud_init(unsigned long addr, unsigned long pagetable)
|
||||
void pud_init(void *addr)
|
||||
{
|
||||
unsigned long *p, *end;
|
||||
unsigned long pagetable = (unsigned long)invalid_pmd_table;
|
||||
|
||||
p = (unsigned long *)addr;
|
||||
end = p + PTRS_PER_PUD;
|
||||
@ -98,6 +100,7 @@ void pud_init(unsigned long addr, unsigned long pagetable)
|
||||
p[-1] = pagetable;
|
||||
} while (p != end);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(pud_init);
|
||||
#endif
|
||||
|
||||
pmd_t mk_pmd(struct page *page, pgprot_t prot)
|
||||
@ -119,12 +122,12 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
|
||||
void __init pagetable_init(void)
|
||||
{
|
||||
/* Initialize the entire pgd. */
|
||||
pgd_init((unsigned long)swapper_pg_dir);
|
||||
pgd_init((unsigned long)invalid_pg_dir);
|
||||
pgd_init(swapper_pg_dir);
|
||||
pgd_init(invalid_pg_dir);
|
||||
#ifndef __PAGETABLE_PUD_FOLDED
|
||||
pud_init((unsigned long)invalid_pud_table, (unsigned long)invalid_pmd_table);
|
||||
pud_init(invalid_pud_table);
|
||||
#endif
|
||||
#ifndef __PAGETABLE_PMD_FOLDED
|
||||
pmd_init((unsigned long)invalid_pmd_table, (unsigned long)invalid_pte_table);
|
||||
pmd_init(invalid_pmd_table);
|
||||
#endif
|
||||
}
|
||||
|
@ -145,8 +145,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
/* MMU-specific headers */
|
||||
|
||||
#ifdef CONFIG_SUN3
|
||||
|
@ -20,7 +20,6 @@
|
||||
#define pgd_none(pgd) (0)
|
||||
#define pgd_bad(pgd) (0)
|
||||
#define pgd_clear(pgdp)
|
||||
#define kern_addr_valid(addr) (1)
|
||||
#define pmd_offset(a, b) ((void *)0)
|
||||
|
||||
#define PAGE_NONE __pgprot(0)
|
||||
|
@ -416,9 +416,6 @@ extern unsigned long iopa(unsigned long addr);
|
||||
#define IOMAP_NOCACHE_NONSER 2
|
||||
#define IOMAP_NO_COPYBACK 3
|
||||
|
||||
/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
void do_page_fault(struct pt_regs *regs, unsigned long address,
|
||||
unsigned long error_code);
|
||||
|
||||
|
@ -33,7 +33,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
|
||||
/*
|
||||
* Initialize a new pmd table with invalid pointers.
|
||||
*/
|
||||
extern void pmd_init(unsigned long page, unsigned long pagetable);
|
||||
extern void pmd_init(void *addr);
|
||||
|
||||
#ifndef __PAGETABLE_PMD_FOLDED
|
||||
|
||||
@ -44,9 +44,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Initialize a new pgd / pmd table with invalid pointers.
|
||||
* Initialize a new pgd table with invalid pointers.
|
||||
*/
|
||||
extern void pgd_init(unsigned long page);
|
||||
extern void pgd_init(void *addr);
|
||||
extern pgd_t *pgd_alloc(struct mm_struct *mm);
|
||||
|
||||
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
||||
@ -77,7 +77,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
|
||||
}
|
||||
|
||||
pmd = (pmd_t *)page_address(pg);
|
||||
pmd_init((unsigned long)pmd, (unsigned long)invalid_pte_table);
|
||||
pmd_init(pmd);
|
||||
return pmd;
|
||||
}
|
||||
|
||||
@ -93,7 +93,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
|
||||
|
||||
pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_TABLE_ORDER);
|
||||
if (pud)
|
||||
pud_init((unsigned long)pud, (unsigned long)invalid_pmd_table);
|
||||
pud_init(pud);
|
||||
return pud;
|
||||
}
|
||||
|
||||
|
@ -313,11 +313,11 @@ static inline pmd_t *pud_pgtable(pud_t pud)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Initialize a new pgd / pmd table with invalid pointers.
|
||||
* Initialize a new pgd / pud / pmd table with invalid pointers.
|
||||
*/
|
||||
extern void pgd_init(unsigned long page);
|
||||
extern void pud_init(unsigned long page, unsigned long pagetable);
|
||||
extern void pmd_init(unsigned long page, unsigned long pagetable);
|
||||
extern void pgd_init(void *addr);
|
||||
extern void pud_init(void *addr);
|
||||
extern void pmd_init(void *addr);
|
||||
|
||||
/*
|
||||
* Non-present pages: high 40 bits are offset, next 8 bits type,
|
||||
|
@ -550,8 +550,6 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
|
||||
__update_tlb(vma, address, pte);
|
||||
}
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
/*
|
||||
* Allow physical addresses to be fixed up to help 36-bit peripherals.
|
||||
*/
|
||||
|
@ -122,8 +122,7 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
|
||||
if (!cache)
|
||||
return NULL;
|
||||
new_pmd = kvm_mmu_memory_cache_alloc(cache);
|
||||
pmd_init((unsigned long)new_pmd,
|
||||
(unsigned long)invalid_pte_table);
|
||||
pmd_init(new_pmd);
|
||||
pud_populate(NULL, pud, new_pmd);
|
||||
}
|
||||
pmd = pmd_offset(pud, addr);
|
||||
|
@ -13,9 +13,9 @@
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
void pgd_init(unsigned long page)
|
||||
void pgd_init(void *addr)
|
||||
{
|
||||
unsigned long *p = (unsigned long *) page;
|
||||
unsigned long *p = (unsigned long *)addr;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < USER_PTRS_PER_PGD; i+=8) {
|
||||
@ -61,9 +61,8 @@ void __init pagetable_init(void)
|
||||
#endif
|
||||
|
||||
/* Initialize the entire pgd. */
|
||||
pgd_init((unsigned long)swapper_pg_dir);
|
||||
pgd_init((unsigned long)swapper_pg_dir
|
||||
+ sizeof(pgd_t) * USER_PTRS_PER_PGD);
|
||||
pgd_init(swapper_pg_dir);
|
||||
pgd_init(&swapper_pg_dir[USER_PTRS_PER_PGD]);
|
||||
|
||||
pgd_base = swapper_pg_dir;
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
void pgd_init(unsigned long page)
|
||||
void pgd_init(void *addr)
|
||||
{
|
||||
unsigned long *p, *end;
|
||||
unsigned long entry;
|
||||
@ -26,7 +26,7 @@ void pgd_init(unsigned long page)
|
||||
entry = (unsigned long)invalid_pte_table;
|
||||
#endif
|
||||
|
||||
p = (unsigned long *) page;
|
||||
p = (unsigned long *) addr;
|
||||
end = p + PTRS_PER_PGD;
|
||||
|
||||
do {
|
||||
@ -43,11 +43,12 @@ void pgd_init(unsigned long page)
|
||||
}
|
||||
|
||||
#ifndef __PAGETABLE_PMD_FOLDED
|
||||
void pmd_init(unsigned long addr, unsigned long pagetable)
|
||||
void pmd_init(void *addr)
|
||||
{
|
||||
unsigned long *p, *end;
|
||||
unsigned long pagetable = (unsigned long)invalid_pte_table;
|
||||
|
||||
p = (unsigned long *) addr;
|
||||
p = (unsigned long *)addr;
|
||||
end = p + PTRS_PER_PMD;
|
||||
|
||||
do {
|
||||
@ -66,9 +67,10 @@ EXPORT_SYMBOL_GPL(pmd_init);
|
||||
#endif
|
||||
|
||||
#ifndef __PAGETABLE_PUD_FOLDED
|
||||
void pud_init(unsigned long addr, unsigned long pagetable)
|
||||
void pud_init(void *addr)
|
||||
{
|
||||
unsigned long *p, *end;
|
||||
unsigned long pagetable = (unsigned long)invalid_pmd_table;
|
||||
|
||||
p = (unsigned long *)addr;
|
||||
end = p + PTRS_PER_PUD;
|
||||
@ -108,12 +110,12 @@ void __init pagetable_init(void)
|
||||
pgd_t *pgd_base;
|
||||
|
||||
/* Initialize the entire pgd. */
|
||||
pgd_init((unsigned long)swapper_pg_dir);
|
||||
pgd_init(swapper_pg_dir);
|
||||
#ifndef __PAGETABLE_PUD_FOLDED
|
||||
pud_init((unsigned long)invalid_pud_table, (unsigned long)invalid_pmd_table);
|
||||
pud_init(invalid_pud_table);
|
||||
#endif
|
||||
#ifndef __PAGETABLE_PMD_FOLDED
|
||||
pmd_init((unsigned long)invalid_pmd_table, (unsigned long)invalid_pte_table);
|
||||
pmd_init(invalid_pmd_table);
|
||||
#endif
|
||||
pgd_base = swapper_pg_dir;
|
||||
/*
|
||||
|
@ -15,7 +15,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER);
|
||||
if (ret) {
|
||||
init = pgd_offset(&init_mm, 0UL);
|
||||
pgd_init((unsigned long)ret);
|
||||
pgd_init(ret);
|
||||
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
|
||||
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
|
||||
}
|
||||
|
@ -26,11 +26,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
|
||||
set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize a new pmd table with invalid pointers.
|
||||
*/
|
||||
extern void pmd_init(unsigned long page, unsigned long pagetable);
|
||||
|
||||
extern pgd_t *pgd_alloc(struct mm_struct *mm);
|
||||
|
||||
#define __pte_free_tlb(tlb, pte, addr) \
|
||||
|
@ -249,8 +249,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
|
||||
#define __swp_entry_to_pte(swp) ((pte_t) { (swp).val })
|
||||
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
extern void __init paging_init(void);
|
||||
extern void __init mmu_init(void);
|
||||
|
||||
|
@ -50,9 +50,6 @@ struct thread_struct {
|
||||
unsigned long kpsr;
|
||||
};
|
||||
|
||||
#define INIT_MMAP \
|
||||
{ &init_mm, (0), (0), __pgprot(0x0), VM_READ | VM_WRITE | VM_EXEC }
|
||||
|
||||
# define INIT_THREAD { \
|
||||
.kregs = NULL, \
|
||||
.ksp = 0, \
|
||||
|
@ -395,8 +395,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
|
||||
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
|
||||
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
typedef pte_t *pte_addr_t;
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
@ -23,21 +23,6 @@
|
||||
#include <asm/processor.h>
|
||||
#include <asm/cache.h>
|
||||
|
||||
/*
|
||||
* kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
|
||||
* memory. For the return value to be meaningful, ADDR must be >=
|
||||
* PAGE_OFFSET. This operation can be relatively expensive (e.g.,
|
||||
* require a hash-, or multi-level tree-lookup or something of that
|
||||
* sort) but it guarantees to return TRUE only if accessing the page
|
||||
* at that address does not cause an error. Note that there may be
|
||||
* addresses for which kern_addr_valid() returns FALSE even though an
|
||||
* access would not cause an error (e.g., this is typically true for
|
||||
* memory mapped I/O regions.
|
||||
*
|
||||
* XXX Need to implement this for parisc.
|
||||
*/
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
/* This is for the serialization of PxTLB broadcasts. At least on the N class
|
||||
* systems, only one PxTLB inter processor broadcast can be active at any one
|
||||
* time on the Merced bus. */
|
||||
|
@ -18,8 +18,7 @@
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/initrd.h>
|
||||
#include <linux/pgtable.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/mm.h>
|
||||
|
||||
#include <asm/pdc.h>
|
||||
#include <asm/pdcpat.h>
|
||||
@ -232,7 +231,7 @@ void __init pdc_pdt_init(void)
|
||||
|
||||
/* mark memory page bad */
|
||||
memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE);
|
||||
num_poisoned_pages_inc();
|
||||
num_poisoned_pages_inc(addr >> PAGE_SHIFT);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -401,35 +401,9 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
|
||||
#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
|
||||
#define pmdp_clear_flush_young pmdp_test_and_clear_young
|
||||
|
||||
static inline int __pte_write(pte_t pte)
|
||||
{
|
||||
return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
#define pte_savedwrite pte_savedwrite
|
||||
static inline bool pte_savedwrite(pte_t pte)
|
||||
{
|
||||
/*
|
||||
* Saved write ptes are prot none ptes that doesn't have
|
||||
* privileged bit sit. We mark prot none as one which has
|
||||
* present and pviliged bit set and RWX cleared. To mark
|
||||
* protnone which used to have _PAGE_WRITE set we clear
|
||||
* the privileged bit.
|
||||
*/
|
||||
return !(pte_raw(pte) & cpu_to_be64(_PAGE_RWX | _PAGE_PRIVILEGED));
|
||||
}
|
||||
#else
|
||||
#define pte_savedwrite pte_savedwrite
|
||||
static inline bool pte_savedwrite(pte_t pte)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int pte_write(pte_t pte)
|
||||
{
|
||||
return __pte_write(pte) || pte_savedwrite(pte);
|
||||
return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE));
|
||||
}
|
||||
|
||||
static inline int pte_read(pte_t pte)
|
||||
@ -441,24 +415,16 @@ static inline int pte_read(pte_t pte)
|
||||
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep)
|
||||
{
|
||||
if (__pte_write(*ptep))
|
||||
if (pte_write(*ptep))
|
||||
pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0);
|
||||
else if (unlikely(pte_savedwrite(*ptep)))
|
||||
pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0);
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
|
||||
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
/*
|
||||
* We should not find protnone for hugetlb, but this complete the
|
||||
* interface.
|
||||
*/
|
||||
if (__pte_write(*ptep))
|
||||
if (pte_write(*ptep))
|
||||
pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1);
|
||||
else if (unlikely(pte_savedwrite(*ptep)))
|
||||
pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 1);
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
|
||||
@ -535,36 +501,6 @@ static inline int pte_protnone(pte_t pte)
|
||||
return (pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE | _PAGE_RWX)) ==
|
||||
cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE);
|
||||
}
|
||||
|
||||
#define pte_mk_savedwrite pte_mk_savedwrite
|
||||
static inline pte_t pte_mk_savedwrite(pte_t pte)
|
||||
{
|
||||
/*
|
||||
* Used by Autonuma subsystem to preserve the write bit
|
||||
* while marking the pte PROT_NONE. Only allow this
|
||||
* on PROT_NONE pte
|
||||
*/
|
||||
VM_BUG_ON((pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_RWX | _PAGE_PRIVILEGED)) !=
|
||||
cpu_to_be64(_PAGE_PRESENT | _PAGE_PRIVILEGED));
|
||||
return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED));
|
||||
}
|
||||
|
||||
#define pte_clear_savedwrite pte_clear_savedwrite
|
||||
static inline pte_t pte_clear_savedwrite(pte_t pte)
|
||||
{
|
||||
/*
|
||||
* Used by KSM subsystem to make a protnone pte readonly.
|
||||
*/
|
||||
VM_BUG_ON(!pte_protnone(pte));
|
||||
return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED));
|
||||
}
|
||||
#else
|
||||
#define pte_clear_savedwrite pte_clear_savedwrite
|
||||
static inline pte_t pte_clear_savedwrite(pte_t pte)
|
||||
{
|
||||
VM_WARN_ON(1);
|
||||
return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE));
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
static inline bool pte_hw_valid(pte_t pte)
|
||||
@ -641,8 +577,6 @@ static inline unsigned long pte_pfn(pte_t pte)
|
||||
/* Generic modifiers for PTE bits */
|
||||
static inline pte_t pte_wrprotect(pte_t pte)
|
||||
{
|
||||
if (unlikely(pte_savedwrite(pte)))
|
||||
return pte_clear_savedwrite(pte);
|
||||
return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE));
|
||||
}
|
||||
|
||||
@ -1139,8 +1073,6 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
|
||||
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
|
||||
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
|
||||
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
|
||||
#define pmd_mk_savedwrite(pmd) pte_pmd(pte_mk_savedwrite(pmd_pte(pmd)))
|
||||
#define pmd_clear_savedwrite(pmd) pte_pmd(pte_clear_savedwrite(pmd_pte(pmd)))
|
||||
|
||||
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
|
||||
#define pmd_soft_dirty(pmd) pte_soft_dirty(pmd_pte(pmd))
|
||||
@ -1162,8 +1094,6 @@ static inline int pmd_protnone(pmd_t pmd)
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
|
||||
#define __pmd_write(pmd) __pte_write(pmd_pte(pmd))
|
||||
#define pmd_savedwrite(pmd) pte_savedwrite(pmd_pte(pmd))
|
||||
|
||||
#define pmd_access_permitted pmd_access_permitted
|
||||
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
|
||||
@ -1241,10 +1171,8 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
|
||||
static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t *pmdp)
|
||||
{
|
||||
if (__pmd_write((*pmdp)))
|
||||
if (pmd_write(*pmdp))
|
||||
pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0);
|
||||
else if (unlikely(pmd_savedwrite(*pmdp)))
|
||||
pmd_hugepage_update(mm, addr, pmdp, 0, _PAGE_PRIVILEGED);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -81,13 +81,6 @@ void poking_init(void);
|
||||
extern unsigned long ioremap_bot;
|
||||
extern const pgprot_t protection_map[16];
|
||||
|
||||
/*
|
||||
* kern_addr_valid is intended to indicate whether an address is a valid
|
||||
* kernel address. Most 32-bit archs define it as always true (like this)
|
||||
* but most 64-bit archs actually perform a test. What should we do here?
|
||||
*/
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
#define pmd_large(pmd) 0
|
||||
#endif
|
||||
|
@ -265,7 +265,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
|
||||
}
|
||||
pte = kvmppc_read_update_linux_pte(ptep, writing);
|
||||
if (pte_present(pte) && !pte_protnone(pte)) {
|
||||
if (writing && !__pte_write(pte))
|
||||
if (writing && !pte_write(pte))
|
||||
/* make the actual HPTE be read-only */
|
||||
ptel = hpte_make_readonly(ptel);
|
||||
is_ci = pte_ci(pte);
|
||||
|
@ -506,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
|
||||
} while (addr = next, addr != end);
|
||||
}
|
||||
|
||||
struct page *follow_huge_pd(struct vm_area_struct *vma,
|
||||
unsigned long address, hugepd_t hpd,
|
||||
int flags, int pdshift)
|
||||
{
|
||||
pte_t *ptep;
|
||||
spinlock_t *ptl;
|
||||
struct page *page = NULL;
|
||||
unsigned long mask;
|
||||
int shift = hugepd_shift(hpd);
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
|
||||
retry:
|
||||
/*
|
||||
* hugepage directory entries are protected by mm->page_table_lock
|
||||
* Use this instead of huge_pte_lockptr
|
||||
*/
|
||||
ptl = &mm->page_table_lock;
|
||||
spin_lock(ptl);
|
||||
|
||||
ptep = hugepte_offset(hpd, address, pdshift);
|
||||
if (pte_present(*ptep)) {
|
||||
mask = (1UL << shift) - 1;
|
||||
page = pte_page(*ptep);
|
||||
page += ((address & mask) >> PAGE_SHIFT);
|
||||
if (flags & FOLL_GET)
|
||||
get_page(page);
|
||||
} else {
|
||||
if (is_hugetlb_entry_migration(*ptep)) {
|
||||
spin_unlock(ptl);
|
||||
__migration_entry_wait(mm, ptep, ptl);
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
return page;
|
||||
}
|
||||
|
||||
bool __init arch_hugetlb_valid_size(unsigned long size)
|
||||
{
|
||||
int shift = __ffs(size);
|
||||
|
@ -802,8 +802,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
|
||||
|
||||
#endif /* !CONFIG_MMU */
|
||||
|
||||
#define kern_addr_valid(addr) (1) /* FIXME */
|
||||
|
||||
extern char _start[];
|
||||
extern void *_dtb_early_va;
|
||||
extern uintptr_t _dtb_early_pa;
|
||||
|
@ -1774,8 +1774,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
|
||||
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
|
||||
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
extern int vmem_add_mapping(unsigned long start, unsigned long size);
|
||||
extern void vmem_remove_mapping(unsigned long start, unsigned long size);
|
||||
extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc);
|
||||
|
@ -25,7 +25,8 @@
|
||||
void __tlb_remove_table(void *_table);
|
||||
static inline void tlb_flush(struct mmu_gather *tlb);
|
||||
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
|
||||
struct page *page, int page_size);
|
||||
struct encoded_page *page,
|
||||
int page_size);
|
||||
|
||||
#define tlb_flush tlb_flush
|
||||
#define pte_free_tlb pte_free_tlb
|
||||
@ -40,11 +41,15 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
|
||||
* Release the page cache reference for a pte removed by
|
||||
* tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
|
||||
* has already been freed, so just do free_page_and_swap_cache.
|
||||
*
|
||||
* s390 doesn't delay rmap removal, so there is nothing encoded in
|
||||
* the page pointer.
|
||||
*/
|
||||
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
|
||||
struct page *page, int page_size)
|
||||
struct encoded_page *page,
|
||||
int page_size)
|
||||
{
|
||||
free_page_and_swap_cache(page);
|
||||
free_page_and_swap_cache(encoded_page_ptr(page));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -336,12 +336,11 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
|
||||
static unsigned long __gmap_segment_gaddr(unsigned long *entry)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned long offset, mask;
|
||||
unsigned long offset;
|
||||
|
||||
offset = (unsigned long) entry / sizeof(unsigned long);
|
||||
offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
|
||||
mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
|
||||
page = virt_to_page((void *)((unsigned long) entry & mask));
|
||||
page = pmd_pgtable_page((pmd_t *) entry);
|
||||
return page->index + offset;
|
||||
}
|
||||
|
||||
|
@ -92,8 +92,6 @@ static inline unsigned long phys_addr_mask(void)
|
||||
|
||||
typedef pte_t *pte_addr_t;
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT)))
|
||||
|
||||
struct vm_area_struct;
|
||||
|
@ -368,12 +368,6 @@ __get_iospace (unsigned long addr)
|
||||
}
|
||||
}
|
||||
|
||||
extern unsigned long *sparc_valid_addr_bitmap;
|
||||
|
||||
/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
|
||||
#define kern_addr_valid(addr) \
|
||||
(test_bit(__pa((unsigned long)(addr))>>20, sparc_valid_addr_bitmap))
|
||||
|
||||
/*
|
||||
* For sparc32&64, the pfn in io_remap_pfn_range() carries <iospace> in
|
||||
* its high 4 bits. These macros/functions put it there or get it from there.
|
||||
|
@ -37,8 +37,7 @@
|
||||
|
||||
#include "mm_32.h"
|
||||
|
||||
unsigned long *sparc_valid_addr_bitmap;
|
||||
EXPORT_SYMBOL(sparc_valid_addr_bitmap);
|
||||
static unsigned long *sparc_valid_addr_bitmap;
|
||||
|
||||
unsigned long phys_base;
|
||||
EXPORT_SYMBOL(phys_base);
|
||||
|
@ -1667,7 +1667,6 @@ bool kern_addr_valid(unsigned long addr)
|
||||
|
||||
return pfn_valid(pte_pfn(*pte));
|
||||
}
|
||||
EXPORT_SYMBOL(kern_addr_valid);
|
||||
|
||||
static unsigned long __ref kernel_map_hugepud(unsigned long vstart,
|
||||
unsigned long vend,
|
||||
|
@ -298,8 +298,6 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
|
||||
((swp_entry_t) { pte_val(pte_mkuptodate(pte)) })
|
||||
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
/* Clear a kernel PTE and flush it from the TLB */
|
||||
#define kpte_clear_flush(ptep, vaddr) \
|
||||
do { \
|
||||
|
@ -292,7 +292,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
|
||||
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
|
||||
static inline int pte_uffd_wp(pte_t pte)
|
||||
{
|
||||
return pte_flags(pte) & _PAGE_UFFD_WP;
|
||||
bool wp = pte_flags(pte) & _PAGE_UFFD_WP;
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
/*
|
||||
* Having write bit for wr-protect-marked present ptes is fatal,
|
||||
* because it means the uffd-wp bit will be ignored and write will
|
||||
* just go through.
|
||||
*
|
||||
* Use any chance of pgtable walking to verify this (e.g., when
|
||||
* page swapped out or being migrated for all purposes). It means
|
||||
* something is already wrong. Tell the admin even before the
|
||||
* process crashes. We also nail it with wrong pgtable setup.
|
||||
*/
|
||||
WARN_ON_ONCE(wp && pte_write(pte));
|
||||
#endif
|
||||
|
||||
return wp;
|
||||
}
|
||||
|
||||
static inline pte_t pte_mkuffd_wp(pte_t pte)
|
||||
|
@ -47,15 +47,6 @@ do { \
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
|
||||
/*
|
||||
* kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM
|
||||
*/
|
||||
#ifdef CONFIG_FLATMEM
|
||||
#define kern_addr_valid(addr) (1)
|
||||
#else
|
||||
#define kern_addr_valid(kaddr) (0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This is used to calculate the .brk reservation for initial pagetables.
|
||||
* Enough space is reserved to allocate pagetables sufficient to cover all
|
||||
|
@ -240,7 +240,6 @@ static inline void native_pgd_clear(pgd_t *pgd)
|
||||
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
|
||||
#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
|
||||
|
||||
extern int kern_addr_valid(unsigned long addr);
|
||||
extern void cleanup_highmap(void);
|
||||
|
||||
#define HAVE_ARCH_UNMAPPED_AREA
|
||||
|
@ -268,7 +268,7 @@ static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
|
||||
unsigned long addr,
|
||||
unsigned long vm_flags)
|
||||
{
|
||||
unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
|
||||
unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
|
||||
struct sgx_encl_page *entry;
|
||||
|
||||
entry = xa_load(&encl->page_array, PFN_DOWN(addr));
|
||||
@ -502,7 +502,7 @@ static void sgx_vma_open(struct vm_area_struct *vma)
|
||||
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
|
||||
unsigned long end, unsigned long vm_flags)
|
||||
{
|
||||
unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
|
||||
unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
|
||||
struct sgx_encl_page *page;
|
||||
unsigned long count = 0;
|
||||
int ret = 0;
|
||||
|
@ -1416,47 +1416,6 @@ void mark_rodata_ro(void)
|
||||
debug_checkwx();
|
||||
}
|
||||
|
||||
int kern_addr_valid(unsigned long addr)
|
||||
{
|
||||
unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
|
||||
if (above != 0 && above != -1UL)
|
||||
return 0;
|
||||
|
||||
pgd = pgd_offset_k(addr);
|
||||
if (pgd_none(*pgd))
|
||||
return 0;
|
||||
|
||||
p4d = p4d_offset(pgd, addr);
|
||||
if (!p4d_present(*p4d))
|
||||
return 0;
|
||||
|
||||
pud = pud_offset(p4d, addr);
|
||||
if (!pud_present(*pud))
|
||||
return 0;
|
||||
|
||||
if (pud_large(*pud))
|
||||
return pfn_valid(pud_pfn(*pud));
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (!pmd_present(*pmd))
|
||||
return 0;
|
||||
|
||||
if (pmd_large(*pmd))
|
||||
return pfn_valid(pmd_pfn(*pmd));
|
||||
|
||||
pte = pte_offset_kernel(pmd, addr);
|
||||
if (pte_none(*pte))
|
||||
return 0;
|
||||
|
||||
return pfn_valid(pte_pfn(*pte));
|
||||
}
|
||||
|
||||
/*
|
||||
* Block size is the minimum amount of memory which can be hotplugged or
|
||||
* hotremoved. It must be power of two and must be equal or larger than
|
||||
@ -1533,72 +1492,44 @@ static long __meminitdata addr_start, addr_end;
|
||||
static void __meminitdata *p_start, *p_end;
|
||||
static int __meminitdata node_start;
|
||||
|
||||
static int __meminit vmemmap_populate_hugepages(unsigned long start,
|
||||
unsigned long end, int node, struct vmem_altmap *altmap)
|
||||
void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
|
||||
unsigned long addr, unsigned long next)
|
||||
{
|
||||
unsigned long addr;
|
||||
unsigned long next;
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t entry;
|
||||
|
||||
for (addr = start; addr < end; addr = next) {
|
||||
next = pmd_addr_end(addr, end);
|
||||
entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
|
||||
PAGE_KERNEL_LARGE);
|
||||
set_pmd(pmd, __pmd(pte_val(entry)));
|
||||
|
||||
pgd = vmemmap_pgd_populate(addr, node);
|
||||
if (!pgd)
|
||||
return -ENOMEM;
|
||||
|
||||
p4d = vmemmap_p4d_populate(pgd, addr, node);
|
||||
if (!p4d)
|
||||
return -ENOMEM;
|
||||
|
||||
pud = vmemmap_pud_populate(p4d, addr, node);
|
||||
if (!pud)
|
||||
return -ENOMEM;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (pmd_none(*pmd)) {
|
||||
void *p;
|
||||
|
||||
p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
|
||||
if (p) {
|
||||
pte_t entry;
|
||||
|
||||
entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
|
||||
PAGE_KERNEL_LARGE);
|
||||
set_pmd(pmd, __pmd(pte_val(entry)));
|
||||
|
||||
/* check to see if we have contiguous blocks */
|
||||
if (p_end != p || node_start != node) {
|
||||
if (p_start)
|
||||
pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
|
||||
addr_start, addr_end-1, p_start, p_end-1, node_start);
|
||||
addr_start = addr;
|
||||
node_start = node;
|
||||
p_start = p;
|
||||
}
|
||||
|
||||
addr_end = addr + PMD_SIZE;
|
||||
p_end = p + PMD_SIZE;
|
||||
|
||||
if (!IS_ALIGNED(addr, PMD_SIZE) ||
|
||||
!IS_ALIGNED(next, PMD_SIZE))
|
||||
vmemmap_use_new_sub_pmd(addr, next);
|
||||
|
||||
continue;
|
||||
} else if (altmap)
|
||||
return -ENOMEM; /* no fallback */
|
||||
} else if (pmd_large(*pmd)) {
|
||||
vmemmap_verify((pte_t *)pmd, node, addr, next);
|
||||
vmemmap_use_sub_pmd(addr, next);
|
||||
continue;
|
||||
}
|
||||
if (vmemmap_populate_basepages(addr, next, node, NULL))
|
||||
return -ENOMEM;
|
||||
/* check to see if we have contiguous blocks */
|
||||
if (p_end != p || node_start != node) {
|
||||
if (p_start)
|
||||
pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
|
||||
addr_start, addr_end-1, p_start, p_end-1, node_start);
|
||||
addr_start = addr;
|
||||
node_start = node;
|
||||
p_start = p;
|
||||
}
|
||||
return 0;
|
||||
|
||||
addr_end = addr + PMD_SIZE;
|
||||
p_end = p + PMD_SIZE;
|
||||
|
||||
if (!IS_ALIGNED(addr, PMD_SIZE) ||
|
||||
!IS_ALIGNED(next, PMD_SIZE))
|
||||
vmemmap_use_new_sub_pmd(addr, next);
|
||||
}
|
||||
|
||||
int __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
|
||||
unsigned long addr, unsigned long next)
|
||||
{
|
||||
int large = pmd_large(*pmd);
|
||||
|
||||
if (pmd_large(*pmd)) {
|
||||
vmemmap_verify((pte_t *)pmd, node, addr, next);
|
||||
vmemmap_use_sub_pmd(addr, next);
|
||||
}
|
||||
|
||||
return large;
|
||||
}
|
||||
|
||||
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
||||
|
@ -386,8 +386,6 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
|
||||
|
||||
#else
|
||||
|
||||
#define kern_addr_valid(addr) (1)
|
||||
|
||||
extern void update_mmu_cache(struct vm_area_struct * vma,
|
||||
unsigned long address, pte_t *ptep);
|
||||
|
||||
|
@ -780,11 +780,6 @@ static int hmat_callback(struct notifier_block *self,
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
static struct notifier_block hmat_callback_nb = {
|
||||
.notifier_call = hmat_callback,
|
||||
.priority = 2,
|
||||
};
|
||||
|
||||
static __init void hmat_free_structures(void)
|
||||
{
|
||||
struct memory_target *target, *tnext;
|
||||
@ -867,7 +862,7 @@ static __init int hmat_init(void)
|
||||
hmat_register_targets();
|
||||
|
||||
/* Keep the table and structures if the notifier may use them */
|
||||
if (!register_hotmemory_notifier(&hmat_callback_nb))
|
||||
if (!hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI))
|
||||
return 0;
|
||||
out_put:
|
||||
hmat_free_structures();
|
||||
|
@ -175,6 +175,15 @@ int memory_notify(unsigned long val, void *v)
|
||||
return blocking_notifier_call_chain(&memory_chain, val, v);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
|
||||
static unsigned long memblk_nr_poison(struct memory_block *mem);
|
||||
#else
|
||||
static inline unsigned long memblk_nr_poison(struct memory_block *mem)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int memory_block_online(struct memory_block *mem)
|
||||
{
|
||||
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
|
||||
@ -183,6 +192,9 @@ static int memory_block_online(struct memory_block *mem)
|
||||
struct zone *zone;
|
||||
int ret;
|
||||
|
||||
if (memblk_nr_poison(mem))
|
||||
return -EHWPOISON;
|
||||
|
||||
zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
|
||||
start_pfn, nr_pages);
|
||||
|
||||
@ -864,6 +876,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
|
||||
mem = find_memory_block_by_id(block_id);
|
||||
if (WARN_ON_ONCE(!mem))
|
||||
continue;
|
||||
num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
|
||||
unregister_memory_block_under_nodes(mem);
|
||||
remove_memory_block(mem);
|
||||
}
|
||||
@ -1164,3 +1177,28 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
|
||||
void memblk_nr_poison_inc(unsigned long pfn)
|
||||
{
|
||||
const unsigned long block_id = pfn_to_block_id(pfn);
|
||||
struct memory_block *mem = find_memory_block_by_id(block_id);
|
||||
|
||||
if (mem)
|
||||
atomic_long_inc(&mem->nr_hwpoison);
|
||||
}
|
||||
|
||||
void memblk_nr_poison_sub(unsigned long pfn, long i)
|
||||
{
|
||||
const unsigned long block_id = pfn_to_block_id(pfn);
|
||||
struct memory_block *mem = find_memory_block_by_id(block_id);
|
||||
|
||||
if (mem)
|
||||
atomic_long_sub(i, &mem->nr_hwpoison);
|
||||
}
|
||||
|
||||
static unsigned long memblk_nr_poison(struct memory_block *mem)
|
||||
{
|
||||
return atomic_long_read(&mem->nr_hwpoison);
|
||||
}
|
||||
#endif
|
||||
|
@ -78,3 +78,12 @@ config ZRAM_MEMORY_TRACKING
|
||||
/sys/kernel/debug/zram/zramX/block_state.
|
||||
|
||||
See Documentation/admin-guide/blockdev/zram.rst for more information.
|
||||
|
||||
config ZRAM_MULTI_COMP
|
||||
bool "Enable multiple compression streams"
|
||||
depends on ZRAM
|
||||
help
|
||||
This will enable multi-compression streams, so that ZRAM can
|
||||
re-compress pages using a potentially slower but more effective
|
||||
compression algorithm. Note, that IDLE page recompression
|
||||
requires ZRAM_MEMORY_TRACKING.
|
||||
|
@ -206,7 +206,7 @@ void zcomp_destroy(struct zcomp *comp)
|
||||
* case of allocation error, or any other error potentially
|
||||
* returned by zcomp_init().
|
||||
*/
|
||||
struct zcomp *zcomp_create(const char *compress)
|
||||
struct zcomp *zcomp_create(const char *alg)
|
||||
{
|
||||
struct zcomp *comp;
|
||||
int error;
|
||||
@ -216,14 +216,14 @@ struct zcomp *zcomp_create(const char *compress)
|
||||
* is not loaded yet. We must do it here, otherwise we are about to
|
||||
* call /sbin/modprobe under CPU hot-plug lock.
|
||||
*/
|
||||
if (!zcomp_available_algorithm(compress))
|
||||
if (!zcomp_available_algorithm(alg))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
|
||||
if (!comp)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
comp->name = compress;
|
||||
comp->name = alg;
|
||||
error = zcomp_init(comp);
|
||||
if (error) {
|
||||
kfree(comp);
|
||||
|
@ -27,7 +27,7 @@ int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node);
|
||||
ssize_t zcomp_available_show(const char *comp, char *buf);
|
||||
bool zcomp_available_algorithm(const char *comp);
|
||||
|
||||
struct zcomp *zcomp_create(const char *comp);
|
||||
struct zcomp *zcomp_create(const char *alg);
|
||||
void zcomp_destroy(struct zcomp *comp);
|
||||
|
||||
struct zcomp_strm *zcomp_stream_get(struct zcomp *comp);
|
||||
|
@ -155,6 +155,25 @@ static inline bool is_partial_io(struct bio_vec *bvec)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void zram_set_priority(struct zram *zram, u32 index, u32 prio)
|
||||
{
|
||||
prio &= ZRAM_COMP_PRIORITY_MASK;
|
||||
/*
|
||||
* Clear previous priority value first, in case if we recompress
|
||||
* further an already recompressed page
|
||||
*/
|
||||
zram->table[index].flags &= ~(ZRAM_COMP_PRIORITY_MASK <<
|
||||
ZRAM_COMP_PRIORITY_BIT1);
|
||||
zram->table[index].flags |= (prio << ZRAM_COMP_PRIORITY_BIT1);
|
||||
}
|
||||
|
||||
static inline u32 zram_get_priority(struct zram *zram, u32 index)
|
||||
{
|
||||
u32 prio = zram->table[index].flags >> ZRAM_COMP_PRIORITY_BIT1;
|
||||
|
||||
return prio & ZRAM_COMP_PRIORITY_MASK;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if request is within bounds and aligned on zram logical blocks.
|
||||
*/
|
||||
@ -188,16 +207,13 @@ static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
|
||||
static inline void update_used_max(struct zram *zram,
|
||||
const unsigned long pages)
|
||||
{
|
||||
unsigned long old_max, cur_max;
|
||||
|
||||
old_max = atomic_long_read(&zram->stats.max_used_pages);
|
||||
unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages);
|
||||
|
||||
do {
|
||||
cur_max = old_max;
|
||||
if (pages > cur_max)
|
||||
old_max = atomic_long_cmpxchg(
|
||||
&zram->stats.max_used_pages, cur_max, pages);
|
||||
} while (old_max != cur_max);
|
||||
if (cur_max >= pages)
|
||||
return;
|
||||
} while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages,
|
||||
&cur_max, pages));
|
||||
}
|
||||
|
||||
static inline void zram_fill_page(void *ptr, unsigned long len,
|
||||
@ -629,10 +645,10 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
|
||||
|
||||
#define PAGE_WB_SIG "page_index="
|
||||
|
||||
#define PAGE_WRITEBACK 0
|
||||
#define HUGE_WRITEBACK (1<<0)
|
||||
#define IDLE_WRITEBACK (1<<1)
|
||||
|
||||
#define PAGE_WRITEBACK 0
|
||||
#define HUGE_WRITEBACK (1<<0)
|
||||
#define IDLE_WRITEBACK (1<<1)
|
||||
#define INCOMPRESSIBLE_WRITEBACK (1<<2)
|
||||
|
||||
static ssize_t writeback_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
@ -653,6 +669,8 @@ static ssize_t writeback_store(struct device *dev,
|
||||
mode = HUGE_WRITEBACK;
|
||||
else if (sysfs_streq(buf, "huge_idle"))
|
||||
mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
|
||||
else if (sysfs_streq(buf, "incompressible"))
|
||||
mode = INCOMPRESSIBLE_WRITEBACK;
|
||||
else {
|
||||
if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
|
||||
return -EINVAL;
|
||||
@ -715,11 +733,15 @@ static ssize_t writeback_store(struct device *dev,
|
||||
goto next;
|
||||
|
||||
if (mode & IDLE_WRITEBACK &&
|
||||
!zram_test_flag(zram, index, ZRAM_IDLE))
|
||||
!zram_test_flag(zram, index, ZRAM_IDLE))
|
||||
goto next;
|
||||
if (mode & HUGE_WRITEBACK &&
|
||||
!zram_test_flag(zram, index, ZRAM_HUGE))
|
||||
!zram_test_flag(zram, index, ZRAM_HUGE))
|
||||
goto next;
|
||||
if (mode & INCOMPRESSIBLE_WRITEBACK &&
|
||||
!zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
|
||||
goto next;
|
||||
|
||||
/*
|
||||
* Clearing ZRAM_UNDER_WB is duty of caller.
|
||||
* IOW, zram_free_page never clear it.
|
||||
@ -753,8 +775,12 @@ static ssize_t writeback_store(struct device *dev,
|
||||
zram_clear_flag(zram, index, ZRAM_IDLE);
|
||||
zram_slot_unlock(zram, index);
|
||||
/*
|
||||
* Return last IO error unless every IO were
|
||||
* not suceeded.
|
||||
* BIO errors are not fatal, we continue and simply
|
||||
* attempt to writeback the remaining objects (pages).
|
||||
* At the same time we need to signal user-space that
|
||||
* some writes (at least one, but also could be all of
|
||||
* them) were not successful and we do so by returning
|
||||
* the most recent BIO error.
|
||||
*/
|
||||
ret = err;
|
||||
continue;
|
||||
@ -920,13 +946,16 @@ static ssize_t read_block_state(struct file *file, char __user *buf,
|
||||
|
||||
ts = ktime_to_timespec64(zram->table[index].ac_time);
|
||||
copied = snprintf(kbuf + written, count,
|
||||
"%12zd %12lld.%06lu %c%c%c%c\n",
|
||||
"%12zd %12lld.%06lu %c%c%c%c%c%c\n",
|
||||
index, (s64)ts.tv_sec,
|
||||
ts.tv_nsec / NSEC_PER_USEC,
|
||||
zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
|
||||
zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
|
||||
zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
|
||||
zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.');
|
||||
zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.',
|
||||
zram_get_priority(zram, index) ? 'r' : '.',
|
||||
zram_test_flag(zram, index,
|
||||
ZRAM_INCOMPRESSIBLE) ? 'n' : '.');
|
||||
|
||||
if (count <= copied) {
|
||||
zram_slot_unlock(zram, index);
|
||||
@ -1000,47 +1029,144 @@ static ssize_t max_comp_streams_store(struct device *dev,
|
||||
return len;
|
||||
}
|
||||
|
||||
static ssize_t comp_algorithm_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg)
|
||||
{
|
||||
size_t sz;
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
/* Do not free statically defined compression algorithms */
|
||||
if (zram->comp_algs[prio] != default_compressor)
|
||||
kfree(zram->comp_algs[prio]);
|
||||
|
||||
zram->comp_algs[prio] = alg;
|
||||
}
|
||||
|
||||
static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, char *buf)
|
||||
{
|
||||
ssize_t sz;
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
sz = zcomp_available_show(zram->compressor, buf);
|
||||
sz = zcomp_available_show(zram->comp_algs[prio], buf);
|
||||
up_read(&zram->init_lock);
|
||||
|
||||
return sz;
|
||||
}
|
||||
|
||||
static ssize_t comp_algorithm_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf)
|
||||
{
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
char compressor[ARRAY_SIZE(zram->compressor)];
|
||||
char *compressor;
|
||||
size_t sz;
|
||||
|
||||
strscpy(compressor, buf, sizeof(compressor));
|
||||
sz = strlen(buf);
|
||||
if (sz >= CRYPTO_MAX_ALG_NAME)
|
||||
return -E2BIG;
|
||||
|
||||
compressor = kstrdup(buf, GFP_KERNEL);
|
||||
if (!compressor)
|
||||
return -ENOMEM;
|
||||
|
||||
/* ignore trailing newline */
|
||||
sz = strlen(compressor);
|
||||
if (sz > 0 && compressor[sz - 1] == '\n')
|
||||
compressor[sz - 1] = 0x00;
|
||||
|
||||
if (!zcomp_available_algorithm(compressor))
|
||||
if (!zcomp_available_algorithm(compressor)) {
|
||||
kfree(compressor);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
down_write(&zram->init_lock);
|
||||
if (init_done(zram)) {
|
||||
up_write(&zram->init_lock);
|
||||
kfree(compressor);
|
||||
pr_info("Can't change algorithm for initialized device\n");
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
strcpy(zram->compressor, compressor);
|
||||
comp_algorithm_set(zram, prio, compressor);
|
||||
up_write(&zram->init_lock);
|
||||
return len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t comp_algorithm_show(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
|
||||
return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf);
|
||||
}
|
||||
|
||||
static ssize_t comp_algorithm_store(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
const char *buf,
|
||||
size_t len)
|
||||
{
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
int ret;
|
||||
|
||||
ret = __comp_algorithm_store(zram, ZRAM_PRIMARY_COMP, buf);
|
||||
return ret ? ret : len;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZRAM_MULTI_COMP
|
||||
static ssize_t recomp_algorithm_show(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
ssize_t sz = 0;
|
||||
u32 prio;
|
||||
|
||||
for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
|
||||
if (!zram->comp_algs[prio])
|
||||
continue;
|
||||
|
||||
sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, "#%d: ", prio);
|
||||
sz += __comp_algorithm_show(zram, prio, buf + sz);
|
||||
}
|
||||
|
||||
return sz;
|
||||
}
|
||||
|
||||
static ssize_t recomp_algorithm_store(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
const char *buf,
|
||||
size_t len)
|
||||
{
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
int prio = ZRAM_SECONDARY_COMP;
|
||||
char *args, *param, *val;
|
||||
char *alg = NULL;
|
||||
int ret;
|
||||
|
||||
args = skip_spaces(buf);
|
||||
while (*args) {
|
||||
args = next_arg(args, ¶m, &val);
|
||||
|
||||
if (!*val)
|
||||
return -EINVAL;
|
||||
|
||||
if (!strcmp(param, "algo")) {
|
||||
alg = val;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strcmp(param, "priority")) {
|
||||
ret = kstrtoint(val, 10, &prio);
|
||||
if (ret)
|
||||
return ret;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (!alg)
|
||||
return -EINVAL;
|
||||
|
||||
if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS)
|
||||
return -EINVAL;
|
||||
|
||||
ret = __comp_algorithm_store(zram, prio, alg);
|
||||
return ret ? ret : len;
|
||||
}
|
||||
#endif
|
||||
|
||||
static ssize_t compact_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
{
|
||||
@ -1210,6 +1336,11 @@ static void zram_free_page(struct zram *zram, size_t index)
|
||||
atomic64_dec(&zram->stats.huge_pages);
|
||||
}
|
||||
|
||||
if (zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
|
||||
zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE);
|
||||
|
||||
zram_set_priority(zram, index, 0);
|
||||
|
||||
if (zram_test_flag(zram, index, ZRAM_WB)) {
|
||||
zram_clear_flag(zram, index, ZRAM_WB);
|
||||
free_block_bdev(zram, zram_get_element(zram, index));
|
||||
@ -1242,32 +1373,37 @@ static void zram_free_page(struct zram *zram, size_t index)
|
||||
~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
|
||||
}
|
||||
|
||||
static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
|
||||
struct bio *bio, bool partial_io)
|
||||
/*
|
||||
* Reads a page from the writeback devices. Corresponding ZRAM slot
|
||||
* should be unlocked.
|
||||
*/
|
||||
static int zram_bvec_read_from_bdev(struct zram *zram, struct page *page,
|
||||
u32 index, struct bio *bio, bool partial_io)
|
||||
{
|
||||
struct bio_vec bvec = {
|
||||
.bv_page = page,
|
||||
.bv_len = PAGE_SIZE,
|
||||
.bv_offset = 0,
|
||||
};
|
||||
|
||||
return read_from_bdev(zram, &bvec, zram_get_element(zram, index), bio,
|
||||
partial_io);
|
||||
}
|
||||
|
||||
/*
|
||||
* Reads (decompresses if needed) a page from zspool (zsmalloc).
|
||||
* Corresponding ZRAM slot should be locked.
|
||||
*/
|
||||
static int zram_read_from_zspool(struct zram *zram, struct page *page,
|
||||
u32 index)
|
||||
{
|
||||
struct zcomp_strm *zstrm;
|
||||
unsigned long handle;
|
||||
unsigned int size;
|
||||
void *src, *dst;
|
||||
u32 prio;
|
||||
int ret;
|
||||
|
||||
zram_slot_lock(zram, index);
|
||||
if (zram_test_flag(zram, index, ZRAM_WB)) {
|
||||
struct bio_vec bvec;
|
||||
|
||||
zram_slot_unlock(zram, index);
|
||||
/* A null bio means rw_page was used, we must fallback to bio */
|
||||
if (!bio)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
bvec.bv_page = page;
|
||||
bvec.bv_len = PAGE_SIZE;
|
||||
bvec.bv_offset = 0;
|
||||
return read_from_bdev(zram, &bvec,
|
||||
zram_get_element(zram, index),
|
||||
bio, partial_io);
|
||||
}
|
||||
|
||||
handle = zram_get_handle(zram, index);
|
||||
if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
|
||||
unsigned long value;
|
||||
@ -1277,14 +1413,15 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
|
||||
mem = kmap_atomic(page);
|
||||
zram_fill_page(mem, PAGE_SIZE, value);
|
||||
kunmap_atomic(mem);
|
||||
zram_slot_unlock(zram, index);
|
||||
return 0;
|
||||
}
|
||||
|
||||
size = zram_get_obj_size(zram, index);
|
||||
|
||||
if (size != PAGE_SIZE)
|
||||
zstrm = zcomp_stream_get(zram->comp);
|
||||
if (size != PAGE_SIZE) {
|
||||
prio = zram_get_priority(zram, index);
|
||||
zstrm = zcomp_stream_get(zram->comps[prio]);
|
||||
}
|
||||
|
||||
src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
|
||||
if (size == PAGE_SIZE) {
|
||||
@ -1296,20 +1433,43 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
|
||||
dst = kmap_atomic(page);
|
||||
ret = zcomp_decompress(zstrm, src, size, dst);
|
||||
kunmap_atomic(dst);
|
||||
zcomp_stream_put(zram->comp);
|
||||
zcomp_stream_put(zram->comps[prio]);
|
||||
}
|
||||
zs_unmap_object(zram->mem_pool, handle);
|
||||
zram_slot_unlock(zram, index);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
|
||||
struct bio *bio, bool partial_io)
|
||||
{
|
||||
int ret;
|
||||
|
||||
zram_slot_lock(zram, index);
|
||||
if (!zram_test_flag(zram, index, ZRAM_WB)) {
|
||||
/* Slot should be locked through out the function call */
|
||||
ret = zram_read_from_zspool(zram, page, index);
|
||||
zram_slot_unlock(zram, index);
|
||||
} else {
|
||||
/* Slot should be unlocked before the function call */
|
||||
zram_slot_unlock(zram, index);
|
||||
|
||||
/* A null bio means rw_page was used, we must fallback to bio */
|
||||
if (!bio)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = zram_bvec_read_from_bdev(zram, page, index, bio,
|
||||
partial_io);
|
||||
}
|
||||
|
||||
/* Should NEVER happen. Return bio error if it does. */
|
||||
if (WARN_ON(ret))
|
||||
if (WARN_ON(ret < 0))
|
||||
pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
|
||||
u32 index, int offset, struct bio *bio)
|
||||
u32 index, int offset, struct bio *bio)
|
||||
{
|
||||
int ret;
|
||||
struct page *page;
|
||||
@ -1363,13 +1523,13 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||
kunmap_atomic(mem);
|
||||
|
||||
compress_again:
|
||||
zstrm = zcomp_stream_get(zram->comp);
|
||||
zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
|
||||
src = kmap_atomic(page);
|
||||
ret = zcomp_compress(zstrm, src, &comp_len);
|
||||
kunmap_atomic(src);
|
||||
|
||||
if (unlikely(ret)) {
|
||||
zcomp_stream_put(zram->comp);
|
||||
zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
|
||||
pr_err("Compression failed! err=%d\n", ret);
|
||||
zs_free(zram->mem_pool, handle);
|
||||
return ret;
|
||||
@ -1390,19 +1550,19 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||
* if we have a 'non-null' handle here then we are coming
|
||||
* from the slow path and handle has already been allocated.
|
||||
*/
|
||||
if (IS_ERR((void *)handle))
|
||||
if (IS_ERR_VALUE(handle))
|
||||
handle = zs_malloc(zram->mem_pool, comp_len,
|
||||
__GFP_KSWAPD_RECLAIM |
|
||||
__GFP_NOWARN |
|
||||
__GFP_HIGHMEM |
|
||||
__GFP_MOVABLE);
|
||||
if (IS_ERR((void *)handle)) {
|
||||
zcomp_stream_put(zram->comp);
|
||||
if (IS_ERR_VALUE(handle)) {
|
||||
zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
|
||||
atomic64_inc(&zram->stats.writestall);
|
||||
handle = zs_malloc(zram->mem_pool, comp_len,
|
||||
GFP_NOIO | __GFP_HIGHMEM |
|
||||
__GFP_MOVABLE);
|
||||
if (IS_ERR((void *)handle))
|
||||
if (IS_ERR_VALUE(handle))
|
||||
return PTR_ERR((void *)handle);
|
||||
|
||||
if (comp_len != PAGE_SIZE)
|
||||
@ -1414,14 +1574,14 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||
* zstrm buffer back. It is necessary that the dereferencing
|
||||
* of the zstrm variable below occurs correctly.
|
||||
*/
|
||||
zstrm = zcomp_stream_get(zram->comp);
|
||||
zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
|
||||
}
|
||||
|
||||
alloced_pages = zs_get_total_pages(zram->mem_pool);
|
||||
update_used_max(zram, alloced_pages);
|
||||
|
||||
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
|
||||
zcomp_stream_put(zram->comp);
|
||||
zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
|
||||
zs_free(zram->mem_pool, handle);
|
||||
return -ENOMEM;
|
||||
}
|
||||
@ -1435,7 +1595,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||
if (comp_len == PAGE_SIZE)
|
||||
kunmap_atomic(src);
|
||||
|
||||
zcomp_stream_put(zram->comp);
|
||||
zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
|
||||
zs_unmap_object(zram->mem_pool, handle);
|
||||
atomic64_add(comp_len, &zram->stats.compr_data_size);
|
||||
out:
|
||||
@ -1504,6 +1664,274 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZRAM_MULTI_COMP
|
||||
/*
|
||||
* This function will decompress (unless it's ZRAM_HUGE) the page and then
|
||||
* attempt to compress it using provided compression algorithm priority
|
||||
* (which is potentially more effective).
|
||||
*
|
||||
* Corresponding ZRAM slot should be locked.
|
||||
*/
|
||||
static int zram_recompress(struct zram *zram, u32 index, struct page *page,
|
||||
u32 threshold, u32 prio, u32 prio_max)
|
||||
{
|
||||
struct zcomp_strm *zstrm = NULL;
|
||||
unsigned long handle_old;
|
||||
unsigned long handle_new;
|
||||
unsigned int comp_len_old;
|
||||
unsigned int comp_len_new;
|
||||
unsigned int class_index_old;
|
||||
unsigned int class_index_new;
|
||||
u32 num_recomps = 0;
|
||||
void *src, *dst;
|
||||
int ret;
|
||||
|
||||
handle_old = zram_get_handle(zram, index);
|
||||
if (!handle_old)
|
||||
return -EINVAL;
|
||||
|
||||
comp_len_old = zram_get_obj_size(zram, index);
|
||||
/*
|
||||
* Do not recompress objects that are already "small enough".
|
||||
*/
|
||||
if (comp_len_old < threshold)
|
||||
return 0;
|
||||
|
||||
ret = zram_read_from_zspool(zram, page, index);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old);
|
||||
/*
|
||||
* Iterate the secondary comp algorithms list (in order of priority)
|
||||
* and try to recompress the page.
|
||||
*/
|
||||
for (; prio < prio_max; prio++) {
|
||||
if (!zram->comps[prio])
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Skip if the object is already re-compressed with a higher
|
||||
* priority algorithm (or same algorithm).
|
||||
*/
|
||||
if (prio <= zram_get_priority(zram, index))
|
||||
continue;
|
||||
|
||||
num_recomps++;
|
||||
zstrm = zcomp_stream_get(zram->comps[prio]);
|
||||
src = kmap_atomic(page);
|
||||
ret = zcomp_compress(zstrm, src, &comp_len_new);
|
||||
kunmap_atomic(src);
|
||||
|
||||
if (ret) {
|
||||
zcomp_stream_put(zram->comps[prio]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
class_index_new = zs_lookup_class_index(zram->mem_pool,
|
||||
comp_len_new);
|
||||
|
||||
/* Continue until we make progress */
|
||||
if (class_index_new >= class_index_old ||
|
||||
(threshold && comp_len_new >= threshold)) {
|
||||
zcomp_stream_put(zram->comps[prio]);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Recompression was successful so break out */
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* We did not try to recompress, e.g. when we have only one
|
||||
* secondary algorithm and the page is already recompressed
|
||||
* using that algorithm
|
||||
*/
|
||||
if (!zstrm)
|
||||
return 0;
|
||||
|
||||
if (class_index_new >= class_index_old) {
|
||||
/*
|
||||
* Secondary algorithms failed to re-compress the page
|
||||
* in a way that would save memory, mark the object as
|
||||
* incompressible so that we will not try to compress
|
||||
* it again.
|
||||
*
|
||||
* We need to make sure that all secondary algorithms have
|
||||
* failed, so we test if the number of recompressions matches
|
||||
* the number of active secondary algorithms.
|
||||
*/
|
||||
if (num_recomps == zram->num_active_comps - 1)
|
||||
zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Successful recompression but above threshold */
|
||||
if (threshold && comp_len_new >= threshold)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* No direct reclaim (slow path) for handle allocation and no
|
||||
* re-compression attempt (unlike in __zram_bvec_write()) since
|
||||
* we already have stored that object in zsmalloc. If we cannot
|
||||
* alloc memory for recompressed object then we bail out and
|
||||
* simply keep the old (existing) object in zsmalloc.
|
||||
*/
|
||||
handle_new = zs_malloc(zram->mem_pool, comp_len_new,
|
||||
__GFP_KSWAPD_RECLAIM |
|
||||
__GFP_NOWARN |
|
||||
__GFP_HIGHMEM |
|
||||
__GFP_MOVABLE);
|
||||
if (IS_ERR_VALUE(handle_new)) {
|
||||
zcomp_stream_put(zram->comps[prio]);
|
||||
return PTR_ERR((void *)handle_new);
|
||||
}
|
||||
|
||||
dst = zs_map_object(zram->mem_pool, handle_new, ZS_MM_WO);
|
||||
memcpy(dst, zstrm->buffer, comp_len_new);
|
||||
zcomp_stream_put(zram->comps[prio]);
|
||||
|
||||
zs_unmap_object(zram->mem_pool, handle_new);
|
||||
|
||||
zram_free_page(zram, index);
|
||||
zram_set_handle(zram, index, handle_new);
|
||||
zram_set_obj_size(zram, index, comp_len_new);
|
||||
zram_set_priority(zram, index, prio);
|
||||
|
||||
atomic64_add(comp_len_new, &zram->stats.compr_data_size);
|
||||
atomic64_inc(&zram->stats.pages_stored);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define RECOMPRESS_IDLE (1 << 0)
|
||||
#define RECOMPRESS_HUGE (1 << 1)
|
||||
|
||||
static ssize_t recompress_store(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
const char *buf, size_t len)
|
||||
{
|
||||
u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS;
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
|
||||
char *args, *param, *val, *algo = NULL;
|
||||
u32 mode = 0, threshold = 0;
|
||||
unsigned long index;
|
||||
struct page *page;
|
||||
ssize_t ret;
|
||||
|
||||
args = skip_spaces(buf);
|
||||
while (*args) {
|
||||
args = next_arg(args, ¶m, &val);
|
||||
|
||||
if (!*val)
|
||||
return -EINVAL;
|
||||
|
||||
if (!strcmp(param, "type")) {
|
||||
if (!strcmp(val, "idle"))
|
||||
mode = RECOMPRESS_IDLE;
|
||||
if (!strcmp(val, "huge"))
|
||||
mode = RECOMPRESS_HUGE;
|
||||
if (!strcmp(val, "huge_idle"))
|
||||
mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strcmp(param, "threshold")) {
|
||||
/*
|
||||
* We will re-compress only idle objects equal or
|
||||
* greater in size than watermark.
|
||||
*/
|
||||
ret = kstrtouint(val, 10, &threshold);
|
||||
if (ret)
|
||||
return ret;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strcmp(param, "algo")) {
|
||||
algo = val;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (threshold >= PAGE_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
if (!init_done(zram)) {
|
||||
ret = -EINVAL;
|
||||
goto release_init_lock;
|
||||
}
|
||||
|
||||
if (algo) {
|
||||
bool found = false;
|
||||
|
||||
for (; prio < ZRAM_MAX_COMPS; prio++) {
|
||||
if (!zram->comp_algs[prio])
|
||||
continue;
|
||||
|
||||
if (!strcmp(zram->comp_algs[prio], algo)) {
|
||||
prio_max = min(prio + 1, ZRAM_MAX_COMPS);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
ret = -EINVAL;
|
||||
goto release_init_lock;
|
||||
}
|
||||
}
|
||||
|
||||
page = alloc_page(GFP_KERNEL);
|
||||
if (!page) {
|
||||
ret = -ENOMEM;
|
||||
goto release_init_lock;
|
||||
}
|
||||
|
||||
ret = len;
|
||||
for (index = 0; index < nr_pages; index++) {
|
||||
int err = 0;
|
||||
|
||||
zram_slot_lock(zram, index);
|
||||
|
||||
if (!zram_allocated(zram, index))
|
||||
goto next;
|
||||
|
||||
if (mode & RECOMPRESS_IDLE &&
|
||||
!zram_test_flag(zram, index, ZRAM_IDLE))
|
||||
goto next;
|
||||
|
||||
if (mode & RECOMPRESS_HUGE &&
|
||||
!zram_test_flag(zram, index, ZRAM_HUGE))
|
||||
goto next;
|
||||
|
||||
if (zram_test_flag(zram, index, ZRAM_WB) ||
|
||||
zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
|
||||
zram_test_flag(zram, index, ZRAM_SAME) ||
|
||||
zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
|
||||
goto next;
|
||||
|
||||
err = zram_recompress(zram, index, page, threshold,
|
||||
prio, prio_max);
|
||||
next:
|
||||
zram_slot_unlock(zram, index);
|
||||
if (err) {
|
||||
ret = err;
|
||||
break;
|
||||
}
|
||||
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
__free_page(page);
|
||||
|
||||
release_init_lock:
|
||||
up_read(&zram->init_lock);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* zram_bio_discard - handler on discard request
|
||||
* @index: physical block index in PAGE_SIZE units
|
||||
@ -1553,11 +1981,9 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
|
||||
int ret;
|
||||
|
||||
if (!op_is_write(op)) {
|
||||
atomic64_inc(&zram->stats.num_reads);
|
||||
ret = zram_bvec_read(zram, bvec, index, offset, bio);
|
||||
flush_dcache_page(bvec->bv_page);
|
||||
} else {
|
||||
atomic64_inc(&zram->stats.num_writes);
|
||||
ret = zram_bvec_write(zram, bvec, index, offset, bio);
|
||||
}
|
||||
|
||||
@ -1710,6 +2136,21 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void zram_destroy_comps(struct zram *zram)
|
||||
{
|
||||
u32 prio;
|
||||
|
||||
for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) {
|
||||
struct zcomp *comp = zram->comps[prio];
|
||||
|
||||
zram->comps[prio] = NULL;
|
||||
if (!comp)
|
||||
continue;
|
||||
zcomp_destroy(comp);
|
||||
zram->num_active_comps--;
|
||||
}
|
||||
}
|
||||
|
||||
static void zram_reset_device(struct zram *zram)
|
||||
{
|
||||
down_write(&zram->init_lock);
|
||||
@ -1727,11 +2168,11 @@ static void zram_reset_device(struct zram *zram)
|
||||
/* I/O operation under all of CPU are done so let's free */
|
||||
zram_meta_free(zram, zram->disksize);
|
||||
zram->disksize = 0;
|
||||
zram_destroy_comps(zram);
|
||||
memset(&zram->stats, 0, sizeof(zram->stats));
|
||||
zcomp_destroy(zram->comp);
|
||||
zram->comp = NULL;
|
||||
reset_bdev(zram);
|
||||
|
||||
comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
|
||||
up_write(&zram->init_lock);
|
||||
}
|
||||
|
||||
@ -1742,6 +2183,7 @@ static ssize_t disksize_store(struct device *dev,
|
||||
struct zcomp *comp;
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
int err;
|
||||
u32 prio;
|
||||
|
||||
disksize = memparse(buf, NULL);
|
||||
if (!disksize)
|
||||
@ -1760,22 +2202,29 @@ static ssize_t disksize_store(struct device *dev,
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
comp = zcomp_create(zram->compressor);
|
||||
if (IS_ERR(comp)) {
|
||||
pr_err("Cannot initialise %s compressing backend\n",
|
||||
zram->compressor);
|
||||
err = PTR_ERR(comp);
|
||||
goto out_free_meta;
|
||||
}
|
||||
for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) {
|
||||
if (!zram->comp_algs[prio])
|
||||
continue;
|
||||
|
||||
zram->comp = comp;
|
||||
comp = zcomp_create(zram->comp_algs[prio]);
|
||||
if (IS_ERR(comp)) {
|
||||
pr_err("Cannot initialise %s compressing backend\n",
|
||||
zram->comp_algs[prio]);
|
||||
err = PTR_ERR(comp);
|
||||
goto out_free_comps;
|
||||
}
|
||||
|
||||
zram->comps[prio] = comp;
|
||||
zram->num_active_comps++;
|
||||
}
|
||||
zram->disksize = disksize;
|
||||
set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
|
||||
up_write(&zram->init_lock);
|
||||
|
||||
return len;
|
||||
|
||||
out_free_meta:
|
||||
out_free_comps:
|
||||
zram_destroy_comps(zram);
|
||||
zram_meta_free(zram, disksize);
|
||||
out_unlock:
|
||||
up_write(&zram->init_lock);
|
||||
@ -1860,6 +2309,10 @@ static DEVICE_ATTR_WO(writeback);
|
||||
static DEVICE_ATTR_RW(writeback_limit);
|
||||
static DEVICE_ATTR_RW(writeback_limit_enable);
|
||||
#endif
|
||||
#ifdef CONFIG_ZRAM_MULTI_COMP
|
||||
static DEVICE_ATTR_RW(recomp_algorithm);
|
||||
static DEVICE_ATTR_WO(recompress);
|
||||
#endif
|
||||
|
||||
static struct attribute *zram_disk_attrs[] = {
|
||||
&dev_attr_disksize.attr,
|
||||
@ -1883,6 +2336,10 @@ static struct attribute *zram_disk_attrs[] = {
|
||||
&dev_attr_bd_stat.attr,
|
||||
#endif
|
||||
&dev_attr_debug_stat.attr,
|
||||
#ifdef CONFIG_ZRAM_MULTI_COMP
|
||||
&dev_attr_recomp_algorithm.attr,
|
||||
&dev_attr_recompress.attr,
|
||||
#endif
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -1962,7 +2419,7 @@ static int zram_add(void)
|
||||
if (ret)
|
||||
goto out_cleanup_disk;
|
||||
|
||||
strscpy(zram->compressor, default_compressor, sizeof(zram->compressor));
|
||||
comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
|
||||
|
||||
zram_debugfs_register(zram);
|
||||
pr_info("Added device: %s\n", zram->disk->disk_name);
|
||||
|
@ -40,6 +40,9 @@
|
||||
*/
|
||||
#define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1)
|
||||
|
||||
/* Only 2 bits are allowed for comp priority index */
|
||||
#define ZRAM_COMP_PRIORITY_MASK 0x3
|
||||
|
||||
/* Flags for zram pages (table[page_no].flags) */
|
||||
enum zram_pageflags {
|
||||
/* zram slot is locked */
|
||||
@ -49,6 +52,10 @@ enum zram_pageflags {
|
||||
ZRAM_UNDER_WB, /* page is under writeback */
|
||||
ZRAM_HUGE, /* Incompressible page */
|
||||
ZRAM_IDLE, /* not accessed page since last idle marking */
|
||||
ZRAM_INCOMPRESSIBLE, /* none of the algorithms could compress it */
|
||||
|
||||
ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */
|
||||
ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */
|
||||
|
||||
__NR_ZRAM_PAGEFLAGS,
|
||||
};
|
||||
@ -69,8 +76,6 @@ struct zram_table_entry {
|
||||
|
||||
struct zram_stats {
|
||||
atomic64_t compr_data_size; /* compressed size of pages stored */
|
||||
atomic64_t num_reads; /* failed + successful */
|
||||
atomic64_t num_writes; /* --do-- */
|
||||
atomic64_t failed_reads; /* can happen when memory is too low */
|
||||
atomic64_t failed_writes; /* can happen when memory is too low */
|
||||
atomic64_t invalid_io; /* non-page-aligned I/O requests */
|
||||
@ -89,10 +94,20 @@ struct zram_stats {
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef CONFIG_ZRAM_MULTI_COMP
|
||||
#define ZRAM_PRIMARY_COMP 0U
|
||||
#define ZRAM_SECONDARY_COMP 1U
|
||||
#define ZRAM_MAX_COMPS 4U
|
||||
#else
|
||||
#define ZRAM_PRIMARY_COMP 0U
|
||||
#define ZRAM_SECONDARY_COMP 0U
|
||||
#define ZRAM_MAX_COMPS 1U
|
||||
#endif
|
||||
|
||||
struct zram {
|
||||
struct zram_table_entry *table;
|
||||
struct zs_pool *mem_pool;
|
||||
struct zcomp *comp;
|
||||
struct zcomp *comps[ZRAM_MAX_COMPS];
|
||||
struct gendisk *disk;
|
||||
/* Prevent concurrent execution of device init */
|
||||
struct rw_semaphore init_lock;
|
||||
@ -107,7 +122,8 @@ struct zram {
|
||||
* we can store in a disk.
|
||||
*/
|
||||
u64 disksize; /* bytes */
|
||||
char compressor[CRYPTO_MAX_ALG_NAME];
|
||||
const char *comp_algs[ZRAM_MAX_COMPS];
|
||||
s8 num_active_comps;
|
||||
/*
|
||||
* zram is claimed so open request will be failed
|
||||
*/
|
||||
|
@ -256,7 +256,7 @@ static int amdgpu_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_str
|
||||
* becoming writable and makes is_cow_mapping(vm_flags) false.
|
||||
*/
|
||||
if (is_cow_mapping(vma->vm_flags) &&
|
||||
!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
|
||||
!(vma->vm_flags & VM_ACCESS_FLAGS))
|
||||
vma->vm_flags &= ~VM_MAYWRITE;
|
||||
|
||||
return drm_gem_ttm_mmap(obj, vma);
|
||||
|
@ -643,6 +643,7 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj)
|
||||
struct page **pvec = NULL;
|
||||
struct etnaviv_gem_userptr *userptr = &etnaviv_obj->userptr;
|
||||
int ret, pinned = 0, npages = etnaviv_obj->base.size >> PAGE_SHIFT;
|
||||
unsigned int gup_flags = FOLL_LONGTERM;
|
||||
|
||||
might_lock_read(¤t->mm->mmap_lock);
|
||||
|
||||
@ -653,14 +654,15 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj)
|
||||
if (!pvec)
|
||||
return -ENOMEM;
|
||||
|
||||
if (!userptr->ro)
|
||||
gup_flags |= FOLL_WRITE;
|
||||
|
||||
do {
|
||||
unsigned num_pages = npages - pinned;
|
||||
uint64_t ptr = userptr->ptr + pinned * PAGE_SIZE;
|
||||
struct page **pages = pvec + pinned;
|
||||
|
||||
ret = pin_user_pages_fast(ptr, num_pages,
|
||||
FOLL_WRITE | FOLL_FORCE | FOLL_LONGTERM,
|
||||
pages);
|
||||
ret = pin_user_pages_fast(ptr, num_pages, gup_flags, pages);
|
||||
if (ret < 0) {
|
||||
unpin_user_pages(pvec, pinned);
|
||||
kvfree(pvec);
|
||||
|
@ -477,7 +477,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct g2d_data *g2d,
|
||||
}
|
||||
|
||||
ret = pin_user_pages_fast(start, npages,
|
||||
FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM,
|
||||
FOLL_WRITE | FOLL_LONGTERM,
|
||||
g2d_userptr->pages);
|
||||
if (ret != npages) {
|
||||
DRM_DEV_ERROR(g2d->dev,
|
||||
|
@ -156,7 +156,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
|
||||
struct mm_struct *mm;
|
||||
unsigned long npages;
|
||||
int pinned, ret;
|
||||
unsigned int gup_flags = FOLL_WRITE;
|
||||
unsigned int gup_flags = FOLL_LONGTERM;
|
||||
|
||||
/*
|
||||
* If the combination of the addr and size requested for this memory
|
||||
@ -210,8 +210,8 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
|
||||
|
||||
cur_base = addr & PAGE_MASK;
|
||||
|
||||
if (!umem->writable)
|
||||
gup_flags |= FOLL_FORCE;
|
||||
if (umem->writable)
|
||||
gup_flags |= FOLL_WRITE;
|
||||
|
||||
while (npages) {
|
||||
cond_resched();
|
||||
@ -219,7 +219,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
|
||||
min_t(unsigned long, npages,
|
||||
PAGE_SIZE /
|
||||
sizeof(struct page *)),
|
||||
gup_flags | FOLL_LONGTERM, page_list);
|
||||
gup_flags, page_list);
|
||||
if (pinned < 0) {
|
||||
ret = pinned;
|
||||
goto umem_release;
|
||||
|
@ -110,7 +110,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,
|
||||
for (got = 0; got < num_pages; got += ret) {
|
||||
ret = pin_user_pages(start_page + got * PAGE_SIZE,
|
||||
num_pages - got,
|
||||
FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE,
|
||||
FOLL_LONGTERM | FOLL_WRITE,
|
||||
p + got, NULL);
|
||||
if (ret < 0) {
|
||||
mmap_read_unlock(current->mm);
|
||||
|
@ -85,6 +85,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
|
||||
int dmasync, struct usnic_uiom_reg *uiomr)
|
||||
{
|
||||
struct list_head *chunk_list = &uiomr->chunk_list;
|
||||
unsigned int gup_flags = FOLL_LONGTERM;
|
||||
struct page **page_list;
|
||||
struct scatterlist *sg;
|
||||
struct usnic_uiom_chunk *chunk;
|
||||
@ -96,7 +97,6 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
|
||||
int off;
|
||||
int i;
|
||||
dma_addr_t pa;
|
||||
unsigned int gup_flags;
|
||||
struct mm_struct *mm;
|
||||
|
||||
/*
|
||||
@ -131,8 +131,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
|
||||
goto out;
|
||||
}
|
||||
|
||||
gup_flags = FOLL_WRITE;
|
||||
gup_flags |= (writable) ? 0 : FOLL_FORCE;
|
||||
if (writable)
|
||||
gup_flags |= FOLL_WRITE;
|
||||
cur_base = addr & PAGE_MASK;
|
||||
ret = 0;
|
||||
|
||||
@ -140,8 +140,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
|
||||
ret = pin_user_pages(cur_base,
|
||||
min_t(unsigned long, npages,
|
||||
PAGE_SIZE / sizeof(struct page *)),
|
||||
gup_flags | FOLL_LONGTERM,
|
||||
page_list, NULL);
|
||||
gup_flags, page_list, NULL);
|
||||
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
@ -368,7 +368,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
|
||||
struct mm_struct *mm_s;
|
||||
u64 first_page_va;
|
||||
unsigned long mlock_limit;
|
||||
unsigned int foll_flags = FOLL_WRITE;
|
||||
unsigned int foll_flags = FOLL_LONGTERM;
|
||||
int num_pages, num_chunks, i, rv = 0;
|
||||
|
||||
if (!can_do_mlock())
|
||||
@ -391,8 +391,8 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
|
||||
|
||||
mmgrab(mm_s);
|
||||
|
||||
if (!writable)
|
||||
foll_flags |= FOLL_FORCE;
|
||||
if (writable)
|
||||
foll_flags |= FOLL_WRITE;
|
||||
|
||||
mmap_read_lock(mm_s);
|
||||
|
||||
@ -423,8 +423,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
|
||||
while (nents) {
|
||||
struct page **plist = &umem->page_chunk[i].plist[got];
|
||||
|
||||
rv = pin_user_pages(first_page_va, nents,
|
||||
foll_flags | FOLL_LONGTERM,
|
||||
rv = pin_user_pages(first_page_va, nents, foll_flags,
|
||||
plist, NULL);
|
||||
if (rv < 0)
|
||||
goto out_sem_up;
|
||||
|
@ -37,7 +37,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, bool write,
|
||||
struct frame_vector *vec)
|
||||
{
|
||||
int ret;
|
||||
unsigned int gup_flags = FOLL_FORCE | FOLL_LONGTERM;
|
||||
unsigned int gup_flags = FOLL_LONGTERM;
|
||||
|
||||
if (nr_frames == 0)
|
||||
return 0;
|
||||
|
@ -115,7 +115,7 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
|
||||
|
||||
/* Pin user pages for DMA Xfer */
|
||||
err = pin_user_pages_unlocked(user_dma.uaddr, user_dma.page_count,
|
||||
dma->map, FOLL_FORCE);
|
||||
dma->map, 0);
|
||||
|
||||
if (user_dma.page_count != err) {
|
||||
IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n",
|
||||
|
@ -63,12 +63,11 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma,
|
||||
|
||||
/* Pin user pages for DMA Xfer */
|
||||
y_pages = pin_user_pages_unlocked(y_dma.uaddr,
|
||||
y_dma.page_count, &dma->map[0], FOLL_FORCE);
|
||||
y_dma.page_count, &dma->map[0], 0);
|
||||
uv_pages = 0; /* silence gcc. value is set and consumed only if: */
|
||||
if (y_pages == y_dma.page_count) {
|
||||
uv_pages = pin_user_pages_unlocked(uv_dma.uaddr,
|
||||
uv_dma.page_count, &dma->map[y_pages],
|
||||
FOLL_FORCE);
|
||||
uv_dma.page_count, &dma->map[y_pages], 0);
|
||||
}
|
||||
|
||||
if (y_pages != y_dma.page_count || uv_pages != uv_dma.page_count) {
|
||||
|
@ -151,17 +151,16 @@ static void videobuf_dma_init(struct videobuf_dmabuf *dma)
|
||||
static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
|
||||
int direction, unsigned long data, unsigned long size)
|
||||
{
|
||||
unsigned int gup_flags = FOLL_LONGTERM;
|
||||
unsigned long first, last;
|
||||
int err, rw = 0;
|
||||
unsigned int flags = FOLL_FORCE;
|
||||
int err;
|
||||
|
||||
dma->direction = direction;
|
||||
switch (dma->direction) {
|
||||
case DMA_FROM_DEVICE:
|
||||
rw = READ;
|
||||
gup_flags |= FOLL_WRITE;
|
||||
break;
|
||||
case DMA_TO_DEVICE:
|
||||
rw = WRITE;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
@ -177,14 +176,11 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
|
||||
if (NULL == dma->pages)
|
||||
return -ENOMEM;
|
||||
|
||||
if (rw == READ)
|
||||
flags |= FOLL_WRITE;
|
||||
|
||||
dprintk(1, "init user [0x%lx+0x%lx => %lu pages]\n",
|
||||
data, size, dma->nr_pages);
|
||||
|
||||
err = pin_user_pages(data & PAGE_MASK, dma->nr_pages,
|
||||
flags | FOLL_LONGTERM, dma->pages, NULL);
|
||||
err = pin_user_pages(data & PAGE_MASK, dma->nr_pages, gup_flags,
|
||||
dma->pages, NULL);
|
||||
|
||||
if (err != dma->nr_pages) {
|
||||
dma->nr_pages = (err >= 0) ? err : 0;
|
||||
|
@ -2312,8 +2312,7 @@ static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size,
|
||||
if (!userptr->pages)
|
||||
return -ENOMEM;
|
||||
|
||||
rc = pin_user_pages_fast(start, npages,
|
||||
FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM,
|
||||
rc = pin_user_pages_fast(start, npages, FOLL_WRITE | FOLL_LONGTERM,
|
||||
userptr->pages);
|
||||
|
||||
if (rc != npages) {
|
||||
|
221
fs/dax.c
221
fs/dax.c
@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry)
|
||||
for (pfn = dax_to_pfn(entry); \
|
||||
pfn < dax_end_pfn(entry); pfn++)
|
||||
|
||||
static inline bool dax_mapping_is_cow(struct address_space *mapping)
|
||||
static inline bool dax_page_is_shared(struct page *page)
|
||||
{
|
||||
return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
|
||||
return page->mapping == PAGE_MAPPING_DAX_SHARED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
|
||||
* Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
|
||||
* refcount.
|
||||
*/
|
||||
static inline void dax_mapping_set_cow(struct page *page)
|
||||
static inline void dax_page_share_get(struct page *page)
|
||||
{
|
||||
if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
|
||||
if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
|
||||
/*
|
||||
* Reset the index if the page was already mapped
|
||||
* regularly before.
|
||||
*/
|
||||
if (page->mapping)
|
||||
page->index = 1;
|
||||
page->mapping = (void *)PAGE_MAPPING_DAX_COW;
|
||||
page->share = 1;
|
||||
page->mapping = PAGE_MAPPING_DAX_SHARED;
|
||||
}
|
||||
page->index++;
|
||||
page->share++;
|
||||
}
|
||||
|
||||
static inline unsigned long dax_page_share_put(struct page *page)
|
||||
{
|
||||
return --page->share;
|
||||
}
|
||||
|
||||
/*
|
||||
* When it is called in dax_insert_entry(), the cow flag will indicate that
|
||||
* When it is called in dax_insert_entry(), the shared flag will indicate that
|
||||
* whether this entry is shared by multiple files. If so, set the page->mapping
|
||||
* FS_DAX_MAPPING_COW, and use page->index as refcount.
|
||||
* PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
|
||||
*/
|
||||
static void dax_associate_entry(void *entry, struct address_space *mapping,
|
||||
struct vm_area_struct *vma, unsigned long address, bool cow)
|
||||
struct vm_area_struct *vma, unsigned long address, bool shared)
|
||||
{
|
||||
unsigned long size = dax_entry_size(entry), pfn, index;
|
||||
int i = 0;
|
||||
@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
|
||||
for_each_mapped_pfn(entry, pfn) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
if (cow) {
|
||||
dax_mapping_set_cow(page);
|
||||
if (shared) {
|
||||
dax_page_share_get(page);
|
||||
} else {
|
||||
WARN_ON_ONCE(page->mapping);
|
||||
page->mapping = mapping;
|
||||
@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
|
||||
if (dax_mapping_is_cow(page->mapping)) {
|
||||
/* keep the CoW flag if this page is still shared */
|
||||
if (page->index-- > 0)
|
||||
if (dax_page_is_shared(page)) {
|
||||
/* keep the shared flag if this page is still shared */
|
||||
if (dax_page_share_put(page) > 0)
|
||||
continue;
|
||||
} else
|
||||
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
|
||||
@ -840,12 +846,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
|
||||
(iter->iomap.flags & IOMAP_F_DIRTY);
|
||||
}
|
||||
|
||||
static bool dax_fault_is_cow(const struct iomap_iter *iter)
|
||||
{
|
||||
return (iter->flags & IOMAP_WRITE) &&
|
||||
(iter->iomap.flags & IOMAP_F_SHARED);
|
||||
}
|
||||
|
||||
/*
|
||||
* By this point grab_mapping_entry() has ensured that we have a locked entry
|
||||
* of the appropriate size so we don't have to worry about downgrading PMDs to
|
||||
@ -859,13 +859,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
|
||||
{
|
||||
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
||||
void *new_entry = dax_make_entry(pfn, flags);
|
||||
bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
|
||||
bool cow = dax_fault_is_cow(iter);
|
||||
bool write = iter->flags & IOMAP_WRITE;
|
||||
bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
|
||||
bool shared = iter->iomap.flags & IOMAP_F_SHARED;
|
||||
|
||||
if (dirty)
|
||||
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
|
||||
|
||||
if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
|
||||
if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
|
||||
unsigned long index = xas->xa_index;
|
||||
/* we are replacing a zero page with block mapping */
|
||||
if (dax_is_pmd_entry(entry))
|
||||
@ -877,12 +878,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
|
||||
|
||||
xas_reset(xas);
|
||||
xas_lock_irq(xas);
|
||||
if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
|
||||
if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
|
||||
void *old;
|
||||
|
||||
dax_disassociate_entry(entry, mapping, false);
|
||||
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
|
||||
cow);
|
||||
shared);
|
||||
/*
|
||||
* Only swap our new entry into the page cache if the current
|
||||
* entry is a zero page or an empty entry. If a normal PTE or
|
||||
@ -902,7 +903,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
|
||||
if (dirty)
|
||||
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
|
||||
|
||||
if (cow)
|
||||
if (write && shared)
|
||||
xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
|
||||
|
||||
xas_unlock_irq(xas);
|
||||
@ -1086,7 +1087,8 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
|
||||
}
|
||||
|
||||
/**
|
||||
* dax_iomap_cow_copy - Copy the data from source to destination before write
|
||||
* dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
|
||||
* by copying the data before and after the range to be written.
|
||||
* @pos: address to do copy from.
|
||||
* @length: size of copy operation.
|
||||
* @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
|
||||
@ -1095,35 +1097,50 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
|
||||
*
|
||||
* This can be called from two places. Either during DAX write fault (page
|
||||
* aligned), to copy the length size data to daddr. Or, while doing normal DAX
|
||||
* write operation, dax_iomap_actor() might call this to do the copy of either
|
||||
* write operation, dax_iomap_iter() might call this to do the copy of either
|
||||
* start or end unaligned address. In the latter case the rest of the copy of
|
||||
* aligned ranges is taken care by dax_iomap_actor() itself.
|
||||
* aligned ranges is taken care by dax_iomap_iter() itself.
|
||||
* If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
|
||||
* area to make sure no old data remains.
|
||||
*/
|
||||
static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
|
||||
static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
|
||||
const struct iomap *srcmap, void *daddr)
|
||||
{
|
||||
loff_t head_off = pos & (align_size - 1);
|
||||
size_t size = ALIGN(head_off + length, align_size);
|
||||
loff_t end = pos + length;
|
||||
loff_t pg_end = round_up(end, align_size);
|
||||
/* copy_all is usually in page fault case */
|
||||
bool copy_all = head_off == 0 && end == pg_end;
|
||||
/* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
|
||||
bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
|
||||
srcmap->type == IOMAP_UNWRITTEN;
|
||||
void *saddr = 0;
|
||||
int ret = 0;
|
||||
|
||||
ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (!zero_edge) {
|
||||
ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (copy_all) {
|
||||
ret = copy_mc_to_kernel(daddr, saddr, length);
|
||||
return ret ? -EIO : 0;
|
||||
if (zero_edge)
|
||||
memset(daddr, 0, size);
|
||||
else
|
||||
ret = copy_mc_to_kernel(daddr, saddr, length);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Copy the head part of the range */
|
||||
if (head_off) {
|
||||
ret = copy_mc_to_kernel(daddr, saddr, head_off);
|
||||
if (ret)
|
||||
return -EIO;
|
||||
if (zero_edge)
|
||||
memset(daddr, 0, head_off);
|
||||
else {
|
||||
ret = copy_mc_to_kernel(daddr, saddr, head_off);
|
||||
if (ret)
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
|
||||
/* Copy the tail part of the range */
|
||||
@ -1131,12 +1148,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
|
||||
loff_t tail_off = head_off + length;
|
||||
loff_t tail_len = pg_end - end;
|
||||
|
||||
ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
|
||||
tail_len);
|
||||
if (ret)
|
||||
return -EIO;
|
||||
if (zero_edge)
|
||||
memset(daddr + tail_off, 0, tail_len);
|
||||
else {
|
||||
ret = copy_mc_to_kernel(daddr + tail_off,
|
||||
saddr + tail_off, tail_len);
|
||||
if (ret)
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
out:
|
||||
if (zero_edge)
|
||||
dax_flush(srcmap->dax_dev, daddr, size);
|
||||
return ret ? -EIO : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1221,6 +1245,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
|
||||
}
|
||||
#endif /* CONFIG_FS_DAX_PMD */
|
||||
|
||||
static s64 dax_unshare_iter(struct iomap_iter *iter)
|
||||
{
|
||||
struct iomap *iomap = &iter->iomap;
|
||||
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
||||
loff_t pos = iter->pos;
|
||||
loff_t length = iomap_length(iter);
|
||||
int id = 0;
|
||||
s64 ret = 0;
|
||||
void *daddr = NULL, *saddr = NULL;
|
||||
|
||||
/* don't bother with blocks that are not shared to start with */
|
||||
if (!(iomap->flags & IOMAP_F_SHARED))
|
||||
return length;
|
||||
/* don't bother with holes or unwritten extents */
|
||||
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
|
||||
return length;
|
||||
|
||||
id = dax_read_lock();
|
||||
ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
|
||||
ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
|
||||
ret = copy_mc_to_kernel(daddr, saddr, length);
|
||||
if (ret)
|
||||
ret = -EIO;
|
||||
|
||||
out_unlock:
|
||||
dax_read_unlock(id);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
|
||||
const struct iomap_ops *ops)
|
||||
{
|
||||
struct iomap_iter iter = {
|
||||
.inode = inode,
|
||||
.pos = pos,
|
||||
.len = len,
|
||||
.flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
|
||||
};
|
||||
int ret;
|
||||
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = dax_unshare_iter(&iter);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_file_unshare);
|
||||
|
||||
static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
|
||||
{
|
||||
const struct iomap *iomap = &iter->iomap;
|
||||
@ -1235,13 +1311,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
memset(kaddr + offset, 0, size);
|
||||
if (srcmap->addr != iomap->addr) {
|
||||
ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
|
||||
kaddr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
|
||||
} else
|
||||
if (iomap->flags & IOMAP_F_SHARED)
|
||||
ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
|
||||
kaddr);
|
||||
else
|
||||
dax_flush(iomap->dax_dev, kaddr + offset, size);
|
||||
return ret;
|
||||
}
|
||||
@ -1258,6 +1331,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
|
||||
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
|
||||
return length;
|
||||
|
||||
/*
|
||||
* invalidate the pages whose sharing state is to be changed
|
||||
* because of CoW.
|
||||
*/
|
||||
if (iomap->flags & IOMAP_F_SHARED)
|
||||
invalidate_inode_pages2_range(iter->inode->i_mapping,
|
||||
pos >> PAGE_SHIFT,
|
||||
(pos + length - 1) >> PAGE_SHIFT);
|
||||
|
||||
do {
|
||||
unsigned offset = offset_in_page(pos);
|
||||
unsigned size = min_t(u64, PAGE_SIZE - offset, length);
|
||||
@ -1318,12 +1400,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
struct iov_iter *iter)
|
||||
{
|
||||
const struct iomap *iomap = &iomi->iomap;
|
||||
const struct iomap *srcmap = &iomi->srcmap;
|
||||
const struct iomap *srcmap = iomap_iter_srcmap(iomi);
|
||||
loff_t length = iomap_length(iomi);
|
||||
loff_t pos = iomi->pos;
|
||||
struct dax_device *dax_dev = iomap->dax_dev;
|
||||
loff_t end = pos + length, done = 0;
|
||||
bool write = iov_iter_rw(iter) == WRITE;
|
||||
bool cow = write && iomap->flags & IOMAP_F_SHARED;
|
||||
ssize_t ret = 0;
|
||||
size_t xfer;
|
||||
int id;
|
||||
@ -1350,7 +1433,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
* into page tables. We have to tear down these mappings so that data
|
||||
* written by write(2) is visible in mmap.
|
||||
*/
|
||||
if (iomap->flags & IOMAP_F_NEW) {
|
||||
if (iomap->flags & IOMAP_F_NEW || cow) {
|
||||
invalidate_inode_pages2_range(iomi->inode->i_mapping,
|
||||
pos >> PAGE_SHIFT,
|
||||
(end - 1) >> PAGE_SHIFT);
|
||||
@ -1384,10 +1467,9 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
break;
|
||||
}
|
||||
|
||||
if (write &&
|
||||
srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
|
||||
ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
|
||||
kaddr);
|
||||
if (cow) {
|
||||
ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
|
||||
srcmap, kaddr);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
@ -1532,7 +1614,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
|
||||
struct xa_state *xas, void **entry, bool pmd)
|
||||
{
|
||||
const struct iomap *iomap = &iter->iomap;
|
||||
const struct iomap *srcmap = &iter->srcmap;
|
||||
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
||||
size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
|
||||
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
|
||||
bool write = iter->flags & IOMAP_WRITE;
|
||||
@ -1563,9 +1645,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
|
||||
|
||||
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
|
||||
|
||||
if (write &&
|
||||
srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
|
||||
err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
|
||||
if (write && iomap->flags & IOMAP_F_SHARED) {
|
||||
err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
|
||||
if (err)
|
||||
return dax_fault_return(err);
|
||||
}
|
||||
@ -1936,15 +2017,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
|
||||
.len = len,
|
||||
.flags = IOMAP_DAX,
|
||||
};
|
||||
int ret;
|
||||
int ret, compared = 0;
|
||||
|
||||
while ((ret = iomap_iter(&src_iter, ops)) > 0) {
|
||||
while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
|
||||
dst_iter.processed = dax_range_compare_iter(&src_iter,
|
||||
&dst_iter, len, same);
|
||||
}
|
||||
if (ret <= 0)
|
||||
src_iter.processed = ret;
|
||||
while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
|
||||
(ret = iomap_iter(&dst_iter, ops)) > 0) {
|
||||
compared = dax_range_compare_iter(&src_iter, &dst_iter, len,
|
||||
same);
|
||||
if (compared < 0)
|
||||
return ret;
|
||||
src_iter.processed = dst_iter.processed = compared;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -345,11 +345,6 @@ static void exfat_readahead(struct readahead_control *rac)
|
||||
mpage_readahead(rac, exfat_get_block);
|
||||
}
|
||||
|
||||
static int exfat_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
return block_write_full_page(page, exfat_get_block, wbc);
|
||||
}
|
||||
|
||||
static int exfat_writepages(struct address_space *mapping,
|
||||
struct writeback_control *wbc)
|
||||
{
|
||||
@ -473,12 +468,12 @@ static const struct address_space_operations exfat_aops = {
|
||||
.invalidate_folio = block_invalidate_folio,
|
||||
.read_folio = exfat_read_folio,
|
||||
.readahead = exfat_readahead,
|
||||
.writepage = exfat_writepage,
|
||||
.writepages = exfat_writepages,
|
||||
.write_begin = exfat_write_begin,
|
||||
.write_end = exfat_write_end,
|
||||
.direct_IO = exfat_direct_IO,
|
||||
.bmap = exfat_aop_bmap
|
||||
.bmap = exfat_aop_bmap,
|
||||
.migrate_folio = buffer_migrate_folio,
|
||||
};
|
||||
|
||||
static inline unsigned long exfat_hash(loff_t i_pos)
|
||||
|
@ -253,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
|
||||
{
|
||||
struct inode *orig_inode = file_inode(o_filp);
|
||||
struct page *pagep[2] = {NULL, NULL};
|
||||
struct folio *folio[2] = {NULL, NULL};
|
||||
handle_t *handle;
|
||||
ext4_lblk_t orig_blk_offset, donor_blk_offset;
|
||||
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
|
||||
@ -313,6 +314,13 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
|
||||
* hold page's lock, if it is still the case data copy is not
|
||||
* necessary, just swap data blocks between orig and donor.
|
||||
*/
|
||||
folio[0] = page_folio(pagep[0]);
|
||||
folio[1] = page_folio(pagep[1]);
|
||||
|
||||
VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
|
||||
VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
|
||||
VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]);
|
||||
|
||||
if (unwritten) {
|
||||
ext4_double_down_write_data_sem(orig_inode, donor_inode);
|
||||
/* If any of extents in range became initialized we have to
|
||||
@ -331,10 +339,10 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
|
||||
ext4_double_up_write_data_sem(orig_inode, donor_inode);
|
||||
goto data_copy;
|
||||
}
|
||||
if ((page_has_private(pagep[0]) &&
|
||||
!try_to_release_page(pagep[0], 0)) ||
|
||||
(page_has_private(pagep[1]) &&
|
||||
!try_to_release_page(pagep[1], 0))) {
|
||||
if ((folio_has_private(folio[0]) &&
|
||||
!filemap_release_folio(folio[0], 0)) ||
|
||||
(folio_has_private(folio[1]) &&
|
||||
!filemap_release_folio(folio[1], 0))) {
|
||||
*err = -EBUSY;
|
||||
goto drop_data_sem;
|
||||
}
|
||||
@ -344,19 +352,21 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
|
||||
block_len_in_page, 1, err);
|
||||
drop_data_sem:
|
||||
ext4_double_up_write_data_sem(orig_inode, donor_inode);
|
||||
goto unlock_pages;
|
||||
goto unlock_folios;
|
||||
}
|
||||
data_copy:
|
||||
*err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
|
||||
*err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size);
|
||||
if (*err)
|
||||
goto unlock_pages;
|
||||
goto unlock_folios;
|
||||
|
||||
/* At this point all buffers in range are uptodate, old mapping layout
|
||||
* is no longer required, try to drop it now. */
|
||||
if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
|
||||
(page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
|
||||
if ((folio_has_private(folio[0]) &&
|
||||
!filemap_release_folio(folio[0], 0)) ||
|
||||
(folio_has_private(folio[1]) &&
|
||||
!filemap_release_folio(folio[1], 0))) {
|
||||
*err = -EBUSY;
|
||||
goto unlock_pages;
|
||||
goto unlock_folios;
|
||||
}
|
||||
ext4_double_down_write_data_sem(orig_inode, donor_inode);
|
||||
replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
|
||||
@ -369,13 +379,13 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
|
||||
replaced_size =
|
||||
block_len_in_page << orig_inode->i_blkbits;
|
||||
} else
|
||||
goto unlock_pages;
|
||||
goto unlock_folios;
|
||||
}
|
||||
/* Perform all necessary steps similar write_begin()/write_end()
|
||||
* but keeping in mind that i_size will not change */
|
||||
if (!page_has_buffers(pagep[0]))
|
||||
create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
|
||||
bh = page_buffers(pagep[0]);
|
||||
if (!folio_buffers(folio[0]))
|
||||
create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0);
|
||||
bh = folio_buffers(folio[0]);
|
||||
for (i = 0; i < data_offset_in_page; i++)
|
||||
bh = bh->b_this_page;
|
||||
for (i = 0; i < block_len_in_page; i++) {
|
||||
@ -385,7 +395,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
|
||||
bh = bh->b_this_page;
|
||||
}
|
||||
if (!*err)
|
||||
*err = block_commit_write(pagep[0], from, from + replaced_size);
|
||||
*err = block_commit_write(&folio[0]->page, from, from + replaced_size);
|
||||
|
||||
if (unlikely(*err < 0))
|
||||
goto repair_branches;
|
||||
@ -395,11 +405,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
|
||||
*err = ext4_jbd2_inode_add_write(handle, orig_inode,
|
||||
(loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
|
||||
|
||||
unlock_pages:
|
||||
unlock_page(pagep[0]);
|
||||
put_page(pagep[0]);
|
||||
unlock_page(pagep[1]);
|
||||
put_page(pagep[1]);
|
||||
unlock_folios:
|
||||
folio_unlock(folio[0]);
|
||||
folio_put(folio[0]);
|
||||
folio_unlock(folio[1]);
|
||||
folio_put(folio[1]);
|
||||
stop_journal:
|
||||
ext4_journal_stop(handle);
|
||||
if (*err == -ENOSPC &&
|
||||
@ -430,7 +440,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
|
||||
*err = -EIO;
|
||||
}
|
||||
replaced_count = 0;
|
||||
goto unlock_pages;
|
||||
goto unlock_folios;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -194,11 +194,6 @@ static int fat_get_block(struct inode *inode, sector_t iblock,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fat_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
return block_write_full_page(page, fat_get_block, wbc);
|
||||
}
|
||||
|
||||
static int fat_writepages(struct address_space *mapping,
|
||||
struct writeback_control *wbc)
|
||||
{
|
||||
@ -346,12 +341,12 @@ static const struct address_space_operations fat_aops = {
|
||||
.invalidate_folio = block_invalidate_folio,
|
||||
.read_folio = fat_read_folio,
|
||||
.readahead = fat_readahead,
|
||||
.writepage = fat_writepage,
|
||||
.writepages = fat_writepages,
|
||||
.write_begin = fat_write_begin,
|
||||
.write_end = fat_write_end,
|
||||
.direct_IO = fat_direct_IO,
|
||||
.bmap = _fat_bmap
|
||||
.bmap = _fat_bmap,
|
||||
.migrate_folio = buffer_migrate_folio,
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -764,11 +764,11 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
|
||||
return ncpy;
|
||||
}
|
||||
|
||||
static int fuse_check_page(struct page *page)
|
||||
static int fuse_check_folio(struct folio *folio)
|
||||
{
|
||||
if (page_mapcount(page) ||
|
||||
page->mapping != NULL ||
|
||||
(page->flags & PAGE_FLAGS_CHECK_AT_PREP &
|
||||
if (folio_mapped(folio) ||
|
||||
folio->mapping != NULL ||
|
||||
(folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
|
||||
~(1 << PG_locked |
|
||||
1 << PG_referenced |
|
||||
1 << PG_uptodate |
|
||||
@ -778,7 +778,7 @@ static int fuse_check_page(struct page *page)
|
||||
1 << PG_reclaim |
|
||||
1 << PG_waiters |
|
||||
LRU_GEN_MASK | LRU_REFS_MASK))) {
|
||||
dump_page(page, "fuse: trying to steal weird page");
|
||||
dump_page(&folio->page, "fuse: trying to steal weird page");
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
@ -787,11 +787,11 @@ static int fuse_check_page(struct page *page)
|
||||
static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
|
||||
{
|
||||
int err;
|
||||
struct page *oldpage = *pagep;
|
||||
struct page *newpage;
|
||||
struct folio *oldfolio = page_folio(*pagep);
|
||||
struct folio *newfolio;
|
||||
struct pipe_buffer *buf = cs->pipebufs;
|
||||
|
||||
get_page(oldpage);
|
||||
folio_get(oldfolio);
|
||||
err = unlock_request(cs->req);
|
||||
if (err)
|
||||
goto out_put_old;
|
||||
@ -814,35 +814,36 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
|
||||
if (!pipe_buf_try_steal(cs->pipe, buf))
|
||||
goto out_fallback;
|
||||
|
||||
newpage = buf->page;
|
||||
newfolio = page_folio(buf->page);
|
||||
|
||||
if (!PageUptodate(newpage))
|
||||
SetPageUptodate(newpage);
|
||||
if (!folio_test_uptodate(newfolio))
|
||||
folio_mark_uptodate(newfolio);
|
||||
|
||||
ClearPageMappedToDisk(newpage);
|
||||
folio_clear_mappedtodisk(newfolio);
|
||||
|
||||
if (fuse_check_page(newpage) != 0)
|
||||
if (fuse_check_folio(newfolio) != 0)
|
||||
goto out_fallback_unlock;
|
||||
|
||||
/*
|
||||
* This is a new and locked page, it shouldn't be mapped or
|
||||
* have any special flags on it
|
||||
*/
|
||||
if (WARN_ON(page_mapped(oldpage)))
|
||||
if (WARN_ON(folio_mapped(oldfolio)))
|
||||
goto out_fallback_unlock;
|
||||
if (WARN_ON(page_has_private(oldpage)))
|
||||
if (WARN_ON(folio_has_private(oldfolio)))
|
||||
goto out_fallback_unlock;
|
||||
if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
|
||||
if (WARN_ON(folio_test_dirty(oldfolio) ||
|
||||
folio_test_writeback(oldfolio)))
|
||||
goto out_fallback_unlock;
|
||||
if (WARN_ON(PageMlocked(oldpage)))
|
||||
if (WARN_ON(folio_test_mlocked(oldfolio)))
|
||||
goto out_fallback_unlock;
|
||||
|
||||
replace_page_cache_page(oldpage, newpage);
|
||||
replace_page_cache_folio(oldfolio, newfolio);
|
||||
|
||||
get_page(newpage);
|
||||
folio_get(newfolio);
|
||||
|
||||
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
|
||||
lru_cache_add(newpage);
|
||||
folio_add_lru(newfolio);
|
||||
|
||||
/*
|
||||
* Release while we have extra ref on stolen page. Otherwise
|
||||
@ -855,28 +856,28 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
|
||||
if (test_bit(FR_ABORTED, &cs->req->flags))
|
||||
err = -ENOENT;
|
||||
else
|
||||
*pagep = newpage;
|
||||
*pagep = &newfolio->page;
|
||||
spin_unlock(&cs->req->waitq.lock);
|
||||
|
||||
if (err) {
|
||||
unlock_page(newpage);
|
||||
put_page(newpage);
|
||||
folio_unlock(newfolio);
|
||||
folio_put(newfolio);
|
||||
goto out_put_old;
|
||||
}
|
||||
|
||||
unlock_page(oldpage);
|
||||
folio_unlock(oldfolio);
|
||||
/* Drop ref for ap->pages[] array */
|
||||
put_page(oldpage);
|
||||
folio_put(oldfolio);
|
||||
cs->len = 0;
|
||||
|
||||
err = 0;
|
||||
out_put_old:
|
||||
/* Drop ref obtained in this function */
|
||||
put_page(oldpage);
|
||||
folio_put(oldfolio);
|
||||
return err;
|
||||
|
||||
out_fallback_unlock:
|
||||
unlock_page(newpage);
|
||||
folio_unlock(newfolio);
|
||||
out_fallback:
|
||||
cs->pg = buf->page;
|
||||
cs->offset = buf->offset;
|
||||
|
@ -173,12 +173,12 @@ const struct address_space_operations hfs_aops = {
|
||||
.dirty_folio = block_dirty_folio,
|
||||
.invalidate_folio = block_invalidate_folio,
|
||||
.read_folio = hfs_read_folio,
|
||||
.writepage = hfs_writepage,
|
||||
.write_begin = hfs_write_begin,
|
||||
.write_end = generic_write_end,
|
||||
.bmap = hfs_bmap,
|
||||
.direct_IO = hfs_direct_IO,
|
||||
.writepages = hfs_writepages,
|
||||
.migrate_folio = buffer_migrate_folio,
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -170,12 +170,12 @@ const struct address_space_operations hfsplus_aops = {
|
||||
.dirty_folio = block_dirty_folio,
|
||||
.invalidate_folio = block_invalidate_folio,
|
||||
.read_folio = hfsplus_read_folio,
|
||||
.writepage = hfsplus_writepage,
|
||||
.write_begin = hfsplus_write_begin,
|
||||
.write_end = generic_write_end,
|
||||
.bmap = hfsplus_bmap,
|
||||
.direct_IO = hfsplus_direct_IO,
|
||||
.writepages = hfsplus_writepages,
|
||||
.migrate_folio = buffer_migrate_folio,
|
||||
};
|
||||
|
||||
const struct dentry_operations hfsplus_dentry_operations = {
|
||||
|
@ -163,11 +163,6 @@ static int hpfs_read_folio(struct file *file, struct folio *folio)
|
||||
return mpage_read_folio(folio, hpfs_get_block);
|
||||
}
|
||||
|
||||
static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
return block_write_full_page(page, hpfs_get_block, wbc);
|
||||
}
|
||||
|
||||
static void hpfs_readahead(struct readahead_control *rac)
|
||||
{
|
||||
mpage_readahead(rac, hpfs_get_block);
|
||||
@ -248,12 +243,12 @@ const struct address_space_operations hpfs_aops = {
|
||||
.dirty_folio = block_dirty_folio,
|
||||
.invalidate_folio = block_invalidate_folio,
|
||||
.read_folio = hpfs_read_folio,
|
||||
.writepage = hpfs_writepage,
|
||||
.readahead = hpfs_readahead,
|
||||
.writepages = hpfs_writepages,
|
||||
.write_begin = hpfs_write_begin,
|
||||
.write_end = hpfs_write_end,
|
||||
.bmap = _hpfs_bmap
|
||||
.bmap = _hpfs_bmap,
|
||||
.migrate_folio = buffer_migrate_folio,
|
||||
};
|
||||
|
||||
const struct file_operations hpfs_file_ops =
|
||||
|
@ -370,11 +370,11 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static void hugetlb_delete_from_page_cache(struct page *page)
|
||||
static void hugetlb_delete_from_page_cache(struct folio *folio)
|
||||
{
|
||||
ClearPageDirty(page);
|
||||
ClearPageUptodate(page);
|
||||
delete_from_page_cache(page);
|
||||
folio_clear_dirty(folio);
|
||||
folio_clear_uptodate(folio);
|
||||
filemap_remove_folio(folio);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -580,8 +580,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
|
||||
* map could fail. Correspondingly, the subpool and global
|
||||
* reserve usage count can need to be adjusted.
|
||||
*/
|
||||
VM_BUG_ON(HPageRestoreReserve(&folio->page));
|
||||
hugetlb_delete_from_page_cache(&folio->page);
|
||||
VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
|
||||
hugetlb_delete_from_page_cache(folio);
|
||||
ret = true;
|
||||
if (!truncate_op) {
|
||||
if (unlikely(hugetlb_unreserve_pages(inode, index,
|
||||
@ -1097,10 +1097,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
|
||||
if (rc != MIGRATEPAGE_SUCCESS)
|
||||
return rc;
|
||||
|
||||
if (hugetlb_page_subpool(&src->page)) {
|
||||
hugetlb_set_page_subpool(&dst->page,
|
||||
hugetlb_page_subpool(&src->page));
|
||||
hugetlb_set_page_subpool(&src->page, NULL);
|
||||
if (hugetlb_folio_subpool(src)) {
|
||||
hugetlb_set_folio_subpool(dst,
|
||||
hugetlb_folio_subpool(src));
|
||||
hugetlb_set_folio_subpool(src, NULL);
|
||||
}
|
||||
|
||||
if (mode != MIGRATE_SYNC_NO_COPY)
|
||||
@ -1279,7 +1279,7 @@ static const struct address_space_operations hugetlbfs_aops = {
|
||||
|
||||
static void init_once(void *foo)
|
||||
{
|
||||
struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
|
||||
struct hugetlbfs_inode_info *ei = foo;
|
||||
|
||||
inode_init_once(&ei->vfs_inode);
|
||||
}
|
||||
@ -1377,7 +1377,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
|
||||
|
||||
case Opt_size:
|
||||
/* memparse() will accept a K/M/G without a digit */
|
||||
if (!isdigit(param->string[0]))
|
||||
if (!param->string || !isdigit(param->string[0]))
|
||||
goto bad_val;
|
||||
ctx->max_size_opt = memparse(param->string, &rest);
|
||||
ctx->max_val_type = SIZE_STD;
|
||||
@ -1387,7 +1387,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
|
||||
|
||||
case Opt_nr_inodes:
|
||||
/* memparse() will accept a K/M/G without a digit */
|
||||
if (!isdigit(param->string[0]))
|
||||
if (!param->string || !isdigit(param->string[0]))
|
||||
goto bad_val;
|
||||
ctx->nr_inodes = memparse(param->string, &rest);
|
||||
return 0;
|
||||
@ -1403,7 +1403,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
|
||||
|
||||
case Opt_min_size:
|
||||
/* memparse() will accept a K/M/G without a digit */
|
||||
if (!isdigit(param->string[0]))
|
||||
if (!param->string || !isdigit(param->string[0]))
|
||||
goto bad_val;
|
||||
ctx->min_size_opt = memparse(param->string, &rest);
|
||||
ctx->min_val_type = SIZE_STD;
|
||||
|
@ -264,11 +264,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int jfs_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
return block_write_full_page(page, jfs_get_block, wbc);
|
||||
}
|
||||
|
||||
static int jfs_writepages(struct address_space *mapping,
|
||||
struct writeback_control *wbc)
|
||||
{
|
||||
@ -355,12 +350,12 @@ const struct address_space_operations jfs_aops = {
|
||||
.invalidate_folio = block_invalidate_folio,
|
||||
.read_folio = jfs_read_folio,
|
||||
.readahead = jfs_readahead,
|
||||
.writepage = jfs_writepage,
|
||||
.writepages = jfs_writepages,
|
||||
.write_begin = jfs_write_begin,
|
||||
.write_end = jfs_write_end,
|
||||
.bmap = jfs_bmap,
|
||||
.direct_IO = jfs_direct_IO,
|
||||
.migrate_folio = buffer_migrate_folio,
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -294,11 +294,6 @@ static void omfs_readahead(struct readahead_control *rac)
|
||||
mpage_readahead(rac, omfs_get_block);
|
||||
}
|
||||
|
||||
static int omfs_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
return block_write_full_page(page, omfs_get_block, wbc);
|
||||
}
|
||||
|
||||
static int
|
||||
omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
|
||||
{
|
||||
@ -375,10 +370,10 @@ const struct address_space_operations omfs_aops = {
|
||||
.invalidate_folio = block_invalidate_folio,
|
||||
.read_folio = omfs_read_folio,
|
||||
.readahead = omfs_readahead,
|
||||
.writepage = omfs_writepage,
|
||||
.writepages = omfs_writepages,
|
||||
.write_begin = omfs_write_begin,
|
||||
.write_end = generic_write_end,
|
||||
.bmap = omfs_bmap,
|
||||
.migrate_folio = buffer_migrate_folio,
|
||||
};
|
||||
|
||||
|
@ -18,7 +18,6 @@
|
||||
#include <linux/capability.h>
|
||||
#include <linux/elf.h>
|
||||
#include <linux/elfcore.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/printk.h>
|
||||
@ -541,25 +540,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
|
||||
fallthrough;
|
||||
case KCORE_VMEMMAP:
|
||||
case KCORE_TEXT:
|
||||
if (kern_addr_valid(start)) {
|
||||
/*
|
||||
* Using bounce buffer to bypass the
|
||||
* hardened user copy kernel text checks.
|
||||
*/
|
||||
if (copy_from_kernel_nofault(buf, (void *)start,
|
||||
tsz)) {
|
||||
if (clear_user(buffer, tsz)) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
if (copy_to_user(buffer, buf, tsz)) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* Using bounce buffer to bypass the
|
||||
* hardened user copy kernel text checks.
|
||||
*/
|
||||
if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
|
||||
if (clear_user(buffer, tsz)) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
if (clear_user(buffer, tsz)) {
|
||||
if (copy_to_user(buffer, buf, tsz)) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
@ -638,10 +629,6 @@ static int __meminit kcore_callback(struct notifier_block *self,
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
static struct notifier_block kcore_callback_nb __meminitdata = {
|
||||
.notifier_call = kcore_callback,
|
||||
.priority = 0,
|
||||
};
|
||||
|
||||
static struct kcore_list kcore_vmalloc;
|
||||
|
||||
@ -694,7 +681,7 @@ static int __init proc_kcore_init(void)
|
||||
add_modules_range();
|
||||
/* Store direct-map area from physical memory map */
|
||||
kcore_update_ram();
|
||||
register_hotmemory_notifier(&kcore_callback_nb);
|
||||
hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -274,6 +274,7 @@ static void show_vma_header_prefix(struct seq_file *m,
|
||||
static void
|
||||
show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
|
||||
{
|
||||
struct anon_vma_name *anon_name = NULL;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct file *file = vma->vm_file;
|
||||
vm_flags_t flags = vma->vm_flags;
|
||||
@ -293,6 +294,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
|
||||
start = vma->vm_start;
|
||||
end = vma->vm_end;
|
||||
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
|
||||
if (mm)
|
||||
anon_name = anon_vma_name(vma);
|
||||
|
||||
/*
|
||||
* Print the dentry name for named mappings, and a
|
||||
@ -300,7 +303,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
|
||||
*/
|
||||
if (file) {
|
||||
seq_pad(m, ' ');
|
||||
seq_file_path(m, file, "\n");
|
||||
/*
|
||||
* If user named this anon shared memory via
|
||||
* prctl(PR_SET_VMA ..., use the provided name.
|
||||
*/
|
||||
if (anon_name)
|
||||
seq_printf(m, "[anon_shmem:%s]", anon_name->name);
|
||||
else
|
||||
seq_file_path(m, file, "\n");
|
||||
goto done;
|
||||
}
|
||||
|
||||
@ -312,8 +322,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
|
||||
|
||||
name = arch_vma_name(vma);
|
||||
if (!name) {
|
||||
struct anon_vma_name *anon_name;
|
||||
|
||||
if (!mm) {
|
||||
name = "[vdso]";
|
||||
goto done;
|
||||
@ -330,7 +338,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
|
||||
goto done;
|
||||
}
|
||||
|
||||
anon_name = anon_vma_name(vma);
|
||||
if (anon_name) {
|
||||
seq_pad(m, ' ');
|
||||
seq_printf(m, "[anon:%s]", anon_name->name);
|
||||
@ -667,6 +674,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
|
||||
[ilog2(VM_RAND_READ)] = "rr",
|
||||
[ilog2(VM_DONTCOPY)] = "dc",
|
||||
[ilog2(VM_DONTEXPAND)] = "de",
|
||||
[ilog2(VM_LOCKONFAULT)] = "lf",
|
||||
[ilog2(VM_ACCOUNT)] = "ac",
|
||||
[ilog2(VM_NORESERVE)] = "nr",
|
||||
[ilog2(VM_HUGETLB)] = "ht",
|
||||
|
@ -1138,10 +1138,6 @@ xfs_ioctl_setattr_xflags(
|
||||
if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
|
||||
ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
|
||||
|
||||
/* Don't allow us to set DAX mode for a reflinked file for now. */
|
||||
if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
|
||||
return -EINVAL;
|
||||
|
||||
/* diflags2 only valid for v3 inodes. */
|
||||
i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
|
||||
if (i_flags2 && !xfs_has_v3inodes(mp))
|
||||
|
@ -1215,7 +1215,7 @@ xfs_read_iomap_begin(
|
||||
return error;
|
||||
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
|
||||
&nimaps, 0);
|
||||
if (!error && (flags & IOMAP_REPORT))
|
||||
if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode)))
|
||||
error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
|
||||
xfs_iunlock(ip, lockmode);
|
||||
|
||||
@ -1370,7 +1370,7 @@ xfs_zero_range(
|
||||
|
||||
if (IS_DAX(inode))
|
||||
return dax_zero_range(inode, pos, len, did_zero,
|
||||
&xfs_direct_write_iomap_ops);
|
||||
&xfs_dax_write_iomap_ops);
|
||||
return iomap_zero_range(inode, pos, len, did_zero,
|
||||
&xfs_buffered_write_iomap_ops);
|
||||
}
|
||||
@ -1385,7 +1385,7 @@ xfs_truncate_page(
|
||||
|
||||
if (IS_DAX(inode))
|
||||
return dax_truncate_page(inode, pos, did_zero,
|
||||
&xfs_direct_write_iomap_ops);
|
||||
&xfs_dax_write_iomap_ops);
|
||||
return iomap_truncate_page(inode, pos, did_zero,
|
||||
&xfs_buffered_write_iomap_ops);
|
||||
}
|
||||
|
@ -1187,10 +1187,6 @@ xfs_inode_supports_dax(
|
||||
if (!S_ISREG(VFS_I(ip)->i_mode))
|
||||
return false;
|
||||
|
||||
/* Only supported on non-reflinked files. */
|
||||
if (xfs_is_reflink_inode(ip))
|
||||
return false;
|
||||
|
||||
/* Block size must match page size */
|
||||
if (mp->m_sb.sb_blocksize != PAGE_SIZE)
|
||||
return false;
|
||||
|
@ -1693,8 +1693,12 @@ xfs_reflink_unshare(
|
||||
|
||||
inode_dio_wait(inode);
|
||||
|
||||
error = iomap_file_unshare(inode, offset, len,
|
||||
&xfs_buffered_write_iomap_ops);
|
||||
if (IS_DAX(inode))
|
||||
error = dax_file_unshare(inode, offset, len,
|
||||
&xfs_dax_write_iomap_ops);
|
||||
else
|
||||
error = iomap_file_unshare(inode, offset, len,
|
||||
&xfs_buffered_write_iomap_ops);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user