25 hotfixes, mainly for MM. 13 are cc:stable.

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCY9x+swAKCRDdBJ7gKXxA
 joPwAP95XqB7gzy2l1Mc++Ta7Ih0fS34Pj1vTAxwsRQnqzr6rwD/QOt3YU9KgXpy
 D7Fp8NnaQZq6m5o8cvV5+fBqA3uarAM=
 =IIB8
 -----END PGP SIGNATURE-----

Merge tag 'mm-hotfixes-stable-2023-02-02-19-24-2' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
 "25 hotfixes, mainly for MM.  13 are cc:stable"

* tag 'mm-hotfixes-stable-2023-02-02-19-24-2' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (26 commits)
  mm: memcg: fix NULL pointer in mem_cgroup_track_foreign_dirty_slowpath()
  Kconfig.debug: fix the help description in SCHED_DEBUG
  mm/swapfile: add cond_resched() in get_swap_pages()
  mm: use stack_depot_early_init for kmemleak
  Squashfs: fix handling and sanity checking of xattr_ids count
  sh: define RUNTIME_DISCARD_EXIT
  highmem: round down the address passed to kunmap_flush_on_unmap()
  migrate: hugetlb: check for hugetlb shared PMD in node migration
  mm: hugetlb: proc: check for hugetlb shared PMD in /proc/PID/smaps
  mm/MADV_COLLAPSE: catch !none !huge !bad pmd lookups
  Revert "mm: kmemleak: alloc gray object for reserved region with direct map"
  freevxfs: Kconfig: fix spelling
  maple_tree: should get pivots boundary by type
  .mailmap: update e-mail address for Eugen Hristev
  mm, mremap: fix mremap() expanding for vma's with vm_ops->close()
  squashfs: harden sanity check in squashfs_read_xattr_id_table
  ia64: fix build error due to switch case label appearing next to declaration
  mm: multi-gen LRU: fix crash during cgroup migration
  Revert "mm: add nodes= arg to memory.reclaim"
  zsmalloc: fix a race with deferred_handles storing
  ...
This commit is contained in:
Linus Torvalds 2023-02-03 10:01:57 -08:00
commit 0c272a1d33
31 changed files with 430 additions and 152 deletions

View File

@ -130,6 +130,7 @@ Domen Puncer <domen@coderock.org>
Douglas Gilbert <dougg@torque.net>
Ed L. Cashin <ecashin@coraid.com>
Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com>
Eugen Hristev <eugen.hristev@collabora.com> <eugen.hristev@microchip.com>
Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com>
Felipe W Damasio <felipewd@terra.com.br>

View File

@ -1245,13 +1245,17 @@ PAGE_SIZE multiple when read back.
This is a simple interface to trigger memory reclaim in the
target cgroup.
This file accepts a string which contains the number of bytes to
reclaim.
This file accepts a single key, the number of bytes to reclaim.
No nested keys are currently supported.
Example::
echo "1G" > memory.reclaim
The interface can be later extended with nested keys to
configure the reclaim behavior. For example, specify the
type of memory to reclaim from (anon, file, ..).
Please note that the kernel can over or under reclaim from
the target cgroup. If less bytes are reclaimed than the
specified amount, -EAGAIN is returned.
@ -1263,13 +1267,6 @@ PAGE_SIZE multiple when read back.
This means that the networking layer will not adapt based on
reclaim induced by memory.reclaim.
This file also allows the user to specify the nodes to reclaim from,
via the 'nodes=' key, for example::
echo "1G nodes=0,1" > memory.reclaim
The above instructs the kernel to reclaim memory from nodes 0,1.
memory.peak
A read-only single value file which exists on non-root
cgroups.

View File

@ -170,6 +170,9 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u
asmlinkage long
ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp)
{
struct timespec64 rtn_tp;
s64 tick_ns;
/*
* ia64's clock_gettime() syscall is implemented as a vdso call
* fsys_clock_gettime(). Currently it handles only
@ -185,8 +188,8 @@ ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *
switch (which_clock) {
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq);
struct timespec64 rtn_tp = ns_to_timespec64(tick_ns);
tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq);
rtn_tp = ns_to_timespec64(tick_ns);
return put_timespec64(&rtn_tp, tp);
}

View File

@ -4,6 +4,7 @@
* Written by Niibe Yutaka and Paul Mundt
*/
OUTPUT_ARCH(sh)
#define RUNTIME_DISCARD_EXIT
#include <asm/thread_info.h>
#include <asm/cache.h>
#include <asm/vmlinux.lds.h>

View File

@ -26,7 +26,6 @@
#include <linux/serial_core.h>
#include <linux/sysfs.h>
#include <linux/random.h>
#include <linux/kmemleak.h>
#include <asm/setup.h> /* for COMMAND_LINE_SIZE */
#include <asm/page.h>
@ -525,12 +524,9 @@ static int __init __reserved_mem_reserve_reg(unsigned long node,
size = dt_mem_next_cell(dt_root_size_cells, &prop);
if (size &&
early_init_dt_reserve_memory(base, size, nomap) == 0) {
early_init_dt_reserve_memory(base, size, nomap) == 0)
pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
uname, &base, (unsigned long)(size / SZ_1M));
if (!nomap)
kmemleak_alloc_phys(base, size, 0);
}
else
pr_err("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
uname, &base, (unsigned long)(size / SZ_1M));

View File

@ -8,7 +8,7 @@ config VXFS_FS
of SCO UnixWare (and possibly others) and optionally available
for Sunsoft Solaris, HP-UX and many other operating systems. However
these particular OS implementations of vxfs may differ in on-disk
data endianess and/or superblock offset. The vxfs module has been
data endianness and/or superblock offset. The vxfs module has been
tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.)
Currently only readonly access is supported and VxFX versions
2, 3 and 4. Tests were performed with HP-UX VxFS version 3.

View File

@ -745,9 +745,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
page = pfn_swap_entry_to_page(swpent);
}
if (page) {
int mapcount = page_mapcount(page);
if (mapcount >= 2)
if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else
mss->private_hugetlb += huge_page_size(hstate_vma(vma));

View File

@ -183,7 +183,7 @@ static inline int squashfs_block_size(__le32 raw)
#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
sizeof(u64))
/* xattr id lookup table defines */
#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id))
#define SQUASHFS_XATTR_BYTES(A) (((u64) (A)) * sizeof(struct squashfs_xattr_id))
#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
SQUASHFS_METADATA_SIZE)

View File

@ -63,7 +63,7 @@ struct squashfs_sb_info {
long long bytes_used;
unsigned int inodes;
unsigned int fragments;
int xattr_ids;
unsigned int xattr_ids;
unsigned int ids;
bool panic_on_errors;
const struct squashfs_decompressor_thread_ops *thread_ops;

View File

@ -10,12 +10,12 @@
#ifdef CONFIG_SQUASHFS_XATTR
extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
u64 *, int *);
u64 *, unsigned int *);
extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
unsigned int *, unsigned long long *);
#else
static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
u64 start, u64 *xattr_table_start, int *xattr_ids)
u64 start, u64 *xattr_table_start, unsigned int *xattr_ids)
{
struct squashfs_xattr_id_table *id_table;

View File

@ -56,7 +56,7 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
* Read uncompressed xattr id lookup table indexes from disk into memory
*/
__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
u64 *xattr_table_start, int *xattr_ids)
u64 *xattr_table_start, unsigned int *xattr_ids)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
unsigned int len, indexes;
@ -76,7 +76,7 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
/* Sanity check values */
/* there is always at least one xattr id */
if (*xattr_ids == 0)
if (*xattr_ids <= 0)
return ERR_PTR(-EINVAL);
len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);

View File

@ -200,7 +200,7 @@ static inline void *kmap_local_pfn(unsigned long pfn)
static inline void __kunmap_local(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(addr);
kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
}
@ -227,7 +227,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn)
static inline void __kunmap_atomic(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(addr);
kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
pagefault_enable();
if (IS_ENABLED(CONFIG_PREEMPT_RT))

View File

@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/hugetlb_inline.h>
#include <linux/cgroup.h>
#include <linux/page_ref.h>
#include <linux/list.h>
#include <linux/kref.h>
#include <linux/pgtable.h>
@ -1187,6 +1188,18 @@ static inline __init void hugetlb_cma_reserve(int order)
}
#endif
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
return page_count(virt_to_page(pte)) > 1;
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
return false;
}
#endif
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE

View File

@ -1666,10 +1666,13 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
struct bdi_writeback *wb)
{
struct mem_cgroup *memcg;
if (mem_cgroup_disabled())
return;
if (unlikely(&folio_memcg(folio)->css != wb->memcg_css))
memcg = folio_memcg(folio);
if (unlikely(memcg && &memcg->css != wb->memcg_css))
mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
}

View File

@ -418,8 +418,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
unsigned int reclaim_options,
nodemask_t *nodemask);
unsigned int reclaim_options);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
pg_data_t *pgdat,

View File

@ -754,6 +754,7 @@ config DEBUG_KMEMLEAK
select KALLSYMS
select CRC32
select STACKDEPOT
select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF
help
Say Y here if you want to enable the memory leak
detector. The memory allocation/freeing is traced in a way
@ -1207,7 +1208,7 @@ config SCHED_DEBUG
depends on DEBUG_KERNEL && PROC_FS
default y
help
If you say Y here, the /proc/sched_debug file will be provided
If you say Y here, the /sys/kernel/debug/sched file will be provided
that can help debug the scheduler. The runtime overhead of this
option is minimal.

View File

@ -670,12 +670,13 @@ static inline unsigned long mte_pivot(const struct maple_enode *mn,
unsigned char piv)
{
struct maple_node *node = mte_to_node(mn);
enum maple_type type = mte_node_type(mn);
if (piv >= mt_pivots[piv]) {
if (piv >= mt_pivots[type]) {
WARN_ON(1);
return 0;
}
switch (mte_node_type(mn)) {
switch (type) {
case maple_arange_64:
return node->ma64.pivot[piv];
case maple_range_64:
@ -4887,7 +4888,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
unsigned long *pivots, *gaps;
void __rcu **slots;
unsigned long gap = 0;
unsigned long max, min, index;
unsigned long max, min;
unsigned char offset;
if (unlikely(mas_is_err(mas)))
@ -4909,8 +4910,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
min = mas_safe_min(mas, pivots, --offset);
max = mas_safe_pivot(mas, pivots, offset, type);
index = mas->index;
while (index <= max) {
while (mas->index <= max) {
gap = 0;
if (gaps)
gap = gaps[offset];
@ -4941,10 +4941,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
min = mas_safe_min(mas, pivots, offset);
}
if (unlikely(index > max)) {
mas_set_err(mas, -EBUSY);
return false;
}
if (unlikely((mas->index > max) || (size - 1 > max - mas->index)))
goto no_space;
if (unlikely(ma_is_leaf(type))) {
mas->offset = offset;
@ -4961,9 +4959,11 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
return false;
ascend:
if (mte_is_root(mas->node))
mas_set_err(mas, -EBUSY);
if (!mte_is_root(mas->node))
return false;
no_space:
mas_set_err(mas, -EBUSY);
return false;
}

View File

@ -2517,6 +2517,91 @@ static noinline void check_bnode_min_spanning(struct maple_tree *mt)
mt_set_non_kernel(0);
}
static noinline void check_empty_area_window(struct maple_tree *mt)
{
unsigned long i, nr_entries = 20;
MA_STATE(mas, mt, 0, 0);
for (i = 1; i <= nr_entries; i++)
mtree_store_range(mt, i*10, i*10 + 9,
xa_mk_value(i), GFP_KERNEL);
/* Create another hole besides the one at 0 */
mtree_store_range(mt, 160, 169, NULL, GFP_KERNEL);
/* Check lower bounds that don't fit */
rcu_read_lock();
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 10) != -EBUSY);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 6, 90, 5) != -EBUSY);
/* Check lower bound that does fit */
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 5) != 0);
MT_BUG_ON(mt, mas.index != 5);
MT_BUG_ON(mt, mas.last != 9);
rcu_read_unlock();
/* Check one gap that doesn't fit and one that does */
rcu_read_lock();
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 217, 9) != 0);
MT_BUG_ON(mt, mas.index != 161);
MT_BUG_ON(mt, mas.last != 169);
/* Check one gap that does fit above the min */
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 3) != 0);
MT_BUG_ON(mt, mas.index != 216);
MT_BUG_ON(mt, mas.last != 218);
/* Check size that doesn't fit any gap */
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 16) != -EBUSY);
/*
* Check size that doesn't fit the lower end of the window but
* does fit the gap
*/
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 167, 200, 4) != -EBUSY);
/*
* Check size that doesn't fit the upper end of the window but
* does fit the gap
*/
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 162, 4) != -EBUSY);
/* Check mas_empty_area forward */
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 9) != 0);
MT_BUG_ON(mt, mas.index != 0);
MT_BUG_ON(mt, mas.last != 8);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 4) != 0);
MT_BUG_ON(mt, mas.index != 0);
MT_BUG_ON(mt, mas.last != 3);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 11) != -EBUSY);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 5, 100, 6) != -EBUSY);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EBUSY);
mas_reset(&mas);
mas_empty_area(&mas, 100, 165, 3);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 100, 163, 6) != -EBUSY);
rcu_read_unlock();
}
static DEFINE_MTREE(tree);
static int maple_tree_seed(void)
{
@ -2765,6 +2850,10 @@ static int maple_tree_seed(void)
check_bnode_min_spanning(&tree);
mtree_destroy(&tree);
mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
check_empty_area_window(&tree);
mtree_destroy(&tree);
#if defined(BENCH)
skip:
#endif

View File

@ -5051,6 +5051,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_pte_clear_uffd_wp(entry);
set_huge_pte_at(dst, addr, dst_pte, entry);
} else if (unlikely(is_pte_marker(entry))) {
/* No swap on hugetlb */
WARN_ON_ONCE(
is_swapin_error_entry(pte_to_swp_entry(entry)));
/*
* We copy the pte marker only if the dst vma has
* uffd-wp enabled.

View File

@ -847,6 +847,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
/*
* See pmd_trans_unstable() for how the result may change out from
* underneath us, even if we hold mmap_lock in read.
*/
static int find_pmd_or_thp_or_none(struct mm_struct *mm,
unsigned long address,
pmd_t **pmd)
@ -865,8 +869,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
#endif
if (pmd_none(pmde))
return SCAN_PMD_NONE;
if (!pmd_present(pmde))
return SCAN_PMD_NULL;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
if (pmd_devmap(pmde))
return SCAN_PMD_NULL;
if (pmd_bad(pmde))
return SCAN_PMD_NULL;
return SCAN_SUCCEED;
@ -1642,7 +1650,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
* has higher cost too. It would also probably require locking
* the anon_vma.
*/
if (vma->anon_vma) {
if (READ_ONCE(vma->anon_vma)) {
result = SCAN_PAGE_ANON;
goto next;
}
@ -1670,6 +1678,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
result = SCAN_PTE_MAPPED_HUGEPAGE;
if ((cc->is_khugepaged || is_target) &&
mmap_write_trylock(mm)) {
/*
* Re-check whether we have an ->anon_vma, because
* collapse_and_free_pmd() requires that either no
* ->anon_vma exists or the anon_vma is locked.
* We already checked ->anon_vma above, but that check
* is racy because ->anon_vma can be populated under the
* mmap lock in read mode.
*/
if (vma->anon_vma) {
result = SCAN_PAGE_ANON;
goto unlock_next;
}
/*
* When a vma is registered with uffd-wp, we can't
* recycle the pmd pgtable because there can be pte

View File

@ -2070,8 +2070,10 @@ static int __init kmemleak_boot_config(char *str)
return -EINVAL;
if (strcmp(str, "off") == 0)
kmemleak_disable();
else if (strcmp(str, "on") == 0)
else if (strcmp(str, "on") == 0) {
kmemleak_skip_disable = 1;
stack_depot_want_early_init();
}
else
return -EINVAL;
return 0;
@ -2093,7 +2095,6 @@ void __init kmemleak_init(void)
if (kmemleak_error)
return;
stack_depot_init();
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);

View File

@ -63,7 +63,6 @@
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
#include <linux/parser.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@ -2393,8 +2392,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask,
MEMCG_RECLAIM_MAY_SWAP,
NULL);
MEMCG_RECLAIM_MAY_SWAP);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@ -2685,8 +2683,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, reclaim_options,
NULL);
gfp_mask, reclaim_options);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@ -3506,8 +3503,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP,
NULL)) {
memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
ret = -EBUSY;
break;
}
@ -3618,8 +3614,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
MEMCG_RECLAIM_MAY_SWAP,
NULL))
MEMCG_RECLAIM_MAY_SWAP))
nr_retries--;
}
@ -6429,8 +6424,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
NULL);
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
if (!reclaimed && !nr_retries--)
break;
@ -6479,8 +6473,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
NULL))
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
nr_reclaims--;
continue;
}
@ -6603,54 +6596,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
enum {
MEMORY_RECLAIM_NODES = 0,
MEMORY_RECLAIM_NULL,
};
static const match_table_t if_tokens = {
{ MEMORY_RECLAIM_NODES, "nodes=%s" },
{ MEMORY_RECLAIM_NULL, NULL },
};
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
MEMCG_RECLAIM_PROACTIVE;
char *old_buf, *start;
substring_t args[MAX_OPT_ARGS];
int token;
char value[256];
nodemask_t nodemask = NODE_MASK_ALL;
unsigned int reclaim_options;
int err;
buf = strstrip(buf);
err = page_counter_memparse(buf, "", &nr_to_reclaim);
if (err)
return err;
old_buf = buf;
nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
if (buf == old_buf)
return -EINVAL;
buf = strstrip(buf);
while ((start = strsep(&buf, " ")) != NULL) {
if (!strlen(start))
continue;
token = match_token(start, if_tokens, args);
match_strlcpy(value, args, sizeof(value));
switch (token) {
case MEMORY_RECLAIM_NODES:
if (nodelist_parse(value, nodemask) < 0)
return -EINVAL;
break;
default:
return -EINVAL;
}
}
reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
while (nr_reclaimed < nr_to_reclaim) {
unsigned long reclaimed;
@ -6667,8 +6627,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed,
GFP_KERNEL, reclaim_options,
&nodemask);
GFP_KERNEL, reclaim_options);
if (!reclaimed && !nr_retries--)
return -EAGAIN;

View File

@ -828,12 +828,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return -EBUSY;
return -ENOENT;
} else if (is_pte_marker_entry(entry)) {
/*
* We're copying the pgtable should only because dst_vma has
* uffd-wp enabled, do sanity check.
*/
WARN_ON_ONCE(!userfaultfd_wp(dst_vma));
set_pte_at(dst_mm, addr, dst_pte, pte);
if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
if (!userfaultfd_wp(dst_vma))
@ -3629,8 +3625,12 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
/*
* Be careful so that we will only recover a special uffd-wp pte into a
* none pte. Otherwise it means the pte could have changed, so retry.
*
* This should also cover the case where e.g. the pte changed
* quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
* So is_pte_marker() check is not enough to safely drop the pte.
*/
if (is_pte_marker(*vmf->pte))
if (pte_same(vmf->orig_pte, *vmf->pte))
pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;

View File

@ -600,7 +600,8 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
!hugetlb_pmd_shared(pte))) {
if (isolate_hugetlb(page, qp->pagelist) &&
(flags & MPOL_MF_STRICT))
/*

View File

@ -245,7 +245,13 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
newpte = pte_swp_mksoft_dirty(newpte);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
} else if (pte_marker_entry_uffd_wp(entry)) {
} else if (is_pte_marker_entry(entry)) {
/*
* Ignore swapin errors unconditionally,
* because any access should sigbus anyway.
*/
if (is_swapin_error_entry(entry))
continue;
/*
* If this is uffd-wp pte marker and we'd like
* to unprotect it, drop it; the next page

View File

@ -1027,16 +1027,29 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
/*
* Function vma_merge() is called on the extension we are adding to
* the already existing vma, vma_merge() will merge this extension with
* the already existing vma (expand operation itself) and possibly also
* with the next vma if it becomes adjacent to the expanded vma and
* otherwise compatible.
* Function vma_merge() is called on the extension we
* are adding to the already existing vma, vma_merge()
* will merge this extension with the already existing
* vma (expand operation itself) and possibly also with
* the next vma if it becomes adjacent to the expanded
* vma and otherwise compatible.
*
* However, vma_merge() can currently fail due to
* is_mergeable_vma() check for vm_ops->close (see the
* comment there). Yet this should not prevent vma
* expanding, so perform a simple expand for such vma.
* Ideally the check for close op should be only done
* when a vma would be actually removed due to a merge.
*/
vma = vma_merge(mm, vma, extension_start, extension_end,
if (!vma->vm_ops || !vma->vm_ops->close) {
vma = vma_merge(mm, vma, extension_start, extension_end,
vma->vm_flags, vma->anon_vma, vma->vm_file,
extension_pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
} else if (vma_adjust(vma, vma->vm_start, addr + new_len,
vma->vm_pgoff, NULL)) {
vma = NULL;
}
if (!vma) {
vm_unacct_memory(pages);
ret = -ENOMEM;

View File

@ -1100,6 +1100,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n",
si->type);
cond_resched();
spin_lock(&swap_avail_lock);
nextsi:

View File

@ -3323,13 +3323,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
if (mem_cgroup_disabled())
return;
/* migration can happen before addition */
if (!mm->lru_gen.memcg)
return;
rcu_read_lock();
memcg = mem_cgroup_from_task(task);
rcu_read_unlock();
if (memcg == mm->lru_gen.memcg)
return;
VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
lru_gen_del_mm(mm);
@ -6754,8 +6757,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
unsigned int reclaim_options,
nodemask_t *nodemask)
unsigned int reclaim_options)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
@ -6770,7 +6772,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
.nodemask = nodemask,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put

View File

@ -113,7 +113,23 @@
* have room for two bit at least.
*/
#define OBJ_ALLOCATED_TAG 1
#define OBJ_TAG_BITS 1
#ifdef CONFIG_ZPOOL
/*
* The second least-significant bit in the object's header identifies if the
* value stored at the header is a deferred handle from the last reclaim
* attempt.
*
* As noted above, this is valid because we have room for two bits.
*/
#define OBJ_DEFERRED_HANDLE_TAG 2
#define OBJ_TAG_BITS 2
#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
#else
#define OBJ_TAG_BITS 1
#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
#endif /* CONFIG_ZPOOL */
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
@ -222,6 +238,12 @@ struct link_free {
* Handle of allocated object.
*/
unsigned long handle;
#ifdef CONFIG_ZPOOL
/*
* Deferred handle of a reclaimed object.
*/
unsigned long deferred_handle;
#endif
};
};
@ -272,8 +294,6 @@ struct zspage {
/* links the zspage to the lru list in the pool */
struct list_head lru;
bool under_reclaim;
/* list of unfreed handles whose objects have been reclaimed */
unsigned long *deferred_handles;
#endif
struct zs_pool *pool;
@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
int tag)
{
unsigned long handle;
struct zspage *zspage = get_zspage(page);
@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
} else
handle = *(unsigned long *)obj;
if (!(handle & OBJ_ALLOCATED_TAG))
if (!(handle & tag))
return false;
*phandle = handle & ~OBJ_ALLOCATED_TAG;
/* Clear all tags before returning the handle */
*phandle = handle & ~OBJ_TAG_MASK;
return true;
}
static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
{
return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
}
#ifdef CONFIG_ZPOOL
static bool obj_stores_deferred_handle(struct page *page, void *obj,
unsigned long *phandle)
{
return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
}
#endif
static void reset_page(struct page *page)
{
__ClearPageMovable(page);
@ -946,22 +981,36 @@ static int trylock_zspage(struct zspage *zspage)
}
#ifdef CONFIG_ZPOOL
static unsigned long find_deferred_handle_obj(struct size_class *class,
struct page *page, int *obj_idx);
/*
* Free all the deferred handles whose objects are freed in zs_free.
*/
static void free_handles(struct zs_pool *pool, struct zspage *zspage)
static void free_handles(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
unsigned long handle = (unsigned long)zspage->deferred_handles;
int obj_idx = 0;
struct page *page = get_first_page(zspage);
unsigned long handle;
while (handle) {
unsigned long nxt_handle = handle_to_obj(handle);
while (1) {
handle = find_deferred_handle_obj(class, page, &obj_idx);
if (!handle) {
page = get_next_page(page);
if (!page)
break;
obj_idx = 0;
continue;
}
cache_free_handle(pool, handle);
handle = nxt_handle;
obj_idx++;
}
}
#else
static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}
static inline void free_handles(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage) {}
#endif
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(fg != ZS_EMPTY);
/* Free all deferred handles from zs_free */
free_handles(pool, zspage);
free_handles(pool, class, zspage);
next = page = get_first_page(zspage);
do {
@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
#ifdef CONFIG_ZPOOL
INIT_LIST_HEAD(&zspage->lru);
zspage->under_reclaim = false;
zspage->deferred_handles = NULL;
#endif
set_freeobj(zspage, 0);
@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(zs_malloc);
static void obj_free(int class_size, unsigned long obj)
static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
{
struct link_free *link;
struct zspage *zspage;
@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj)
zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
/* Insert this object in containing zspage's freelist */
link = (struct link_free *)(vaddr + f_offset);
if (likely(!ZsHugePage(zspage)))
link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
else
f_page->index = 0;
if (handle) {
#ifdef CONFIG_ZPOOL
/* Stores the (deferred) handle in the object's header */
*handle |= OBJ_DEFERRED_HANDLE_TAG;
*handle &= ~OBJ_ALLOCATED_TAG;
if (likely(!ZsHugePage(zspage)))
link->deferred_handle = *handle;
else
f_page->index = *handle;
#endif
} else {
/* Insert this object in containing zspage's freelist */
if (likely(!ZsHugePage(zspage)))
link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
else
f_page->index = 0;
set_freeobj(zspage, f_objidx);
}
kunmap_atomic(vaddr);
set_freeobj(zspage, f_objidx);
mod_zspage_inuse(zspage, -1);
}
@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
zspage = get_zspage(f_page);
class = zspage_class(pool, zspage);
obj_free(class->size, obj);
class_stat_dec(class, OBJ_USED, 1);
#ifdef CONFIG_ZPOOL
@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
* Reclaim needs the handles during writeback. It'll free
* them along with the zspage when it's done with them.
*
* Record current deferred handle at the memory location
* whose address is given by handle.
* Record current deferred handle in the object's header.
*/
record_obj(handle, (unsigned long)zspage->deferred_handles);
zspage->deferred_handles = (unsigned long *)handle;
obj_free(class->size, obj, &handle);
spin_unlock(&pool->lock);
return;
}
#endif
obj_free(class->size, obj, NULL);
fullness = fix_fullness_group(class, zspage);
if (fullness == ZS_EMPTY)
free_zspage(pool, class, zspage);
@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
}
/*
* Find alloced object in zspage from index object and
* Find object with a certain tag in zspage from index object and
* return handle.
*/
static unsigned long find_alloced_obj(struct size_class *class,
struct page *page, int *obj_idx)
static unsigned long find_tagged_obj(struct size_class *class,
struct page *page, int *obj_idx, int tag)
{
unsigned int offset;
int index = *obj_idx;
@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
offset += class->size * index;
while (offset < PAGE_SIZE) {
if (obj_allocated(page, addr + offset, &handle))
if (obj_tagged(page, addr + offset, &handle, tag))
break;
offset += class->size;
@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class,
return handle;
}
/*
* Find alloced object in zspage from index object and
* return handle.
*/
static unsigned long find_alloced_obj(struct size_class *class,
struct page *page, int *obj_idx)
{
return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
}
#ifdef CONFIG_ZPOOL
/*
* Find object storing a deferred handle in header in zspage from index object
* and return handle.
*/
static unsigned long find_deferred_handle_obj(struct size_class *class,
struct page *page, int *obj_idx)
{
return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
}
#endif
struct zs_compact_control {
/* Source spage for migration which could be a subpage of zspage */
struct page *s_page;
@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
record_obj(handle, free_obj);
obj_free(class->size, used_obj);
obj_free(class->size, used_obj, NULL);
}
/* Remember last position in this iteration */
@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool)
EXPORT_SYMBOL_GPL(zs_destroy_pool);
#ifdef CONFIG_ZPOOL
static void restore_freelist(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
unsigned int obj_idx = 0;
unsigned long handle, off = 0; /* off is within-page offset */
struct page *page = get_first_page(zspage);
struct link_free *prev_free = NULL;
void *prev_page_vaddr = NULL;
/* in case no free object found */
set_freeobj(zspage, (unsigned int)(-1UL));
while (page) {
void *vaddr = kmap_atomic(page);
struct page *next_page;
while (off < PAGE_SIZE) {
void *obj_addr = vaddr + off;
/* skip allocated object */
if (obj_allocated(page, obj_addr, &handle)) {
obj_idx++;
off += class->size;
continue;
}
/* free deferred handle from reclaim attempt */
if (obj_stores_deferred_handle(page, obj_addr, &handle))
cache_free_handle(pool, handle);
if (prev_free)
prev_free->next = obj_idx << OBJ_TAG_BITS;
else /* first free object found */
set_freeobj(zspage, obj_idx);
prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
/* if last free object in a previous page, need to unmap */
if (prev_page_vaddr) {
kunmap_atomic(prev_page_vaddr);
prev_page_vaddr = NULL;
}
obj_idx++;
off += class->size;
}
/*
* Handle the last (full or partial) object on this page.
*/
next_page = get_next_page(page);
if (next_page) {
if (!prev_free || prev_page_vaddr) {
/*
* There is no free object in this page, so we can safely
* unmap it.
*/
kunmap_atomic(vaddr);
} else {
/* update prev_page_vaddr since prev_free is on this page */
prev_page_vaddr = vaddr;
}
} else { /* this is the last page */
if (prev_free) {
/*
* Reset OBJ_TAG_BITS bit to last link to tell
* whether it's allocated object or not.
*/
prev_free->next = -1UL << OBJ_TAG_BITS;
}
/* unmap previous page (if not done yet) */
if (prev_page_vaddr) {
kunmap_atomic(prev_page_vaddr);
prev_page_vaddr = NULL;
}
kunmap_atomic(vaddr);
}
page = next_page;
off %= PAGE_SIZE;
}
}
static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
{
int i, obj_idx, ret = 0;
@ -2561,6 +2728,12 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
return 0;
}
/*
* Eviction fails on one of the handles, so we need to restore zspage.
* We need to rebuild its freelist (and free stored deferred handles),
* put it back to the correct size class, and add it to the LRU list.
*/
restore_freelist(pool, class, zspage);
putback_zspage(class, zspage);
list_add(&zspage->lru, &pool->lru);
unlock_zspage(zspage);

View File

View File

@ -17,7 +17,6 @@
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#define __USE_GNU
#include <fcntl.h>
#define MIN_FREE_PAGES 20