mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-09 14:43:16 +00:00
15 hotfixes. 9 are cc:stable and the remainder address post-6.8 issues
or aren't considered suitable for backporting. There are a significant number of fixups for this cycle's page_owner changes (series "page_owner: print stacks and their outstanding allocations"). Apart from that, singleton changes all over, mainly in MM. -----BEGIN PGP SIGNATURE----- iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZiGTewAKCRDdBJ7gKXxA jt1QAP9QxiU/+gUMVjkHyKaMBHSBMD/CWBFjDfRjx+BPqYx55gD+JWxUXwlyVkMo Z8fqtCGEgatev1VbwpCwByhvnH9bKgw= =YBZ9 -----END PGP SIGNATURE----- Merge tag 'mm-hotfixes-stable-2024-04-18-14-41' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull misc fixes from Andrew Morton: "15 hotfixes. 9 are cc:stable and the remainder address post-6.8 issues or aren't considered suitable for backporting. There are a significant number of fixups for this cycle's page_owner changes (series "page_owner: print stacks and their outstanding allocations"). Apart from that, singleton changes all over, mainly in MM" * tag 'mm-hotfixes-stable-2024-04-18-14-41' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: nilfs2: fix OOB in nilfs_set_de_type MAINTAINERS: update Naoya Horiguchi's email address fork: defer linking file vma until vma is fully initialized mm/shmem: inline shmem_is_huge() for disabled transparent hugepages mm,page_owner: defer enablement of static branch Squashfs: check the inode number is not the invalid value of zero mm,swapops: update check in is_pfn_swap_entry for hwpoison entries mm/memory-failure: fix deadlock when hugetlb_optimize_vmemmap is enabled mm/userfaultfd: allow hugetlb change protection upon poison entry mm,page_owner: fix printing of stack records mm,page_owner: fix accounting of pages when migrating mm,page_owner: fix refcount imbalance mm,page_owner: update metadata for tail pages userfaultfd: change src_folio after ensuring it's unpinned in UFFDIO_MOVE mm/madvise: make MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY properly
This commit is contained in:
commit
54c23548e0
3
.mailmap
3
.mailmap
@ -446,7 +446,8 @@ Mythri P K <mythripk@ti.com>
|
||||
Nadav Amit <nadav.amit@gmail.com> <namit@vmware.com>
|
||||
Nadav Amit <nadav.amit@gmail.com> <namit@cs.technion.ac.il>
|
||||
Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com>
|
||||
Naoya Horiguchi <naoya.horiguchi@nec.com> <n-horiguchi@ah.jp.nec.com>
|
||||
Naoya Horiguchi <nao.horiguchi@gmail.com> <n-horiguchi@ah.jp.nec.com>
|
||||
Naoya Horiguchi <nao.horiguchi@gmail.com> <naoya.horiguchi@nec.com>
|
||||
Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com>
|
||||
Neeraj Upadhyay <quic_neeraju@quicinc.com> <neeraju@codeaurora.org>
|
||||
Neil Armstrong <neil.armstrong@linaro.org> <narmstrong@baylibre.com>
|
||||
|
@ -24,10 +24,10 @@ fragmentation statistics can be obtained through gfp flag information of
|
||||
each page. It is already implemented and activated if page owner is
|
||||
enabled. Other usages are more than welcome.
|
||||
|
||||
It can also be used to show all the stacks and their outstanding
|
||||
allocations, which gives us a quick overview of where the memory is going
|
||||
without the need to screen through all the pages and match the allocation
|
||||
and free operation.
|
||||
It can also be used to show all the stacks and their current number of
|
||||
allocated base pages, which gives us a quick overview of where the memory
|
||||
is going without the need to screen through all the pages and match the
|
||||
allocation and free operation.
|
||||
|
||||
page owner is disabled by default. So, if you'd like to use it, you need
|
||||
to add "page_owner=on" to your boot cmdline. If the kernel is built
|
||||
@ -75,42 +75,45 @@ Usage
|
||||
|
||||
cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
|
||||
cat stacks.txt
|
||||
prep_new_page+0xa9/0x120
|
||||
get_page_from_freelist+0x7e6/0x2140
|
||||
__alloc_pages+0x18a/0x370
|
||||
new_slab+0xc8/0x580
|
||||
___slab_alloc+0x1f2/0xaf0
|
||||
__slab_alloc.isra.86+0x22/0x40
|
||||
kmem_cache_alloc+0x31b/0x350
|
||||
__khugepaged_enter+0x39/0x100
|
||||
dup_mmap+0x1c7/0x5ce
|
||||
copy_process+0x1afe/0x1c90
|
||||
kernel_clone+0x9a/0x3c0
|
||||
__do_sys_clone+0x66/0x90
|
||||
do_syscall_64+0x7f/0x160
|
||||
entry_SYSCALL_64_after_hwframe+0x6c/0x74
|
||||
stack_count: 234
|
||||
post_alloc_hook+0x177/0x1a0
|
||||
get_page_from_freelist+0xd01/0xd80
|
||||
__alloc_pages+0x39e/0x7e0
|
||||
allocate_slab+0xbc/0x3f0
|
||||
___slab_alloc+0x528/0x8a0
|
||||
kmem_cache_alloc+0x224/0x3b0
|
||||
sk_prot_alloc+0x58/0x1a0
|
||||
sk_alloc+0x32/0x4f0
|
||||
inet_create+0x427/0xb50
|
||||
__sock_create+0x2e4/0x650
|
||||
inet_ctl_sock_create+0x30/0x180
|
||||
igmp_net_init+0xc1/0x130
|
||||
ops_init+0x167/0x410
|
||||
setup_net+0x304/0xa60
|
||||
copy_net_ns+0x29b/0x4a0
|
||||
create_new_namespaces+0x4a1/0x820
|
||||
nr_base_pages: 16
|
||||
...
|
||||
...
|
||||
echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold
|
||||
cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt
|
||||
cat stacks_7000.txt
|
||||
prep_new_page+0xa9/0x120
|
||||
get_page_from_freelist+0x7e6/0x2140
|
||||
__alloc_pages+0x18a/0x370
|
||||
alloc_pages_mpol+0xdf/0x1e0
|
||||
folio_alloc+0x14/0x50
|
||||
filemap_alloc_folio+0xb0/0x100
|
||||
page_cache_ra_unbounded+0x97/0x180
|
||||
filemap_fault+0x4b4/0x1200
|
||||
__do_fault+0x2d/0x110
|
||||
do_pte_missing+0x4b0/0xa30
|
||||
__handle_mm_fault+0x7fa/0xb70
|
||||
handle_mm_fault+0x125/0x300
|
||||
do_user_addr_fault+0x3c9/0x840
|
||||
exc_page_fault+0x68/0x150
|
||||
asm_exc_page_fault+0x22/0x30
|
||||
stack_count: 8248
|
||||
post_alloc_hook+0x177/0x1a0
|
||||
get_page_from_freelist+0xd01/0xd80
|
||||
__alloc_pages+0x39e/0x7e0
|
||||
alloc_pages_mpol+0x22e/0x490
|
||||
folio_alloc+0xd5/0x110
|
||||
filemap_alloc_folio+0x78/0x230
|
||||
page_cache_ra_order+0x287/0x6f0
|
||||
filemap_get_pages+0x517/0x1160
|
||||
filemap_read+0x304/0x9f0
|
||||
xfs_file_buffered_read+0xe6/0x1d0 [xfs]
|
||||
xfs_file_read_iter+0x1f0/0x380 [xfs]
|
||||
__kernel_read+0x3b9/0x730
|
||||
kernel_read_file+0x309/0x4d0
|
||||
__do_sys_finit_module+0x381/0x730
|
||||
do_syscall_64+0x8d/0x150
|
||||
entry_SYSCALL_64_after_hwframe+0x62/0x6a
|
||||
nr_base_pages: 20824
|
||||
...
|
||||
|
||||
cat /sys/kernel/debug/page_owner > page_owner_full.txt
|
||||
|
@ -10024,7 +10024,7 @@ F: drivers/media/platform/st/sti/hva
|
||||
|
||||
HWPOISON MEMORY FAILURE HANDLING
|
||||
M: Miaohe Lin <linmiaohe@huawei.com>
|
||||
R: Naoya Horiguchi <naoya.horiguchi@nec.com>
|
||||
R: Naoya Horiguchi <nao.horiguchi@gmail.com>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: mm/hwpoison-inject.c
|
||||
|
@ -240,7 +240,7 @@ nilfs_filetype_table[NILFS_FT_MAX] = {
|
||||
|
||||
#define S_SHIFT 12
|
||||
static unsigned char
|
||||
nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
|
||||
nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
|
||||
[S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE,
|
||||
[S_IFDIR >> S_SHIFT] = NILFS_FT_DIR,
|
||||
[S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV,
|
||||
|
@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
|
||||
gid_t i_gid;
|
||||
int err;
|
||||
|
||||
inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
|
||||
if (inode->i_ino == 0)
|
||||
return -EINVAL;
|
||||
|
||||
err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid);
|
||||
if (err)
|
||||
return err;
|
||||
@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
|
||||
|
||||
i_uid_write(inode, i_uid);
|
||||
i_gid_write(inode, i_gid);
|
||||
inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
|
||||
inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
|
||||
inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
|
||||
inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
|
||||
|
@ -110,8 +110,17 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
|
||||
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
|
||||
int shmem_unuse(unsigned int type);
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
|
||||
struct mm_struct *mm, unsigned long vm_flags);
|
||||
#else
|
||||
static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
|
||||
struct mm_struct *mm, unsigned long vm_flags)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SHMEM
|
||||
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
|
||||
#else
|
||||
|
@ -390,6 +390,35 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry)
|
||||
}
|
||||
#endif /* CONFIG_MIGRATION */
|
||||
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
|
||||
/*
|
||||
* Support for hardware poisoned pages
|
||||
*/
|
||||
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
||||
{
|
||||
BUG_ON(!PageLocked(page));
|
||||
return swp_entry(SWP_HWPOISON, page_to_pfn(page));
|
||||
}
|
||||
|
||||
static inline int is_hwpoison_entry(swp_entry_t entry)
|
||||
{
|
||||
return swp_type(entry) == SWP_HWPOISON;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
||||
{
|
||||
return swp_entry(0, 0);
|
||||
}
|
||||
|
||||
static inline int is_hwpoison_entry(swp_entry_t swp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef unsigned long pte_marker;
|
||||
|
||||
#define PTE_MARKER_UFFD_WP BIT(0)
|
||||
@ -483,8 +512,9 @@ static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
|
||||
|
||||
/*
|
||||
* A pfn swap entry is a special type of swap entry that always has a pfn stored
|
||||
* in the swap offset. They are used to represent unaddressable device memory
|
||||
* and to restrict access to a page undergoing migration.
|
||||
* in the swap offset. They can either be used to represent unaddressable device
|
||||
* memory, to restrict access to a page undergoing migration or to represent a
|
||||
* pfn which has been hwpoisoned and unmapped.
|
||||
*/
|
||||
static inline bool is_pfn_swap_entry(swp_entry_t entry)
|
||||
{
|
||||
@ -492,7 +522,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry)
|
||||
BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
|
||||
|
||||
return is_migration_entry(entry) || is_device_private_entry(entry) ||
|
||||
is_device_exclusive_entry(entry);
|
||||
is_device_exclusive_entry(entry) || is_hwpoison_entry(entry);
|
||||
}
|
||||
|
||||
struct page_vma_mapped_walk;
|
||||
@ -561,35 +591,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
|
||||
}
|
||||
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
|
||||
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
|
||||
/*
|
||||
* Support for hardware poisoned pages
|
||||
*/
|
||||
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
||||
{
|
||||
BUG_ON(!PageLocked(page));
|
||||
return swp_entry(SWP_HWPOISON, page_to_pfn(page));
|
||||
}
|
||||
|
||||
static inline int is_hwpoison_entry(swp_entry_t entry)
|
||||
{
|
||||
return swp_type(entry) == SWP_HWPOISON;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
||||
{
|
||||
return swp_entry(0, 0);
|
||||
}
|
||||
|
||||
static inline int is_hwpoison_entry(swp_entry_t swp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int non_swap_entry(swp_entry_t entry)
|
||||
{
|
||||
return swp_type(entry) >= MAX_SWAPFILES;
|
||||
|
@ -714,6 +714,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
} else if (anon_vma_fork(tmp, mpnt))
|
||||
goto fail_nomem_anon_vma_fork;
|
||||
vm_flags_clear(tmp, VM_LOCKED_MASK);
|
||||
/*
|
||||
* Copy/update hugetlb private vma information.
|
||||
*/
|
||||
if (is_vm_hugetlb_page(tmp))
|
||||
hugetlb_dup_vma_private(tmp);
|
||||
|
||||
/*
|
||||
* Link the vma into the MT. After using __mt_dup(), memory
|
||||
* allocation is not necessary here, so it cannot fail.
|
||||
*/
|
||||
vma_iter_bulk_store(&vmi, tmp);
|
||||
|
||||
mm->map_count++;
|
||||
|
||||
if (tmp->vm_ops && tmp->vm_ops->open)
|
||||
tmp->vm_ops->open(tmp);
|
||||
|
||||
file = tmp->vm_file;
|
||||
if (file) {
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
@ -730,25 +747,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
i_mmap_unlock_write(mapping);
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy/update hugetlb private vma information.
|
||||
*/
|
||||
if (is_vm_hugetlb_page(tmp))
|
||||
hugetlb_dup_vma_private(tmp);
|
||||
|
||||
/*
|
||||
* Link the vma into the MT. After using __mt_dup(), memory
|
||||
* allocation is not necessary here, so it cannot fail.
|
||||
*/
|
||||
vma_iter_bulk_store(&vmi, tmp);
|
||||
|
||||
mm->map_count++;
|
||||
if (!(tmp->vm_flags & VM_WIPEONFORK))
|
||||
retval = copy_page_range(tmp, mpnt);
|
||||
|
||||
if (tmp->vm_ops && tmp->vm_ops->open)
|
||||
tmp->vm_ops->open(tmp);
|
||||
|
||||
if (retval) {
|
||||
mpnt = vma_next(&vmi);
|
||||
goto loop_out;
|
||||
|
54
mm/gup.c
54
mm/gup.c
@ -1206,6 +1206,22 @@ static long __get_user_pages(struct mm_struct *mm,
|
||||
|
||||
/* first iteration or cross vma bound */
|
||||
if (!vma || start >= vma->vm_end) {
|
||||
/*
|
||||
* MADV_POPULATE_(READ|WRITE) wants to handle VMA
|
||||
* lookups+error reporting differently.
|
||||
*/
|
||||
if (gup_flags & FOLL_MADV_POPULATE) {
|
||||
vma = vma_lookup(mm, start);
|
||||
if (!vma) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
if (check_vma_flags(vma, gup_flags)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
vma = gup_vma_lookup(mm, start);
|
||||
if (!vma && in_gate_area(mm, start)) {
|
||||
ret = get_gate_page(mm, start & PAGE_MASK,
|
||||
@ -1685,35 +1701,35 @@ long populate_vma_page_range(struct vm_area_struct *vma,
|
||||
}
|
||||
|
||||
/*
|
||||
* faultin_vma_page_range() - populate (prefault) page tables inside the
|
||||
* given VMA range readable/writable
|
||||
* faultin_page_range() - populate (prefault) page tables inside the
|
||||
* given range readable/writable
|
||||
*
|
||||
* This takes care of mlocking the pages, too, if VM_LOCKED is set.
|
||||
*
|
||||
* @vma: target vma
|
||||
* @mm: the mm to populate page tables in
|
||||
* @start: start address
|
||||
* @end: end address
|
||||
* @write: whether to prefault readable or writable
|
||||
* @locked: whether the mmap_lock is still held
|
||||
*
|
||||
* Returns either number of processed pages in the vma, or a negative error
|
||||
* code on error (see __get_user_pages()).
|
||||
* Returns either number of processed pages in the MM, or a negative error
|
||||
* code on error (see __get_user_pages()). Note that this function reports
|
||||
* errors related to VMAs, such as incompatible mappings, as expected by
|
||||
* MADV_POPULATE_(READ|WRITE).
|
||||
*
|
||||
* vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
|
||||
* covered by the VMA. If it's released, *@locked will be set to 0.
|
||||
* The range must be page-aligned.
|
||||
*
|
||||
* mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
|
||||
*/
|
||||
long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, bool write, int *locked)
|
||||
long faultin_page_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, bool write, int *locked)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long nr_pages = (end - start) / PAGE_SIZE;
|
||||
int gup_flags;
|
||||
long ret;
|
||||
|
||||
VM_BUG_ON(!PAGE_ALIGNED(start));
|
||||
VM_BUG_ON(!PAGE_ALIGNED(end));
|
||||
VM_BUG_ON_VMA(start < vma->vm_start, vma);
|
||||
VM_BUG_ON_VMA(end > vma->vm_end, vma);
|
||||
mmap_assert_locked(mm);
|
||||
|
||||
/*
|
||||
@ -1725,19 +1741,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
|
||||
* a poisoned page.
|
||||
* !FOLL_FORCE: Require proper access permissions.
|
||||
*/
|
||||
gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE;
|
||||
gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
|
||||
FOLL_MADV_POPULATE;
|
||||
if (write)
|
||||
gup_flags |= FOLL_WRITE;
|
||||
|
||||
/*
|
||||
* We want to report -EINVAL instead of -EFAULT for any permission
|
||||
* problems or incompatible mappings.
|
||||
*/
|
||||
if (check_vma_flags(vma, gup_flags))
|
||||
return -EINVAL;
|
||||
|
||||
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
|
||||
NULL, locked);
|
||||
ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
|
||||
gup_flags);
|
||||
lru_add_drain();
|
||||
return ret;
|
||||
}
|
||||
|
@ -2259,9 +2259,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
|
||||
goto unlock_ptls;
|
||||
}
|
||||
|
||||
folio_move_anon_rmap(src_folio, dst_vma);
|
||||
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
|
||||
|
||||
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
|
||||
/* Folio got pinned from under us. Put it back and fail the move. */
|
||||
if (folio_maybe_dma_pinned(src_folio)) {
|
||||
@ -2270,6 +2267,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
|
||||
goto unlock_ptls;
|
||||
}
|
||||
|
||||
folio_move_anon_rmap(src_folio, dst_vma);
|
||||
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
|
||||
|
||||
_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
|
||||
/* Follow mremap() behavior and treat the entry dirty after the move */
|
||||
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
|
||||
|
10
mm/hugetlb.c
10
mm/hugetlb.c
@ -7044,9 +7044,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
|
||||
if (!pte_same(pte, newpte))
|
||||
set_huge_pte_at(mm, address, ptep, newpte, psize);
|
||||
} else if (unlikely(is_pte_marker(pte))) {
|
||||
/* No other markers apply for now. */
|
||||
WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
|
||||
if (uffd_wp_resolve)
|
||||
/*
|
||||
* Do nothing on a poison marker; page is
|
||||
* corrupted, permissons do not apply. Here
|
||||
* pte_marker_uffd_wp()==true implies !poison
|
||||
* because they're mutual exclusive.
|
||||
*/
|
||||
if (pte_marker_uffd_wp(pte) && uffd_wp_resolve)
|
||||
/* Safe to modify directly (non-present->none). */
|
||||
huge_pte_clear(mm, address, ptep, psize);
|
||||
} else if (!huge_pte_none(pte)) {
|
||||
|
@ -686,9 +686,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio);
|
||||
void unmap_mapping_folio(struct folio *folio);
|
||||
extern long populate_vma_page_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end, int *locked);
|
||||
extern long faultin_vma_page_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
bool write, int *locked);
|
||||
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, bool write, int *locked);
|
||||
extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
|
||||
unsigned long bytes);
|
||||
|
||||
@ -1127,10 +1126,13 @@ enum {
|
||||
FOLL_FAST_ONLY = 1 << 20,
|
||||
/* allow unlocking the mmap lock */
|
||||
FOLL_UNLOCKABLE = 1 << 21,
|
||||
/* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
|
||||
FOLL_MADV_POPULATE = 1 << 22,
|
||||
};
|
||||
|
||||
#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
|
||||
FOLL_FAST_ONLY | FOLL_UNLOCKABLE)
|
||||
FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
|
||||
FOLL_MADV_POPULATE)
|
||||
|
||||
/*
|
||||
* Indicates for which pages that are write-protected in the page table,
|
||||
|
17
mm/madvise.c
17
mm/madvise.c
@ -908,27 +908,14 @@ static long madvise_populate(struct vm_area_struct *vma,
|
||||
{
|
||||
const bool write = behavior == MADV_POPULATE_WRITE;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long tmp_end;
|
||||
int locked = 1;
|
||||
long pages;
|
||||
|
||||
*prev = vma;
|
||||
|
||||
while (start < end) {
|
||||
/*
|
||||
* We might have temporarily dropped the lock. For example,
|
||||
* our VMA might have been split.
|
||||
*/
|
||||
if (!vma || start >= vma->vm_end) {
|
||||
vma = vma_lookup(mm, start);
|
||||
if (!vma)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
tmp_end = min_t(unsigned long, end, vma->vm_end);
|
||||
/* Populate (prefault) page tables readable/writable. */
|
||||
pages = faultin_vma_page_range(vma, start, tmp_end, write,
|
||||
&locked);
|
||||
pages = faultin_page_range(mm, start, end, write, &locked);
|
||||
if (!locked) {
|
||||
mmap_read_lock(mm);
|
||||
locked = 1;
|
||||
@ -949,7 +936,7 @@ static long madvise_populate(struct vm_area_struct *vma,
|
||||
pr_warn_once("%s: unhandled return value: %ld\n",
|
||||
__func__, pages);
|
||||
fallthrough;
|
||||
case -ENOMEM:
|
||||
case -ENOMEM: /* No VMA or out of memory. */
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
@ -154,11 +154,23 @@ static int __page_handle_poison(struct page *page)
|
||||
{
|
||||
int ret;
|
||||
|
||||
zone_pcp_disable(page_zone(page));
|
||||
/*
|
||||
* zone_pcp_disable() can't be used here. It will
|
||||
* hold pcp_batch_high_lock and dissolve_free_huge_page() might hold
|
||||
* cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
|
||||
* optimization is enabled. This will break current lock dependency
|
||||
* chain and leads to deadlock.
|
||||
* Disabling pcp before dissolving the page was a deterministic
|
||||
* approach because we made sure that those pages cannot end up in any
|
||||
* PCP list. Draining PCP lists expels those pages to the buddy system,
|
||||
* but nothing guarantees that those pages do not get back to a PCP
|
||||
* queue if we need to refill those.
|
||||
*/
|
||||
ret = dissolve_free_huge_page(page);
|
||||
if (!ret)
|
||||
if (!ret) {
|
||||
drain_all_pages(page_zone(page));
|
||||
ret = take_page_off_buddy(page);
|
||||
zone_pcp_enable(page_zone(page));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
190
mm/page_owner.c
190
mm/page_owner.c
@ -118,7 +118,6 @@ static __init void init_page_owner(void)
|
||||
register_dummy_stack();
|
||||
register_failure_stack();
|
||||
register_early_stack();
|
||||
static_branch_enable(&page_owner_inited);
|
||||
init_early_allocated_pages();
|
||||
/* Initialize dummy and failure stacks and link them to stack_list */
|
||||
dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle);
|
||||
@ -129,6 +128,7 @@ static __init void init_page_owner(void)
|
||||
refcount_set(&failure_stack.stack_record->count, 1);
|
||||
dummy_stack.next = &failure_stack;
|
||||
stack_list = &dummy_stack;
|
||||
static_branch_enable(&page_owner_inited);
|
||||
}
|
||||
|
||||
struct page_ext_operations page_owner_ops = {
|
||||
@ -196,7 +196,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record,
|
||||
spin_unlock_irqrestore(&stack_list_lock, flags);
|
||||
}
|
||||
|
||||
static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
|
||||
static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
|
||||
int nr_base_pages)
|
||||
{
|
||||
struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
|
||||
|
||||
@ -217,20 +218,74 @@ static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
|
||||
/* Add the new stack_record to our list */
|
||||
add_stack_record_to_list(stack_record, gfp_mask);
|
||||
}
|
||||
refcount_inc(&stack_record->count);
|
||||
refcount_add(nr_base_pages, &stack_record->count);
|
||||
}
|
||||
|
||||
static void dec_stack_record_count(depot_stack_handle_t handle)
|
||||
static void dec_stack_record_count(depot_stack_handle_t handle,
|
||||
int nr_base_pages)
|
||||
{
|
||||
struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
|
||||
|
||||
if (stack_record)
|
||||
refcount_dec(&stack_record->count);
|
||||
if (!stack_record)
|
||||
return;
|
||||
|
||||
if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
|
||||
pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
|
||||
handle);
|
||||
}
|
||||
|
||||
static inline void __update_page_owner_handle(struct page_ext *page_ext,
|
||||
depot_stack_handle_t handle,
|
||||
unsigned short order,
|
||||
gfp_t gfp_mask,
|
||||
short last_migrate_reason, u64 ts_nsec,
|
||||
pid_t pid, pid_t tgid, char *comm)
|
||||
{
|
||||
int i;
|
||||
struct page_owner *page_owner;
|
||||
|
||||
for (i = 0; i < (1 << order); i++) {
|
||||
page_owner = get_page_owner(page_ext);
|
||||
page_owner->handle = handle;
|
||||
page_owner->order = order;
|
||||
page_owner->gfp_mask = gfp_mask;
|
||||
page_owner->last_migrate_reason = last_migrate_reason;
|
||||
page_owner->pid = pid;
|
||||
page_owner->tgid = tgid;
|
||||
page_owner->ts_nsec = ts_nsec;
|
||||
strscpy(page_owner->comm, comm,
|
||||
sizeof(page_owner->comm));
|
||||
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
|
||||
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
|
||||
page_ext = page_ext_next(page_ext);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
|
||||
depot_stack_handle_t handle,
|
||||
unsigned short order,
|
||||
pid_t pid, pid_t tgid,
|
||||
u64 free_ts_nsec)
|
||||
{
|
||||
int i;
|
||||
struct page_owner *page_owner;
|
||||
|
||||
for (i = 0; i < (1 << order); i++) {
|
||||
page_owner = get_page_owner(page_ext);
|
||||
/* Only __reset_page_owner() wants to clear the bit */
|
||||
if (handle) {
|
||||
__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
|
||||
page_owner->free_handle = handle;
|
||||
}
|
||||
page_owner->free_ts_nsec = free_ts_nsec;
|
||||
page_owner->free_pid = current->pid;
|
||||
page_owner->free_tgid = current->tgid;
|
||||
page_ext = page_ext_next(page_ext);
|
||||
}
|
||||
}
|
||||
|
||||
void __reset_page_owner(struct page *page, unsigned short order)
|
||||
{
|
||||
int i;
|
||||
struct page_ext *page_ext;
|
||||
depot_stack_handle_t handle;
|
||||
depot_stack_handle_t alloc_handle;
|
||||
@ -245,16 +300,10 @@ void __reset_page_owner(struct page *page, unsigned short order)
|
||||
alloc_handle = page_owner->handle;
|
||||
|
||||
handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
|
||||
for (i = 0; i < (1 << order); i++) {
|
||||
__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
|
||||
page_owner->free_handle = handle;
|
||||
page_owner->free_ts_nsec = free_ts_nsec;
|
||||
page_owner->free_pid = current->pid;
|
||||
page_owner->free_tgid = current->tgid;
|
||||
page_ext = page_ext_next(page_ext);
|
||||
page_owner = get_page_owner(page_ext);
|
||||
}
|
||||
__update_page_owner_free_handle(page_ext, handle, order, current->pid,
|
||||
current->tgid, free_ts_nsec);
|
||||
page_ext_put(page_ext);
|
||||
|
||||
if (alloc_handle != early_handle)
|
||||
/*
|
||||
* early_handle is being set as a handle for all those
|
||||
@ -263,39 +312,14 @@ void __reset_page_owner(struct page *page, unsigned short order)
|
||||
* the machinery is not ready yet, we cannot decrement
|
||||
* their refcount either.
|
||||
*/
|
||||
dec_stack_record_count(alloc_handle);
|
||||
}
|
||||
|
||||
static inline void __set_page_owner_handle(struct page_ext *page_ext,
|
||||
depot_stack_handle_t handle,
|
||||
unsigned short order, gfp_t gfp_mask)
|
||||
{
|
||||
struct page_owner *page_owner;
|
||||
int i;
|
||||
u64 ts_nsec = local_clock();
|
||||
|
||||
for (i = 0; i < (1 << order); i++) {
|
||||
page_owner = get_page_owner(page_ext);
|
||||
page_owner->handle = handle;
|
||||
page_owner->order = order;
|
||||
page_owner->gfp_mask = gfp_mask;
|
||||
page_owner->last_migrate_reason = -1;
|
||||
page_owner->pid = current->pid;
|
||||
page_owner->tgid = current->tgid;
|
||||
page_owner->ts_nsec = ts_nsec;
|
||||
strscpy(page_owner->comm, current->comm,
|
||||
sizeof(page_owner->comm));
|
||||
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
|
||||
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
|
||||
|
||||
page_ext = page_ext_next(page_ext);
|
||||
}
|
||||
dec_stack_record_count(alloc_handle, 1 << order);
|
||||
}
|
||||
|
||||
noinline void __set_page_owner(struct page *page, unsigned short order,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
struct page_ext *page_ext;
|
||||
u64 ts_nsec = local_clock();
|
||||
depot_stack_handle_t handle;
|
||||
|
||||
handle = save_stack(gfp_mask);
|
||||
@ -303,9 +327,11 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
|
||||
page_ext = page_ext_get(page);
|
||||
if (unlikely(!page_ext))
|
||||
return;
|
||||
__set_page_owner_handle(page_ext, handle, order, gfp_mask);
|
||||
__update_page_owner_handle(page_ext, handle, order, gfp_mask, -1,
|
||||
current->pid, current->tgid, ts_nsec,
|
||||
current->comm);
|
||||
page_ext_put(page_ext);
|
||||
inc_stack_record_count(handle, gfp_mask);
|
||||
inc_stack_record_count(handle, gfp_mask, 1 << order);
|
||||
}
|
||||
|
||||
void __set_page_owner_migrate_reason(struct page *page, int reason)
|
||||
@ -340,9 +366,12 @@ void __split_page_owner(struct page *page, int old_order, int new_order)
|
||||
|
||||
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
|
||||
{
|
||||
int i;
|
||||
struct page_ext *old_ext;
|
||||
struct page_ext *new_ext;
|
||||
struct page_owner *old_page_owner, *new_page_owner;
|
||||
struct page_owner *old_page_owner;
|
||||
struct page_owner *new_page_owner;
|
||||
depot_stack_handle_t migrate_handle;
|
||||
|
||||
old_ext = page_ext_get(&old->page);
|
||||
if (unlikely(!old_ext))
|
||||
@ -356,30 +385,32 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
|
||||
|
||||
old_page_owner = get_page_owner(old_ext);
|
||||
new_page_owner = get_page_owner(new_ext);
|
||||
new_page_owner->order = old_page_owner->order;
|
||||
new_page_owner->gfp_mask = old_page_owner->gfp_mask;
|
||||
new_page_owner->last_migrate_reason =
|
||||
old_page_owner->last_migrate_reason;
|
||||
new_page_owner->handle = old_page_owner->handle;
|
||||
new_page_owner->pid = old_page_owner->pid;
|
||||
new_page_owner->tgid = old_page_owner->tgid;
|
||||
new_page_owner->free_pid = old_page_owner->free_pid;
|
||||
new_page_owner->free_tgid = old_page_owner->free_tgid;
|
||||
new_page_owner->ts_nsec = old_page_owner->ts_nsec;
|
||||
new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
|
||||
strcpy(new_page_owner->comm, old_page_owner->comm);
|
||||
|
||||
migrate_handle = new_page_owner->handle;
|
||||
__update_page_owner_handle(new_ext, old_page_owner->handle,
|
||||
old_page_owner->order, old_page_owner->gfp_mask,
|
||||
old_page_owner->last_migrate_reason,
|
||||
old_page_owner->ts_nsec, old_page_owner->pid,
|
||||
old_page_owner->tgid, old_page_owner->comm);
|
||||
/*
|
||||
* We don't clear the bit on the old folio as it's going to be freed
|
||||
* after migration. Until then, the info can be useful in case of
|
||||
* a bug, and the overall stats will be off a bit only temporarily.
|
||||
* Also, migrate_misplaced_transhuge_page() can still fail the
|
||||
* migration and then we want the old folio to retain the info. But
|
||||
* in that case we also don't need to explicitly clear the info from
|
||||
* the new page, which will be freed.
|
||||
* Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
|
||||
* will be freed after migration. Keep them until then as they may be
|
||||
* useful.
|
||||
*/
|
||||
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
|
||||
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
|
||||
__update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
|
||||
old_page_owner->free_pid,
|
||||
old_page_owner->free_tgid,
|
||||
old_page_owner->free_ts_nsec);
|
||||
/*
|
||||
* We linked the original stack to the new folio, we need to do the same
|
||||
* for the new one and the old folio otherwise there will be an imbalance
|
||||
* when subtracting those pages from the stack.
|
||||
*/
|
||||
for (i = 0; i < (1 << new_page_owner->order); i++) {
|
||||
old_page_owner->handle = migrate_handle;
|
||||
old_ext = page_ext_next(old_ext);
|
||||
old_page_owner = get_page_owner(old_ext);
|
||||
}
|
||||
|
||||
page_ext_put(new_ext);
|
||||
page_ext_put(old_ext);
|
||||
}
|
||||
@ -787,8 +818,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
|
||||
goto ext_put_continue;
|
||||
|
||||
/* Found early allocated page */
|
||||
__set_page_owner_handle(page_ext, early_handle,
|
||||
0, 0);
|
||||
__update_page_owner_handle(page_ext, early_handle, 0, 0,
|
||||
-1, local_clock(), current->pid,
|
||||
current->tgid, current->comm);
|
||||
count++;
|
||||
ext_put_continue:
|
||||
page_ext_put(page_ext);
|
||||
@ -840,13 +872,11 @@ static void *stack_start(struct seq_file *m, loff_t *ppos)
|
||||
* value of stack_list.
|
||||
*/
|
||||
stack = smp_load_acquire(&stack_list);
|
||||
m->private = stack;
|
||||
} else {
|
||||
stack = m->private;
|
||||
stack = stack->next;
|
||||
}
|
||||
|
||||
m->private = stack;
|
||||
|
||||
return stack;
|
||||
}
|
||||
|
||||
@ -861,11 +891,11 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
|
||||
return stack;
|
||||
}
|
||||
|
||||
static unsigned long page_owner_stack_threshold;
|
||||
static unsigned long page_owner_pages_threshold;
|
||||
|
||||
static int stack_print(struct seq_file *m, void *v)
|
||||
{
|
||||
int i, stack_count;
|
||||
int i, nr_base_pages;
|
||||
struct stack *stack = v;
|
||||
unsigned long *entries;
|
||||
unsigned long nr_entries;
|
||||
@ -876,14 +906,14 @@ static int stack_print(struct seq_file *m, void *v)
|
||||
|
||||
nr_entries = stack_record->size;
|
||||
entries = stack_record->entries;
|
||||
stack_count = refcount_read(&stack_record->count) - 1;
|
||||
nr_base_pages = refcount_read(&stack_record->count) - 1;
|
||||
|
||||
if (stack_count < 1 || stack_count < page_owner_stack_threshold)
|
||||
if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < nr_entries; i++)
|
||||
seq_printf(m, " %pS\n", (void *)entries[i]);
|
||||
seq_printf(m, "stack_count: %d\n\n", stack_count);
|
||||
seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -913,13 +943,13 @@ static const struct file_operations page_owner_stack_operations = {
|
||||
|
||||
static int page_owner_threshold_get(void *data, u64 *val)
|
||||
{
|
||||
*val = READ_ONCE(page_owner_stack_threshold);
|
||||
*val = READ_ONCE(page_owner_pages_threshold);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int page_owner_threshold_set(void *data, u64 val)
|
||||
{
|
||||
WRITE_ONCE(page_owner_stack_threshold, val);
|
||||
WRITE_ONCE(page_owner_pages_threshold, val);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -748,12 +748,6 @@ static long shmem_unused_huge_count(struct super_block *sb,
|
||||
|
||||
#define shmem_huge SHMEM_HUGE_DENY
|
||||
|
||||
bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
|
||||
struct mm_struct *mm, unsigned long vm_flags)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
|
||||
struct shrink_control *sc, unsigned long nr_to_split)
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user