From def13795928b82ecca4ae3ea16f375114ff88685 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 Nov 2024 15:44:21 -0800 Subject: [PATCH 01/24] fs/proc/vmcore.c: fix warning when CONFIG_MMU=n >> fs/proc/vmcore.c:424:19: warning: 'mmap_vmcore_fault' defined but not used [-Wunused-function] 424 | static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411140156.2o0nS4fl-lkp@intel.com/ Cc: Qi Xi Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 56 ++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index b4521b096058..3d8a82cee63e 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -414,6 +414,34 @@ static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter) return __read_vmcore(iter, &iocb->ki_pos); } +/** + * vmcore_alloc_buf - allocate buffer in vmalloc memory + * @size: size of buffer + * + * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap + * the buffer to user-space by means of remap_vmalloc_range(). + * + * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is + * disabled and there's no need to allow users to mmap the buffer. + */ +static inline char *vmcore_alloc_buf(size_t size) +{ +#ifdef CONFIG_MMU + return vmalloc_user(size); +#else + return vzalloc(size); +#endif +} + +/* + * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is + * essential for mmap_vmcore() in order to map physically + * non-contiguous objects (ELF header, ELF note segment and memory + * regions in the 1st kernel pointed to by PT_LOAD entries) into + * virtually contiguous user-space in ELF layout. + */ +#ifdef CONFIG_MMU + /* * The vmcore fault handler uses the page cache and fills data using the * standard __read_vmcore() function. @@ -457,34 +485,6 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) #endif } -/** - * vmcore_alloc_buf - allocate buffer in vmalloc memory - * @size: size of buffer - * - * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap - * the buffer to user-space by means of remap_vmalloc_range(). - * - * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is - * disabled and there's no need to allow users to mmap the buffer. - */ -static inline char *vmcore_alloc_buf(size_t size) -{ -#ifdef CONFIG_MMU - return vmalloc_user(size); -#else - return vzalloc(size); -#endif -} - -/* - * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is - * essential for mmap_vmcore() in order to map physically - * non-contiguous objects (ELF header, ELF note segment and memory - * regions in the 1st kernel pointed to by PT_LOAD entries) into - * virtually contiguous user-space in ELF layout. - */ -#ifdef CONFIG_MMU - static const struct vm_operations_struct vmcore_mmap_ops = { .fault = mmap_vmcore_fault, }; From a1268be280d8e484ab3606d7476edd0f14bb9961 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 20 Nov 2024 19:49:33 -0800 Subject: [PATCH 02/24] mm/gup: handle NULL pages in unpin_user_pages() The recent addition of "pofs" (pages or folios) handling to gup has a flaw: it assumes that unpin_user_pages() handles NULL pages in the pages** array. That's not the case, as I discovered when I ran on a new configuration on my test machine. Fix this by skipping NULL pages in unpin_user_pages(), just like unpin_folios() already does. Details: when booting on x86 with "numa=fake=2 movablecore=4G" on Linux 6.12, and running this: tools/testing/selftests/mm/gup_longterm ...I get the following crash: BUG: kernel NULL pointer dereference, address: 0000000000000008 RIP: 0010:sanity_check_pinned_pages+0x3a/0x2d0 ... Call Trace: ? __die_body+0x66/0xb0 ? page_fault_oops+0x30c/0x3b0 ? do_user_addr_fault+0x6c3/0x720 ? irqentry_enter+0x34/0x60 ? exc_page_fault+0x68/0x100 ? asm_exc_page_fault+0x22/0x30 ? sanity_check_pinned_pages+0x3a/0x2d0 unpin_user_pages+0x24/0xe0 check_and_migrate_movable_pages_or_folios+0x455/0x4b0 __gup_longterm_locked+0x3bf/0x820 ? mmap_read_lock_killable+0x12/0x50 ? __pfx_mmap_read_lock_killable+0x10/0x10 pin_user_pages+0x66/0xa0 gup_test_ioctl+0x358/0xb20 __se_sys_ioctl+0x6b/0xc0 do_syscall_64+0x7b/0x150 entry_SYSCALL_64_after_hwframe+0x76/0x7e Link: https://lkml.kernel.org/r/20241121034933.77502-1-jhubbard@nvidia.com Fixes: 94efde1d1539 ("mm/gup: avoid an unnecessary allocation call for FOLL_LONGTERM cases") Signed-off-by: John Hubbard Acked-by: David Hildenbrand Cc: Oscar Salvador Cc: Vivek Kasireddy Cc: Dave Airlie Cc: Gerd Hoffmann Cc: Matthew Wilcox Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Peter Xu Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Dongwon Kim Cc: Hugh Dickins Cc: Junxiao Chang Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 746070a1d8bf..3b75e631f369 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -52,7 +52,12 @@ static inline void sanity_check_pinned_pages(struct page **pages, */ for (; npages; npages--, pages++) { struct page *page = *pages; - struct folio *folio = page_folio(page); + struct folio *folio; + + if (!page) + continue; + + folio = page_folio(page); if (is_zero_page(page) || !folio_test_anon(folio)) @@ -409,6 +414,10 @@ void unpin_user_pages(struct page **pages, unsigned long npages) sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { + if (!pages[i]) { + nr = 1; + continue; + } folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } From 091c1dd2d4df6edd1beebe0e5863d4034ade9572 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Nov 2024 21:11:51 +0100 Subject: [PATCH 03/24] mm/mempolicy: fix migrate_to_node() assuming there is at least one VMA in a MM We currently assume that there is at least one VMA in a MM, which isn't true. So we might end up having find_vma() return NULL, to then de-reference NULL. So properly handle find_vma() returning NULL. This fixes the report: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 UID: 0 PID: 6021 Comm: syz-executor284 Not tainted 6.12.0-rc7-syzkaller-00187-gf868cd251776 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/30/2024 RIP: 0010:migrate_to_node mm/mempolicy.c:1090 [inline] RIP: 0010:do_migrate_pages+0x403/0x6f0 mm/mempolicy.c:1194 Code: ... RSP: 0018:ffffc9000375fd08 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffc9000375fd78 RCX: 0000000000000000 RDX: ffff88807e171300 RSI: dffffc0000000000 RDI: ffff88803390c044 RBP: ffff88807e171428 R08: 0000000000000014 R09: fffffbfff2039ef1 R10: ffffffff901cf78f R11: 0000000000000000 R12: 0000000000000003 R13: ffffc9000375fe90 R14: ffffc9000375fe98 R15: ffffc9000375fdf8 FS: 00005555919e1380(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005555919e1ca8 CR3: 000000007f12a000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: kernel_migrate_pages+0x5b2/0x750 mm/mempolicy.c:1709 __do_sys_migrate_pages mm/mempolicy.c:1727 [inline] __se_sys_migrate_pages mm/mempolicy.c:1723 [inline] __x64_sys_migrate_pages+0x96/0x100 mm/mempolicy.c:1723 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f [akpm@linux-foundation.org: add unlikely()] Link: https://lkml.kernel.org/r/20241120201151.9518-1-david@redhat.com Fixes: 39743889aaf7 ("[PATCH] Swap Migration V5: sys_migrate_pages interface") Signed-off-by: David Hildenbrand Reported-by: syzbot+3511625422f7aa637f0d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/673d2696.050a0220.3c9d61.012f.GAE@google.com/T/ Reviewed-by: Liam R. Howlett Reviewed-by: Christoph Lameter Cc: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bb37cd1a51d8..04f35659717a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1080,6 +1080,10 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, mmap_read_lock(mm); vma = find_vma(mm, 0); + if (unlikely(!vma)) { + mmap_read_unlock(mm); + return 0; + } /* * This does not migrate the range, but isolates all pages that From e30a0361b8515d424c73c67de1a43e45a13b8ba2 Mon Sep 17 00:00:00 2001 From: Jared Kangas Date: Tue, 19 Nov 2024 13:02:34 -0800 Subject: [PATCH 04/24] kasan: make report_lock a raw spinlock If PREEMPT_RT is enabled, report_lock is a sleeping spinlock and must not be locked when IRQs are disabled. However, KASAN reports may be triggered in such contexts. For example: char *s = kzalloc(1, GFP_KERNEL); kfree(s); local_irq_disable(); char c = *s; /* KASAN report here leads to spin_lock() */ local_irq_enable(); Make report_spinlock a raw spinlock to prevent rescheduling when PREEMPT_RT is enabled. Link: https://lkml.kernel.org/r/20241119210234.1602529-1-jkangas@redhat.com Fixes: 342a93247e08 ("locking/spinlock: Provide RT variant header: ") Signed-off-by: Jared Kangas Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- mm/kasan/report.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 50fb19ad4388..3fe77a360f1c 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -201,7 +201,7 @@ static inline void fail_non_kasan_kunit_test(void) { } #endif /* CONFIG_KUNIT */ -static DEFINE_SPINLOCK(report_lock); +static DEFINE_RAW_SPINLOCK(report_lock); static void start_report(unsigned long *flags, bool sync) { @@ -212,7 +212,7 @@ static void start_report(unsigned long *flags, bool sync) lockdep_off(); /* Make sure we don't end up in loop. */ report_suppress_start(); - spin_lock_irqsave(&report_lock, *flags); + raw_spin_lock_irqsave(&report_lock, *flags); pr_err("==================================================================\n"); } @@ -222,7 +222,7 @@ static void end_report(unsigned long *flags, const void *addr, bool is_write) trace_error_report_end(ERROR_DETECTOR_KASAN, (unsigned long)addr); pr_err("==================================================================\n"); - spin_unlock_irqrestore(&report_lock, *flags); + raw_spin_unlock_irqrestore(&report_lock, *flags); if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) check_panic_on_warn("KASAN"); switch (kasan_arg_fault) { From 985ebec4ab0a28bb5910c3b1481a40fbf7f9e61d Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 20 Nov 2024 02:23:37 +0900 Subject: [PATCH 05/24] nilfs2: fix potential out-of-bounds memory access in nilfs_find_entry() Syzbot reported that when searching for records in a directory where the inode's i_size is corrupted and has a large value, memory access outside the folio/page range may occur, or a use-after-free bug may be detected if KASAN is enabled. This is because nilfs_last_byte(), which is called by nilfs_find_entry() and others to calculate the number of valid bytes of directory data in a page from i_size and the page index, loses the upper 32 bits of the 64-bit size information due to an inappropriate type of local variable to which the i_size value is assigned. This caused a large byte offset value due to underflow in the end address calculation in the calling nilfs_find_entry(), resulting in memory access that exceeds the folio/page size. Fix this issue by changing the type of the local variable causing the bit loss from "unsigned int" to "u64". The return value of nilfs_last_byte() is also of type "unsigned int", but it is truncated so as not to exceed PAGE_SIZE and no bit loss occurs, so no change is required. Link: https://lkml.kernel.org/r/20241119172403.9292-1-konishi.ryusuke@gmail.com Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+96d5d14c47d97015c624@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=96d5d14c47d97015c624 Tested-by: syzbot+96d5d14c47d97015c624@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 14e8d82f8629..0a3aea6c416b 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -70,7 +70,7 @@ static inline unsigned int nilfs_chunk_size(struct inode *inode) */ static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr) { - unsigned int last_byte = inode->i_size; + u64 last_byte = inode->i_size; last_byte -= page_nr << PAGE_SHIFT; if (last_byte > PAGE_SIZE) From 965b5dd1894f4525f38c1b5f99b0106a07dbb5db Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 23 Nov 2024 22:28:34 +0900 Subject: [PATCH 06/24] ocfs2: free inode when ocfs2_get_init_inode() fails syzbot is reporting busy inodes after unmount, for commit 9c89fe0af826 ("ocfs2: Handle error from dquot_initialize()") forgot to call iput() when new_inode() succeeded and dquot_initialize() failed. Link: https://lkml.kernel.org/r/e68c0224-b7c6-4784-b4fa-a9fc8c675525@I-love.SAKURA.ne.jp Fixes: 9c89fe0af826 ("ocfs2: Handle error from dquot_initialize()") Signed-off-by: Tetsuo Handa Reported-by: syzbot+0af00f6a2cba2058b5db@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=0af00f6a2cba2058b5db Tested-by: syzbot+0af00f6a2cba2058b5db@syzkaller.appspotmail.com Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 59c92353151a..5550f8afa438 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -200,8 +200,10 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) mode = mode_strip_sgid(&nop_mnt_idmap, dir, mode); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); status = dquot_initialize(inode); - if (status) + if (status) { + iput(inode); return ERR_PTR(status); + } return inode; } From 4ae132c693896b0713db572676c90ffd855a4246 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 27 Nov 2024 16:14:22 +0000 Subject: [PATCH 07/24] selftest: hugetlb_dio: fix test naming The string logged when a test passes or fails is used by the selftest framework to identify which test is being reported. The hugetlb_dio test not only uses the same strings for every test that is run but it also uses different strings for test passes and failures which means that test automation is unable to follow what the test is doing at all. Pull the existing duplicated logging of the number of free huge pages before and after the test out of the conditional and replace that and the logging of the result with a single ksft_print_result() which incorporates the parameters passed into the test into the output. Link: https://lkml.kernel.org/r/20241127-kselftest-mm-hugetlb-dio-names-v1-1-22aab01bf550@kernel.org Fixes: fae1980347bf ("selftests: hugetlb_dio: fixup check for initial conditions to skip in the start") Signed-off-by: Mark Brown Reviewed-by: Muhammad Usama Anjum Cc: Donet Tom Cc: Ritesh Harjani (IBM) Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb_dio.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c index 432d5af15e66..db63abe5ee5e 100644 --- a/tools/testing/selftests/mm/hugetlb_dio.c +++ b/tools/testing/selftests/mm/hugetlb_dio.c @@ -76,19 +76,15 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) /* Get the free huge pages after unmap*/ free_hpage_a = get_free_hugepages(); + ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b); + ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a); + /* * If the no. of free hugepages before allocation and after unmap does * not match - that means there could still be a page which is pinned. */ - if (free_hpage_a != free_hpage_b) { - ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b); - ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a); - ksft_test_result_fail(": Huge pages not freed!\n"); - } else { - ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b); - ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a); - ksft_test_result_pass(": Huge pages freed successfully !\n"); - } + ksft_test_result(free_hpage_a == free_hpage_b, + "free huge pages from %u-%u\n", start_off, end_off); } int main(void) From 4a475c0a7eeb3368eca40fe7cb02d157eeddc77a Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Wed, 27 Nov 2024 12:08:53 +0000 Subject: [PATCH 08/24] selftests/damon: add _damon_sysfs.py to TEST_FILES When running selftests I encountered the following error message with some damon tests: # Traceback (most recent call last): # File "[...]/damon/./damos_quota.py", line 7, in # import _damon_sysfs # ModuleNotFoundError: No module named '_damon_sysfs' Fix this by adding the _damon_sysfs.py file to TEST_FILES so that it will be available when running the respective damon selftests. Link: https://lkml.kernel.org/r/20241127-picks-visitor-7416685b-mheyne@amazon.de Fixes: 306abb63a8ca ("selftests/damon: implement a python module for test-purpose DAMON sysfs controls") Signed-off-by: Maximilian Heyne Reviewed-by: SeongJae Park Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 5b2a6a5dd1af..812f656260fb 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -6,7 +6,7 @@ TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race TEST_GEN_FILES += debugfs_target_ids_pid_leak TEST_GEN_FILES += access_memory access_memory_even -TEST_FILES = _chk_dependency.sh _debugfs_common.sh +TEST_FILES = _chk_dependency.sh _debugfs_common.sh _damon_sysfs.py # functionality tests TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh From a220d6b95b1ae12c7626283d7609f0a1438e6437 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 26 Nov 2024 15:52:08 +0100 Subject: [PATCH 09/24] Revert "readahead: properly shorten readahead when falling back to do_page_cache_ra()" This reverts commit 7c877586da3178974a8a94577b6045a48377ff25. Anders and Philippe have reported that recent kernels occasionally hang when used with NFS in readahead code. The problem has been bisected to 7c877586da3 ("readahead: properly shorten readahead when falling back to do_page_cache_ra()"). The cause of the problem is that ra->size can be shrunk by read_pages() call and subsequently we end up calling do_page_cache_ra() with negative (read huge positive) number of pages. Let's revert 7c877586da3 for now until we can find a proper way how the logic in read_pages() and page_cache_ra_order() can coexist. This can lead to reduced readahead throughput due to readahead window confusion but that's better than outright hangs. Link: https://lkml.kernel.org/r/20241126145208.985-1-jack@suse.cz Fixes: 7c877586da31 ("readahead: properly shorten readahead when falling back to do_page_cache_ra()") Reported-by: Anders Blomdell Reported-by: Philippe Troin Signed-off-by: Jan Kara Tested-by: Philippe Troin Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/readahead.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 8f1cf599b572..ea650b8b02fb 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -458,8 +458,7 @@ void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; - pgoff_t start = readahead_index(ractl); - pgoff_t index = start; + pgoff_t index = readahead_index(ractl); unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; @@ -522,7 +521,7 @@ void page_cache_ra_order(struct readahead_control *ractl, if (!err) return; fallback: - do_page_cache_ra(ractl, ra->size - (index - start), ra->async_size); + do_page_cache_ra(ractl, ra->size, ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, From d699440f58ce9bd71103cc7b692e3ab76a20bfcd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 25 Nov 2024 16:52:06 -0800 Subject: [PATCH 10/24] mm: fix vrealloc()'s KASAN poisoning logic When vrealloc() reuses already allocated vmap_area, we need to re-annotate poisoned and unpoisoned portions of underlying memory according to the new size. This results in a KASAN splat recorded at [1]. A KASAN mis-reporting issue where there is none. Note, hard-coding KASAN_VMALLOC_PROT_NORMAL might not be exactly correct, but KASAN flag logic is pretty involved and spread out throughout __vmalloc_node_range_noprof(), so I'm using the bare minimum flag here and leaving the rest to mm people to refactor this logic and reuse it here. Link: https://lkml.kernel.org/r/20241126005206.3457974-1-andrii@kernel.org Link: https://lore.kernel.org/bpf/67450f9b.050a0220.21d33d.0004.GAE@google.com/ [1] Fixes: 3ddc2fefe6f3 ("mm: vmalloc: implement vrealloc()") Signed-off-by: Andrii Nakryiko Cc: Alexei Starovoitov Cc: Christoph Hellwig Cc: Michal Hocko Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7ed39d104201..f009b21705c1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4093,7 +4093,8 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) /* Zero out spare memory. */ if (want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); - + kasan_poison_vmalloc(p + size, old_size - size); + kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL); return (void *)p; } From 4de22b2a6a7477d84d9a01eb6b62a9117309d722 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 20:17:18 +0000 Subject: [PATCH 11/24] mm: open-code PageTail in folio_flags() and const_folio_flags() It is unsafe to call PageTail() in dump_page() as page_is_fake_head() will almost certainly return true when called on a head page that is copied to the stack. That will cause the VM_BUG_ON_PGFLAGS() in const_folio_flags() to trigger when it shouldn't. Fortunately, we don't need to call PageTail() here; it's fine to have a pointer to a virtual alias of the page's flag word rather than the real page's flag word. Link: https://lkml.kernel.org/r/20241125201721.2963278-1-willy@infradead.org Fixes: fae7d834c43c ("mm: add __dump_folio()") Signed-off-by: Matthew Wilcox (Oracle) Cc: Kees Cook Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2220bfec278e..cf46ac720802 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -306,7 +306,7 @@ static const unsigned long *const_folio_flags(const struct folio *folio, { const struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); return &page[n].flags; } @@ -315,7 +315,7 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); return &page[n].flags; } From 6a7de1bf218d75f27f68d6a3f5ae1eb7332b941e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 20:17:19 +0000 Subject: [PATCH 12/24] mm: open-code page_folio() in dump_page() page_folio() calls page_fixed_fake_head() which will misidentify this page as being a fake head and load off the end of 'precise'. We may have a pointer to a fake head, but that's OK because it contains the right information for dump_page(). gcc-15 is smart enough to catch this with -Warray-bounds: In function 'page_fixed_fake_head', inlined from '_compound_head' at ../include/linux/page-flags.h:251:24, inlined from '__dump_page' at ../mm/debug.c:123:11: ../include/asm-generic/rwonce.h:44:26: warning: array subscript 9 is outside +array bounds of 'struct page[1]' [-Warray-bounds=] Link: https://lkml.kernel.org/r/20241125201721.2963278-2-willy@infradead.org Fixes: fae7d834c43c ("mm: add __dump_folio()") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Kees Cook Cc: Signed-off-by: Andrew Morton --- mm/debug.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index aa57d3ffd4ed..95b6ab809c0e 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -124,19 +124,22 @@ static void __dump_page(const struct page *page) { struct folio *foliop, folio; struct page precise; + unsigned long head; unsigned long pfn = page_to_pfn(page); unsigned long idx, nr_pages = 1; int loops = 5; again: memcpy(&precise, page, sizeof(*page)); - foliop = page_folio(&precise); - if (foliop == (struct folio *)&precise) { + head = precise.compound_head; + if ((head & 1) == 0) { + foliop = (struct folio *)&precise; idx = 0; if (!folio_test_large(foliop)) goto dump; foliop = (struct folio *)page; } else { + foliop = (struct folio *)(head - 1); idx = folio_page_idx(foliop, page); } From 031e04bdc834cda3b054ef6b698503b2b97e8186 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 22 Nov 2024 16:39:47 +0100 Subject: [PATCH 13/24] stackdepot: fix stack_depot_save_flags() in NMI context Per documentation, stack_depot_save_flags() was meant to be usable from NMI context if STACK_DEPOT_FLAG_CAN_ALLOC is unset. However, it still would try to take the pool_lock in an attempt to save a stack trace in the current pool (if space is available). This could result in deadlock if an NMI is handled while pool_lock is already held. To avoid deadlock, only try to take the lock in NMI context and give up if unsuccessful. The documentation is fixed to clearly convey this. Link: https://lkml.kernel.org/r/Z0CcyfbPqmxJ9uJH@elver.google.com Link: https://lkml.kernel.org/r/20241122154051.3914732-1-elver@google.com Fixes: 4434a56ec209 ("stackdepot: make fast paths lock-less again") Signed-off-by: Marco Elver Reported-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 6 +++--- lib/stackdepot.c | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index e9ec32fb97d4..2cc21ffcdaf9 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -147,7 +147,7 @@ static inline int stack_depot_early_init(void) { return 0; } * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. * - * Context: Any context, but setting STACK_DEPOT_FLAG_CAN_ALLOC is required if + * Context: Any context, but unsetting STACK_DEPOT_FLAG_CAN_ALLOC is required if * alloc_pages() cannot be used from the current context. Currently * this is the case for contexts where neither %GFP_ATOMIC nor * %GFP_NOWAIT can be used (NMI, raw_spin_lock). @@ -156,7 +156,7 @@ static inline int stack_depot_early_init(void) { return 0; } */ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, unsigned int nr_entries, - gfp_t gfp_flags, + gfp_t alloc_flags, depot_flags_t depot_flags); /** @@ -175,7 +175,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, * Return: Handle of the stack trace stored in depot, 0 on failure */ depot_stack_handle_t stack_depot_save(unsigned long *entries, - unsigned int nr_entries, gfp_t gfp_flags); + unsigned int nr_entries, gfp_t alloc_flags); /** * __stack_depot_get_stack_record - Get a pointer to a stack_record struct diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 5ed34cc963fc..245d5b416699 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -630,7 +630,15 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, prealloc = page_address(page); } - raw_spin_lock_irqsave(&pool_lock, flags); + if (in_nmi()) { + /* We can never allocate in NMI context. */ + WARN_ON_ONCE(can_alloc); + /* Best effort; bail if we fail to take the lock. */ + if (!raw_spin_trylock_irqsave(&pool_lock, flags)) + goto exit; + } else { + raw_spin_lock_irqsave(&pool_lock, flags); + } printk_deferred_enter(); /* Try to find again, to avoid concurrently inserting duplicates. */ From 914eec5e980171bc128e7e24f7a22aa1d803570e Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Tue, 19 Nov 2024 09:45:00 -0800 Subject: [PATCH 14/24] ocfs2: update seq_file index in ocfs2_dlm_seq_next The following INFO level message was seen: seq_file: buggy .next function ocfs2_dlm_seq_next [ocfs2] did not update position index Fix: Update *pos (so m->index) to make seq_read_iter happy though the index its self makes no sense to ocfs2_dlm_seq_next. Link: https://lkml.kernel.org/r/20241119174500.9198-1-wen.gang.wang@oracle.com Signed-off-by: Wengang Wang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/dlmglue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 60df52e4c1f8..764ecbd5ad41 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3110,6 +3110,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) struct ocfs2_lock_res *iter = v; struct ocfs2_lock_res *dummy = &priv->p_iter_res; + (*pos)++; spin_lock(&ocfs2_dlm_tracking_lock); iter = ocfs2_dlm_next_res(iter, priv); list_del_init(&dummy->l_debug_list); From 51f43d5d82ed2ba3f9a3f9a2390c52f28e42af32 Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Fri, 29 Nov 2024 10:52:13 +0800 Subject: [PATCH 15/24] mm/codetag: swap tags when migrate pages Current solution to adjust codetag references during page migration is done in 3 steps: 1. sets the codetag reference of the old page as empty (not pointing to any codetag); 2. subtracts counters of the new page to compensate for its own allocation; 3. sets codetag reference of the new page to point to the codetag of the old page. This does not work if CONFIG_MEM_ALLOC_PROFILING_DEBUG=n because set_codetag_empty() becomes NOOP. Instead, let's simply swap codetag references so that the new page is referencing the old codetag and the old page is referencing the new codetag. This way accounting stays valid and the logic makes more sense. Link: https://lkml.kernel.org/r/20241129025213.34836-1-00107082@163.com Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()") Signed-off-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20241124074318.399027-1-00107082@163.com/ Acked-by: Suren Baghdasaryan Suggested-by: Suren Baghdasaryan Acked-by: Yu Zhao Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 4 ++-- lib/alloc_tag.c | 36 ++++++++++++++++++++++-------------- mm/migrate.c | 2 +- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 0e43ab653ab6..3469c4b20105 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -231,7 +231,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) } void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); -void pgalloc_tag_copy(struct folio *new, struct folio *old); +void pgalloc_tag_swap(struct folio *new, struct folio *old); void __init alloc_tag_sec_init(void); @@ -245,7 +245,7 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) {} +static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 2414a7ee7ec7..35f7560a309a 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -189,26 +189,34 @@ void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) } } -void pgalloc_tag_copy(struct folio *new, struct folio *old) +void pgalloc_tag_swap(struct folio *new, struct folio *old) { - union pgtag_ref_handle handle; - union codetag_ref ref; - struct alloc_tag *tag; + union pgtag_ref_handle handle_old, handle_new; + union codetag_ref ref_old, ref_new; + struct alloc_tag *tag_old, *tag_new; - tag = pgalloc_tag_get(&old->page); - if (!tag) + tag_old = pgalloc_tag_get(&old->page); + if (!tag_old) + return; + tag_new = pgalloc_tag_get(&new->page); + if (!tag_new) return; - if (!get_page_tag_ref(&new->page, &ref, &handle)) + if (!get_page_tag_ref(&old->page, &ref_old, &handle_old)) return; + if (!get_page_tag_ref(&new->page, &ref_new, &handle_new)) { + put_page_tag_ref(handle_old); + return; + } - /* Clear the old ref to the original allocation tag. */ - clear_page_tag_ref(&old->page); - /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(&ref, folio_size(new)); - __alloc_tag_ref_set(&ref, tag); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); + /* swap tags */ + __alloc_tag_ref_set(&ref_old, tag_new); + update_page_tag_ref(handle_old, &ref_old); + __alloc_tag_ref_set(&ref_new, tag_old); + update_page_tag_ref(handle_new, &ref_new); + + put_page_tag_ref(handle_old); + put_page_tag_ref(handle_new); } static void shutdown_mem_profiling(bool remove_file) diff --git a/mm/migrate.c b/mm/migrate.c index 2ce6b4b814df..cc68583c86f9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -745,7 +745,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_set_readahead(newfolio); folio_copy_owner(newfolio, folio); - pgalloc_tag_copy(newfolio, folio); + pgalloc_tag_swap(newfolio, folio); mem_cgroup_migrate(folio, newfolio); } From 89dd878282881306c38f7e354e7614fca98cb9a6 Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Thu, 28 Nov 2024 12:39:59 -0800 Subject: [PATCH 16/24] mm: memcg: declare do_memsw_account inline In commit 66d60c428b23 ("mm: memcg: move legacy memcg event code into memcontrol-v1.c"), the static do_memsw_account() function was moved from a .c file to a .h file. Unfortunately, the traditional inline keyword wasn't added. If a file (e.g., a unit test) includes the .h file, but doesn't refer to do_memsw_account(), it will get a warning like: mm/memcontrol-v1.h:41:13: warning: unused function 'do_memsw_account' [-Wunused-function] 41 | static bool do_memsw_account(void) | ^~~~~~~~~~~~~~~~ Link: https://lkml.kernel.org/r/20241128203959.726527-1-jsperbeck@google.com Fixes: 66d60c428b23 ("mm: memcg: move legacy memcg event code into memcontrol-v1.c") Signed-off-by: John Sperbeck Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- mm/memcontrol-v1.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 0e3b82951d91..144d71b65907 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -38,7 +38,7 @@ void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n); iter = mem_cgroup_iter(NULL, iter, NULL)) /* Whether legacy memory+swap accounting is active */ -static bool do_memsw_account(void) +static inline bool do_memsw_account(void) { return !cgroup_subsys_on_dfl(memory_cgrp_subsys); } From 249608ee47132cab3b1adacd9e463548f57bd316 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 18 Nov 2024 13:46:48 -0800 Subject: [PATCH 17/24] mm: respect mmap hint address when aligning for THP Commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") updated __get_unmapped_area() to align the start address for the VMA to a PMD boundary if CONFIG_TRANSPARENT_HUGEPAGE=y. It does this by effectively looking up a region that is of size, request_size + PMD_SIZE, and aligning up the start to a PMD boundary. Commit 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit") opted out of this for 32bit due to regressions in mmap base randomization. Commit d4148aeab412 ("mm, mmap: limit THP alignment of anonymous mappings to PMD-aligned sizes") restricted this to only mmap sizes that are multiples of the PMD_SIZE due to reported regressions in some performance benchmarks -- which seemed mostly due to the reduced spatial locality of related mappings due to the forced PMD-alignment. Another unintended side effect has emerged: When a user specifies an mmap hint address, the THP alignment logic modifies the behavior, potentially ignoring the hint even if a sufficiently large gap exists at the requested hint location. Example Scenario: Consider the following simplified virtual address (VA) space: ... 0x200000-0x400000 --- VMA A 0x400000-0x600000 --- Hole 0x600000-0x800000 --- VMA B ... A call to mmap() with hint=0x400000 and len=0x200000 behaves differently: - Before THP alignment: The requested region (size 0x200000) fits into the gap at 0x400000, so the hint is respected. - After alignment: The logic searches for a region of size 0x400000 (len + PMD_SIZE) starting at 0x400000. This search fails due to the mapping at 0x600000 (VMA B), and the hint is ignored, falling back to arch_get_unmapped_area[_topdown](). In general the hint is effectively ignored, if there is any existing mapping in the below range: [mmap_hint + mmap_size, mmap_hint + mmap_size + PMD_SIZE) This changes the semantics of mmap hint; from ""Respect the hint if a sufficiently large gap exists at the requested location" to "Respect the hint only if an additional PMD-sized gap exists beyond the requested size". This has performance implications for allocators that allocate their heap using mmap but try to keep it "as contiguous as possible" by using the end of the exisiting heap as the address hint. With the new behavior it's more likely to get a much less contiguous heap, adding extra fragmentation and performance overhead. To restore the expected behavior; don't use thp_get_unmapped_area_vmflags() when the user provided a hint address, for anonymous mappings. Note: As Yang Shi pointed out: the issue still remains for filesystems which are using thp_get_unmapped_area() for their get_unmapped_area() op. It is unclear what worklaods will regress for if we ignore THP alignment when the hint address is provided for such file backed mappings -- so this fix will be handled separately. Link: https://lkml.kernel.org/r/20241118214650.3667577-1-kaleshsingh@google.com Fixes: efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") Signed-off-by: Kalesh Singh Reviewed-by: Rik van Riel Reviewed-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Vlastimil Babka Cc: Yang Shi Cc: Rik van Riel Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Minchan Kim Cc: Hans Boehm Cc: Lokesh Gidra Cc: Signed-off-by: Andrew Morton --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mmap.c b/mm/mmap.c index 386429f7db5a..d32b7e701058 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -889,6 +889,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) + && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, From cbb70e45348769172cb24cca2d2b6437f4c9240b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 18 Nov 2024 17:54:14 +0000 Subject: [PATCH 18/24] mm: correct typo in MMAP_STATE() macro We mistakenly refer to len rather than len_ here. The only existing caller passes len to the len_ parameter so this has no impact on the code, but it is obviously incorrect to do this, so fix it. Link: https://lkml.kernel.org/r/20241118175414.390827-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Wei Yang Cc: Jann Horn Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vma.c b/mm/vma.c index 8a454a7bbc80..8e31b7e25aeb 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -35,7 +35,7 @@ struct mmap_state { .mm = mm_, \ .vmi = vmi_, \ .addr = addr_, \ - .end = (addr_) + len, \ + .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ .flags = flags_, \ From d89c8ec0546184267cb211b579514ebaf8916100 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Nov 2024 18:24:06 -0800 Subject: [PATCH 19/24] scatterlist: fix incorrect func name in kernel-doc Fix a kernel-doc warning by making the kernel-doc function description match the function name: include/linux/scatterlist.h:323: warning: expecting prototype for sg_unmark_bus_address(). Prototype was for sg_dma_unmark_bus_address() instead Link: https://lkml.kernel.org/r/20241130022406.537973-1-rdunlap@infradead.org Fixes: 42399301203e ("lib/scatterlist: add flag for indicating P2PDMA segments in an SGL") Signed-off-by: Randy Dunlap Cc: Logan Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index c5e2239b550e..d836e7440ee8 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -313,7 +313,7 @@ static inline void sg_dma_mark_bus_address(struct scatterlist *sg) } /** - * sg_unmark_bus_address - Unmark the scatterlist entry as a bus address + * sg_dma_unmark_bus_address - Unmark the scatterlist entry as a bus address * @sg: SG entry * * Description: From 3203b3ab0fcf22132caadd72caebfad47bf0dd2b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 29 Nov 2024 13:53:03 +0100 Subject: [PATCH 20/24] mm/filemap: don't call folio_test_locked() without a reference in next_uptodate_folio() The folio can get freed + buddy-merged + reallocated in the meantime, resulting in us calling folio_test_locked() possibly on a tail page. This makes const_folio_flags VM_BUG_ON_PGFLAGS() when stumbling over the tail page. Could this result in other issues? Doesn't look like it. False positives and false negatives don't really matter, because this folio would get skipped either way when detecting that they have been reallocated in the meantime. Fix it by performing the folio_test_locked() checked after grabbing a reference. If this ever becomes a real problem, we could add a special helper that racily checks if the bit is set even on tail pages ... but let's hope that's not required so we can just handle it cleaner: work on the folio after we hold a reference. Do we really need the folio_test_locked() check if we are going to trylock briefly after? Well, we can at least avoid a xas_reload(). It's a bit unclear which exact change introduced that issue. Likely, ever since we made PG_locked obey to the PF_NO_TAIL policy it could have been triggered in some way. Link: https://lkml.kernel.org/r/20241129125303.4033164-1-david@redhat.com Fixes: 48c935ad88f5 ("page-flags: define PG_locked behavior on compound pages") Signed-off-by: David Hildenbrand Reported-by: syzbot+9f9a7f73fb079b2387a6@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/674184c9.050a0220.1cc393.0001.GAE@google.com/ Acked-by: Kirill A. Shutemov Cc: "Matthew Wilcox (Oracle)" Cc: Hillf Danton Signed-off-by: Andrew Morton --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 7c76a123ba18..f61cf51c2238 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3501,10 +3501,10 @@ static struct folio *next_uptodate_folio(struct xa_state *xas, continue; if (xa_is_value(folio)) continue; - if (folio_test_locked(folio)) - continue; if (!folio_try_get(folio)) continue; + if (folio_test_locked(folio)) + goto skip; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) goto skip; From 5c3793604f91123bf49bc792ce697a0bef4c173c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 17 Nov 2024 03:38:13 -0800 Subject: [PATCH 21/24] lib: stackinit: hide never-taken branch from compiler The never-taken branch leads to an invalid bounds condition, which is by design. To avoid the unwanted warning from the compiler, hide the variable from the optimizer. ../lib/stackinit_kunit.c: In function 'do_nothing_u16_zero': ../lib/stackinit_kunit.c:51:49: error: array subscript 1 is outside array bounds of 'u16[0]' {aka 'short unsigned int[]'} [-Werror=array-bounds=] 51 | #define DO_NOTHING_RETURN_SCALAR(ptr) *(ptr) | ^~~~~~ ../lib/stackinit_kunit.c:219:24: note: in expansion of macro 'DO_NOTHING_RETURN_SCALAR' 219 | return DO_NOTHING_RETURN_ ## which(ptr + 1); \ | ^~~~~~~~~~~~~~~~~~ Link: https://lkml.kernel.org/r/20241117113813.work.735-kees@kernel.org Signed-off-by: Kees Cook Cc: Signed-off-by: Andrew Morton --- lib/stackinit_kunit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/stackinit_kunit.c b/lib/stackinit_kunit.c index c14c6f8e6308..c40818ec9c18 100644 --- a/lib/stackinit_kunit.c +++ b/lib/stackinit_kunit.c @@ -212,6 +212,7 @@ static noinline void test_ ## name (struct kunit *test) \ static noinline DO_NOTHING_TYPE_ ## which(var_type) \ do_nothing_ ## name(var_type *ptr) \ { \ + OPTIMIZER_HIDE_VAR(ptr); \ /* Will always be true, but compiler doesn't know. */ \ if ((unsigned long)ptr > 0x2) \ return DO_NOTHING_RETURN_ ## which(ptr); \ From 6535b8669c1a74078098517174e53fc907ce9d56 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 15 Nov 2024 10:20:23 -0800 Subject: [PATCH 22/24] mm/damon: fix order of arguments in damos_before_apply tracepoint Since the order of the scheme_idx and target_idx arguments in TP_ARGS is reversed, they are stored in the trace record in reverse. Link: https://lkml.kernel.org/r/20241115182023.43118-1-sj@kernel.org Link: https://patch.msgid.link/20241112154828.40307-1-akinobu.mita@gmail.com Fixes: c603c630b509 ("mm/damon/core: add a tracepoint for damos apply target regions") Signed-off-by: Akinobu Mita Signed-off-by: SeongJae Park Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 23200aabccac..da4bd9fd1162 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -15,7 +15,7 @@ TRACE_EVENT_CONDITION(damos_before_apply, unsigned int target_idx, struct damon_region *r, unsigned int nr_regions, bool do_trace), - TP_ARGS(context_idx, target_idx, scheme_idx, r, nr_regions, do_trace), + TP_ARGS(context_idx, scheme_idx, target_idx, r, nr_regions, do_trace), TP_CONDITION(do_trace), From 5f1b64e9a9b7ee9cfd32c6b2fab796e29bfed075 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Wed, 13 Nov 2024 18:21:46 +0800 Subject: [PATCH 23/24] sched/numa: fix memory leak due to the overwritten vma->numab_state [Problem Description] When running the hackbench program of LTP, the following memory leak is reported by kmemleak. # /opt/ltp/testcases/bin/hackbench 20 thread 1000 Running with 20*40 (== 800) tasks. # dmesg | grep kmemleak ... kmemleak: 480 new suspected memory leaks (see /sys/kernel/debug/kmemleak) kmemleak: 665 new suspected memory leaks (see /sys/kernel/debug/kmemleak) # cat /sys/kernel/debug/kmemleak unreferenced object 0xffff888cd8ca2c40 (size 64): comm "hackbench", pid 17142, jiffies 4299780315 hex dump (first 32 bytes): ac 74 49 00 01 00 00 00 4c 84 49 00 01 00 00 00 .tI.....L.I..... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc bff18fd4): [] __kmalloc_cache_noprof+0x2f9/0x3f0 [] task_numa_work+0x725/0xa00 [] task_work_run+0x58/0x90 [] syscall_exit_to_user_mode+0x1c8/0x1e0 [] do_syscall_64+0x85/0x150 [] entry_SYSCALL_64_after_hwframe+0x76/0x7e ... This issue can be consistently reproduced on three different servers: * a 448-core server * a 256-core server * a 192-core server [Root Cause] Since multiple threads are created by the hackbench program (along with the command argument 'thread'), a shared vma might be accessed by two or more cores simultaneously. When two or more cores observe that vma->numab_state is NULL at the same time, vma->numab_state will be overwritten. Although current code ensures that only one thread scans the VMAs in a single 'numa_scan_period', there might be a chance for another thread to enter in the next 'numa_scan_period' while we have not gotten till numab_state allocation [1]. Note that the command `/opt/ltp/testcases/bin/hackbench 50 process 1000` cannot the reproduce the issue. It is verified with 200+ test runs. [Solution] Use the cmpxchg atomic operation to ensure that only one thread executes the vma->numab_state assignment. [1] https://lore.kernel.org/lkml/1794be3c-358c-4cdc-a43d-a1f841d91ef7@amd.com/ Link: https://lkml.kernel.org/r/20241113102146.2384-1-ahuang12@lenovo.com Fixes: ef6a22b70f6d ("sched/numa: apply the scan delay to every new vma") Signed-off-by: Adrian Huang Reported-by: Jiwei Sun Reviewed-by: Raghavendra K T Reviewed-by: Vlastimil Babka Cc: Ben Segall Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Cc: Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fbdca89c677f..a59ae2e23daf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3399,11 +3399,17 @@ static void task_numa_work(struct callback_head *work) /* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { - vma->numab_state = kzalloc(sizeof(struct vma_numab_state), - GFP_KERNEL); - if (!vma->numab_state) + struct vma_numab_state *ptr; + + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + if (!ptr) continue; + if (cmpxchg(&vma->numab_state, NULL, ptr)) { + kfree(ptr); + continue; + } + vma->numab_state->start_scan_seq = mm->numa_scan_seq; vma->numab_state->next_scan = now + From f1ee5483e40881d8ad5a63aa148b753b5c6a839b Mon Sep 17 00:00:00 2001 From: Jakob Hauser Date: Fri, 29 Nov 2024 22:25:07 +0100 Subject: [PATCH 24/24] iio: magnetometer: yas530: use signed integer type for clamp limits In the function yas537_measure() there is a clamp_val() with limits of -BIT(13) and BIT(13) - 1. The input clamp value h[] is of type s32. The BIT() is of type unsigned long integer due to its define in include/vdso/bits.h. The lower limit -BIT(13) is recognized as -8192 but expressed as an unsigned long integer. The size of an unsigned long integer differs between 32-bit and 64-bit architectures. Converting this to type s32 may lead to undesired behavior. Additionally, in the calculation lines h[0], h[1] and h[2] the unsigned long integer divisor BIT(13) causes an unsigned division, shifting the left-hand side of the equation back and forth, possibly ending up in large positive values instead of negative values on 32-bit architectures. To solve those two issues, declare a signed integer with a value of BIT(13). There is another omission in the clamp line: clamp_val() returns a value and it's going nowhere here. Self-assign it to h[i] to make use of the clamp macro. Finally, replace clamp_val() macro by clamp() because after changing the limits from type unsigned long integer to signed integer it's fine that way. Link: https://lkml.kernel.org/r/11609b2243c295d65ab4d47e78c239d61ad6be75.1732914810.git.jahau@rocketmail.com Fixes: 65f79b501030 ("iio: magnetometer: yas530: Add YAS537 variant") Signed-off-by: Jakob Hauser Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411230458.dhZwh3TT-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202411282222.oF0B4110-lkp@intel.com/ Reviewed-by: David Laight Acked-by: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Linus Walleij Signed-off-by: Andrew Morton --- drivers/iio/magnetometer/yamaha-yas530.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/iio/magnetometer/yamaha-yas530.c b/drivers/iio/magnetometer/yamaha-yas530.c index 65011a8598d3..c55a38650c0d 100644 --- a/drivers/iio/magnetometer/yamaha-yas530.c +++ b/drivers/iio/magnetometer/yamaha-yas530.c @@ -372,6 +372,7 @@ static int yas537_measure(struct yas5xx *yas5xx, u16 *t, u16 *x, u16 *y1, u16 *y u8 data[8]; u16 xy1y2[3]; s32 h[3], s[3]; + int half_range = BIT(13); int i, ret; mutex_lock(&yas5xx->lock); @@ -406,13 +407,13 @@ static int yas537_measure(struct yas5xx *yas5xx, u16 *t, u16 *x, u16 *y1, u16 *y /* The second version of YAS537 needs to include calibration coefficients */ if (yas5xx->version == YAS537_VERSION_1) { for (i = 0; i < 3; i++) - s[i] = xy1y2[i] - BIT(13); - h[0] = (c->k * (128 * s[0] + c->a2 * s[1] + c->a3 * s[2])) / BIT(13); - h[1] = (c->k * (c->a4 * s[0] + c->a5 * s[1] + c->a6 * s[2])) / BIT(13); - h[2] = (c->k * (c->a7 * s[0] + c->a8 * s[1] + c->a9 * s[2])) / BIT(13); + s[i] = xy1y2[i] - half_range; + h[0] = (c->k * (128 * s[0] + c->a2 * s[1] + c->a3 * s[2])) / half_range; + h[1] = (c->k * (c->a4 * s[0] + c->a5 * s[1] + c->a6 * s[2])) / half_range; + h[2] = (c->k * (c->a7 * s[0] + c->a8 * s[1] + c->a9 * s[2])) / half_range; for (i = 0; i < 3; i++) { - clamp_val(h[i], -BIT(13), BIT(13) - 1); - xy1y2[i] = h[i] + BIT(13); + h[i] = clamp(h[i], -half_range, half_range - 1); + xy1y2[i] = h[i] + half_range; } }