From aed9a1a4f7106ff99a882ad06318cebfa71016a2 Mon Sep 17 00:00:00 2001 From: Mohamed Ahmed Date: Thu, 9 May 2024 23:43:52 +0300 Subject: [PATCH 001/279] drm/nouveau: use tile_mode and pte_kind for VM_BIND bo allocations Allow PTE kind and tile mode on BO create with VM_BIND, and add a GETPARAM to indicate this change. This is needed to support modifiers in NVK and ensure correctness when dealing with the nouveau GL driver. The userspace modifiers implementation this is for can be found here: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24795 Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI") Signed-off-by: Mohamed Ahmed Reviewed-by: Faith Ekstrand Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240509204352.7597-1-mohamedahmedegypt2001@gmail.com --- drivers/gpu/drm/nouveau/nouveau_abi16.c | 3 ++ drivers/gpu/drm/nouveau/nouveau_bo.c | 44 +++++++++++-------------- include/uapi/drm/nouveau_drm.h | 7 ++++ 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c b/drivers/gpu/drm/nouveau/nouveau_abi16.c index 80f74ee0fc78..47e53e17b4e5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c @@ -272,6 +272,9 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS) getparam->value = (u64)ttm_resource_manager_usage(vram_mgr); break; } + case NOUVEAU_GETPARAM_HAS_VMA_TILEMODE: + getparam->value = 1; + break; default: NV_PRINTK(dbg, cli, "unknown parameter %lld\n", getparam->param); return -EINVAL; diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index db8cbf615112..186add400ea5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -241,28 +241,28 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size, int *align, u32 domain, } nvbo->contig = !(tile_flags & NOUVEAU_GEM_TILE_NONCONTIG); - if (!nouveau_cli_uvmm(cli) || internal) { - /* for BO noVM allocs, don't assign kinds */ - if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) { - nvbo->kind = (tile_flags & 0x0000ff00) >> 8; - if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) { - kfree(nvbo); - return ERR_PTR(-EINVAL); - } - nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind; - } else if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) { - nvbo->kind = (tile_flags & 0x00007f00) >> 8; - nvbo->comp = (tile_flags & 0x00030000) >> 16; - if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) { - kfree(nvbo); - return ERR_PTR(-EINVAL); - } - } else { - nvbo->zeta = (tile_flags & 0x00000007); + if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) { + nvbo->kind = (tile_flags & 0x0000ff00) >> 8; + if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) { + kfree(nvbo); + return ERR_PTR(-EINVAL); } - nvbo->mode = tile_mode; + nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind; + } else if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) { + nvbo->kind = (tile_flags & 0x00007f00) >> 8; + nvbo->comp = (tile_flags & 0x00030000) >> 16; + if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) { + kfree(nvbo); + return ERR_PTR(-EINVAL); + } + } else { + nvbo->zeta = (tile_flags & 0x00000007); + } + nvbo->mode = tile_mode; + + if (!nouveau_cli_uvmm(cli) || internal) { /* Determine the desirable target GPU page size for the buffer. */ for (i = 0; i < vmm->page_nr; i++) { /* Because we cannot currently allow VMM maps to fail @@ -304,12 +304,6 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size, int *align, u32 domain, } nvbo->page = vmm->page[pi].shift; } else { - /* reject other tile flags when in VM mode. */ - if (tile_mode) - return ERR_PTR(-EINVAL); - if (tile_flags & ~NOUVEAU_GEM_TILE_NONCONTIG) - return ERR_PTR(-EINVAL); - /* Determine the desirable target GPU page size for the buffer. */ for (i = 0; i < vmm->page_nr; i++) { /* Because we cannot currently allow VMM maps to fail diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h index cd84227f1b42..5402f77ee859 100644 --- a/include/uapi/drm/nouveau_drm.h +++ b/include/uapi/drm/nouveau_drm.h @@ -68,6 +68,13 @@ extern "C" { */ #define NOUVEAU_GETPARAM_VRAM_USED 19 +/* + * NOUVEAU_GETPARAM_HAS_VMA_TILEMODE + * + * Query whether tile mode and PTE kind are accepted with VM allocs or not. + */ +#define NOUVEAU_GETPARAM_HAS_VMA_TILEMODE 20 + struct drm_nouveau_getparam { __u64 param; __u64 value; From 117bbc0e43adc6f76a3fc39a98f75a811a853459 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 29 Feb 2024 10:51:13 +0000 Subject: [PATCH 002/279] drm/buddy: stop using PAGE_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The drm_buddy minimum page-size requirements should be distinct from the CPU PAGE_SIZE. Only restriction is that the minimum page-size is at least 4K. Signed-off-by: Matthew Auld Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: Arnd Bergmann Reviewed-by: Arunpravin Paneer Selvam Acked-by: Arnd Bergmann Link: https://patchwork.freedesktop.org/patch/msgid/20240229105112.250077-3-matthew.auld@intel.com Signed-off-by: Christian König --- drivers/gpu/drm/drm_buddy.c | 2 +- include/drm/drm_buddy.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 5ebdd6f8f36e..f999568d69c1 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -102,7 +102,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) if (size < chunk_size) return -EINVAL; - if (chunk_size < PAGE_SIZE) + if (chunk_size < SZ_4K) return -EINVAL; if (!is_power_of_2(chunk_size)) diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index a5b39fc01003..19ed661a32f3 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -53,8 +53,8 @@ struct drm_buddy_block { struct list_head tmp_link; }; -/* Order-zero must be at least PAGE_SIZE */ -#define DRM_BUDDY_MAX_ORDER (63 - PAGE_SHIFT) +/* Order-zero must be at least SZ_4K */ +#define DRM_BUDDY_MAX_ORDER (63 - 12) /* * Binary Buddy System. @@ -82,7 +82,7 @@ struct drm_buddy { unsigned int n_roots; unsigned int max_order; - /* Must be at least PAGE_SIZE */ + /* Must be at least SZ_4K */ u64 chunk_size; u64 size; u64 avail; From 520fb7f183e9b4d0ad7a2f084f3c4987845425e2 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 29 Feb 2024 10:51:14 +0000 Subject: [PATCH 003/279] drm/tests/buddy: stop using PAGE_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gives the wrong impression that min page-size has to be tied to the CPU PAGE_SIZE. Signed-off-by: Matthew Auld Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: Arnd Bergmann Reviewed-by: Arunpravin Paneer Selvam Link: https://patchwork.freedesktop.org/patch/msgid/20240229105112.250077-4-matthew.auld@intel.com Signed-off-by: Christian König --- drivers/gpu/drm/tests/drm_buddy_test.c | 42 +++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c index e48863a44556..f2397696d252 100644 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ b/drivers/gpu/drm/tests/drm_buddy_test.c @@ -329,8 +329,8 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) * Eventually we will have a fully 50% fragmented mm. */ - mm_size = PAGE_SIZE << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, PAGE_SIZE), + mm_size = SZ_4K << max_order; + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); @@ -344,7 +344,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) } for (order = top; order--;) { - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), @@ -358,7 +358,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) } /* There should be one final page for this sub-allocation */ - size = get_size(0, PAGE_SIZE); + size = get_size(0, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM for hole\n"); @@ -368,7 +368,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) list_move_tail(&block->link, &holes); - size = get_size(top, PAGE_SIZE); + size = get_size(top, mm.chunk_size); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded at top-order %d/%d, it should be full!", @@ -379,7 +379,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) /* Nothing larger than blocks of chunk_size now available */ for (order = 1; order <= max_order; order++) { - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded at order %d, it should be full!", @@ -408,14 +408,14 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) * page left. */ - mm_size = PAGE_SIZE << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, PAGE_SIZE), + mm_size = SZ_4K << max_order; + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); for (order = 0; order < max_order; order++) { - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", @@ -428,7 +428,7 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) } /* And now the last remaining block available */ - size = get_size(0, PAGE_SIZE); + size = get_size(0, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM on final alloc\n"); @@ -440,7 +440,7 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) /* Should be completely full! */ for (order = max_order; order--;) { - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded, it should be full!"); @@ -456,7 +456,7 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) list_del(&block->link); drm_buddy_free_block(&mm, block); - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", @@ -471,7 +471,7 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) } /* To confirm, now the whole mm should be available */ - size = get_size(max_order, PAGE_SIZE); + size = get_size(max_order, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc (realloc) hit -ENOMEM with order=%d\n", @@ -502,15 +502,15 @@ static void drm_test_buddy_alloc_optimistic(struct kunit *test) * try to allocate them all. */ - mm_size = PAGE_SIZE * ((1 << (max_order + 1)) - 1); + mm_size = SZ_4K * ((1 << (max_order + 1)) - 1); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, PAGE_SIZE), + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); for (order = 0; order <= max_order; order++) { - size = get_size(order, PAGE_SIZE); + size = get_size(order, mm.chunk_size); KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", @@ -523,7 +523,7 @@ static void drm_test_buddy_alloc_optimistic(struct kunit *test) } /* Should be completely full! */ - size = get_size(0, PAGE_SIZE); + size = get_size(0, mm.chunk_size); KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded, it should be full!"); @@ -540,7 +540,7 @@ static void drm_test_buddy_alloc_limit(struct kunit *test) LIST_HEAD(allocated); struct drm_buddy mm; - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, size, PAGE_SIZE)); + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, size, SZ_4K)); KUNIT_EXPECT_EQ_MSG(test, mm.max_order, DRM_BUDDY_MAX_ORDER, "mm.max_order(%d) != %d\n", mm.max_order, @@ -548,7 +548,7 @@ static void drm_test_buddy_alloc_limit(struct kunit *test) size = mm.chunk_size << mm.max_order; KUNIT_EXPECT_FALSE(test, drm_buddy_alloc_blocks(&mm, start, size, size, - PAGE_SIZE, &allocated, flags)); + mm.chunk_size, &allocated, flags)); block = list_first_entry_or_null(&allocated, struct drm_buddy_block, link); KUNIT_EXPECT_TRUE(test, block); @@ -558,10 +558,10 @@ static void drm_test_buddy_alloc_limit(struct kunit *test) drm_buddy_block_order(block), mm.max_order); KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_size(&mm, block), - BIT_ULL(mm.max_order) * PAGE_SIZE, + BIT_ULL(mm.max_order) * mm.chunk_size, "block size(%llu) != %llu\n", drm_buddy_block_size(&mm, block), - BIT_ULL(mm.max_order) * PAGE_SIZE); + BIT_ULL(mm.max_order) * mm.chunk_size); drm_buddy_free_list(&mm, &allocated); drm_buddy_fini(&mm); From dc21c6cc3d6986d938efbf95de62473982c98dec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 May 2024 13:23:39 +0000 Subject: [PATCH 004/279] netfilter: nfnetlink_queue: acquire rcu_read_lock() in instance_destroy_rcu() syzbot reported that nf_reinject() could be called without rcu_read_lock() : WARNING: suspicious RCU usage 6.9.0-rc7-syzkaller-02060-g5c1672705a1a #0 Not tainted net/netfilter/nfnetlink_queue.c:263 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by syz-executor.4/13427: #0: ffffffff8e334f60 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline] #0: ffffffff8e334f60 (rcu_callback){....}-{0:0}, at: rcu_do_batch kernel/rcu/tree.c:2190 [inline] #0: ffffffff8e334f60 (rcu_callback){....}-{0:0}, at: rcu_core+0xa86/0x1830 kernel/rcu/tree.c:2471 #1: ffff88801ca92958 (&inst->lock){+.-.}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline] #1: ffff88801ca92958 (&inst->lock){+.-.}-{2:2}, at: nfqnl_flush net/netfilter/nfnetlink_queue.c:405 [inline] #1: ffff88801ca92958 (&inst->lock){+.-.}-{2:2}, at: instance_destroy_rcu+0x30/0x220 net/netfilter/nfnetlink_queue.c:172 stack backtrace: CPU: 0 PID: 13427 Comm: syz-executor.4 Not tainted 6.9.0-rc7-syzkaller-02060-g5c1672705a1a #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 lockdep_rcu_suspicious+0x221/0x340 kernel/locking/lockdep.c:6712 nf_reinject net/netfilter/nfnetlink_queue.c:323 [inline] nfqnl_reinject+0x6ec/0x1120 net/netfilter/nfnetlink_queue.c:397 nfqnl_flush net/netfilter/nfnetlink_queue.c:410 [inline] instance_destroy_rcu+0x1ae/0x220 net/netfilter/nfnetlink_queue.c:172 rcu_do_batch kernel/rcu/tree.c:2196 [inline] rcu_core+0xafd/0x1830 kernel/rcu/tree.c:2471 handle_softirqs+0x2d6/0x990 kernel/softirq.c:554 __do_softirq kernel/softirq.c:588 [inline] invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu+0xf4/0x1c0 kernel/softirq.c:637 irq_exit_rcu+0x9/0x30 kernel/softirq.c:649 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1043 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1043 Fixes: 9872bec773c2 ("[NETFILTER]: nfnetlink: use RCU for queue instances hash") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 00f4bd21c59b..f1c31757e496 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -169,7 +169,9 @@ instance_destroy_rcu(struct rcu_head *head) struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); + rcu_read_lock(); nfqnl_flush(inst, NULL, 0); + rcu_read_unlock(); kfree(inst); module_put(THIS_MODULE); } From c1193d9bbbd379defe9be3c6de566de684de8a6f Mon Sep 17 00:00:00 2001 From: Alexander Maltsev Date: Wed, 17 Apr 2024 18:51:41 +0500 Subject: [PATCH 005/279] netfilter: ipset: Add list flush to cancel_gc Flushing list in cancel_gc drops references to other lists right away, without waiting for RCU to destroy list. Fixes race when referenced ipsets can't be destroyed while referring list is scheduled for destroy. Fixes: 97f7cf1cd80e ("netfilter: ipset: fix performance regression in swap operation") Signed-off-by: Alexander Maltsev Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_list_set.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 6c3f28bc59b3..54e2a1dd7f5f 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -549,6 +549,9 @@ list_set_cancel_gc(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) timer_shutdown_sync(&map->gc); + + /* Flush list to drop references to other ipsets */ + list_set_flush(set); } static const struct ip_set_type_variant set_variant = { From aff5c01fa1284d606f8e7cbdaafeef2511bb46c1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 9 May 2024 23:02:24 +0200 Subject: [PATCH 006/279] netfilter: nft_payload: restore vlan q-in-q match support Revert f6ae9f120dad ("netfilter: nft_payload: add C-VLAN support"). f41f72d09ee1 ("netfilter: nft_payload: simplify vlan header handling") already allows to match on inner vlan tags by subtract the vlan header size to the payload offset which has been popped and stored in skbuff metadata fields. Fixes: f6ae9f120dad ("netfilter: nft_payload: add C-VLAN support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 0a689c8e0295..a3cb5dbcb362 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -45,36 +45,27 @@ nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; - u8 vlan_hlen = 0; - - if ((skb->protocol == htons(ETH_P_8021AD) || - skb->protocol == htons(ETH_P_8021Q)) && - offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN) - vlan_hlen += VLAN_HLEN; vlanh = (u8 *) &veth; - if (offset < VLAN_ETH_HLEN + vlan_hlen) { + if (offset < VLAN_ETH_HLEN) { u8 ethlen = len; - if (vlan_hlen && - skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0) - return false; - else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) + if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; - if (offset + len > VLAN_ETH_HLEN + vlan_hlen) - ethlen -= offset + len - VLAN_ETH_HLEN - vlan_hlen; + if (offset + len > VLAN_ETH_HLEN) + ethlen -= offset + len - VLAN_ETH_HLEN; - memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); + memcpy(dst_u8, vlanh + offset, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; - offset = ETH_HLEN + vlan_hlen; + offset = ETH_HLEN; } else { - offset -= VLAN_HLEN + vlan_hlen; + offset -= VLAN_HLEN; } return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; From e8dc41afca161b988e6d462f4d0803d247e22250 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Sat, 11 May 2024 10:55:25 +0800 Subject: [PATCH 007/279] pmdomain: imx: gpcv2: Add delay after power up handshake AudioMix BLK-CTRL on i.MX8MP encountered an accessing register issue after power up. [ 2.181035] Kernel panic - not syncing: Asynchronous SError Interrupt [ 2.181038] CPU: 1 PID: 48 Comm: kworker/u16:2 Not tainted 6.9.0-rc5-next-20240424-00003-g21cec88845c6 #171 [ 2.181047] Hardware name: NXP i.MX8MPlus EVK board (DT) [ 2.181050] Workqueue: events_unbound deferred_probe_work_func [ 2.181064] Call trace: [...] [ 2.181142] arm64_serror_panic+0x6c/0x78 [ 2.181149] do_serror+0x3c/0x70 [ 2.181157] el1h_64_error_handler+0x30/0x48 [ 2.181164] el1h_64_error+0x64/0x68 [ 2.181171] clk_imx8mp_audiomix_runtime_resume+0x34/0x44 [ 2.181183] __genpd_runtime_resume+0x30/0x80 [ 2.181195] genpd_runtime_resume+0x110/0x244 [ 2.181205] __rpm_callback+0x48/0x1d8 [ 2.181213] rpm_callback+0x68/0x74 [ 2.181224] rpm_resume+0x468/0x6c0 [ 2.181234] __pm_runtime_resume+0x50/0x94 [ 2.181243] pm_runtime_get_suppliers+0x60/0x8c [ 2.181258] __driver_probe_device+0x48/0x12c [ 2.181268] driver_probe_device+0xd8/0x15c [ 2.181278] __device_attach_driver+0xb8/0x134 [ 2.181290] bus_for_each_drv+0x84/0xe0 [ 2.181302] __device_attach+0x9c/0x188 [ 2.181312] device_initial_probe+0x14/0x20 [ 2.181323] bus_probe_device+0xac/0xb0 [ 2.181334] deferred_probe_work_func+0x88/0xc0 [ 2.181344] process_one_work+0x150/0x290 [ 2.181357] worker_thread+0x2f8/0x408 [ 2.181370] kthread+0x110/0x114 [ 2.181381] ret_from_fork+0x10/0x20 [ 2.181391] SMP: stopping secondary CPUs According to comments in power up handshake: /* request the ADB400 to power up */ if (domain->bits.hskreq) { regmap_update_bits(domain->regmap, domain->regs->hsk, domain->bits.hskreq, domain->bits.hskreq); /* * ret = regmap_read_poll_timeout(domain->regmap, domain->regs->hsk, reg_val, * (reg_val & domain->bits.hskack), 0, * USEC_PER_MSEC); * Technically we need the commented code to wait handshake. But that needs * the BLK-CTL module BUS clk-en bit being set. * * There is a separate BLK-CTL module and we will have such a driver for it, * that driver will set the BUS clk-en bit and handshake will be triggered * automatically there. Just add a delay and suppose the handshake finish * after that. */ } The BLK-CTL module needs to add delay to wait for a handshake request finished. For some BLK-CTL module (eg. AudioMix on i.MX8MP) doesn't have BUS clk-en bit, it is better to add delay in this driver, as the BLK-CTL module doesn't need to care about how it is powered up. regmap_read_bypassed() is to make sure the above write IO transaction already reaches target before udelay(). Fixes: 1496dd413b2e ("clk: imx: imx8mp: Add pm_runtime support for power saving") Reported-by: Francesco Dolcini Closes: https://lore.kernel.org/all/66293535.170a0220.21fe.a2e7@mx.google.com/ Suggested-by: Frank Li Signed-off-by: Shengjiu Wang Tested-by: Adam Ford Tested-by: Alexander Stein Link: https://lore.kernel.org/r/1715396125-3724-1-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/gpcv2.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/pmdomain/imx/gpcv2.c b/drivers/pmdomain/imx/gpcv2.c index 4b828d74a606..856eaac0ec14 100644 --- a/drivers/pmdomain/imx/gpcv2.c +++ b/drivers/pmdomain/imx/gpcv2.c @@ -393,6 +393,17 @@ static int imx_pgc_power_up(struct generic_pm_domain *genpd) * automatically there. Just add a delay and suppose the handshake finish * after that. */ + + /* + * For some BLK-CTL module (eg. AudioMix on i.MX8MP) doesn't have BUS + * clk-en bit, it is better to add delay here, as the BLK-CTL module + * doesn't need to care about how it is powered up. + * + * regmap_read_bypassed() is to make sure the above write IO transaction + * already reaches target before udelay() + */ + regmap_read_bypassed(domain->regmap, domain->regs->hsk, ®_val); + udelay(5); } /* Disable reset clocks for all devices in the domain */ From 25460d6f39024cc3b8241b14c7ccf0d6f11a736a Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Mon, 8 Apr 2024 07:10:39 -0700 Subject: [PATCH 008/279] net/9p: fix uninit-value in p9_client_rpc() Syzbot with the help of KMSAN reported the following error: BUG: KMSAN: uninit-value in trace_9p_client_res include/trace/events/9p.h:146 [inline] BUG: KMSAN: uninit-value in p9_client_rpc+0x1314/0x1340 net/9p/client.c:754 trace_9p_client_res include/trace/events/9p.h:146 [inline] p9_client_rpc+0x1314/0x1340 net/9p/client.c:754 p9_client_create+0x1551/0x1ff0 net/9p/client.c:1031 v9fs_session_init+0x1b9/0x28e0 fs/9p/v9fs.c:410 v9fs_mount+0xe2/0x12b0 fs/9p/vfs_super.c:122 legacy_get_tree+0x114/0x290 fs/fs_context.c:662 vfs_get_tree+0xa7/0x570 fs/super.c:1797 do_new_mount+0x71f/0x15e0 fs/namespace.c:3352 path_mount+0x742/0x1f20 fs/namespace.c:3679 do_mount fs/namespace.c:3692 [inline] __do_sys_mount fs/namespace.c:3898 [inline] __se_sys_mount+0x725/0x810 fs/namespace.c:3875 __x64_sys_mount+0xe4/0x150 fs/namespace.c:3875 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: __alloc_pages+0x9d6/0xe70 mm/page_alloc.c:4598 __alloc_pages_node include/linux/gfp.h:238 [inline] alloc_pages_node include/linux/gfp.h:261 [inline] alloc_slab_page mm/slub.c:2175 [inline] allocate_slab mm/slub.c:2338 [inline] new_slab+0x2de/0x1400 mm/slub.c:2391 ___slab_alloc+0x1184/0x33d0 mm/slub.c:3525 __slab_alloc mm/slub.c:3610 [inline] __slab_alloc_node mm/slub.c:3663 [inline] slab_alloc_node mm/slub.c:3835 [inline] kmem_cache_alloc+0x6d3/0xbe0 mm/slub.c:3852 p9_tag_alloc net/9p/client.c:278 [inline] p9_client_prepare_req+0x20a/0x1770 net/9p/client.c:641 p9_client_rpc+0x27e/0x1340 net/9p/client.c:688 p9_client_create+0x1551/0x1ff0 net/9p/client.c:1031 v9fs_session_init+0x1b9/0x28e0 fs/9p/v9fs.c:410 v9fs_mount+0xe2/0x12b0 fs/9p/vfs_super.c:122 legacy_get_tree+0x114/0x290 fs/fs_context.c:662 vfs_get_tree+0xa7/0x570 fs/super.c:1797 do_new_mount+0x71f/0x15e0 fs/namespace.c:3352 path_mount+0x742/0x1f20 fs/namespace.c:3679 do_mount fs/namespace.c:3692 [inline] __do_sys_mount fs/namespace.c:3898 [inline] __se_sys_mount+0x725/0x810 fs/namespace.c:3875 __x64_sys_mount+0xe4/0x150 fs/namespace.c:3875 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 If p9_check_errors() fails early in p9_client_rpc(), req->rc.tag will not be properly initialized. However, trace_9p_client_res() ends up trying to print it out anyway before p9_client_rpc() finishes. Fix this issue by assigning default values to p9_fcall fields such as 'tag' and (just in case KMSAN unearths something new) 'id' during the tag allocation stage. Reported-and-tested-by: syzbot+ff14db38f56329ef68df@syzkaller.appspotmail.com Fixes: 348b59012e5c ("net/9p: Convert net/9p protocol dumps to tracepoints") Signed-off-by: Nikita Zhandarovich Reviewed-by: Christian Schoenebeck Cc: stable@vger.kernel.org Message-ID: <20240408141039.30428-1-n.zhandarovich@fintech.ru> Signed-off-by: Dominique Martinet --- net/9p/client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/9p/client.c b/net/9p/client.c index f7e90b4769bb..b05f73c291b4 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -235,6 +235,8 @@ static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, if (!fc->sdata) return -ENOMEM; fc->capacity = alloc_msize; + fc->id = 0; + fc->tag = P9_NOTAG; return 0; } From 39bc27bd688066a63e56f7f64ad34fae03fbe3b8 Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Mon, 20 May 2024 12:05:14 +0200 Subject: [PATCH 009/279] drm/shmem-helper: Fix BUG_ON() on mmap(PROT_WRITE, MAP_PRIVATE) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lack of check for copy-on-write (COW) mapping in drm_gem_shmem_mmap allows users to call mmap with PROT_WRITE and MAP_PRIVATE flag causing a kernel panic due to BUG_ON in vmf_insert_pfn_prot: BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); Return -EINVAL early if COW mapping is detected. This bug affects all drm drivers using default shmem helpers. It can be reproduced by this simple example: void *ptr = mmap(0, size, PROT_WRITE, MAP_PRIVATE, fd, mmap_offset); ptr[0] = 0; Fixes: 2194a63a818d ("drm: Add library for shmem backed GEM objects") Cc: Noralf Trønnes Cc: Eric Anholt Cc: Rob Herring Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Cc: # v5.2+ Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20240520100514.925681-1-jacek.lawrynowicz@linux.intel.com --- drivers/gpu/drm/drm_gem_shmem_helper.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index e435f986cd13..1ff0678be7c7 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -610,6 +610,9 @@ int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct return ret; } + if (is_cow_mapping(vma->vm_flags)) + return -EINVAL; + dma_resv_lock(shmem->base.resv, NULL); ret = drm_gem_shmem_get_pages(shmem); dma_resv_unlock(shmem->base.resv); From d3a043733f25d743f3aa617c7f82dbcb5ee2211a Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 16 May 2024 17:43:51 +0530 Subject: [PATCH 010/279] nvme-multipath: find NUMA path only for online numa-node In current native multipath design when a shared namespace is created, we loop through each possible numa-node, calculate the NUMA distance of that node from each nvme controller and then cache the optimal IO path for future reference while sending IO. The issue with this design is that we may refer to the NUMA distance table for an offline node which may not be populated at the time and so we may inadvertently end up finding and caching a non-optimal path for IO. Then latter when the corresponding numa-node becomes online and hence the NUMA distance table entry for that node is created, ideally we should re-calculate the multipath node distance for the newly added node however that doesn't happen unless we rescan/reset the controller. So essentially, we may keep using non-optimal IO path for a node which is made online after namespace is created. This patch helps fix this issue ensuring that when a shared namespace is created, we calculate the multipath node distance for each online numa-node instead of each possible numa-node. Then latter when a node becomes online and we receive any IO on that newly added node, we would calculate the multipath node distance for newly added node but this time NUMA distance table would have been already populated for newly added node. Hence we would be able to correctly calculate the multipath node distance and choose the optimal path for the IO. Signed-off-by: Nilay Shroff Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d16e976ae1a4..9c1e135b8df3 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -595,7 +595,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) int node, srcu_idx; srcu_idx = srcu_read_lock(&head->srcu); - for_each_node(node) + for_each_online_node(node) __nvme_find_path(head, node); srcu_read_unlock(&head->srcu, srcu_idx); } From 8d00547ea8754afdc4a550af2fb7af2e3ba93cf8 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 16 May 2024 10:09:28 +0800 Subject: [PATCH 011/279] MAINTAINERS: Add myself as reviewer of ARM64 BPF JIT I am working on ARM64 BPF JIT for a while, hence add myself as reviewer. Signed-off-by: Xu Kuohai Acked-by: Hengqi Chen Link: https://lore.kernel.org/r/20240516020928.156125-1-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3fdc3b09c171..5e279e9ff63f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3816,6 +3816,7 @@ BPF JIT for ARM64 M: Daniel Borkmann M: Alexei Starovoitov M: Puranjay Mohan +R: Xu Kuohai L: bpf@vger.kernel.org S: Supported F: arch/arm64/net/ From c898afdc15645efb555acb6d85b484eb40a45409 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Tue, 21 May 2024 21:13:36 +0900 Subject: [PATCH 012/279] 9p: add missing locking around taking dentry fid list Fix a use-after-free on dentry's d_fsdata fid list when a thread looks up a fid through dentry while another thread unlinks it: UAF thread: refcount_t: addition on 0; use-after-free. p9_fid_get linux/./include/net/9p/client.h:262 v9fs_fid_find+0x236/0x280 linux/fs/9p/fid.c:129 v9fs_fid_lookup_with_uid linux/fs/9p/fid.c:181 v9fs_fid_lookup+0xbf/0xc20 linux/fs/9p/fid.c:314 v9fs_vfs_getattr_dotl+0xf9/0x360 linux/fs/9p/vfs_inode_dotl.c:400 vfs_statx+0xdd/0x4d0 linux/fs/stat.c:248 Freed by: p9_fid_destroy (inlined) p9_client_clunk+0xb0/0xe0 linux/net/9p/client.c:1456 p9_fid_put linux/./include/net/9p/client.h:278 v9fs_dentry_release+0xb5/0x140 linux/fs/9p/vfs_dentry.c:55 v9fs_remove+0x38f/0x620 linux/fs/9p/vfs_inode.c:518 vfs_unlink+0x29a/0x810 linux/fs/namei.c:4335 The problem is that d_fsdata was not accessed under d_lock, because d_release() normally is only called once the dentry is otherwise no longer accessible but since we also call it explicitly in v9fs_remove that lock is required: move the hlist out of the dentry under lock then unref its fids once they are no longer accessible. Fixes: 154372e67d40 ("fs/9p: fix create-unlink-getattr idiom") Cc: stable@vger.kernel.org Reported-by: Meysam Firouzi Reported-by: Amirmohammad Eftekhar Reviewed-by: Christian Schoenebeck Message-ID: <20240521122947.1080227-1-asmadeus@codewreck.org> Signed-off-by: Dominique Martinet --- fs/9p/vfs_dentry.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index f16f73581634..01338d4c2d9e 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -48,12 +48,17 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry) static void v9fs_dentry_release(struct dentry *dentry) { struct hlist_node *p, *n; + struct hlist_head head; p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n", dentry, dentry); - hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata) + + spin_lock(&dentry->d_lock); + hlist_move_list((struct hlist_head *)&dentry->d_fsdata, &head); + spin_unlock(&dentry->d_lock); + + hlist_for_each_safe(p, n, &head) p9_fid_put(hlist_entry(p, struct p9_fid, dlist)); - dentry->d_fsdata = NULL; } static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) From 9f788ba457b45b0ce422943fcec9fa35c4587764 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 22 May 2024 20:09:49 +0300 Subject: [PATCH 013/279] spi: Don't mark message DMA mapped when no transfer in it is There is no need to set the DMA mapped flag of the message if it has no mapped transfers. Moreover, it may give the code a chance to take the wrong paths, i.e. to exercise DMA related APIs on unmapped data. Make __spi_map_msg() to bail earlier on the above mentioned cases. Fixes: 99adef310f68 ("spi: Provide core support for DMA mapping transfers") Signed-off-by: Andy Shevchenko Link: https://msgid.link/r/20240522171018.3362521-2-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index a8966caed841..d40ce0fdb1a8 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1243,6 +1243,7 @@ static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) else rx_dev = ctlr->dev.parent; + ret = -ENOMSG; list_for_each_entry(xfer, &msg->transfers, transfer_list) { /* The sync is done before each transfer. */ unsigned long attrs = DMA_ATTR_SKIP_CPU_SYNC; @@ -1272,6 +1273,9 @@ static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) } } } + /* No transfer has been mapped, bail out with success */ + if (ret) + return 0; ctlr->cur_rx_dma_dev = rx_dev; ctlr->cur_tx_dma_dev = tx_dev; From da560097c05612f8d360f86528f6213629b9c395 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 22 May 2024 20:09:50 +0300 Subject: [PATCH 014/279] spi: Check if transfer is mapped before calling DMA sync APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The resent update to remove the orig_nents checks revealed that not all DMA sync backends can cope with the unallocated SG list, while supplying orig_nents == 0 (the commit 861370f49ce4 ("iommu/dma: force bouncing if the size is not cacheline-aligned"), for example, makes that happen for the IOMMU case). It means we have to check if the buffers are DMA mapped before trying to sync them. Re-introduce that check in a form of calling ->can_dma() in the same way as it's done in the DMA mapping loop for the SPI transfers. Reported-by: Nícolas F. R. A. Prado Reported-by: Neil Armstrong Closes: https://lore.kernel.org/r/8ae675b5-fcf9-4c9b-b06a-4462f70e1322@linaro.org Closes: https://lore.kernel.org/all/d3679496-2e4e-4a7c-97ed-f193bd53af1d@notapiano Fixes: 8cc3bad9d9d6 ("spi: Remove unneded check for orig_nents") Suggested-by: Nícolas F. R. A. Prado Tested-by: Nícolas F. R. A. Prado Signed-off-by: Andy Shevchenko Link: https://msgid.link/r/20240522171018.3362521-3-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index d40ce0fdb1a8..b18a4c871e21 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1311,7 +1311,7 @@ static int __spi_unmap_msg(struct spi_controller *ctlr, struct spi_message *msg) return 0; } -static void spi_dma_sync_for_device(struct spi_controller *ctlr, +static void spi_dma_sync_for_device(struct spi_controller *ctlr, struct spi_message *msg, struct spi_transfer *xfer) { struct device *rx_dev = ctlr->cur_rx_dma_dev; @@ -1320,11 +1320,14 @@ static void spi_dma_sync_for_device(struct spi_controller *ctlr, if (!ctlr->cur_msg_mapped) return; + if (!ctlr->can_dma(ctlr, msg->spi, xfer)) + return; + dma_sync_sgtable_for_device(tx_dev, &xfer->tx_sg, DMA_TO_DEVICE); dma_sync_sgtable_for_device(rx_dev, &xfer->rx_sg, DMA_FROM_DEVICE); } -static void spi_dma_sync_for_cpu(struct spi_controller *ctlr, +static void spi_dma_sync_for_cpu(struct spi_controller *ctlr, struct spi_message *msg, struct spi_transfer *xfer) { struct device *rx_dev = ctlr->cur_rx_dma_dev; @@ -1333,6 +1336,9 @@ static void spi_dma_sync_for_cpu(struct spi_controller *ctlr, if (!ctlr->cur_msg_mapped) return; + if (!ctlr->can_dma(ctlr, msg->spi, xfer)) + return; + dma_sync_sgtable_for_cpu(rx_dev, &xfer->rx_sg, DMA_FROM_DEVICE); dma_sync_sgtable_for_cpu(tx_dev, &xfer->tx_sg, DMA_TO_DEVICE); } @@ -1350,11 +1356,13 @@ static inline int __spi_unmap_msg(struct spi_controller *ctlr, } static void spi_dma_sync_for_device(struct spi_controller *ctrl, + struct spi_message *msg, struct spi_transfer *xfer) { } static void spi_dma_sync_for_cpu(struct spi_controller *ctrl, + struct spi_message *msg, struct spi_transfer *xfer) { } @@ -1626,10 +1634,10 @@ static int spi_transfer_one_message(struct spi_controller *ctlr, reinit_completion(&ctlr->xfer_completion); fallback_pio: - spi_dma_sync_for_device(ctlr, xfer); + spi_dma_sync_for_device(ctlr, msg, xfer); ret = ctlr->transfer_one(ctlr, msg->spi, xfer); if (ret < 0) { - spi_dma_sync_for_cpu(ctlr, xfer); + spi_dma_sync_for_cpu(ctlr, msg, xfer); if (ctlr->cur_msg_mapped && (xfer->error & SPI_TRANS_FAIL_NO_START)) { @@ -1654,7 +1662,7 @@ static int spi_transfer_one_message(struct spi_controller *ctlr, msg->status = ret; } - spi_dma_sync_for_cpu(ctlr, xfer); + spi_dma_sync_for_cpu(ctlr, msg, xfer); } else { if (xfer->len) dev_err(&msg->spi->dev, From a827ad9b3c2fc243e058595533f91ce41a312527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 23 May 2024 12:33:25 +0200 Subject: [PATCH 015/279] spi: stm32: Revert change that enabled controller before asserting CS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On stm32mp157 enabling the controller before asserting CS makes the hardware trigger spurious interrupts in a tight loop and the transfers fail. Revert the commit that swapped the order of enable and CS. This reintroduces the problem that swapping was supposed to fix, which however is less grave. Reported-by: Leonard Göhrs Link: https://lore.kernel.org/all/39033ed7-3e57-4339-80b4-fc8919e26aa7@pengutronix.de/ Fixes: 52b62e7a5d4f ("spi: stm32: enable controller before asserting CS") Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/20240523103326.792907-2-u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index 4a68abcdcc35..e4e7ddb7524a 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -1016,8 +1016,10 @@ static irqreturn_t stm32fx_spi_irq_event(int irq, void *dev_id) static irqreturn_t stm32fx_spi_irq_thread(int irq, void *dev_id) { struct spi_controller *ctrl = dev_id; + struct stm32_spi *spi = spi_controller_get_devdata(ctrl); spi_finalize_current_transfer(ctrl); + stm32fx_spi_disable(spi); return IRQ_HANDLED; } @@ -1185,8 +1187,6 @@ static int stm32_spi_prepare_msg(struct spi_controller *ctrl, ~clrb) | setb, spi->base + spi->cfg->regs->cpol.reg); - stm32_spi_enable(spi); - spin_unlock_irqrestore(&spi->lock, flags); return 0; @@ -1204,6 +1204,7 @@ static void stm32fx_spi_dma_tx_cb(void *data) if (spi->cur_comm == SPI_SIMPLEX_TX || spi->cur_comm == SPI_3WIRE_TX) { spi_finalize_current_transfer(spi->ctrl); + stm32fx_spi_disable(spi); } } @@ -1218,6 +1219,7 @@ static void stm32_spi_dma_rx_cb(void *data) struct stm32_spi *spi = data; spi_finalize_current_transfer(spi->ctrl); + spi->cfg->disable(spi); } /** @@ -1305,6 +1307,8 @@ static int stm32fx_spi_transfer_one_irq(struct stm32_spi *spi) stm32_spi_set_bits(spi, STM32FX_SPI_CR2, cr2); + stm32_spi_enable(spi); + /* starting data transfer when buffer is loaded */ if (spi->tx_buf) spi->cfg->write_tx(spi); @@ -1341,6 +1345,8 @@ static int stm32h7_spi_transfer_one_irq(struct stm32_spi *spi) spin_lock_irqsave(&spi->lock, flags); + stm32_spi_enable(spi); + /* Be sure to have data in fifo before starting data transfer */ if (spi->tx_buf) stm32h7_spi_write_txfifo(spi); @@ -1372,6 +1378,8 @@ static void stm32fx_spi_transfer_one_dma_start(struct stm32_spi *spi) */ stm32_spi_set_bits(spi, STM32FX_SPI_CR2, STM32FX_SPI_CR2_ERRIE); } + + stm32_spi_enable(spi); } /** @@ -1405,6 +1413,8 @@ static void stm32h7_spi_transfer_one_dma_start(struct stm32_spi *spi) stm32_spi_set_bits(spi, STM32H7_SPI_IER, ier); + stm32_spi_enable(spi); + if (STM32_SPI_HOST_MODE(spi)) stm32_spi_set_bits(spi, STM32H7_SPI_CR1, STM32H7_SPI_CR1_CSTART); } From bb9025f4432f8c158322cf2c04c2b492f23eb511 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 4 May 2024 14:47:01 +0300 Subject: [PATCH 016/279] dma-mapping: benchmark: fix up kthread-related error handling kthread creation failure is invalidly handled inside do_map_benchmark(). The put_task_struct() calls on the error path are supposed to balance the get_task_struct() calls which only happen after all the kthreads are successfully created. Rollback using kthread_stop() for already created kthreads in case of such failure. In normal situation call kthread_stop_put() to gracefully stop kthreads and put their task refcounts. This should be done for all started kthreads. Found by Linux Verification Center (linuxtesting.org). Fixes: 65789daa8087 ("dma-mapping: add benchmark support for streaming DMA APIs") Suggested-by: Robin Murphy Signed-off-by: Fedor Pchelkin Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 02205ab53b7e..2478957cf9f8 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -118,6 +118,8 @@ static int do_map_benchmark(struct map_benchmark_data *map) if (IS_ERR(tsk[i])) { pr_err("create dma_map thread failed\n"); ret = PTR_ERR(tsk[i]); + while (--i >= 0) + kthread_stop(tsk[i]); goto out; } @@ -139,13 +141,17 @@ static int do_map_benchmark(struct map_benchmark_data *map) msleep_interruptible(map->bparam.seconds * 1000); - /* wait for the completion of benchmark threads */ + /* wait for the completion of all started benchmark threads */ for (i = 0; i < threads; i++) { - ret = kthread_stop(tsk[i]); - if (ret) - goto out; + int kthread_ret = kthread_stop_put(tsk[i]); + + if (kthread_ret) + ret = kthread_ret; } + if (ret) + goto out; + loops = atomic64_read(&map->loops); if (likely(loops > 0)) { u64 map_variance, unmap_variance; @@ -170,8 +176,6 @@ static int do_map_benchmark(struct map_benchmark_data *map) } out: - for (i = 0; i < threads; i++) - put_task_struct(tsk[i]); put_device(map->dev); kfree(tsk); return ret; From f7c9ccaadffd13066353332c13d7e9bf73b8f92d Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 4 May 2024 14:47:02 +0300 Subject: [PATCH 017/279] dma-mapping: benchmark: avoid needless copy_to_user if benchmark fails If do_map_benchmark() has failed, there is nothing useful to copy back to userspace. Suggested-by: Barry Song <21cnbao@gmail.com> Signed-off-by: Fedor Pchelkin Acked-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 2478957cf9f8..a6edb1ef98c8 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -256,6 +256,9 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, * dma_mask changed by benchmark */ dma_set_mask(map->dev, old_dma_mask); + + if (ret) + return ret; break; default: return -EINVAL; From 1ff05e723f7ca30644b8ec3fb093f16312e408ad Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 4 May 2024 14:47:03 +0300 Subject: [PATCH 018/279] dma-mapping: benchmark: fix node id validation While validating node ids in map_benchmark_ioctl(), node_possible() may be provided with invalid argument outside of [0,MAX_NUMNODES-1] range leading to: BUG: KASAN: wild-memory-access in map_benchmark_ioctl (kernel/dma/map_benchmark.c:214) Read of size 8 at addr 1fffffff8ccb6398 by task dma_map_benchma/971 CPU: 7 PID: 971 Comm: dma_map_benchma Not tainted 6.9.0-rc6 #37 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Call Trace: dump_stack_lvl (lib/dump_stack.c:117) kasan_report (mm/kasan/report.c:603) kasan_check_range (mm/kasan/generic.c:189) variable_test_bit (arch/x86/include/asm/bitops.h:227) [inline] arch_test_bit (arch/x86/include/asm/bitops.h:239) [inline] _test_bit at (include/asm-generic/bitops/instrumented-non-atomic.h:142) [inline] node_state (include/linux/nodemask.h:423) [inline] map_benchmark_ioctl (kernel/dma/map_benchmark.c:214) full_proxy_unlocked_ioctl (fs/debugfs/file.c:333) __x64_sys_ioctl (fs/ioctl.c:890) do_syscall_64 (arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Compare node ids with sane bounds first. NUMA_NO_NODE is considered a special valid case meaning that benchmarking kthreads won't be bound to a cpuset of a given node. Found by Linux Verification Center (linuxtesting.org). Fixes: 65789daa8087 ("dma-mapping: add benchmark support for streaming DMA APIs") Signed-off-by: Fedor Pchelkin Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index a6edb1ef98c8..9f6c15f3f168 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -212,7 +212,8 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, } if (map->bparam.node != NUMA_NO_NODE && - !node_possible(map->bparam.node)) { + (map->bparam.node < 0 || map->bparam.node >= MAX_NUMNODES || + !node_possible(map->bparam.node))) { pr_err("invalid numa node\n"); return -EINVAL; } From e64746e74f717961250a155e14c156616fcd981f Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sat, 4 May 2024 14:47:04 +0300 Subject: [PATCH 019/279] dma-mapping: benchmark: handle NUMA_NO_NODE correctly cpumask_of_node() can be called for NUMA_NO_NODE inside do_map_benchmark() resulting in the following sanitizer report: UBSAN: array-index-out-of-bounds in ./arch/x86/include/asm/topology.h:72:28 index -1 is out of range for type 'cpumask [64][1]' CPU: 1 PID: 990 Comm: dma_map_benchma Not tainted 6.9.0-rc6 #29 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Call Trace: dump_stack_lvl (lib/dump_stack.c:117) ubsan_epilogue (lib/ubsan.c:232) __ubsan_handle_out_of_bounds (lib/ubsan.c:429) cpumask_of_node (arch/x86/include/asm/topology.h:72) [inline] do_map_benchmark (kernel/dma/map_benchmark.c:104) map_benchmark_ioctl (kernel/dma/map_benchmark.c:246) full_proxy_unlocked_ioctl (fs/debugfs/file.c:333) __x64_sys_ioctl (fs/ioctl.c:890) do_syscall_64 (arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Use cpumask_of_node() in place when binding a kernel thread to a cpuset of a particular node. Note that the provided node id is checked inside map_benchmark_ioctl(). It's just a NUMA_NO_NODE case which is not handled properly later. Found by Linux Verification Center (linuxtesting.org). Fixes: 65789daa8087 ("dma-mapping: add benchmark support for streaming DMA APIs") Signed-off-by: Fedor Pchelkin Acked-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 9f6c15f3f168..4950e0b622b1 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -101,7 +101,6 @@ static int do_map_benchmark(struct map_benchmark_data *map) struct task_struct **tsk; int threads = map->bparam.threads; int node = map->bparam.node; - const cpumask_t *cpu_mask = cpumask_of_node(node); u64 loops; int ret = 0; int i; @@ -124,7 +123,7 @@ static int do_map_benchmark(struct map_benchmark_data *map) } if (node != NUMA_NO_NODE) - kthread_bind_mask(tsk[i], cpu_mask); + kthread_bind_mask(tsk[i], cpumask_of_node(node)); } /* clear the old value in the previous benchmark */ From 2fe7b422460d14b33027d8770f7be8d26bcb2639 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 May 2024 09:50:47 -0700 Subject: [PATCH 020/279] nvme: fix multipath batched completion accounting Batched completions were missing the io stats accounting and bio trace events. Move the common code to a helper and call it from the batched and non-batched functions. Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device") Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 954f850f113a..79cdd34dfa18 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -414,6 +414,14 @@ static inline void nvme_end_req_zoned(struct request *req) } } +static inline void __nvme_end_req(struct request *req) +{ + nvme_end_req_zoned(req); + nvme_trace_bio_complete(req); + if (req->cmd_flags & REQ_NVME_MPATH) + nvme_mpath_end_request(req); +} + static inline void nvme_end_req(struct request *req) { blk_status_t status = nvme_error_status(nvme_req(req)->status); @@ -424,10 +432,7 @@ static inline void nvme_end_req(struct request *req) else nvme_log_error(req); } - nvme_end_req_zoned(req); - nvme_trace_bio_complete(req); - if (req->cmd_flags & REQ_NVME_MPATH) - nvme_mpath_end_request(req); + __nvme_end_req(req); blk_mq_end_request(req, status); } @@ -476,7 +481,7 @@ void nvme_complete_batch_req(struct request *req) { trace_nvme_complete_rq(req); nvme_cleanup_cmd(req); - nvme_end_req_zoned(req); + __nvme_end_req(req); } EXPORT_SYMBOL_GPL(nvme_complete_batch_req); From a2e4c5f5f68dbd206f132bc709b98dea64afc3b8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 May 2024 11:02:28 -0700 Subject: [PATCH 021/279] nvme-multipath: fix io accounting on failover There are io stats accounting that needs to be handled, so don't call blk_mq_end_request() directly. Use the existing nvme_end_req() helper that already handles everything. Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device") Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 3 ++- drivers/nvme/host/nvme.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 79cdd34dfa18..7706df237349 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -422,7 +422,7 @@ static inline void __nvme_end_req(struct request *req) nvme_mpath_end_request(req); } -static inline void nvme_end_req(struct request *req) +void nvme_end_req(struct request *req) { blk_status_t status = nvme_error_status(nvme_req(req)->status); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9c1e135b8df3..1bee176fd850 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req) blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); + nvme_req(req)->status = 0; + nvme_end_req(req); kblockd_schedule_work(&ns->head->requeue_work); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cacc56f4bbf4..fc31bd340a63 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -767,6 +767,7 @@ static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl) } } +void nvme_end_req(struct request *req); void nvme_complete_rq(struct request *req); void nvme_complete_batch_req(struct request *req); From f97914e35fd98b2b18fb8a092e0a0799f73afdfe Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 21 May 2024 23:20:28 +0300 Subject: [PATCH 022/279] nvmet: fix ns enable/disable possible hang When disabling an nvmet namespace, there is a period where the subsys->lock is released, as the ns disable waits for backend IO to complete, and the ns percpu ref to be properly killed. The original intent was to avoid taking the subsystem lock for a prolong period as other processes may need to acquire it (for example new incoming connections). However, it opens up a window where another process may come in and enable the ns, (re)intiailizing the ns percpu_ref, causing the disable sequence to hang. Solve this by taking the global nvmet_config_sem over the entire configfs enable/disable sequence. Fixes: a07b4970f464 ("nvmet: add a generic NVMe target") Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 7c43a0ad6877..bd87dfd173a4 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -676,10 +676,18 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item, if (kstrtobool(page, &enable)) return -EINVAL; + /* + * take a global nvmet_config_sem because the disable routine has a + * window where it releases the subsys-lock, giving a chance to + * a parallel enable to concurrently execute causing the disable to + * have a misaccounting of the ns percpu_ref. + */ + down_write(&nvmet_config_sem); if (enable) ret = nvmet_ns_enable(ns); else nvmet_ns_disable(ns); + up_write(&nvmet_config_sem); return ret ? ret : count; } From ec58991054e899c9d86f7e3c8a96cb602d4b5938 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Tue, 21 May 2024 15:03:02 +0800 Subject: [PATCH 023/279] drm/amdgpu: correct hbm field in boot status hbm filed takes bit 13 and bit 14 in boot status. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index c8980d5f6540..7021c4a66fb5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -46,7 +46,7 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x) AMDGPU_GET_REG_FIELD(x, 7, 7) #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8) #define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11) -#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 13, 13) +#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 14, 13) #define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x) AMDGPU_GET_REG_FIELD(x, 31, 31) #define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 1000 From 8195979d2dd995d60c2663adf54c69c1bf4eadd1 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 8 May 2024 16:45:35 -0500 Subject: [PATCH 024/279] drm/amd/display: Enable colorspace property for MST connectors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MST colorspace property support was disabled due to a series of warnings that came up when the device was plugged in since the properties weren't made at device creation. Create the properties in advance instead. Suggested-by: Ville Syrjälä Fixes: 69a959610229 ("drm/amd/display: Temporary Disable MST DP Colorspace Property"). Reported-and-tested-by: Tyler Schneider Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3353 Reviewed-by: Harry Wentland Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 0b03e659fdf3..8b0e997ebdae 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -613,6 +613,9 @@ dm_dp_add_mst_connector(struct drm_dp_mst_topology_mgr *mgr, &connector->base, dev->mode_config.tile_property, 0); + connector->colorspace_property = master->base.colorspace_property; + if (connector->colorspace_property) + drm_connector_attach_colorspace_property(connector); drm_connector_set_path_property(connector, pathprop); From 699646734ab51bf5b1cd4a7a30c20074f6e74f6e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 20 May 2024 22:30:17 -0700 Subject: [PATCH 025/279] uprobes: prevent mutex_lock() under rcu_read_lock() Recent changes made uprobe_cpu_buffer preparation lazy, and moved it deeper into __uprobe_trace_func(). This is problematic because __uprobe_trace_func() is called inside rcu_read_lock()/rcu_read_unlock() block, which then calls prepare_uprobe_buffer() -> uprobe_buffer_get() -> mutex_lock(&ucb->mutex), leading to a splat about using mutex under non-sleepable RCU: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 98231, name: stress-ng-sigq preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 ... Call Trace: dump_stack_lvl+0x3d/0xe0 __might_resched+0x24c/0x270 ? prepare_uprobe_buffer+0xd5/0x1d0 __mutex_lock+0x41/0x820 ? ___perf_sw_event+0x206/0x290 ? __perf_event_task_sched_in+0x54/0x660 ? __perf_event_task_sched_in+0x54/0x660 prepare_uprobe_buffer+0xd5/0x1d0 __uprobe_trace_func+0x4a/0x140 uprobe_dispatcher+0x135/0x280 ? uprobe_dispatcher+0x94/0x280 uprobe_notify_resume+0x650/0xec0 ? atomic_notifier_call_chain+0x21/0x110 ? atomic_notifier_call_chain+0xf8/0x110 irqentry_exit_to_user_mode+0xe2/0x1e0 asm_exc_int3+0x35/0x40 RIP: 0033:0x7f7e1d4da390 Code: 33 04 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b9 01 00 00 00 e9 b2 fc ff ff 66 90 f3 0f 1e fa 31 c9 e9 a5 fc ff ff 0f 1f 44 00 00 0f 1e fa b8 27 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 6e RSP: 002b:00007ffd2abc3608 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000076d325f1 RCX: 0000000000000000 RDX: 0000000076d325f1 RSI: 000000000000000a RDI: 00007ffd2abc3690 RBP: 000000000000000a R08: 00017fb700000000 R09: 00017fb700000000 R10: 00017fb700000000 R11: 0000000000000246 R12: 0000000000017ff2 R13: 00007ffd2abc3610 R14: 0000000000000000 R15: 00007ffd2abc3780 Luckily, it's easy to fix by moving prepare_uprobe_buffer() to be called slightly earlier: into uprobe_trace_func() and uretprobe_trace_func(), outside of RCU locked section. This still keeps this buffer preparation lazy and helps avoid the overhead when it's not needed. E.g., if there is only BPF uprobe handler installed on a given uprobe, buffer won't be initialized. Note, the other user of prepare_uprobe_buffer(), __uprobe_perf_func(), is not affected, as it doesn't prepare buffer under RCU read lock. Link: https://lore.kernel.org/all/20240521053017.3708530-1-andrii@kernel.org/ Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily") Reported-by: Breno Leitao Signed-off-by: Andrii Nakryiko Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8541fa1494ae..c98e3b3386ba 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -970,19 +970,17 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer **ucbp, + struct uprobe_cpu_buffer *ucb, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; struct trace_event_buffer fbuffer; - struct uprobe_cpu_buffer *ucb; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); WARN_ON(call != trace_file->event_call); - ucb = prepare_uprobe_buffer(tu, regs, ucbp); if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) return; @@ -1014,13 +1012,16 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; + struct uprobe_cpu_buffer *ucb; if (is_ret_probe(tu)) return 0; + ucb = prepare_uprobe_buffer(tu, regs, ucbp); + rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, 0, regs, ucbp, link->file); + __uprobe_trace_func(tu, 0, regs, ucb, link->file); rcu_read_unlock(); return 0; @@ -1031,10 +1032,13 @@ static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; + struct uprobe_cpu_buffer *ucb; + + ucb = prepare_uprobe_buffer(tu, regs, ucbp); rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, func, regs, ucbp, link->file); + __uprobe_trace_func(tu, func, regs, ucb, link->file); rcu_read_unlock(); } From 06e785aeb9ea8a43d0a3967c1ba6e69d758e82d4 Mon Sep 17 00:00:00 2001 From: Matt Jan Date: Tue, 14 May 2024 12:10:46 +0800 Subject: [PATCH 026/279] connector: Fix invalid conversion in cn_proc.h The implicit conversion from unsigned int to enum proc_cn_event is invalid, so explicitly cast it for compilation in a C++ compiler. /usr/include/linux/cn_proc.h: In function 'proc_cn_event valid_event(proc_cn_event)': /usr/include/linux/cn_proc.h:72:17: error: invalid conversion from 'unsigned int' to 'proc_cn_event' [-fpermissive] 72 | ev_type &= PROC_EVENT_ALL; | ^ | | | unsigned int Signed-off-by: Matt Jan Signed-off-by: David S. Miller --- include/uapi/linux/cn_proc.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h index f2afb7cc4926..18e3745b86cd 100644 --- a/include/uapi/linux/cn_proc.h +++ b/include/uapi/linux/cn_proc.h @@ -69,8 +69,7 @@ struct proc_input { static inline enum proc_cn_event valid_event(enum proc_cn_event ev_type) { - ev_type &= PROC_EVENT_ALL; - return ev_type; + return (enum proc_cn_event)(ev_type & PROC_EVENT_ALL); } /* From 128d54fbcb14b8717ecf596d3dbded327b9980b3 Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Tue, 21 May 2024 08:54:06 +0200 Subject: [PATCH 027/279] net: phy: micrel: set soft_reset callback to genphy_soft_reset for KSZ8061 Following a similar reinstate for the KSZ8081 and KSZ9031. Older kernels would use the genphy_soft_reset if the PHY did not implement a .soft_reset. The KSZ8061 errata described here: https://ww1.microchip.com/downloads/en/DeviceDoc/KSZ8061-Errata-DS80000688B.pdf and worked around with 232ba3a51c ("net: phy: Micrel KSZ8061: link failure after cable connect") is back again without this soft reset. Fixes: 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") Tested-by: Karim Ben Houcine Signed-off-by: Mathieu Othacehe Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 13e30ea7eec5..1d769322b059 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -5327,6 +5327,7 @@ static struct phy_driver ksphy_driver[] = { /* PHY_BASIC_FEATURES */ .probe = kszphy_probe, .config_init = ksz8061_config_init, + .soft_reset = genphy_soft_reset, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, .suspend = kszphy_suspend, From 9b038d004ce95551cb35381c49fe896c5bc11ffe Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 May 2024 14:37:43 +0100 Subject: [PATCH 028/279] netfs: Fix io_uring based write-through This can be triggered by mounting a cifs filesystem with a cache=strict mount option and then, using the fsx program from xfstests, doing: ltp/fsx -A -d -N 1000 -S 11463 -P /tmp /cifs-mount/foo \ --replay-ops=gen112-fsxops Where gen112-fsxops holds: fallocate 0x6be7 0x8fc5 0x377d3 copy_range 0x9c71 0x77e8 0x2edaf 0x377d3 write 0x2776d 0x8f65 0x377d3 The problem is that netfs_io_request::len is being used for two purposes and ends up getting set to the amount of data we transferred, not the amount of data the caller asked to be transferred (for various reasons, such as mmap'd writes, we might end up rounding out the data written to the server to include the entire folio at each end). Fix this by keeping the amount we were asked to write in ->len and using ->submitted to track what we issued ops for. Then, when we come to calling ->ki_complete(), ->len is the right size. This also required netfs_cleanup_dio_write() to change since we're no longer advancing wreq->len. Use wreq->transferred instead as we might have done a short read. With this, the generic/112 xfstest passes if cifs is forced to put all non-DIO opens into write-through mode. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Signed-off-by: David Howells Link: https://lore.kernel.org/r/295086.1716298663@warthog.procyon.org.uk cc: Jeff Layton cc: Steve French cc: Enzo Matsumiya cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/direct_write.c | 2 +- fs/netfs/write_collect.c | 7 ++++--- fs/netfs/write_issue.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 608ba6416919..28163516bf03 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -12,7 +12,7 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) { struct inode *inode = wreq->inode; - unsigned long long end = wreq->start + wreq->len; + unsigned long long end = wreq->start + wreq->transferred; if (!wreq->error && i_size_read(inode) < end) { diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 60112e4b2c5e..426cf87aaf2e 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -510,7 +510,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) * stream has a gap that can be jumped. */ if (notes & SOME_EMPTY) { - unsigned long long jump_to = wreq->start + wreq->len; + unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted); for (s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; @@ -690,10 +690,11 @@ void netfs_write_collection_worker(struct work_struct *work) wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); if (wreq->iocb) { - wreq->iocb->ki_pos += wreq->transferred; + size_t written = min(wreq->transferred, wreq->len); + wreq->iocb->ki_pos += written; if (wreq->iocb->ki_complete) wreq->iocb->ki_complete( - wreq->iocb, wreq->error ? wreq->error : wreq->transferred); + wreq->iocb, wreq->error ? wreq->error : written); wreq->iocb = VFS_PTR_POISON; } diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index e190043bc0da..86dad7e4202b 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -254,7 +254,7 @@ static void netfs_issue_write(struct netfs_io_request *wreq, stream->construct = NULL; if (subreq->start + subreq->len > wreq->start + wreq->submitted) - wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start; + WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start); netfs_do_issue_write(stream, subreq); } From 2c6b531020f0590db3b6b4950a41c692e9aa4f4a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 May 2024 14:36:27 +0100 Subject: [PATCH 029/279] netfs: Fix AIO error handling when doing write-through If an error occurs whilst we're doing an AIO write in write-through mode, we may end up calling ->ki_complete() *and* returning an error from ->write_iter(). This can result in either a UAF (the ->ki_complete() func pointer may get overwritten, for example) or a refcount underflow in io_submit() as ->ki_complete is called twice. Fix this by making netfs_end_writethrough() - and thus netfs_perform_write() - unconditionally return -EIOCBQUEUED if we're doing an AIO write and wait for completion if we're not. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Signed-off-by: David Howells Link: https://lore.kernel.org/r/295052.1716298587@warthog.procyon.org.uk cc: Jeff Layton cc: Enzo Matsumiya cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/write_issue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 86dad7e4202b..3aa86e268f40 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -636,7 +636,12 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr mutex_unlock(&ictx->wb_lock); - ret = wreq->error; + if (wreq->iocb) { + ret = -EIOCBQUEUED; + } else { + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); + ret = wreq->error; + } netfs_put_request(wreq, false, netfs_rreq_trace_put_return); return ret; } From 79c137454815ba5554caa8eeb4ad5c94e96e45ce Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 21 May 2024 19:49:38 +0800 Subject: [PATCH 030/279] filemap: add helper mapping_max_folio_size() Add mapping_max_folio_size() to get the maximum folio size for this pagecache mapping. Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace") Cc: stable@vger.kernel.org Reviewed-by: Darrick J. Wong Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20240521114939.2541461-1-xu.yang_2@nxp.com Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3d69589c00a4..ee633712bba0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -346,6 +346,19 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +/* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + /** * mapping_set_large_folios() - Indicate the file supports large folios. * @mapping: The file. @@ -372,6 +385,14 @@ static inline bool mapping_large_folio_support(struct address_space *mapping) test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } +/* Return the maximum folio size for this pagecache mapping, in bytes. */ +static inline size_t mapping_max_folio_size(struct address_space *mapping) +{ + if (mapping_large_folio_support(mapping)) + return PAGE_SIZE << MAX_PAGECACHE_ORDER; + return PAGE_SIZE; +} + static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS @@ -530,19 +551,6 @@ static inline void *detach_page_private(struct page *page) return folio_detach_private(page_folio(page)); } -/* - * There are some parts of the kernel which assume that PMD entries - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, - * limit the maximum allocation order to PMD size. I'm not aware of any - * assumptions about maximum order if THP are disabled, but 8 seems like - * a good order (that's 1MB if you're using 4kB pages) - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER -#else -#define MAX_PAGECACHE_ORDER 8 -#endif - #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); #else From 4e527d5841e24623181edc7fd6f6598ffa810e10 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 21 May 2024 19:49:39 +0800 Subject: [PATCH 031/279] iomap: fault in smaller chunks for non-large folio mappings Since commit (5d8edfb900d5 "iomap: Copy larger chunks from userspace"), iomap will try to copy in larger chunks than PAGE_SIZE. However, if the mapping doesn't support large folio, only one page of maximum 4KB will be created and 4KB data will be writen to pagecache each time. Then, next 4KB will be handled in next iteration. This will cause potential write performance problem. If chunk is 2MB, total 512 pages need to be handled finally. During this period, fault_in_iov_iter_readable() is called to check iov_iter readable validity. Since only 4KB will be handled each time, below address space will be checked over and over again: start end - buf, buf+2MB buf+4KB, buf+2MB buf+8KB, buf+2MB ... buf+2044KB buf+2MB Obviously the checking size is wrong since only 4KB will be handled each time. So this will get a correct chunk to let iomap work well in non-large folio case. With this change, the write speed will be stable. Tested on ARM64 device. Before: - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (334 MB/s) - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (278 MB/s) - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (204 MB/s) - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (170 MB/s) - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (150 MB/s) - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (139 MB/s) After: - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (339 MB/s) - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (330 MB/s) - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (332 MB/s) - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (333 MB/s) - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (333 MB/s) - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (333 MB/s) Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace") Cc: stable@vger.kernel.org Reviewed-by: Darrick J. Wong Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20240521114939.2541461-2-xu.yang_2@nxp.com Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 41c8f0c68ef5..c5802a459334 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -898,11 +898,11 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { loff_t length = iomap_length(iter); - size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; loff_t pos = iter->pos; ssize_t total_written = 0; long status = 0; struct address_space *mapping = iter->inode->i_mapping; + size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { From f826bc9d6fc2f0e089fb8d104415d72e4d2e204c Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 20 May 2024 12:08:18 +0300 Subject: [PATCH 032/279] signalfd: fix error return code If anon_inode_getfile() fails, return appropriate error code. This looks like a single typo: the similar code changes in timerfd and userfaultfd are okay. Found by Linux Verification Center (linuxtesting.org). Fixes: fbe38120eb1d ("signalfd: convert to ->read_iter()") Signed-off-by: Fedor Pchelkin Link: https://lore.kernel.org/r/20240520090819.76342-1-pchelkin@ispras.ru Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/signalfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/signalfd.c b/fs/signalfd.c index 4a5614442dbf..65fe5eed0be4 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -282,7 +282,7 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); - return ufd; + return PTR_ERR(file); } file->f_mode |= FMODE_NOWAIT; From 65bea9953715b19371164a8bec4f74fdd22c9e5a Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 20 May 2024 12:08:19 +0300 Subject: [PATCH 033/279] signalfd: drop an obsolete comment Commit fbe38120eb1d ("signalfd: convert to ->read_iter()") removed the call to anon_inode_getfd() by splitting fd setup into two parts. Drop the comment referencing the internal details of that function. Signed-off-by: Fedor Pchelkin Link: https://lore.kernel.org/r/20240520090819.76342-2-pchelkin@ispras.ru Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/signalfd.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/signalfd.c b/fs/signalfd.c index 65fe5eed0be4..ec7b2da2477a 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -286,10 +286,6 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) } file->f_mode |= FMODE_NOWAIT; - /* - * When we call this, the initialization must be complete, since - * anon_inode_getfd() will install the fd. - */ fd_install(ufd, file); } else { struct fd f = fdget(ufd); From c596bea1452ddf172ec9b588e4597228e9a1f4d5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 May 2024 16:49:46 +0100 Subject: [PATCH 034/279] netfs: Fix setting of BDP_ASYNC from iocb flags Fix netfs_perform_write() to set BDP_ASYNC if IOCB_NOWAIT is set rather than if IOCB_SYNC is not set. It reflects asynchronicity in the sense of not waiting rather than synchronicity in the sense of not returning until the op is complete. Without this, generic/590 fails on cifs in strict caching mode with a complaint that one of the writes fails with EAGAIN. The test can be distilled down to: mount -t cifs /my/share /mnt -ostuff xfs_io -i -c 'falloc 0 8191M -c fsync -f /mnt/file xfs_io -i -c 'pwrite -b 1M -W 0 8191M' /mnt/file Fixes: c38f4e96e605 ("netfs: Provide func to copy data to pagecache for buffered write") Signed-off-by: David Howells Link: https://lore.kernel.org/r/316306.1716306586@warthog.procyon.org.uk Reviewed-by: Jens Axboe cc: Jeff Layton cc: Enzo Matsumiya cc: Jens Axboe cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 1121601536d1..07bc1fd43530 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -181,7 +181,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct folio *folio, *writethrough = NULL; enum netfs_how_to_modify howto; enum netfs_folio_trace trace; - unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; + unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; loff_t i_size, pos = iocb->ki_pos, from, to; size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; From 712182b67e831912f90259102ae334089e7bccd1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 May 2024 21:00:44 +0200 Subject: [PATCH 035/279] swap: yield device immediately Otherwise we can cause spurious EBUSY issues when trying to mount the rootfs later on. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218845 Reported-by: Petri Kaukasoina Signed-off-by: Christian Brauner --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 5bc04bfe2db1..c6f24d17866d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1600,7 +1600,7 @@ int swsusp_check(bool exclusive) put: if (error) - fput(hib_resume_bdev_file); + bdev_fput(hib_resume_bdev_file); else pr_debug("Image signature found, resuming\n"); } else { From 51ef9305b8f40946d65c40368ffb4c14636d369a Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 22 May 2024 22:26:52 +0300 Subject: [PATCH 036/279] net/mlx5: Lag, do bond only if slaves agree on roce state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the driver does not enforce that lag bond slaves must have matching roce capabilities. Yet, in mlx5_do_bond(), the driver attempts to enable roce on all vports of the bond slaves, causing the following syndrome when one slave has no roce fw support: mlx5_cmd_out_err:809:(pid 25427): MODIFY_NIC_VPORT_CONTEXT(0×755) op_mod(0×0) failed, status bad parameter(0×3), syndrome (0xc1f678), err(-22) Thus, create HW lag only if bond's slaves agree on roce state, either all slaves have roce support resulting in a roce lag bond, or none do, resulting in a raw eth bond. Fixes: 7907f23adc18 ("net/mlx5: Implement RoCE LAG feature") Signed-off-by: Maher Sanalla Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index f7f0476a4a58..d0871c46b8c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -719,6 +719,7 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) struct mlx5_core_dev *dev; u8 mode; #endif + bool roce_support; int i; for (i = 0; i < ldev->ports; i++) @@ -743,6 +744,11 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) if (mlx5_sriov_is_enabled(ldev->pf[i].dev)) return false; #endif + roce_support = mlx5_get_roce_state(ldev->pf[MLX5_LAG_P1].dev); + for (i = 1; i < ldev->ports; i++) + if (mlx5_get_roce_state(ldev->pf[i].dev) != roce_support) + return false; + return true; } @@ -910,8 +916,10 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) } else if (roce_lag) { dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); - for (i = 1; i < ldev->ports; i++) - mlx5_nic_vport_enable_roce(ldev->pf[i].dev); + for (i = 1; i < ldev->ports; i++) { + if (mlx5_get_roce_state(ldev->pf[i].dev)) + mlx5_nic_vport_enable_roce(ldev->pf[i].dev); + } } else if (shared_fdb) { int i; From fca3b4791850b7e2181f0b3195b66d53df83151b Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 22 May 2024 22:26:53 +0300 Subject: [PATCH 037/279] net/mlx5: Do not query MPIR on embedded CPU function A proper query to MPIR needs to set the correct value in the depth field. On embedded CPU this value is not necessarily zero. As there is no real use case for multi-PF netdev on the embedded CPU of the smart NIC, block this option. This fixes the following failure: ACCESS_REG(0x805) op_mod(0x1) failed, status bad system state(0x4), syndrome (0x685f19), err(-5) Fixes: 678eb448055a ("net/mlx5: SD, Implement basic query and instantiation") Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index dd5d186dc614..f6deb5a3f820 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -100,10 +100,6 @@ static bool ft_create_alias_supported(struct mlx5_core_dev *dev) static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses) { - /* Feature is currently implemented for PFs only */ - if (!mlx5_core_is_pf(dev)) - return false; - /* Honor the SW implementation limit */ if (host_buses > MLX5_SD_MAX_GROUP_SZ) return false; @@ -162,6 +158,14 @@ static int sd_init(struct mlx5_core_dev *dev) bool sdm; int err; + /* Feature is currently implemented for PFs only */ + if (!mlx5_core_is_pf(dev)) + return 0; + + /* Block on embedded CPU PFs */ + if (mlx5_core_is_ecpf(dev)) + return 0; + if (!MLX5_CAP_MCAM_REG(dev, mpir)) return 0; From 1b9f86c6d53245dab087f1b2c05727b5982142ff Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 May 2024 22:26:54 +0300 Subject: [PATCH 038/279] net/mlx5: Fix MTMP register capability offset in MCAM register The MTMP register (0x900a) capability offset is off-by-one, move it to the right place. Fixes: 1f507e80c700 ("net/mlx5: Expose NIC temperature via hardware monitoring kernel API") Signed-off-by: Gal Pressman Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f468763478ae..5df52e15f7d6 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10308,9 +10308,9 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 mfrl[0x1]; u8 regs_39_to_32[0x8]; - u8 regs_31_to_10[0x16]; + u8 regs_31_to_11[0x15]; u8 mtmp[0x1]; - u8 regs_8_to_0[0x9]; + u8 regs_9_to_0[0xa]; }; struct mlx5_ifc_mcam_access_reg_bits1 { From 16d66a4fa81da07bc4ed19f4e53b87263c2f8d38 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Wed, 22 May 2024 22:26:55 +0300 Subject: [PATCH 039/279] net/mlx5: Use mlx5_ipsec_rx_status_destroy to correctly delete status rules rx_create no longer allocates a modify_hdr instance that needs to be cleaned up. The mlx5_modify_header_dealloc call will lead to a NULL pointer dereference. A leak in the rules also previously occurred since there are now two rules populated related to status. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 109907067 P4D 109907067 PUD 116890067 PMD 0 Oops: 0000 [#1] SMP CPU: 1 PID: 484 Comm: ip Not tainted 6.9.0-rc2-rrameshbabu+ #254 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.3-1-1 04/01/2014 RIP: 0010:mlx5_modify_header_dealloc+0xd/0x70 Call Trace: ? show_regs+0x60/0x70 ? __die+0x24/0x70 ? page_fault_oops+0x15f/0x430 ? free_to_partial_list.constprop.0+0x79/0x150 ? do_user_addr_fault+0x2c9/0x5c0 ? exc_page_fault+0x63/0x110 ? asm_exc_page_fault+0x27/0x30 ? mlx5_modify_header_dealloc+0xd/0x70 rx_create+0x374/0x590 rx_add_rule+0x3ad/0x500 ? rx_add_rule+0x3ad/0x500 ? mlx5_cmd_exec+0x2c/0x40 ? mlx5_create_ipsec_obj+0xd6/0x200 mlx5e_accel_ipsec_fs_add_rule+0x31/0xf0 mlx5e_xfrm_add_state+0x426/0xc00 Fixes: 94af50c0a9bb ("net/mlx5e: Unify esw and normal IPsec status table creation/destruction") Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 41a2543a52cd..e51b03d4c717 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -750,8 +750,7 @@ static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, err_fs_ft: if (rx->allow_tunnel_mode) mlx5_eswitch_unblock_encap(mdev); - mlx5_del_flow_rules(rx->status.rule); - mlx5_modify_header_dealloc(mdev, rx->status.modify_hdr); + mlx5_ipsec_rx_status_destroy(ipsec, rx); err_add: mlx5_destroy_flow_table(rx->ft.status); err_fs_ft_status: From 9a52f6d44f4521773b4699b4ed34b8e21d5a175c Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Wed, 22 May 2024 22:26:56 +0300 Subject: [PATCH 040/279] net/mlx5e: Fix IPsec tunnel mode offload feature check Remove faulty check disabling checksum offload and GSO for offload of simple IPsec tunnel L4 traffic. Comment previously describing the deleted code incorrectly claimed the check prevented double tunnel (or three layers of ip headers). Fixes: f1267798c980 ("net/mlx5: Fix checksum issue of VXLAN and IPsec crypto offload") Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../mellanox/mlx5/core/en_accel/ipsec_rxtx.h | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h index 82064614846f..359050f0b54d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h @@ -97,18 +97,11 @@ mlx5e_ipsec_feature_check(struct sk_buff *skb, netdev_features_t features) if (!x || !x->xso.offload_handle) goto out_disable; - if (xo->inner_ipproto) { - /* Cannot support tunnel packet over IPsec tunnel mode - * because we cannot offload three IP header csum - */ - if (x->props.mode == XFRM_MODE_TUNNEL) - goto out_disable; - - /* Only support UDP or TCP L4 checksum */ - if (xo->inner_ipproto != IPPROTO_UDP && - xo->inner_ipproto != IPPROTO_TCP) - goto out_disable; - } + /* Only support UDP or TCP L4 checksum */ + if (xo->inner_ipproto && + xo->inner_ipproto != IPPROTO_UDP && + xo->inner_ipproto != IPPROTO_TCP) + goto out_disable; return features; From f55cd31287e5f77f226c91d2f7756bafa0d583ed Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Wed, 22 May 2024 22:26:57 +0300 Subject: [PATCH 041/279] net/mlx5e: Do not use ptp structure for tx ts stats when not initialized The ptp channel instance is only initialized when ptp traffic is first processed by the driver. This means that there is a window in between when port timestamping is enabled and ptp traffic is sent where the ptp channel instance is not initialized. Accessing statistics during this window will lead to an access violation (NULL + member offset). Check the validity of the instance before attempting to query statistics. BUG: unable to handle page fault for address: 0000000000003524 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 109dfc067 P4D 109dfc067 PUD 1064ef067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 420 Comm: ethtool Not tainted 6.9.0-rc2-rrameshbabu+ #245 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.3-1-1 04/01/204 RIP: 0010:mlx5e_stats_ts_get+0x4c/0x130 Call Trace: ? show_regs+0x60/0x70 ? __die+0x24/0x70 ? page_fault_oops+0x15f/0x430 ? do_user_addr_fault+0x2c9/0x5c0 ? exc_page_fault+0x63/0x110 ? asm_exc_page_fault+0x27/0x30 ? mlx5e_stats_ts_get+0x4c/0x130 ? mlx5e_stats_ts_get+0x20/0x130 mlx5e_get_ts_stats+0x15/0x20 Fixes: 3579032c08c1 ("net/mlx5e: Implement ethtool hardware timestamping statistics") Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index e211c41cec06..e1ed214e8651 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -1186,6 +1186,9 @@ void mlx5e_stats_ts_get(struct mlx5e_priv *priv, ts_stats->err = 0; ts_stats->lost = 0; + if (!ptp) + goto out; + /* Aggregate stats across all TCs */ for (i = 0; i < ptp->num_tc; i++) { struct mlx5e_ptp_cq_stats *stats = @@ -1214,6 +1217,7 @@ void mlx5e_stats_ts_get(struct mlx5e_priv *priv, } } +out: mutex_unlock(&priv->state_lock); } From 5c74195d5dd977e97556e6fa76909b831c241230 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 22 May 2024 22:26:58 +0300 Subject: [PATCH 042/279] net/mlx5e: Use rx_missed_errors instead of rx_dropped for reporting buffer exhaustion Previously, the driver incorrectly used rx_dropped to report device buffer exhaustion. According to the documentation, rx_dropped should not be used to count packets dropped due to buffer exhaustion, which is the purpose of rx_missed_errors. Use rx_missed_errors as intended for counting packets dropped due to buffer exhaustion. Fixes: 269e6b3af3bf ("net/mlx5e: Report additional error statistics in get stats ndo") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b758bc72ac36..c53c99dde558 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3886,7 +3886,7 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) mlx5e_fold_sw_stats64(priv, stats); } - stats->rx_dropped = priv->stats.qcnt.rx_out_of_buffer; + stats->rx_missed_errors = priv->stats.qcnt.rx_out_of_buffer; stats->rx_length_errors = PPORT_802_3_GET(pstats, a_in_range_length_errors) + From 83fea49f2711fc90c0d115b0ed04046b45155b65 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 May 2024 22:26:59 +0300 Subject: [PATCH 043/279] net/mlx5e: Fix UDP GSO for encapsulated packets When the skb is encapsulated, adjust the inner UDP header instead of the outer one, and account for UDP header (instead of TCP) in the inline header size calculation. Fixes: 689adf0d4892 ("net/mlx5e: Add UDP GSO support") Reported-by: Jason Baron Closes: https://lore.kernel.org/netdev/c42961cb-50b9-4a9a-bd43-87fe48d88d29@akamai.com/ Signed-off-by: Gal Pressman Reviewed-by: Dragos Tatulea Reviewed-by: Boris Pismenny Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h | 8 +++++++- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index caa34b9c161e..33e32584b07f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -102,8 +102,14 @@ static inline void mlx5e_udp_gso_handle_tx_skb(struct sk_buff *skb) { int payload_len = skb_shinfo(skb)->gso_size + sizeof(struct udphdr); + struct udphdr *udphdr; - udp_hdr(skb)->len = htons(payload_len); + if (skb->encapsulation) + udphdr = (struct udphdr *)skb_inner_transport_header(skb); + else + udphdr = udp_hdr(skb); + + udphdr->len = htons(payload_len); } struct mlx5e_accel_tx_state { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 099bf1078889..b09e9abd39f3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -153,7 +153,11 @@ mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb, int *hopbyhop) *hopbyhop = 0; if (skb->encapsulation) { - ihs = skb_inner_tcp_all_headers(skb); + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + ihs = skb_inner_transport_offset(skb) + + sizeof(struct udphdr); + else + ihs = skb_inner_tcp_all_headers(skb); stats->tso_inner_packets++; stats->tso_inner_bytes += skb->len - ihs; } else { From b794918961516f667b0c745aebdfebbb8a98df39 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 5 May 2024 23:08:31 +0900 Subject: [PATCH 044/279] dma-buf/sw-sync: don't enable IRQ from sync_print_obj() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit a6aa8fca4d79 ("dma-buf/sw-sync: Reduce irqsave/irqrestore from known context") by error replaced spin_unlock_irqrestore() with spin_unlock_irq() for both sync_debugfs_show() and sync_print_obj() despite sync_print_obj() is called from sync_debugfs_show(), lockdep complains inconsistent lock state warning. Use plain spin_{lock,unlock}() for sync_print_obj(), for sync_debugfs_show() is already using spin_{lock,unlock}_irq(). Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=a225ee3df7e7f9372dbe Fixes: a6aa8fca4d79 ("dma-buf/sw-sync: Reduce irqsave/irqrestore from known context") Signed-off-by: Tetsuo Handa Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/c2e46020-aaa6-4e06-bf73-f05823f913f0@I-love.SAKURA.ne.jp Signed-off-by: Christian König --- drivers/dma-buf/sync_debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma-buf/sync_debug.c b/drivers/dma-buf/sync_debug.c index 101394f16930..237bce21d1e7 100644 --- a/drivers/dma-buf/sync_debug.c +++ b/drivers/dma-buf/sync_debug.c @@ -110,12 +110,12 @@ static void sync_print_obj(struct seq_file *s, struct sync_timeline *obj) seq_printf(s, "%s: %d\n", obj->name, obj->value); - spin_lock_irq(&obj->lock); + spin_lock(&obj->lock); /* Caller already disabled IRQ. */ list_for_each(pos, &obj->pt_list) { struct sync_pt *pt = container_of(pos, struct sync_pt, link); sync_print_fence(s, &pt->base, false); } - spin_unlock_irq(&obj->lock); + spin_unlock(&obj->lock); } static void sync_print_sync_file(struct seq_file *s, From 44382b3ed6b2787710c8ade06c0e97f5970a47c8 Mon Sep 17 00:00:00 2001 From: Friedrich Vock Date: Tue, 14 May 2024 09:09:31 +0200 Subject: [PATCH 045/279] bpf: Fix potential integer overflow in resolve_btfids err is a 32-bit integer, but elf_update returns an off_t, which is 64-bit at least on 64-bit platforms. If symbols_patch is called on a binary between 2-4GB in size, the result will be negative when cast to a 32-bit integer, which the code assumes means an error occurred. This can wrongly trigger build failures when building very large kernel images. Fixes: fbbb68de80a4 ("bpf: Add resolve_btfids tool to resolve BTF IDs in ELF object") Signed-off-by: Friedrich Vock Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240514070931.199694-1-friedrich.vock@gmx.de --- tools/bpf/resolve_btfids/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index d9520cb826b3..af393c7dee1f 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -728,7 +728,7 @@ static int sets_patch(struct object *obj) static int symbols_patch(struct object *obj) { - int err; + off_t err; if (__symbols_patch(obj, &obj->structs) || __symbols_patch(obj, &obj->unions) || From 64e3d02b43b17390f0fa9af6708a4eafdf20ba01 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 24 May 2024 16:04:48 +0530 Subject: [PATCH 046/279] nvme: remove sgs and sws sgs/sws are unused, so remove these from nvme_ns_head structure. Signed-off-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index fc31bd340a63..c43a30753d87 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -471,8 +471,6 @@ struct nvme_ns_head { u8 pi_type; u8 pi_offset; u8 guard_type; - u16 sgs; - u32 sws; #ifdef CONFIG_BLK_DEV_ZONED u64 zsze; #endif From 1bd293fcf3af84674e82ed022c049491f3768840 Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Thu, 23 May 2024 17:01:49 +0530 Subject: [PATCH 047/279] nvme: adjust multiples of NVME_CTRL_PAGE_SIZE in offset bio_vec start offset may be relatively large particularly when large folio gets added to the bio. A bigger offset will result in avoiding the single-segment mapping optimization and end up using expensive mempool_alloc further. Rather than using absolute value, adjust bv_offset by NVME_CTRL_PAGE_SIZE while checking if segment can be fitted into one/two PRP entries. Suggested-by: Christoph Hellwig Signed-off-by: Kundan Kumar Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 710043086dff..102a9fb0c65f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -778,7 +778,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) + if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); From 29be9100aca2915fab54b5693309bc42956542e5 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Fri, 24 May 2024 17:17:55 +0100 Subject: [PATCH 048/279] afs: Don't cross .backup mountpoint from backup volume Don't cross a mountpoint that explicitly specifies a backup volume (target is .backup) when starting from a backup volume. It it not uncommon to mount a volume's backup directly in the volume itself. This can cause tools that are not paying attention to get into a loop mounting the volume onto itself as they attempt to traverse the tree, leading to a variety of problems. This doesn't prevent the general case of loops in a sequence of mountpoints, but addresses a common special case in the same way as other afs clients. Reported-by: Jan Henrik Sylvester Link: http://lists.infradead.org/pipermail/linux-afs/2024-May/008454.html Reported-by: Markus Suvanto Link: http://lists.infradead.org/pipermail/linux-afs/2024-February/008074.html Signed-off-by: Marc Dionne Signed-off-by: David Howells Link: https://lore.kernel.org/r/768760.1716567475@warthog.procyon.org.uk Reviewed-by: Jeffrey Altman cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- fs/afs/mntpt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 97f50e9fd9eb..297487ee8323 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -140,6 +140,11 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) put_page(page); if (ret < 0) return ret; + + /* Don't cross a backup volume mountpoint from a backup volume */ + if (src_as->volume && src_as->volume->type == AFSVL_BACKVOL && + ctx->type == AFSVL_BACKVOL) + return -ENODEV; } return 0; From 46ba0e49b64232adac35a2bc892f1710c5b0fb7f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 May 2024 09:33:57 -0700 Subject: [PATCH 049/279] bpf: fix multi-uprobe PID filtering logic Current implementation of PID filtering logic for multi-uprobes in uprobe_prog_run() is filtering down to exact *thread*, while the intent for PID filtering it to filter by *process* instead. The check in uprobe_prog_run() also differs from the analogous one in uprobe_multi_link_filter() for some reason. The latter is correct, checking task->mm, not the task itself. Fix the check in uprobe_prog_run() to perform the same task->mm check. While doing this, we also update get_pid_task() use to use PIDTYPE_TGID type of lookup, given the intent is to get a representative task of an entire process. This doesn't change behavior, but seems more logical. It would hold task group leader task now, not any random thread task. Last but not least, given multi-uprobe support is half-broken due to this PID filtering logic (depending on whether PID filtering is important or not), we need to make it easy for user space consumers (including libbpf) to easily detect whether PID filtering logic was already fixed. We do it here by adding an early check on passed pid parameter. If it's negative (and so has no chance of being a valid PID), we return -EINVAL. Previous behavior would eventually return -ESRCH ("No process found"), given there can't be any process with negative PID. This subtle change won't make any practical change in behavior, but will allow applications to detect PID filtering fixes easily. Libbpf fixes take advantage of this in the next patch. Cc: stable@vger.kernel.org Acked-by: Jiri Olsa Fixes: b733eeade420 ("bpf: Add pid filter support for uprobe_multi link") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240521163401.3005045-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 8 ++++---- .../testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f5154c051d2c..1baaeb9ca205 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3295,7 +3295,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, struct bpf_run_ctx *old_run_ctx; int err = 0; - if (link->task && current != link->task) + if (link->task && current->mm != link->task->mm) return 0; if (sleepable) @@ -3396,8 +3396,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); cnt = attr->link_create.uprobe_multi.cnt; + pid = attr->link_create.uprobe_multi.pid; - if (!upath || !uoffsets || !cnt) + if (!upath || !uoffsets || !cnt || pid < 0) return -EINVAL; if (cnt > MAX_UPROBE_MULTI_CNT) return -E2BIG; @@ -3421,10 +3422,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error_path_put; } - pid = attr->link_create.uprobe_multi.pid; if (pid) { rcu_read_lock(); - task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); rcu_read_unlock(); if (!task) { err = -ESRCH; diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 8269cdee33ae..38fda42fd70f 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -397,7 +397,7 @@ static void test_attach_api_fails(void) link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); if (!ASSERT_ERR(link_fd, "link_fd")) goto cleanup; - ASSERT_EQ(link_fd, -ESRCH, "pid_is_wrong"); + ASSERT_EQ(link_fd, -EINVAL, "pid_is_wrong"); cleanup: if (link_fd >= 0) From 4a8f635a60540888dab3804992e86410360339c8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 May 2024 09:33:58 -0700 Subject: [PATCH 050/279] bpf: remove unnecessary rcu_read_{lock,unlock}() in multi-uprobe attach logic get_pid_task() internally already calls rcu_read_lock() and rcu_read_unlock(), so there is no point to do this one extra time. This is a drive-by improvement and has no correctness implications. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240521163401.3005045-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1baaeb9ca205..6249dac61701 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3423,9 +3423,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } if (pid) { - rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); - rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; From 04d939a2ab229a3821f04fc81f7c027842f501f1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 May 2024 09:33:59 -0700 Subject: [PATCH 051/279] libbpf: detect broken PID filtering logic for multi-uprobe Libbpf is automatically (and transparently to user) detecting multi-uprobe support in the kernel, and, if supported, uses multi-uprobes to improve USDT attachment speed. USDTs can be attached system-wide or for the specific process by PID. In the latter case, we rely on correct kernel logic of not triggering USDT for unrelated processes. As such, on older kernels that do support multi-uprobes, but still have broken PID filtering logic, we need to fall back to singular uprobes. Unfortunately, whether user is using PID filtering or not is known at the attachment time, which happens after relevant BPF programs were loaded into the kernel. Also unfortunately, we need to make a call whether to use multi-uprobes or singular uprobe for SEC("usdt") programs during BPF object load time, at which point we have no information about possible PID filtering. The distinction between single and multi-uprobes is small, but important for the kernel. Multi-uprobes get BPF_TRACE_UPROBE_MULTI attach type, and kernel internally substitiute different implementation of some of BPF helpers (e.g., bpf_get_attach_cookie()) depending on whether uprobe is multi or singular. So, multi-uprobes and singular uprobes cannot be intermixed. All the above implies that we have to make an early and conservative call about the use of multi-uprobes. And so this patch modifies libbpf's existing feature detector for multi-uprobe support to also check correct PID filtering. If PID filtering is not yet fixed, we fall back to singular uprobes for USDTs. This extension to feature detection is simple thanks to kernel's -EINVAL addition for pid < 0. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240521163401.3005045-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/features.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index a336786a22a3..3df0125ed5fa 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -392,11 +392,40 @@ static int probe_uprobe_multi_link(int token_fd) link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts); err = -errno; /* close() can clobber errno */ + if (link_fd >= 0 || err != -EBADF) { + close(link_fd); + close(prog_fd); + return 0; + } + + /* Initial multi-uprobe support in kernel didn't handle PID filtering + * correctly (it was doing thread filtering, not process filtering). + * So now we'll detect if PID filtering logic was fixed, and, if not, + * we'll pretend multi-uprobes are not supported, if not. + * Multi-uprobes are used in USDT attachment logic, and we need to be + * conservative here, because multi-uprobe selection happens early at + * load time, while the use of PID filtering is known late at + * attachment time, at which point it's too late to undo multi-uprobe + * selection. + * + * Creating uprobe with pid == -1 for (invalid) '/' binary will fail + * early with -EINVAL on kernels with fixed PID filtering logic; + * otherwise -ESRCH would be returned if passed correct binary path + * (but we'll just get -BADF, of course). + */ + link_opts.uprobe_multi.pid = -1; /* invalid PID */ + link_opts.uprobe_multi.path = "/"; /* invalid path */ + link_opts.uprobe_multi.offsets = &offset; + link_opts.uprobe_multi.cnt = 1; + + link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts); + err = -errno; /* close() can clobber errno */ + if (link_fd >= 0) close(link_fd); close(prog_fd); - return link_fd < 0 && err == -EBADF; + return link_fd < 0 && err == -EINVAL; } static int probe_kern_bpf_cookie(int token_fd) From 70342420a1cf1173bdec456e5fa574a804e422db Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 May 2024 09:34:00 -0700 Subject: [PATCH 052/279] selftests/bpf: extend multi-uprobe tests with child thread case Extend existing multi-uprobe tests to test that PID filtering works correctly. We already have child *process* tests, but we need also child *thread* tests. This patch adds spawn_thread() helper to start child thread, wait for it to be ready, and then instruct it to trigger desired uprobes. Additionally, we extend BPF-side code to track thread ID, not just process ID. Also we detect whether extraneous triggerings with unexpected process IDs happened, and validate that none of that happened in practice. These changes prove that fixed PID filtering logic for multi-uprobe works as expected. These tests fail on old kernels. Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240521163401.3005045-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/uprobe_multi_test.c | 107 ++++++++++++++++-- .../selftests/bpf/progs/uprobe_multi.c | 17 ++- 2 files changed, 115 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 38fda42fd70f..677232d31432 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include "uprobe_multi.skel.h" #include "uprobe_multi_bench.skel.h" @@ -27,7 +28,10 @@ noinline void uprobe_multi_func_3(void) struct child { int go[2]; + int c2p[2]; /* child -> parent channel */ int pid; + int tid; + pthread_t thread; }; static void release_child(struct child *child) @@ -38,6 +42,10 @@ static void release_child(struct child *child) return; close(child->go[1]); close(child->go[0]); + if (child->thread) + pthread_join(child->thread, NULL); + close(child->c2p[0]); + close(child->c2p[1]); if (child->pid > 0) waitpid(child->pid, &child_status, 0); } @@ -63,7 +71,7 @@ static struct child *spawn_child(void) if (pipe(child.go)) return NULL; - child.pid = fork(); + child.pid = child.tid = fork(); if (child.pid < 0) { release_child(&child); errno = EINVAL; @@ -89,6 +97,66 @@ static struct child *spawn_child(void) return &child; } +static void *child_thread(void *ctx) +{ + struct child *child = ctx; + int c = 0, err; + + child->tid = syscall(SYS_gettid); + + /* let parent know we are ready */ + err = write(child->c2p[1], &c, 1); + if (err != 1) + pthread_exit(&err); + + /* wait for parent's kick */ + err = read(child->go[0], &c, 1); + if (err != 1) + pthread_exit(&err); + + uprobe_multi_func_1(); + uprobe_multi_func_2(); + uprobe_multi_func_3(); + + err = 0; + pthread_exit(&err); +} + +static struct child *spawn_thread(void) +{ + static struct child child; + int c, err; + + /* pipe to notify child to execute the trigger functions */ + if (pipe(child.go)) + return NULL; + /* pipe to notify parent that child thread is ready */ + if (pipe(child.c2p)) { + close(child.go[0]); + close(child.go[1]); + return NULL; + } + + child.pid = getpid(); + + err = pthread_create(&child.thread, NULL, child_thread, &child); + if (err) { + err = -errno; + close(child.go[0]); + close(child.go[1]); + close(child.c2p[0]); + close(child.c2p[1]); + errno = -err; + return NULL; + } + + err = read(child.c2p[0], &c, 1); + if (!ASSERT_EQ(err, 1, "child_thread_ready")) + return NULL; + + return &child; +} + static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child) { skel->bss->uprobe_multi_func_1_addr = (__u64) uprobe_multi_func_1; @@ -103,15 +171,22 @@ static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child * passed at the probe attach. */ skel->bss->pid = child ? 0 : getpid(); + skel->bss->expect_pid = child ? child->pid : 0; + + /* trigger all probes, if we are testing child *process*, just to make + * sure that PID filtering doesn't let through activations from wrong + * PIDs; when we test child *thread*, we don't want to do this to + * avoid double counting number of triggering events + */ + if (!child || !child->thread) { + uprobe_multi_func_1(); + uprobe_multi_func_2(); + uprobe_multi_func_3(); + } if (child) kick_child(child); - /* trigger all probes */ - uprobe_multi_func_1(); - uprobe_multi_func_2(); - uprobe_multi_func_3(); - /* * There are 2 entry and 2 exit probe called for each uprobe_multi_func_[123] * function and each slepable probe (6) increments uprobe_multi_sleep_result. @@ -126,8 +201,12 @@ static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child ASSERT_EQ(skel->bss->uprobe_multi_sleep_result, 6, "uprobe_multi_sleep_result"); - if (child) + ASSERT_FALSE(skel->bss->bad_pid_seen, "bad_pid_seen"); + + if (child) { ASSERT_EQ(skel->bss->child_pid, child->pid, "uprobe_multi_child_pid"); + ASSERT_EQ(skel->bss->child_tid, child->tid, "uprobe_multi_child_tid"); + } } static void test_skel_api(void) @@ -210,6 +289,13 @@ test_attach_api(const char *binary, const char *pattern, struct bpf_uprobe_multi return; __test_attach_api(binary, pattern, opts, child); + + /* pid filter (thread) */ + child = spawn_thread(); + if (!ASSERT_OK_PTR(child, "spawn_thread")) + return; + + __test_attach_api(binary, pattern, opts, child); } static void test_attach_api_pattern(void) @@ -495,6 +581,13 @@ static void test_link_api(void) return; __test_link_api(child); + + /* pid filter (thread) */ + child = spawn_thread(); + if (!ASSERT_OK_PTR(child, "spawn_thread")) + return; + + __test_link_api(child); } static void test_bench_attach_uprobe(void) diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi.c b/tools/testing/selftests/bpf/progs/uprobe_multi.c index 419d9aa28fce..86a7ff5d3726 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi.c @@ -22,6 +22,10 @@ __u64 uprobe_multi_sleep_result = 0; int pid = 0; int child_pid = 0; +int child_tid = 0; + +int expect_pid = 0; +bool bad_pid_seen = false; bool test_cookie = false; void *user_ptr = 0; @@ -36,11 +40,19 @@ static __always_inline bool verify_sleepable_user_copy(void) static void uprobe_multi_check(void *ctx, bool is_return, bool is_sleep) { - child_pid = bpf_get_current_pid_tgid() >> 32; + __u64 cur_pid_tgid = bpf_get_current_pid_tgid(); + __u32 cur_pid; - if (pid && child_pid != pid) + cur_pid = cur_pid_tgid >> 32; + if (pid && cur_pid != pid) return; + if (expect_pid && cur_pid != expect_pid) + bad_pid_seen = true; + + child_pid = cur_pid_tgid >> 32; + child_tid = (__u32)cur_pid_tgid; + __u64 cookie = test_cookie ? bpf_get_attach_cookie(ctx) : 0; __u64 addr = bpf_get_func_ip(ctx); @@ -97,5 +109,6 @@ int uretprobe_sleep(struct pt_regs *ctx) SEC("uprobe.multi//proc/self/exe:uprobe_multi_func_*") int uprobe_extra(struct pt_regs *ctx) { + /* we need this one just to mix PID-filtered and global uprobes */ return 0; } From 198034a87dfeb64d5a8359a5089022c6b923646e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 May 2024 09:34:01 -0700 Subject: [PATCH 053/279] selftests/bpf: extend multi-uprobe tests with USDTs Validate libbpf's USDT-over-multi-uprobe logic by adding USDTs to existing multi-uprobe tests. This checks correct libbpf fallback to singular uprobes (when run on older kernels with buggy PID filtering). We reuse already established child process and child thread testing infrastructure, so additions are minimal. These test fail on either older kernels or older version of libbpf that doesn't detect PID filtering problems. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240521163401.3005045-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/uprobe_multi_test.c | 25 ++++++++++++++ .../selftests/bpf/progs/uprobe_multi.c | 33 +++++++++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 677232d31432..bf6ca8e3eb13 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -8,6 +8,7 @@ #include "uprobe_multi_usdt.skel.h" #include "bpf/libbpf_internal.h" #include "testing_helpers.h" +#include "../sdt.h" static char test_data[] = "test_data"; @@ -26,6 +27,11 @@ noinline void uprobe_multi_func_3(void) asm volatile (""); } +noinline void usdt_trigger(void) +{ + STAP_PROBE(test, pid_filter_usdt); +} + struct child { int go[2]; int c2p[2]; /* child -> parent channel */ @@ -90,6 +96,7 @@ static struct child *spawn_child(void) uprobe_multi_func_1(); uprobe_multi_func_2(); uprobe_multi_func_3(); + usdt_trigger(); exit(errno); } @@ -117,6 +124,7 @@ static void *child_thread(void *ctx) uprobe_multi_func_1(); uprobe_multi_func_2(); uprobe_multi_func_3(); + usdt_trigger(); err = 0; pthread_exit(&err); @@ -182,6 +190,7 @@ static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child uprobe_multi_func_1(); uprobe_multi_func_2(); uprobe_multi_func_3(); + usdt_trigger(); } if (child) @@ -269,8 +278,24 @@ __test_attach_api(const char *binary, const char *pattern, struct bpf_uprobe_mul if (!ASSERT_OK_PTR(skel->links.uprobe_extra, "bpf_program__attach_uprobe_multi")) goto cleanup; + /* Attach (uprobe-backed) USDTs */ + skel->links.usdt_pid = bpf_program__attach_usdt(skel->progs.usdt_pid, pid, binary, + "test", "pid_filter_usdt", NULL); + if (!ASSERT_OK_PTR(skel->links.usdt_pid, "attach_usdt_pid")) + goto cleanup; + + skel->links.usdt_extra = bpf_program__attach_usdt(skel->progs.usdt_extra, -1, binary, + "test", "pid_filter_usdt", NULL); + if (!ASSERT_OK_PTR(skel->links.usdt_extra, "attach_usdt_extra")) + goto cleanup; + uprobe_multi_test_run(skel, child); + ASSERT_FALSE(skel->bss->bad_pid_seen_usdt, "bad_pid_seen_usdt"); + if (child) { + ASSERT_EQ(skel->bss->child_pid_usdt, child->pid, "usdt_multi_child_pid"); + ASSERT_EQ(skel->bss->child_tid_usdt, child->tid, "usdt_multi_child_tid"); + } cleanup: uprobe_multi__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi.c b/tools/testing/selftests/bpf/progs/uprobe_multi.c index 86a7ff5d3726..44190efcdba2 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include "vmlinux.h" #include #include -#include +#include char _license[] SEC("license") = "GPL"; @@ -23,9 +23,12 @@ __u64 uprobe_multi_sleep_result = 0; int pid = 0; int child_pid = 0; int child_tid = 0; +int child_pid_usdt = 0; +int child_tid_usdt = 0; int expect_pid = 0; bool bad_pid_seen = false; +bool bad_pid_seen_usdt = false; bool test_cookie = false; void *user_ptr = 0; @@ -112,3 +115,29 @@ int uprobe_extra(struct pt_regs *ctx) /* we need this one just to mix PID-filtered and global uprobes */ return 0; } + +SEC("usdt") +int usdt_pid(struct pt_regs *ctx) +{ + __u64 cur_pid_tgid = bpf_get_current_pid_tgid(); + __u32 cur_pid; + + cur_pid = cur_pid_tgid >> 32; + if (pid && cur_pid != pid) + return 0; + + if (expect_pid && cur_pid != expect_pid) + bad_pid_seen_usdt = true; + + child_pid_usdt = cur_pid_tgid >> 32; + child_tid_usdt = (__u32)cur_pid_tgid; + + return 0; +} + +SEC("usdt") +int usdt_extra(struct pt_regs *ctx) +{ + /* we need this one just to mix PID-filtered and global USDT probes */ + return 0; +} From dd6a403795f0c7b5c566f86f2ee6b687278d3c1c Mon Sep 17 00:00:00 2001 From: Shahab Vahedi Date: Sat, 25 May 2024 05:56:28 +0200 Subject: [PATCH 054/279] ARC, bpf: Fix issues reported by the static analyzers Also updated couple of comments along the way. One of the issues reported was indeed a bug in the code: memset(ctx, 0, sizeof(ctx)) // original line memset(ctx, 0, sizeof(*ctx)) // fixed line That was a nice catch. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405222314.UG5F2NHn-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202405232036.Xqoc3b0J-lkp@intel.com/ Signed-off-by: Shahab Vahedi Link: https://lore.kernel.org/r/20240525035628.1026-1-list+bpf@vahedi.org Signed-off-by: Alexei Starovoitov --- arch/arc/net/bpf_jit.h | 2 +- arch/arc/net/bpf_jit_arcv2.c | 10 ++++++---- arch/arc/net/bpf_jit_core.c | 22 +++++++++++----------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/arc/net/bpf_jit.h b/arch/arc/net/bpf_jit.h index ec44873c42d1..495f3023e4c1 100644 --- a/arch/arc/net/bpf_jit.h +++ b/arch/arc/net/bpf_jit.h @@ -39,7 +39,7 @@ /************** Functions that the back-end must provide **************/ /* Extension for 32-bit operations. */ -inline u8 zext(u8 *buf, u8 rd); +u8 zext(u8 *buf, u8 rd); /***** Moves *****/ u8 mov_r32(u8 *buf, u8 rd, u8 rs, u8 sign_ext); u8 mov_r32_i32(u8 *buf, u8 reg, s32 imm); diff --git a/arch/arc/net/bpf_jit_arcv2.c b/arch/arc/net/bpf_jit_arcv2.c index 31bfb6e9ce00..4458e409ca0a 100644 --- a/arch/arc/net/bpf_jit_arcv2.c +++ b/arch/arc/net/bpf_jit_arcv2.c @@ -62,7 +62,7 @@ enum { * If/when we decide to add ARCv2 instructions that do use register pairs, * the mapping, hopefully, doesn't need to be revisited. */ -const u8 bpf2arc[][2] = { +static const u8 bpf2arc[][2] = { /* Return value from in-kernel function, and exit value from eBPF */ [BPF_REG_0] = {ARC_R_8, ARC_R_9}, /* Arguments from eBPF program to in-kernel function */ @@ -1302,7 +1302,7 @@ static u8 arc_b(u8 *buf, s32 offset) /************* Packers (Deal with BPF_REGs) **************/ -inline u8 zext(u8 *buf, u8 rd) +u8 zext(u8 *buf, u8 rd) { if (rd != BPF_REG_FP) return arc_movi_r(buf, REG_HI(rd), 0); @@ -2235,6 +2235,7 @@ u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext) break; default: /* The caller must have handled this. */ + break; } } else { /* @@ -2253,6 +2254,7 @@ u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext) break; default: /* The caller must have handled this. */ + break; } } @@ -2517,7 +2519,7 @@ u8 arc_epilogue(u8 *buf, u32 usage, u16 frame_size) #define JCC64_NR_OF_JMPS 3 /* Number of jumps in jcc64 template. */ #define JCC64_INSNS_TO_END 3 /* Number of insn. inclusive the 2nd jmp to end. */ #define JCC64_SKIP_JMP 1 /* Index of the "skip" jump to "end". */ -const struct { +static const struct { /* * "jit_off" is common between all "jmp[]" and is coupled with * "cond" of each "jmp[]" instance. e.g.: @@ -2883,7 +2885,7 @@ u8 gen_jmp_64(u8 *buf, u8 rd, u8 rs, u8 cond, u32 curr_off, u32 targ_off) * The "ARC_CC_SET" becomes "CC_unequal" because of the "tst" * instruction that precedes the conditional branch. */ -const u8 arcv2_32_jmps[ARC_CC_LAST] = { +static const u8 arcv2_32_jmps[ARC_CC_LAST] = { [ARC_CC_UGT] = CC_great_u, [ARC_CC_UGE] = CC_great_eq_u, [ARC_CC_ULT] = CC_less_u, diff --git a/arch/arc/net/bpf_jit_core.c b/arch/arc/net/bpf_jit_core.c index 6f6b4ffccf2c..e3628922c24a 100644 --- a/arch/arc/net/bpf_jit_core.c +++ b/arch/arc/net/bpf_jit_core.c @@ -159,7 +159,7 @@ static void jit_dump(const struct jit_context *ctx) /* Initialise the context so there's no garbage. */ static int jit_ctx_init(struct jit_context *ctx, struct bpf_prog *prog) { - memset(ctx, 0, sizeof(ctx)); + memset(ctx, 0, sizeof(*ctx)); ctx->orig_prog = prog; @@ -167,7 +167,7 @@ static int jit_ctx_init(struct jit_context *ctx, struct bpf_prog *prog) ctx->prog = bpf_jit_blind_constants(prog); if (IS_ERR(ctx->prog)) return PTR_ERR(ctx->prog); - ctx->blinded = (ctx->prog == ctx->orig_prog ? false : true); + ctx->blinded = (ctx->prog != ctx->orig_prog); /* If the verifier doesn't zero-extend, then we have to do it. */ ctx->do_zext = !ctx->prog->aux->verifier_zext; @@ -1182,12 +1182,12 @@ static int jit_prepare(struct jit_context *ctx) } /* - * All the "handle_*()" functions have been called before by the - * "jit_prepare()". If there was an error, we would know by now. - * Therefore, no extra error checking at this point, other than - * a sanity check at the end that expects the calculated length - * (jit.len) to be equal to the length of generated instructions - * (jit.index). + * jit_compile() is the real compilation phase. jit_prepare() is + * invoked before jit_compile() as a dry-run to make sure everything + * will go OK and allocate the necessary memory. + * + * In the end, jit_compile() checks if it has produced the same number + * of instructions as jit_prepare() would. */ static int jit_compile(struct jit_context *ctx) { @@ -1407,9 +1407,9 @@ static struct bpf_prog *do_extra_pass(struct bpf_prog *prog) /* * This function may be invoked twice for the same stream of BPF - * instructions. The "extra pass" happens, when there are "call"s - * involved that their addresses are not known during the first - * invocation. + * instructions. The "extra pass" happens, when there are + * (re)locations involved that their addresses are not known + * during the first run. */ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { From d6fe532b7499e4575f9647879b7a34625817fe7f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 May 2024 18:36:16 +0200 Subject: [PATCH 055/279] netkit: Fix setting mac address in l2 mode When running Cilium connectivity test suite with netkit in L2 mode, we found that it is expected to be able to specify a custom MAC address for the devices, in particular, cilium-cni obtains the specified MAC address by querying the endpoint and sets the MAC address of the interface inside the Pod. Thus, fix the missing support in netkit for L2 mode. Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device") Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20240524163619.26001-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- drivers/net/netkit.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index a4d2e76a8d58..272894053e2c 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -155,6 +155,16 @@ static void netkit_set_multicast(struct net_device *dev) /* Nothing to do, we receive whatever gets pushed to us! */ } +static int netkit_set_macaddr(struct net_device *dev, void *sa) +{ + struct netkit *nk = netkit_priv(dev); + + if (nk->mode != NETKIT_L2) + return -EOPNOTSUPP; + + return eth_mac_addr(dev, sa); +} + static void netkit_set_headroom(struct net_device *dev, int headroom) { struct netkit *nk = netkit_priv(dev), *nk2; @@ -198,6 +208,7 @@ static const struct net_device_ops netkit_netdev_ops = { .ndo_start_xmit = netkit_xmit, .ndo_set_rx_mode = netkit_set_multicast, .ndo_set_rx_headroom = netkit_set_headroom, + .ndo_set_mac_address = netkit_set_macaddr, .ndo_get_iflink = netkit_get_iflink, .ndo_get_peer_dev = netkit_peer_dev, .ndo_get_stats64 = netkit_get_stats, @@ -300,9 +311,11 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], if (!attr) return 0; - NL_SET_ERR_MSG_ATTR(extack, attr, - "Setting Ethernet address is not supported"); - return -EOPNOTSUPP; + if (nla_len(attr) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(attr))) + return -EADDRNOTAVAIL; + return 0; } static struct rtnl_link_ops netkit_link_ops; @@ -365,6 +378,9 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, strscpy(ifname, "nk%d", IFNAMSIZ); ifname_assign_type = NET_NAME_ENUM; } + if (mode != NETKIT_L2 && + (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) + return -EOPNOTSUPP; net = rtnl_link_get_net(src_net, tbp); if (IS_ERR(net)) @@ -379,7 +395,7 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, netif_inherit_tso_max(peer, dev); - if (mode == NETKIT_L2) + if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) eth_hw_addr_random(peer); if (ifmp && dev->ifindex) peer->ifindex = ifmp->ifi_index; @@ -402,7 +418,7 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, if (err < 0) goto err_configure_peer; - if (mode == NETKIT_L2) + if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); if (tb[IFLA_IFNAME]) nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); From 3998d184267dfcff858aaa84d3de17429253629d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 May 2024 18:36:17 +0200 Subject: [PATCH 056/279] netkit: Fix pkt_type override upon netkit pass verdict When running Cilium connectivity test suite with netkit in L2 mode, we found that compared to tcx a few tests were failing which pushed traffic into an L7 proxy sitting in host namespace. The problem in particular is around the invocation of eth_type_trans() in netkit. In case of tcx, this is run before the tcx ingress is triggered inside host namespace and thus if the BPF program uses the bpf_skb_change_type() helper the newly set type is retained. However, in case of netkit, the late eth_type_trans() invocation overrides the earlier decision from the BPF program which eventually leads to the test failure. Instead of eth_type_trans(), split out the relevant parts, meaning, reset of mac header and call to eth_skb_pkt_type() before the BPF program is run in order to have the same behavior as with tcx, and refactor a small helper called eth_skb_pull_mac() which is run in case it's passed up the stack where the mac header must be pulled. With this all connectivity tests pass. Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device") Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20240524163619.26001-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- drivers/net/netkit.c | 4 +++- include/linux/etherdevice.h | 8 ++++++++ net/ethernet/eth.c | 4 +--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 272894053e2c..16789cd446e9 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -55,6 +55,7 @@ static void netkit_prep_forward(struct sk_buff *skb, bool xnet) skb_scrub_packet(skb, xnet); skb->priority = 0; nf_skip_egress(skb, true); + skb_reset_mac_header(skb); } static struct netkit *netkit_priv(const struct net_device *dev) @@ -78,6 +79,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer))); + eth_skb_pkt_type(skb, peer); skb->dev = peer; entry = rcu_dereference(nk->active); if (entry) @@ -85,7 +87,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) switch (ret) { case NETKIT_NEXT: case NETKIT_PASS: - skb->protocol = eth_type_trans(skb, skb->dev); + eth_skb_pull_mac(skb); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) { dev_sw_netstats_tx_add(dev, 1, len); diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2ad1ffa4ccb9..0ed47d00549b 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -636,6 +636,14 @@ static inline void eth_skb_pkt_type(struct sk_buff *skb, } } +static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) +{ + struct ethhdr *eth = (struct ethhdr *)skb->data; + + skb_pull_inline(skb, ETH_HLEN); + return eth; +} + /** * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame * @skb: Buffer to pad diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 049c3adeb850..4e3651101b86 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -161,9 +161,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb->dev = dev; skb_reset_mac_header(skb); - eth = (struct ethhdr *)skb->data; - skb_pull_inline(skb, ETH_HLEN); - + eth = eth_skb_pull_mac(skb); eth_skb_pkt_type(skb, dev); /* From 998ffeb2738e26f134dc8e63b5dcaece22573957 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 May 2024 18:36:18 +0200 Subject: [PATCH 057/279] selftests/bpf: Add netkit tests for mac address This adds simple tests around setting MAC addresses in the different netkit modes. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240524163619.26001-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/tc_netkit.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c index 15ee7b2fc410..18b2e969a456 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c @@ -73,6 +73,16 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, "up primary"); ASSERT_OK(system("ip addr add dev " netkit_name " 10.0.0.1/24"), "addr primary"); + + if (mode == NETKIT_L3) { + ASSERT_EQ(system("ip link set dev " netkit_name + " addr ee:ff:bb:cc:aa:dd 2> /dev/null"), 512, + "set hwaddress"); + } else { + ASSERT_OK(system("ip link set dev " netkit_name + " addr ee:ff:bb:cc:aa:dd"), + "set hwaddress"); + } if (same_netns) { ASSERT_OK(system("ip link set dev " netkit_peer " up"), "up peer"); From 95348e463eabc803341c67d562f9e0a5f0a48fe6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 May 2024 18:36:19 +0200 Subject: [PATCH 058/279] selftests/bpf: Add netkit test for pkt_type Add a test case to assert that the skb->pkt_type which was set from the BPF program is retained from the netkit xmit side to the peer's device at tcx ingress location. # ./vmtest.sh -- ./test_progs -t netkit [...] ./test_progs -t netkit [ 1.140780] bpf_testmod: loading out-of-tree module taints kernel. [ 1.141127] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel [ 1.284601] tsc: Refined TSC clocksource calibration: 3408.006 MHz [ 1.286672] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x311fd9b189d, max_idle_ns: 440795225691 ns [ 1.290384] clocksource: Switched to clocksource tsc #345 tc_netkit_basic:OK #346 tc_netkit_device:OK #347 tc_netkit_multi_links:OK #348 tc_netkit_multi_opts:OK #349 tc_netkit_neigh_links:OK #350 tc_netkit_pkt_type:OK Summary: 6/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240524163619.26001-4-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tc_netkit.c | 84 +++++++++++++++++++ .../selftests/bpf/progs/test_tc_link.c | 35 +++++++- 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c index 18b2e969a456..b9135720024c 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c @@ -99,6 +99,16 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, return err; } +static void move_netkit(void) +{ + ASSERT_OK(system("ip link set " netkit_peer " netns foo"), + "move peer"); + ASSERT_OK(system("ip netns exec foo ip link set dev " + netkit_peer " up"), "up peer"); + ASSERT_OK(system("ip netns exec foo ip addr add dev " + netkit_peer " 10.0.0.2/24"), "addr peer"); +} + static void destroy_netkit(void) { ASSERT_OK(system("ip link del dev " netkit_name), "del primary"); @@ -695,3 +705,77 @@ void serial_test_tc_netkit_neigh_links(void) serial_test_tc_netkit_neigh_links_target(NETKIT_L2, BPF_NETKIT_PRIMARY); serial_test_tc_netkit_neigh_links_target(NETKIT_L3, BPF_NETKIT_PRIMARY); } + +static void serial_test_tc_netkit_pkt_type_mode(int mode) +{ + LIBBPF_OPTS(bpf_netkit_opts, optl_nk); + LIBBPF_OPTS(bpf_tcx_opts, optl_tcx); + int err, ifindex, ifindex2; + struct test_tc_link *skel; + struct bpf_link *link; + + err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, + &ifindex, true); + if (err) + return; + + ifindex2 = if_nametoindex(netkit_peer); + ASSERT_NEQ(ifindex, ifindex2, "ifindex_1_2"); + + skel = test_tc_link__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + + ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc1, + BPF_NETKIT_PRIMARY), 0, "tc1_attach_type"); + ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc7, + BPF_TCX_INGRESS), 0, "tc7_attach_type"); + + err = test_tc_link__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0); + assert_mprog_count_ifindex(ifindex2, BPF_TCX_INGRESS, 0); + + link = bpf_program__attach_netkit(skel->progs.tc1, ifindex, &optl_nk); + if (!ASSERT_OK_PTR(link, "link_attach")) + goto cleanup; + + skel->links.tc1 = link; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 1); + assert_mprog_count_ifindex(ifindex2, BPF_TCX_INGRESS, 0); + + link = bpf_program__attach_tcx(skel->progs.tc7, ifindex2, &optl_tcx); + if (!ASSERT_OK_PTR(link, "link_attach")) + goto cleanup; + + skel->links.tc7 = link; + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 1); + assert_mprog_count_ifindex(ifindex2, BPF_TCX_INGRESS, 1); + + move_netkit(); + + tc_skel_reset_all_seen(skel); + skel->bss->set_type = true; + ASSERT_EQ(send_icmp(), 0, "icmp_pkt"); + + ASSERT_EQ(skel->bss->seen_tc1, true, "seen_tc1"); + ASSERT_EQ(skel->bss->seen_tc7, true, "seen_tc7"); + + ASSERT_EQ(skel->bss->seen_host, true, "seen_host"); + ASSERT_EQ(skel->bss->seen_mcast, true, "seen_mcast"); +cleanup: + test_tc_link__destroy(skel); + + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0); + destroy_netkit(); +} + +void serial_test_tc_netkit_pkt_type(void) +{ + serial_test_tc_netkit_pkt_type_mode(NETKIT_L2); + serial_test_tc_netkit_pkt_type_mode(NETKIT_L3); +} diff --git a/tools/testing/selftests/bpf/progs/test_tc_link.c b/tools/testing/selftests/bpf/progs/test_tc_link.c index 992400acb957..ab3eae3d6af8 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_link.c +++ b/tools/testing/selftests/bpf/progs/test_tc_link.c @@ -4,7 +4,8 @@ #include #include - +#include +#include #include #include @@ -16,7 +17,13 @@ bool seen_tc3; bool seen_tc4; bool seen_tc5; bool seen_tc6; +bool seen_tc7; + +bool set_type; + bool seen_eth; +bool seen_host; +bool seen_mcast; SEC("tc/ingress") int tc1(struct __sk_buff *skb) @@ -28,8 +35,16 @@ int tc1(struct __sk_buff *skb) if (bpf_skb_load_bytes(skb, 0, ð, sizeof(eth))) goto out; seen_eth = eth.h_proto == bpf_htons(ETH_P_IP); + seen_host = skb->pkt_type == PACKET_HOST; + if (seen_host && set_type) { + eth.h_dest[0] = 4; + if (bpf_skb_store_bytes(skb, 0, ð, sizeof(eth), 0)) + goto fail; + bpf_skb_change_type(skb, PACKET_MULTICAST); + } out: seen_tc1 = true; +fail: return TCX_NEXT; } @@ -67,3 +82,21 @@ int tc6(struct __sk_buff *skb) seen_tc6 = true; return TCX_PASS; } + +SEC("tc/ingress") +int tc7(struct __sk_buff *skb) +{ + struct ethhdr eth = {}; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) + goto out; + if (bpf_skb_load_bytes(skb, 0, ð, sizeof(eth))) + goto out; + if (eth.h_dest[0] == 4 && set_type) { + seen_mcast = skb->pkt_type == PACKET_MULTICAST; + bpf_skb_change_type(skb, PACKET_HOST); + } +out: + seen_tc7 = true; + return TCX_PASS; +} From 67ec8cdf29971677b2fb4b6d92871eb5d5e95597 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 22 May 2024 13:37:54 +0800 Subject: [PATCH 059/279] hwrng: core - Remove add_early_randomness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A potential deadlock was reported with the config file at https://web.archive.org/web/20240522052129/https://0x0.st/XPN_.txt In this particular configuration, the deadlock doesn't exist because the warning triggered at a point before modules were even available. However, the deadlock can be real because any module loaded would invoke async_synchronize_full. The issue is spurious for software crypto algorithms which aren't themselves involved in async probing. However, it would be hard to avoid for a PCI crypto driver using async probing. In this particular call trace, the problem is easily avoided because the only reason the module is being requested during probing is the add_early_randomness call in the hwrng core. This feature is vestigial since there is now a kernel thread dedicated to doing exactly this. So remove add_early_randomness as it is no longer needed. Reported-by: Nícolas F. R. A. Prado Reported-by: Eric Biggers Fixes: 1b6d7f9eb150 ("tpm: add session encryption protection to tpm2_get_random()") Link: https://lore.kernel.org/r/119dc5ed-f159-41be-9dda-1a056f29888d@notapiano/ Signed-off-by: Herbert Xu Reviewed-by: Jarkko Sakkinen Tested-by: Nícolas F. R. A. Prado Signed-off-by: Herbert Xu --- drivers/char/hw_random/core.c | 47 +++-------------------------------- 1 file changed, 4 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index f5c71a617a99..4084df65c9fa 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -64,19 +64,6 @@ static size_t rng_buffer_size(void) return RNG_BUFFER_SIZE; } -static void add_early_randomness(struct hwrng *rng) -{ - int bytes_read; - - mutex_lock(&reading_mutex); - bytes_read = rng_get_data(rng, rng_fillbuf, 32, 0); - mutex_unlock(&reading_mutex); - if (bytes_read > 0) { - size_t entropy = bytes_read * 8 * rng->quality / 1024; - add_hwgenerator_randomness(rng_fillbuf, bytes_read, entropy, false); - } -} - static inline void cleanup_rng(struct kref *kref) { struct hwrng *rng = container_of(kref, struct hwrng, ref); @@ -340,13 +327,12 @@ static ssize_t rng_current_store(struct device *dev, const char *buf, size_t len) { int err; - struct hwrng *rng, *old_rng, *new_rng; + struct hwrng *rng, *new_rng; err = mutex_lock_interruptible(&rng_mutex); if (err) return -ERESTARTSYS; - old_rng = current_rng; if (sysfs_streq(buf, "")) { err = enable_best_rng(); } else { @@ -362,11 +348,8 @@ static ssize_t rng_current_store(struct device *dev, new_rng = get_current_rng_nolock(); mutex_unlock(&rng_mutex); - if (new_rng) { - if (new_rng != old_rng) - add_early_randomness(new_rng); + if (new_rng) put_rng(new_rng); - } return err ? : len; } @@ -544,7 +527,6 @@ int hwrng_register(struct hwrng *rng) { int err = -EINVAL; struct hwrng *tmp; - bool is_new_current = false; if (!rng->name || (!rng->data_read && !rng->read)) goto out; @@ -573,25 +555,8 @@ int hwrng_register(struct hwrng *rng) err = set_current_rng(rng); if (err) goto out_unlock; - /* to use current_rng in add_early_randomness() we need - * to take a ref - */ - is_new_current = true; - kref_get(&rng->ref); } mutex_unlock(&rng_mutex); - if (is_new_current || !rng->init) { - /* - * Use a new device's input to add some randomness to - * the system. If this rng device isn't going to be - * used right away, its init function hasn't been - * called yet by set_current_rng(); so only use the - * randomness from devices that don't need an init callback - */ - add_early_randomness(rng); - } - if (is_new_current) - put_rng(rng); return 0; out_unlock: mutex_unlock(&rng_mutex); @@ -602,12 +567,11 @@ EXPORT_SYMBOL_GPL(hwrng_register); void hwrng_unregister(struct hwrng *rng) { - struct hwrng *old_rng, *new_rng; + struct hwrng *new_rng; int err; mutex_lock(&rng_mutex); - old_rng = current_rng; list_del(&rng->list); complete_all(&rng->dying); if (current_rng == rng) { @@ -626,11 +590,8 @@ void hwrng_unregister(struct hwrng *rng) } else mutex_unlock(&rng_mutex); - if (new_rng) { - if (old_rng != new_rng) - add_early_randomness(new_rng); + if (new_rng) put_rng(new_rng); - } wait_for_completion(&rng->cleanup_done); } From d509cadc3a48fee394d68757e4b685f7c143ed64 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 May 2024 00:53:25 -0400 Subject: [PATCH 060/279] bcachefs: Fix debug assert Reported-by: syzbot+a8074a75b8d73328751e@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 2206a8dee693..df2bea38e83f 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -564,7 +564,7 @@ static void __bch2_fs_free(struct bch_fs *c) BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); - EBUG_ON(percpu_u64_get(c->online_reserved)); + EBUG_ON(c->online_reserved && percpu_u64_get(c->online_reserved)); free_percpu(c->online_reserved); darray_exit(&c->btree_roots_extra); From 9242a34b760648b722f4958749ad83ef7d0f7525 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 May 2024 12:38:53 -0400 Subject: [PATCH 061/279] bcachefs: Fix sb-downgrade validation Superblock downgrade entries are only two byte aligned, but section sizes are 8 byte aligned, which means we have to be careful about overrun checks; an entry that crosses the end of the section is allowed (and ignored) as long as it has zero errors. Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-downgrade.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 390a1bbd2567..3fb23e399ffb 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -146,10 +146,17 @@ static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, for (const struct bch_sb_field_downgrade_entry *i = e->entries; (void *) i < vstruct_end(&e->field); i = downgrade_entry_next_c(i)) { + /* + * Careful: sb_field_downgrade_entry is only 2 byte aligned, but + * section sizes are 8 byte aligned - an empty entry spanning + * the end of the section is allowed (and ignored): + */ + if ((void *) &i->errors[0] > vstruct_end(&e->field)) + break; + if (flags & BCH_VALIDATE_write && - ((void *) &i->errors[0] > vstruct_end(&e->field) || - (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field))) { - prt_printf(err, "downgrade entry overruns end of superblock section)"); + (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) { + prt_printf(err, "downgrade entry overruns end of superblock section"); return -BCH_ERR_invalid_sb_downgrade; } From f94b77709e82242c1101e59a90a7807455c4ab2a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 23 May 2024 16:22:34 -0700 Subject: [PATCH 062/279] firewire: add missing MODULE_DESCRIPTION() to test modules Fix the 'make W=1' warnings: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/firewire/uapi-test.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/firewire/packet-serdes-test.o Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240523-md-firewire-uapi-test-v1-1-6be5adcc3aed@quicinc.com Signed-off-by: Takashi Sakamoto --- drivers/firewire/packet-serdes-test.c | 1 + drivers/firewire/uapi-test.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/firewire/packet-serdes-test.c b/drivers/firewire/packet-serdes-test.c index f93c966e794d..e83b1fece780 100644 --- a/drivers/firewire/packet-serdes-test.c +++ b/drivers/firewire/packet-serdes-test.c @@ -579,4 +579,5 @@ static struct kunit_suite packet_serdes_test_suite = { }; kunit_test_suite(packet_serdes_test_suite); +MODULE_DESCRIPTION("FireWire packet serialization/deserialization unit test suite"); MODULE_LICENSE("GPL"); diff --git a/drivers/firewire/uapi-test.c b/drivers/firewire/uapi-test.c index 2fcbede4fab1..bc3f10a2e516 100644 --- a/drivers/firewire/uapi-test.c +++ b/drivers/firewire/uapi-test.c @@ -86,4 +86,5 @@ static struct kunit_suite structure_layout_test_suite = { }; kunit_test_suite(structure_layout_test_suite); +MODULE_DESCRIPTION("FireWire UAPI unit test suite"); MODULE_LICENSE("GPL"); From 611b7eb19d0a305d4de00280e4a71a1b15c507fc Mon Sep 17 00:00:00 2001 From: Jim Wylder Date: Thu, 23 May 2024 16:14:36 -0500 Subject: [PATCH 063/279] regmap-i2c: Subtract reg size from max_write Currently, when an adapter defines a max_write_len quirk, the data will be chunked into data sizes equal to the max_write_len quirk value. But the payload will be increased by the size of the register address before transmission. The resulting value always ends up larger than the limit set by the quirk. Avoid this error by setting regmap's max_write to the quirk's max_write_len minus the number of bytes for the register and padding. This allows the chunking to work correctly for this limited case without impacting other use-cases. Signed-off-by: Jim Wylder Link: https://msgid.link/r/20240523211437.2839942-1-jwylder@google.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-i2c.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap-i2c.c b/drivers/base/regmap/regmap-i2c.c index 3ec611dc0c09..a905e955bbfc 100644 --- a/drivers/base/regmap/regmap-i2c.c +++ b/drivers/base/regmap/regmap-i2c.c @@ -350,7 +350,8 @@ static const struct regmap_bus *regmap_get_i2c_bus(struct i2c_client *i2c, if (quirks->max_write_len && (bus->max_raw_write == 0 || bus->max_raw_write > quirks->max_write_len)) - max_write = quirks->max_write_len; + max_write = quirks->max_write_len - + (config->reg_bits + config->pad_bits) / BITS_PER_BYTE; if (max_read || max_write) { ret_bus = kmemdup(bus, sizeof(*bus), GFP_KERNEL); From b82b6eeefd30f1ff049bff54da419a30ad9354c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 May 2024 20:46:25 +0100 Subject: [PATCH 064/279] bcachefs: Use copy_folio_from_iter_atomic() copy_page_from_iter_atomic() will be removed at some point. Also fixup a comment for folios. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 6b69e5cd68dd..54873ecc635c 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -437,8 +437,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op) */ /* - * PageWriteback is effectively our ref on the inode - fixup i_blocks - * before calling end_page_writeback: + * The writeback flag is effectively our ref on the inode - + * fixup i_blocks before calling folio_end_writeback: */ bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); @@ -898,7 +898,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_for_each(fs, fi) { f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; - f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); if (!f_copied) { folios_trunc(&fs, fi); break; From 9ee267a29309233b9ef8f58ee61e0b1c9b5879e8 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 26 May 2024 09:52:48 -0700 Subject: [PATCH 065/279] fs: smb: common: add missing MODULE_DESCRIPTION() macros Fix the 'make W=1' warnings: WARNING: modpost: missing MODULE_DESCRIPTION() in fs/smb/common/cifs_arc4.o WARNING: modpost: missing MODULE_DESCRIPTION() in fs/smb/common/cifs_md4.o Signed-off-by: Jeff Johnson Signed-off-by: Steve French --- fs/smb/common/cifs_arc4.c | 1 + fs/smb/common/cifs_md4.c | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/smb/common/cifs_arc4.c b/fs/smb/common/cifs_arc4.c index 043e4cb839fa..df360ca47826 100644 --- a/fs/smb/common/cifs_arc4.c +++ b/fs/smb/common/cifs_arc4.c @@ -10,6 +10,7 @@ #include #include "arc4.h" +MODULE_DESCRIPTION("ARC4 Cipher Algorithm"); MODULE_LICENSE("GPL"); int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len) diff --git a/fs/smb/common/cifs_md4.c b/fs/smb/common/cifs_md4.c index 50f78cfc6ce9..7ee7f4dad90c 100644 --- a/fs/smb/common/cifs_md4.c +++ b/fs/smb/common/cifs_md4.c @@ -24,6 +24,7 @@ #include #include "md4.h" +MODULE_DESCRIPTION("MD4 Message Digest Algorithm (RFC1320)"); MODULE_LICENSE("GPL"); static inline u32 lshift(u32 x, unsigned int s) From 9e2f46cd87473c70d01fcaf8a559809e6d18dd50 Mon Sep 17 00:00:00 2001 From: Jason Nader Date: Tue, 21 May 2024 22:36:24 +0900 Subject: [PATCH 066/279] ata: ahci: Do not apply Intel PCS quirk on Intel Alder Lake Commit b8b8b4e0c052 ("ata: ahci: Add Intel Alder Lake-P AHCI controller to low power chipsets list") added Intel Alder Lake to the ahci_pci_tbl. Because of the way that the Intel PCS quirk was implemented, having an explicit entry in the ahci_pci_tbl caused the Intel PCS quirk to be applied. (The quirk was not being applied if there was no explict entry.) Thus, entries that were added to the ahci_pci_tbl also got the Intel PCS quirk applied. The quirk was cleaned up in commit 7edbb6059274 ("ahci: clean up intel_pcs_quirk"), such that it is clear which entries that actually applies the Intel PCS quirk. Newer Intel AHCI controllers do not need the Intel PCS quirk, and applying it when not needed actually breaks some platforms. Do not apply the Intel PCS quirk for Intel Alder Lake. This is in line with how things worked before commit b8b8b4e0c052 ("ata: ahci: Add Intel Alder Lake-P AHCI controller to low power chipsets list"), such that certain platforms using Intel Alder Lake will work once again. Cc: stable@vger.kernel.org # 6.7 Fixes: b8b8b4e0c052 ("ata: ahci: Add Intel Alder Lake-P AHCI controller to low power chipsets list") Signed-off-by: Jason Nader Signed-off-by: Niklas Cassel --- drivers/ata/ahci.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 6548f10e61d9..07d66d2c5f0d 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -429,7 +429,6 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x02d7), board_ahci_pcs_quirk }, /* Comet Lake PCH RAID */ /* Elkhart Lake IDs 0x4b60 & 0x4b62 https://sata-io.org/product/8803 not tested yet */ { PCI_VDEVICE(INTEL, 0x4b63), board_ahci_pcs_quirk }, /* Elkhart Lake AHCI */ - { PCI_VDEVICE(INTEL, 0x7ae2), board_ahci_pcs_quirk }, /* Alder Lake-P AHCI */ /* JMicron 360/1/3/5/6, match class to avoid IDE function */ { PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, From e8021b94b0412c37bcc79027c2e382086b6ce449 Mon Sep 17 00:00:00 2001 From: Roded Zats Date: Wed, 22 May 2024 10:30:44 +0300 Subject: [PATCH 067/279] enic: Validate length of nl attributes in enic_set_vf_port enic_set_vf_port assumes that the nl attribute IFLA_PORT_PROFILE is of length PORT_PROFILE_MAX and that the nl attributes IFLA_PORT_INSTANCE_UUID, IFLA_PORT_HOST_UUID are of length PORT_UUID_MAX. These attributes are validated (in the function do_setlink in rtnetlink.c) using the nla_policy ifla_port_policy. The policy defines IFLA_PORT_PROFILE as NLA_STRING, IFLA_PORT_INSTANCE_UUID as NLA_BINARY and IFLA_PORT_HOST_UUID as NLA_STRING. That means that the length validation using the policy is for the max size of the attributes and not on exact size so the length of these attributes might be less than the sizes that enic_set_vf_port expects. This might cause an out of bands read access in the memcpys of the data of these attributes in enic_set_vf_port. Fixes: f8bd909183ac ("net: Add ndo_{set|get}_vf_port support for enic dynamic vnics") Signed-off-by: Roded Zats Link: https://lore.kernel.org/r/20240522073044.33519-1-rzats@paloaltonetworks.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cisco/enic/enic_main.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index f604119efc80..5f26fc3ad655 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -1117,18 +1117,30 @@ static int enic_set_vf_port(struct net_device *netdev, int vf, pp->request = nla_get_u8(port[IFLA_PORT_REQUEST]); if (port[IFLA_PORT_PROFILE]) { + if (nla_len(port[IFLA_PORT_PROFILE]) != PORT_PROFILE_MAX) { + memcpy(pp, &prev_pp, sizeof(*pp)); + return -EINVAL; + } pp->set |= ENIC_SET_NAME; memcpy(pp->name, nla_data(port[IFLA_PORT_PROFILE]), PORT_PROFILE_MAX); } if (port[IFLA_PORT_INSTANCE_UUID]) { + if (nla_len(port[IFLA_PORT_INSTANCE_UUID]) != PORT_UUID_MAX) { + memcpy(pp, &prev_pp, sizeof(*pp)); + return -EINVAL; + } pp->set |= ENIC_SET_INSTANCE; memcpy(pp->instance_uuid, nla_data(port[IFLA_PORT_INSTANCE_UUID]), PORT_UUID_MAX); } if (port[IFLA_PORT_HOST_UUID]) { + if (nla_len(port[IFLA_PORT_HOST_UUID]) != PORT_UUID_MAX) { + memcpy(pp, &prev_pp, sizeof(*pp)); + return -EINVAL; + } pp->set |= ENIC_SET_HOST; memcpy(pp->host_uuid, nla_data(port[IFLA_PORT_HOST_UUID]), PORT_UUID_MAX); From a4edf675ba3357f60e2ee310acc15eb9cd5a8ae0 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Fri, 17 May 2024 07:49:46 -0700 Subject: [PATCH 068/279] platform/x86: ISST: fix use-after-free in tpmi_sst_dev_remove() In tpmi_sst_dev_remove(), tpmi_sst is dereferenced after being freed. Fix this by reordering the kfree() post the dereference. Fixes: 9d1d36268f3d ("platform/x86: ISST: Support partitioned systems") Signed-off-by: Harshit Mogalapalli Reviewed-by: Hans de Goede Acked-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20240517144946.289615-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c index 7bac7841ff0a..7fa360073f6e 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c @@ -1610,8 +1610,8 @@ void tpmi_sst_dev_remove(struct auxiliary_device *auxdev) tpmi_sst->partition_mask_current &= ~BIT(plat_info->partition); /* Free the package instance when the all partitions are removed */ if (!tpmi_sst->partition_mask_current) { - kfree(tpmi_sst); isst_common.sst_inst[tpmi_sst->package_id] = NULL; + kfree(tpmi_sst); } mutex_unlock(&isst_tpmi_dev_lock); } From 4d6ef1be2492a6789ba2b711933625cd72ced39d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 21 May 2024 11:47:41 +0200 Subject: [PATCH 069/279] platform/x86: x86-android-tablets: Add "select LEDS_CLASS" Since the x86-android-tablets now calls devm_led_classdev_register_ext() it needs to select LEDS_CLASS as well as LEDS_CLASS' NEW_LEDS dependency. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405182256.FsKBjIzG-lkp@intel.com/ Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240521094741.273397-1-hdegoede@redhat.com --- drivers/platform/x86/x86-android-tablets/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/x86-android-tablets/Kconfig b/drivers/platform/x86/x86-android-tablets/Kconfig index 6603461d4273..b591419de80c 100644 --- a/drivers/platform/x86/x86-android-tablets/Kconfig +++ b/drivers/platform/x86/x86-android-tablets/Kconfig @@ -6,6 +6,8 @@ config X86_ANDROID_TABLETS tristate "X86 Android tablet support" depends on I2C && SPI && SERIAL_DEV_BUS && ACPI && EFI && GPIOLIB && PMIC_OPREGION + select NEW_LEDS + select LEDS_CLASS help X86 tablets which ship with Android as (part of) the factory image typically have various problems with their DSDTs. The factory kernels From 5d059bf2b1c4d5779a4c09ec418e40eded44a187 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 22 May 2024 07:48:13 -0400 Subject: [PATCH 070/279] platform/x86: thinkpad_acpi: Select INPUT_SPARSEKMAP in Kconfig Now that drivers/platform/x86/thinkpad_acpi.c uses sparse_keymap_report_event(), it must select INPUT_SPARSEKMAP in its Kconfig option otherwise the build fails with: ld: vmlinux.o: in function `tpacpi_input_send_key': thinkpad_acpi.c:(.text+0xd4d27f): undefined reference to `sparse_keymap_report_event' ld: vmlinux.o: in function `hotkey_init': thinkpad_acpi.c:(.init.text+0x66cb6): undefined reference to `sparse_keymap_setup' Fixes: 42f7b965de9d ("platform/x86: thinkpad_acpi: Switch to using sparse-keymap helpers") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240522074813.379b9fc2@gandalf.local.home Signed-off-by: Hans de Goede --- drivers/platform/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 0ec952b5d03e..1953317541ea 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -515,6 +515,7 @@ config THINKPAD_ACPI select NVRAM select NEW_LEDS select LEDS_CLASS + select INPUT_SPARSEKMAP help This is a driver for the IBM and Lenovo ThinkPad laptops. It adds support for Fn-Fx key combinations, Bluetooth control, video From 0b178b02673998f5acca5a0365a8858ca45beedb Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 23 May 2024 16:36:01 +0200 Subject: [PATCH 071/279] platform/x86: touchscreen_dmi: Add support for setting touchscreen properties from cmdline On x86/ACPI platforms touchscreens mostly just work without needing any device/model specific configuration. But in some cases (mostly with Silead and Goodix touchscreens) it is still necessary to manually specify various touchscreen-properties on a per model basis. touchscreen_dmi is a special place for DMI quirks for this, but it can be challenging for users to figure out the right property values, especially for Silead touchscreens where non of these can be read back from the touchscreen-controller. ATM users can only test touchscreen properties by editing touchscreen_dmi.c and then building a completely new kernel which makes it unnecessary difficult for users to test and submit properties when necessary for their laptop / tablet model. Add support for specifying properties on the kernel commandline to allow users to easily figure out the right settings. See the added documentation in kernel-parameters.txt for the commandline syntax. Cc: Gregor Riepl Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240523143601.47555-1-hdegoede@redhat.com --- .../admin-guide/kernel-parameters.txt | 22 +++++ drivers/platform/x86/touchscreen_dmi.c | 81 ++++++++++++++++++- 2 files changed, 99 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 500cfa776225..b600df82669d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1921,6 +1921,28 @@ Format: , + i2c_touchscreen_props= [HW,ACPI,X86] + Set device-properties for ACPI-enumerated I2C-attached + touchscreen, to e.g. fix coordinates of upside-down + mounted touchscreens. If you need this option please + submit a drivers/platform/x86/touchscreen_dmi.c patch + adding a DMI quirk for this. + + Format: + :=[:prop_name=val][:...] + Where is one of: + Omit "=" entirely Set a boolean device-property + Unsigned number Set a u32 device-property + Anything else Set a string device-property + + Examples (split over multiple lines): + i2c_touchscreen_props=GDIX1001:touchscreen-inverted-x: + touchscreen-inverted-y + + i2c_touchscreen_props=MSSL1680:touchscreen-size-x=1920: + touchscreen-size-y=1080:touchscreen-inverted-y: + firmware-name=gsl1680-vendor-model.fw:silead,home-button + i8042.debug [HW] Toggle i8042 debug mode i8042.unmask_kbd_data [HW] Enable printing of interrupt data from the KBD port diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index c6a10ec2c83f..b021fb9e579e 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -9,10 +9,13 @@ */ #include +#include #include #include #include #include +#include +#include #include #include #include @@ -1817,7 +1820,7 @@ const struct dmi_system_id touchscreen_dmi_table[] = { { } }; -static const struct ts_dmi_data *ts_data; +static struct ts_dmi_data *ts_data; static void ts_dmi_add_props(struct i2c_client *client) { @@ -1852,6 +1855,64 @@ static int ts_dmi_notifier_call(struct notifier_block *nb, return 0; } +#define MAX_CMDLINE_PROPS 16 + +static struct property_entry ts_cmdline_props[MAX_CMDLINE_PROPS + 1]; + +static struct ts_dmi_data ts_cmdline_data = { + .properties = ts_cmdline_props, +}; + +static int __init ts_parse_props(char *str) +{ + /* Save the original str to show it on syntax errors */ + char orig_str[256]; + char *name, *value; + u32 u32val; + int i, ret; + + strscpy(orig_str, str, sizeof(orig_str)); + + /* + * str is part of the static_command_line from init/main.c and poking + * holes in that by writing 0 to it is allowed, as is taking long + * lasting references to it. + */ + ts_cmdline_data.acpi_name = strsep(&str, ":"); + + for (i = 0; i < MAX_CMDLINE_PROPS; i++) { + name = strsep(&str, ":"); + if (!name || !name[0]) + break; + + /* Replace '=' with 0 and make value point past '=' or NULL */ + value = name; + strsep(&value, "="); + if (!value) { + ts_cmdline_props[i] = PROPERTY_ENTRY_BOOL(name); + } else if (isdigit(value[0])) { + ret = kstrtou32(value, 0, &u32val); + if (ret) + goto syntax_error; + + ts_cmdline_props[i] = PROPERTY_ENTRY_U32(name, u32val); + } else { + ts_cmdline_props[i] = PROPERTY_ENTRY_STRING(name, value); + } + } + + if (!i || str) + goto syntax_error; + + ts_data = &ts_cmdline_data; + return 1; + +syntax_error: + pr_err("Invalid '%s' value for 'i2c_touchscreen_props='\n", orig_str); + return 1; /* "i2c_touchscreen_props=" is still a known parameter */ +} +__setup("i2c_touchscreen_props=", ts_parse_props); + static struct notifier_block ts_dmi_notifier = { .notifier_call = ts_dmi_notifier_call, }; @@ -1859,13 +1920,25 @@ static struct notifier_block ts_dmi_notifier = { static int __init ts_dmi_init(void) { const struct dmi_system_id *dmi_id; + struct ts_dmi_data *ts_data_dmi; int error; dmi_id = dmi_first_match(touchscreen_dmi_table); - if (!dmi_id) - return 0; /* Not an error */ + ts_data_dmi = dmi_id ? dmi_id->driver_data : NULL; + + if (ts_data) { + /* + * Kernel cmdline provided data takes precedence, copy over + * DMI efi_embedded_fw info if available. + */ + if (ts_data_dmi) + ts_data->embedded_fw = ts_data_dmi->embedded_fw; + } else if (ts_data_dmi) { + ts_data = ts_data_dmi; + } else { + return 0; /* Not an error */ + } - ts_data = dmi_id->driver_data; /* Some dmi table entries only provide an efi_embedded_fw_desc */ if (!ts_data->properties) return 0; From 7c8639aa41343fd7b3dbe09baf6b0791fcc407a1 Mon Sep 17 00:00:00 2001 From: hmtheboy154 Date: Mon, 27 May 2024 11:14:46 +0200 Subject: [PATCH 072/279] platform/x86: touchscreen_dmi: Add info for GlobalSpace SolT IVW 11.6" tablet This is a tablet created by GlobalSpace Technologies Limited which uses an Intel Atom x5-Z8300, 4GB of RAM & 64GB of storage. Link: https://web.archive.org/web/20171102141952/http://globalspace.in/11.6-device.html Signed-off-by: hmtheboy154 Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240527091447.248849-2-hdegoede@redhat.com --- drivers/platform/x86/touchscreen_dmi.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index b021fb9e579e..6c03e7daadd4 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -900,6 +900,22 @@ static const struct ts_dmi_data schneider_sct101ctm_data = { .properties = schneider_sct101ctm_props, }; +static const struct property_entry globalspace_solt_ivw116_props[] = { + PROPERTY_ENTRY_U32("touchscreen-min-x", 7), + PROPERTY_ENTRY_U32("touchscreen-min-y", 22), + PROPERTY_ENTRY_U32("touchscreen-size-x", 1723), + PROPERTY_ENTRY_U32("touchscreen-size-y", 1077), + PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-globalspace-solt-ivw116.fw"), + PROPERTY_ENTRY_U32("silead,max-fingers", 10), + PROPERTY_ENTRY_BOOL("silead,home-button"), + { } +}; + +static const struct ts_dmi_data globalspace_solt_ivw116_data = { + .acpi_name = "MSSL1680:00", + .properties = globalspace_solt_ivw116_props, +}; + static const struct property_entry techbite_arc_11_6_props[] = { PROPERTY_ENTRY_U32("touchscreen-min-x", 5), PROPERTY_ENTRY_U32("touchscreen-min-y", 7), @@ -1627,6 +1643,15 @@ const struct dmi_system_id touchscreen_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "SCT101CTM"), }, }, + { + /* GlobalSpace SoLT IVW 11.6" */ + .driver_data = (void *)&globalspace_solt_ivw116_data, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Globalspace Tech Pvt Ltd"), + DMI_MATCH(DMI_PRODUCT_NAME, "SolTIVW"), + DMI_MATCH(DMI_PRODUCT_SKU, "PN20170413488"), + }, + }, { /* Techbite Arc 11.6 */ .driver_data = (void *)&techbite_arc_11_6_data, From 3050052613790e75b5e4a8536930426b0a8b0774 Mon Sep 17 00:00:00 2001 From: hmtheboy154 Date: Mon, 27 May 2024 11:14:47 +0200 Subject: [PATCH 073/279] platform/x86: touchscreen_dmi: Add info for the EZpad 6s Pro The "EZpad 6s Pro" uses the same touchscreen as the "EZpad 6 Pro B", unlike the "Ezpad 6 Pro" which has its own touchscreen. Signed-off-by: hmtheboy154 Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240527091447.248849-3-hdegoede@redhat.com --- drivers/platform/x86/touchscreen_dmi.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 6c03e7daadd4..2d9ca2292ea1 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -1404,6 +1404,17 @@ const struct dmi_system_id touchscreen_dmi_table[] = { DMI_MATCH(DMI_BIOS_DATE, "04/24/2018"), }, }, + { + /* Jumper EZpad 6s Pro */ + .driver_data = (void *)&jumper_ezpad_6_pro_b_data, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Jumper"), + DMI_MATCH(DMI_PRODUCT_NAME, "Ezpad"), + /* Above matches are too generic, add bios match */ + DMI_MATCH(DMI_BIOS_VERSION, "E.WSA116_8.E1.042.bin"), + DMI_MATCH(DMI_BIOS_DATE, "01/08/2020"), + }, + }, { /* Jumper EZpad 6 m4 */ .driver_data = (void *)&jumper_ezpad_6_m4_data, From 21a22ed618d072a47597e63ee591973c18524880 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 22 May 2024 18:45:04 +0800 Subject: [PATCH 074/279] selftests: hsr: Fix "File exists" errors for hsr_ping The hsr_ping test reports the following errors: INFO: preparing interfaces for HSRv0. INFO: Initial validation ping. INFO: Longer ping test. INFO: Cutting one link. INFO: Delay the link and drop a few packages. INFO: All good. INFO: preparing interfaces for HSRv1. RTNETLINK answers: File exists RTNETLINK answers: File exists RTNETLINK answers: File exists RTNETLINK answers: File exists RTNETLINK answers: File exists RTNETLINK answers: File exists Error: ipv4: Address already assigned. Error: ipv6: address already assigned. Error: ipv4: Address already assigned. Error: ipv6: address already assigned. Error: ipv4: Address already assigned. Error: ipv6: address already assigned. INFO: Initial validation ping. That is because the cleanup code for the 2nd round test before "setup_hsr_interfaces 1" is removed incorrectly in commit 680fda4f6714 ("test: hsr: Remove script code already implemented in lib.sh"). This patch fixes it by re-setup the namespaces using setup_ns ns1 ns2 ns3 command before "setup_hsr_interfaces 1". It deletes previous namespaces and create new ones. Fixes: 680fda4f6714 ("test: hsr: Remove script code already implemented in lib.sh") Reviewed-by: Hangbin Liu Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/6485d3005f467758d49f0f313c8c009759ba6b05.1716374462.git.tanggeliang@kylinos.cn Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/hsr/hsr_ping.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh index 790294c8af83..3684b813b0f6 100755 --- a/tools/testing/selftests/net/hsr/hsr_ping.sh +++ b/tools/testing/selftests/net/hsr/hsr_ping.sh @@ -174,6 +174,8 @@ trap cleanup_all_ns EXIT setup_hsr_interfaces 0 do_complete_ping_test +setup_ns ns1 ns2 ns3 + setup_hsr_interfaces 1 do_complete_ping_test From 97e1db06c7bb948da10ba85acad8030b56886593 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 May 2024 00:40:02 +0900 Subject: [PATCH 075/279] af_unix: Annotate data-race around unix_sk(sk)->addr. Once unix_sk(sk)->addr is assigned under net->unx.table.locks and unix_sk(sk)->bindlock, *(unix_sk(sk)->addr) and unix_sk(sk)->path are fully set up, and unix_sk(sk)->addr is never changed. unix_getname() and unix_copy_addr() access the two fields locklessly, and commit ae3b564179bf ("missing barriers in some of unix_sock ->addr and ->path accesses") added smp_store_release() and smp_load_acquire() pairs. In other functions, we still read unix_sk(sk)->addr locklessly to check if the socket is bound, and KCSAN complains about it. [0] Given these functions have no dependency for *(unix_sk(sk)->addr) and unix_sk(sk)->path, READ_ONCE() is enough to annotate the data-race. Note that it is safe to access unix_sk(sk)->addr locklessly if the socket is found in the hash table. For example, the lockless read of otheru->addr in unix_stream_connect() is safe. Note also that newu->addr there is of the child socket that is still not accessible from userspace, and smp_store_release() publishes the address in case the socket is accept()ed and unix_getname() / unix_copy_addr() is called. [0]: BUG: KCSAN: data-race in unix_bind / unix_listen write (marked) to 0xffff88805f8d1840 of 8 bytes by task 13723 on cpu 0: __unix_set_addr_hash net/unix/af_unix.c:329 [inline] unix_bind_bsd net/unix/af_unix.c:1241 [inline] unix_bind+0x881/0x1000 net/unix/af_unix.c:1319 __sys_bind+0x194/0x1e0 net/socket.c:1847 __do_sys_bind net/socket.c:1858 [inline] __se_sys_bind net/socket.c:1856 [inline] __x64_sys_bind+0x40/0x50 net/socket.c:1856 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x4f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x46/0x4e read to 0xffff88805f8d1840 of 8 bytes by task 13724 on cpu 1: unix_listen+0x72/0x180 net/unix/af_unix.c:734 __sys_listen+0xdc/0x160 net/socket.c:1881 __do_sys_listen net/socket.c:1890 [inline] __se_sys_listen net/socket.c:1888 [inline] __x64_sys_listen+0x2e/0x40 net/socket.c:1888 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x4f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x46/0x4e value changed: 0x0000000000000000 -> 0xffff88807b5b1b40 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 13724 Comm: syz-executor.4 Not tainted 6.8.0-12822-gcd51db110a7e #12 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240522154002.77857-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e4af6616e1df..fe631212a345 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -731,7 +731,7 @@ static int unix_listen(struct socket *sock, int backlog) if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; /* Only stream/seqpacket sockets accept */ err = -EINVAL; - if (!u->addr) + if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) @@ -1369,7 +1369,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !unix_sk(sk)->addr) { + !READ_ONCE(unix_sk(sk)->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -1481,7 +1481,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { + test_bit(SOCK_PASSPIDFD, &sock->flags)) && + !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -1950,7 +1951,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, } if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { + test_bit(SOCK_PASSPIDFD, &sock->flags)) && + !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; From 51d1b25a720982324871338b1a36b197ec9bd6f0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 May 2024 00:42:18 +0900 Subject: [PATCH 076/279] af_unix: Read sk->sk_hash under bindlock during bind(). syzkaller reported data-race of sk->sk_hash in unix_autobind() [0], and the same ones exist in unix_bind_bsd() and unix_bind_abstract(). The three bind() functions prefetch sk->sk_hash locklessly and use it later after validating that unix_sk(sk)->addr is NULL under unix_sk(sk)->bindlock. The prefetched sk->sk_hash is the hash value of unbound socket set in unix_create1() and does not change until bind() completes. There could be a chance that sk->sk_hash changes after the lockless read. However, in such a case, non-NULL unix_sk(sk)->addr is visible under unix_sk(sk)->bindlock, and bind() returns -EINVAL without using the prefetched value. The KCSAN splat is false-positive, but let's silence it by reading sk->sk_hash under unix_sk(sk)->bindlock. [0]: BUG: KCSAN: data-race in unix_autobind / unix_autobind write to 0xffff888034a9fb88 of 4 bytes by task 4468 on cpu 0: __unix_set_addr_hash net/unix/af_unix.c:331 [inline] unix_autobind+0x47a/0x7d0 net/unix/af_unix.c:1185 unix_dgram_connect+0x7e3/0x890 net/unix/af_unix.c:1373 __sys_connect_file+0xd7/0xe0 net/socket.c:2048 __sys_connect+0x114/0x140 net/socket.c:2065 __do_sys_connect net/socket.c:2075 [inline] __se_sys_connect net/socket.c:2072 [inline] __x64_sys_connect+0x40/0x50 net/socket.c:2072 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x4f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x46/0x4e read to 0xffff888034a9fb88 of 4 bytes by task 4465 on cpu 1: unix_autobind+0x28/0x7d0 net/unix/af_unix.c:1134 unix_dgram_connect+0x7e3/0x890 net/unix/af_unix.c:1373 __sys_connect_file+0xd7/0xe0 net/socket.c:2048 __sys_connect+0x114/0x140 net/socket.c:2065 __do_sys_connect net/socket.c:2075 [inline] __se_sys_connect net/socket.c:2072 [inline] __x64_sys_connect+0x40/0x50 net/socket.c:2072 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x4f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x46/0x4e value changed: 0x000000e4 -> 0x000001e3 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 4465 Comm: syz-executor.0 Not tainted 6.8.0-12822-gcd51db110a7e #12 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Fixes: afd20b9290e1 ("af_unix: Replace the big lock with small locks.") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240522154218.78088-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fe631212a345..25b49efc0926 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1131,8 +1131,8 @@ static struct sock *unix_find_other(struct net *net, static int unix_autobind(struct sock *sk) { - unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; u32 lastnum, ordernum; @@ -1155,6 +1155,7 @@ static int unix_autobind(struct sock *sk) addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); + old_hash = sk->sk_hash; ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: @@ -1195,8 +1196,8 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, { umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct mnt_idmap *idmap; struct unix_address *addr; @@ -1234,6 +1235,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, if (u->addr) goto out_unlock; + old_hash = sk->sk_hash; new_hash = unix_bsd_hash(d_backing_inode(dentry)); unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); @@ -1261,8 +1263,8 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { - unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; int err; @@ -1280,6 +1282,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, goto out_mutex; } + old_hash = sk->sk_hash; new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); From 1684842147677a1279bcff95f8adb6de9a656e30 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Thu, 23 May 2024 13:06:26 +0530 Subject: [PATCH 077/279] Octeontx2-pf: Free send queue buffers incase of leaf to inner There are two type of classes. "Leaf classes" that are the bottom of the class hierarchy. "Inner classes" that are neither the root class nor leaf classes. QoS rules can only specify leaf classes as targets for traffic. Root / \ / \ 1 2 /\ / \ 4 5 classes 1,4 and 5 are leaf classes. class 2 is a inner class. When a leaf class made as inner, or vice versa, resources associated with send queue (send queue buffers and transmit schedulers) are not getting freed. Fixes: 5e6808b4c68d ("octeontx2-pf: Add support for HTB offload") Signed-off-by: Hariprasad Kelam Link: https://lore.kernel.org/r/20240523073626.4114-1-hkelam@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/nic/qos.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c index 070711df612e..edac008099c0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c @@ -1422,7 +1422,10 @@ static int otx2_qos_leaf_to_inner(struct otx2_nic *pfvf, u16 classid, otx2_qos_read_txschq_cfg(pfvf, node, old_cfg); /* delete the txschq nodes allocated for this node */ + otx2_qos_disable_sq(pfvf, qid); + otx2_qos_free_hw_node_schq(pfvf, node); otx2_qos_free_sw_node_schq(pfvf, node); + pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ; /* mark this node as htb inner node */ WRITE_ONCE(node->qid, OTX2_QOS_QID_INNER); @@ -1632,6 +1635,7 @@ static int otx2_qos_leaf_del_last(struct otx2_nic *pfvf, u16 classid, bool force dwrr_del_node = true; /* destroy the leaf node */ + otx2_qos_disable_sq(pfvf, qid); otx2_qos_destroy_node(pfvf, node); pfvf->qos.qid_to_sqmap[qid] = OTX2_QOS_INVALID_SQ; From d7ba701da636afae17d8e1243b3d12eed149abcb Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 2 May 2024 10:08:25 +0000 Subject: [PATCH 078/279] xfs: Clear W=1 warning in xfs_iwalk_run_callbacks() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For CONFIG_XFS_DEBUG unset, xfs_iwalk_run_callbacks() generates the following warning for when building with W=1: fs/xfs/xfs_iwalk.c: In function ‘xfs_iwalk_run_callbacks’: fs/xfs/xfs_iwalk.c:354:42: error: variable ‘irec’ set but not used [-Werror=unused-but-set-variable] 354 | struct xfs_inobt_rec_incore *irec; | ^~~~ cc1: all warnings being treated as errors Drop @irec, as it is only an intermediate variable. Suggested-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_iwalk.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 730c8d48da28..86f14ec7c31f 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -351,7 +351,6 @@ xfs_iwalk_run_callbacks( int *has_more) { struct xfs_mount *mp = iwag->mp; - struct xfs_inobt_rec_incore *irec; xfs_agino_t next_agino; int error; @@ -361,8 +360,8 @@ xfs_iwalk_run_callbacks( /* Delete cursor but remember the last record we cached... */ xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0); - irec = &iwag->recs[iwag->nr_recs - 1]; - ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK); + ASSERT(next_agino >= iwag->recs[iwag->nr_recs - 1].ir_startino + + XFS_INODES_PER_CHUNK); if (iwag->drop_trans) { xfs_trans_cancel(iwag->tp); From b33874fb7f28326380562f208d948bab785fbd6f Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 2 May 2024 10:08:26 +0000 Subject: [PATCH 079/279] xfs: Stop using __maybe_unused in xfs_alloc.c In both xfs_alloc_cur_finish() and xfs_alloc_ag_vextent_exact(), local variable @afg is tagged as __maybe_unused. Otherwise an unused variable warning would be generated for when building with W=1 and CONFIG_XFS_DEBUG unset. In both cases, the variable is unused as it is only referenced in an ASSERT() call, which is compiled out (in this config). It is generally a poor programming style to use __maybe_unused for variables. The ASSERT() call is to verify that agbno of the end of the extent is within bounds for both functions. @afg is used as an intermediate variable to find the AG length. However xfs_verify_agbext() already exists to verify a valid extent range. The arguments for calling xfs_verify_agbext() are already available, so use that instead. An advantage of using xfs_verify_agbext() is that it verifies that both the start and the end of the extent are within the bounds of the AG and catches overflows. Suggested-by: Dave Chinner Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 6cb8b2ddc541..6c55a6e88eba 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1008,13 +1008,12 @@ xfs_alloc_cur_finish( struct xfs_alloc_arg *args, struct xfs_alloc_cur *acur) { - struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; int error; ASSERT(acur->cnt && acur->bnolt); ASSERT(acur->bno >= acur->rec_bno); ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); - ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); + ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len)); error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, acur->rec_len, acur->bno, acur->len, 0); @@ -1217,7 +1216,6 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { - struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ int error; @@ -1297,7 +1295,7 @@ xfs_alloc_ag_vextent_exact( */ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, args->pag); - ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); + ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { From 2b3f004d3d518ec7a392066d935fd85c81412e33 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 May 2024 23:01:45 -0700 Subject: [PATCH 080/279] xfs: drop xfarray sortinfo folio on error Chandan Babu reports the following livelock in xfs/708: run fstests xfs/708 at 2024-05-04 15:35:29 XFS (loop16): EXPERIMENTAL online scrub feature in use. Use at your own risk! XFS (loop5): Mounting V5 Filesystem e96086f0-a2f9-4424-a1d5-c75d53d823be XFS (loop5): Ending clean mount XFS (loop5): Quotacheck needed: Please wait. XFS (loop5): Quotacheck: Done. XFS (loop5): EXPERIMENTAL online scrub feature in use. Use at your own risk! INFO: task xfs_io:143725 blocked for more than 122 seconds. Not tainted 6.9.0-rc4+ #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:xfs_io state:D stack:0 pid:143725 tgid:143725 ppid:117661 flags:0x00004006 Call Trace: __schedule+0x69c/0x17a0 schedule+0x74/0x1b0 io_schedule+0xc4/0x140 folio_wait_bit_common+0x254/0x650 shmem_undo_range+0x9d5/0xb40 shmem_evict_inode+0x322/0x8f0 evict+0x24e/0x560 __dentry_kill+0x17d/0x4d0 dput+0x263/0x430 __fput+0x2fc/0xaa0 task_work_run+0x132/0x210 get_signal+0x1a8/0x1910 arch_do_signal_or_restart+0x7b/0x2f0 syscall_exit_to_user_mode+0x1c2/0x200 do_syscall_64+0x72/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e The shmem code is trying to drop all the folios attached to a shmem file and gets stuck on a locked folio after a bnobt repair. It looks like the process has a signal pending, so I started looking for places where we lock an xfile folio and then deal with a fatal signal. I found a bug in xfarray_sort_scan via code inspection. This function is called to set up the scanning phase of a quicksort operation, which may involve grabbing a locked xfile folio. If we exit the function with an error code, the caller does not call xfarray_sort_scan_done to put the xfile folio. If _sort_scan returns an error code while si->folio is set, we leak the reference and never unlock the folio. Therefore, change xfarray_sort to call _scan_done on exit. This is safe to call multiple times because it sets si->folio to NULL and ignores a NULL si->folio. Also change _sort_scan to use an intermediate variable so that we never pollute si->folio with an errptr. Fixes: 232ea052775f9 ("xfs: enable sorting of xfile-backed arrays") Reported-by: Chandan Babu R Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfarray.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index 9185ae7088d4..cdd13ed9c569 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -822,12 +822,14 @@ xfarray_sort_scan( /* Grab the first folio that backs this array element. */ if (!si->folio) { + struct folio *folio; loff_t next_pos; - si->folio = xfile_get_folio(si->array->xfile, idx_pos, + folio = xfile_get_folio(si->array->xfile, idx_pos, si->array->obj_size, XFILE_ALLOC); - if (IS_ERR(si->folio)) - return PTR_ERR(si->folio); + if (IS_ERR(folio)) + return PTR_ERR(folio); + si->folio = folio; si->first_folio_idx = xfarray_idx(si->array, folio_pos(si->folio) + si->array->obj_size - 1); @@ -1048,6 +1050,7 @@ xfarray_sort( out_free: trace_xfarray_sort_stats(si, error); + xfarray_sort_scan_done(si); kvfree(si); return error; } From 97835e6866796874571646a1a8ff44f24c0b39f7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 May 2024 23:02:01 -0700 Subject: [PATCH 081/279] xfs: fix xfs_init_attr_trans not handling explicit operation codes When we were converting the attr code to use an explicit operation code instead of keying off of attr->value being null, we forgot to change the code that initializes the transaction reservation. Split the function into two helpers that handle the !remove and remove cases, then fix both callsites to handle this correctly. Fixes: c27411d4c640 ("xfs: make attr removal an explicit operation") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 38 ++++++++++++++++++-------------------- fs/xfs/libxfs/xfs_attr.h | 3 +-- fs/xfs/xfs_attr_item.c | 17 +++++++++++++++-- 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 430cd3244c14..f30bcc64100d 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -329,26 +329,20 @@ xfs_attr_calc_size( return nblks; } -/* Initialize transaction reservation for attr operations */ -void -xfs_init_attr_trans( - struct xfs_da_args *args, - struct xfs_trans_res *tres, - unsigned int *total) +/* Initialize transaction reservation for an xattr set/replace/upsert */ +inline struct xfs_trans_res +xfs_attr_set_resv( + const struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_trans_res ret = { + .tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args->total, + .tr_logcount = XFS_ATTRSET_LOG_COUNT, + .tr_logflags = XFS_TRANS_PERM_LOG_RES, + }; - if (args->value) { - tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; - *total = args->total; - } else { - *tres = M_RES(mp)->tr_attrrm; - *total = XFS_ATTRRM_SPACE_RES(mp); - } + return ret; } /* @@ -1006,7 +1000,7 @@ xfs_attr_set( struct xfs_trans_res tres; int error, local; int rmt_blks = 0; - unsigned int total; + unsigned int total = 0; ASSERT(!args->trans); @@ -1033,10 +1027,15 @@ xfs_attr_set( if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); + + tres = xfs_attr_set_resv(args); + total = args->total; break; case XFS_ATTRUPDATE_REMOVE: XFS_STATS_INC(mp, xs_attr_remove); rmt_blks = xfs_attr3_max_rmt_blocks(mp); + tres = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); break; } @@ -1044,7 +1043,6 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ - xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 088cb7b30168..0e51d0723f9a 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -565,8 +565,7 @@ bool xfs_attr_check_namespace(unsigned int attr_flags); bool xfs_attr_namecheck(unsigned int attr_flags, const void *name, size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); -void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, - unsigned int *total); +struct xfs_trans_res xfs_attr_set_resv(const struct xfs_da_args *args); /* * Check to see if the attr should be upgraded from non-existent or shortform to diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 2b10ac4c5fce..f683b7a9323f 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -746,7 +746,7 @@ xfs_attr_recover_work( struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error; - int total; + unsigned int total = 0; /* * First check the validity of the attr described by the ATTRI. If any @@ -763,7 +763,20 @@ xfs_attr_recover_work( return PTR_ERR(attr); args = attr->xattri_da_args; - xfs_init_attr_trans(args, &resv, &total); + switch (xfs_attr_intent_op(attr)) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + resv = xfs_attr_set_resv(args); + total = args->total; + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_REMOVE: + resv = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); + break; + } resv = xlog_recover_resv(&resv); error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) From 38de567906d95c397d87f292b892686b7ec6fbc3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 May 2024 23:02:16 -0700 Subject: [PATCH 082/279] xfs: allow symlinks with short remote targets An internal user complained about log recovery failing on a symlink ("Bad dinode after recovery") with the following (excerpted) format: core.magic = 0x494e core.mode = 0120777 core.version = 3 core.format = 2 (extents) core.nlinkv2 = 1 core.nextents = 1 core.size = 297 core.nblocks = 1 core.naextents = 0 core.forkoff = 0 core.aformat = 2 (extents) u3.bmx[0] = [startoff,startblock,blockcount,extentflag] 0:[0,12,1,0] This is a symbolic link with a 297-byte target stored in a disk block, which is to say this is a symlink with a remote target. The forkoff is 0, which is to say that there's 512 - 176 == 336 bytes in the inode core to store the data fork. Eventually, testing of generic/388 failed with the same inode corruption message during inode recovery. In writing a debugging patch to call xfs_dinode_verify on dirty inode log items when we're committing transactions, I observed that xfs/298 can reproduce the problem quite quickly. xfs/298 creates a symbolic link, adds some extended attributes, then deletes them all. The test failure occurs when the final removexattr also deletes the attr fork because that does not convert the remote symlink back into a shortform symlink. That is how we trip this test. The only reason why xfs/298 only triggers with the debug patch added is that it deletes the symlink, so the final iflush shows the inode as free. I wrote a quick fstest to emulate the behavior of xfs/298, except that it leaves the symlinks on the filesystem after inducing the "corrupt" state. Kernels going back at least as far as 4.18 have written out symlink inodes in this manner and prior to 1eb70f54c445f they did not object to reading them back in. Because we've been writing out inodes this way for quite some time, the only way to fix this is to relax the check for symbolic links. Directories don't have this problem because di_size is bumped to blocksize during the sf->data conversion. Fixes: 1eb70f54c445f ("xfs: validate inode fork size against fork format") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_inode_buf.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d79002343d0b..e7a7bfbe75b4 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -374,17 +374,37 @@ xfs_dinode_verify_fork( /* * For fork types that can contain local data, check that the fork * format matches the size of local data contained within the fork. - * - * For all types, check that when the size says the should be in extent - * or btree format, the inode isn't claiming it is in local format. */ if (whichfork == XFS_DATA_FORK) { - if (S_ISDIR(mode) || S_ISLNK(mode)) { + /* + * A directory small enough to fit in the inode must be stored + * in local format. The directory sf <-> extents conversion + * code updates the directory size accordingly. + */ + if (S_ISDIR(mode)) { if (be64_to_cpu(dip->di_size) <= fork_size && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } + /* + * A symlink with a target small enough to fit in the inode can + * be stored in extents format if xattrs were added (thus + * converting the data fork from shortform to remote format) + * and then removed. + */ + if (S_ISLNK(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_EXTENTS && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + /* + * For all types, check that when the size says the fork should + * be in extent or btree format, the inode isn't claiming to be + * in local format. + */ if (be64_to_cpu(dip->di_size) > fork_size && fork_format == XFS_DINODE_FMT_LOCAL) return __this_address; From 95b19e2f4e0f730c83910e7f5e4d62ec68c6d862 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 May 2024 23:02:32 -0700 Subject: [PATCH 083/279] xfs: don't open-code u64_to_user_ptr Don't open-code what the kernel already provides. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/scrub.c | 2 +- fs/xfs/xfs_handle.c | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index c013f0ba4f36..4cbcf7a86dbe 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -856,7 +856,7 @@ xfs_ioc_scrubv_metadata( if (vec_bytes > PAGE_SIZE) return -ENOMEM; - uvectors = (void __user *)(uintptr_t)head.svh_vectors; + uvectors = u64_to_user_ptr(head.svh_vectors); vectors = memdup_user(uvectors, vec_bytes); if (IS_ERR(vectors)) return PTR_ERR(vectors); diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index c8785ed59543..a3f16e9b6fe5 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -773,11 +773,6 @@ xfs_getparents_expand_lastrec( trace_xfs_getparents_expand_lastrec(gpx->ip, gp, &gpx->context, gpr); } -static inline void __user *u64_to_uptr(u64 val) -{ - return (void __user *)(uintptr_t)val; -} - /* Retrieve the parent pointers for a given inode. */ STATIC int xfs_getparents( @@ -862,7 +857,7 @@ xfs_getparents( ASSERT(gpx->context.firstu <= gpx->gph.gph_request.gp_bufsize); /* Copy the records to userspace. */ - if (copy_to_user(u64_to_uptr(gpx->gph.gph_request.gp_buffer), + if (copy_to_user(u64_to_user_ptr(gpx->gph.gph_request.gp_buffer), gpx->krecords, gpx->context.firstu)) error = -EFAULT; From 52a2f0608366a629d43dacd3191039c95fef74ba Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Thu, 23 May 2024 14:23:14 +0530 Subject: [PATCH 084/279] net: usb: smsc95xx: fix changing LED_SEL bit value updated from EEPROM LED Select (LED_SEL) bit in the LED General Purpose IO Configuration register is used to determine the functionality of external LED pins (Speed Indicator, Link and Activity Indicator, Full Duplex Link Indicator). The default value for this bit is 0 when no EEPROM is present. If a EEPROM is present, the default value is the value of the LED Select bit in the Configuration Flags of the EEPROM. A USB Reset or Lite Reset (LRST) will cause this bit to be restored to the image value last loaded from EEPROM, or to be set to 0 if no EEPROM is present. While configuring the dual purpose GPIO/LED pins to LED outputs in the LED General Purpose IO Configuration register, the LED_SEL bit is changed as 0 and resulting the configured value from the EEPROM is cleared. The issue is fixed by using read-modify-write approach. Fixes: f293501c61c5 ("smsc95xx: configure LED outputs") Signed-off-by: Parthiban Veerasooran Reviewed-by: Simon Horman Reviewed-by: Woojung Huh Link: https://lore.kernel.org/r/20240523085314.167650-1-Parthiban.Veerasooran@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/usb/smsc95xx.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index cbea24666479..8e82184be5e7 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -879,7 +879,7 @@ static int smsc95xx_start_rx_path(struct usbnet *dev) static int smsc95xx_reset(struct usbnet *dev) { struct smsc95xx_priv *pdata = dev->driver_priv; - u32 read_buf, write_buf, burst_cap; + u32 read_buf, burst_cap; int ret = 0, timeout; netif_dbg(dev, ifup, dev->net, "entering smsc95xx_reset\n"); @@ -1003,10 +1003,13 @@ static int smsc95xx_reset(struct usbnet *dev) return ret; netif_dbg(dev, ifup, dev->net, "ID_REV = 0x%08x\n", read_buf); + ret = smsc95xx_read_reg(dev, LED_GPIO_CFG, &read_buf); + if (ret < 0) + return ret; /* Configure GPIO pins as LED outputs */ - write_buf = LED_GPIO_CFG_SPD_LED | LED_GPIO_CFG_LNK_LED | - LED_GPIO_CFG_FDX_LED; - ret = smsc95xx_write_reg(dev, LED_GPIO_CFG, write_buf); + read_buf |= LED_GPIO_CFG_SPD_LED | LED_GPIO_CFG_LNK_LED | + LED_GPIO_CFG_FDX_LED; + ret = smsc95xx_write_reg(dev, LED_GPIO_CFG, read_buf); if (ret < 0) return ret; From 779aa4d74785078575ee20d05d49e6942d1f2844 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 24 May 2024 06:48:17 -0700 Subject: [PATCH 085/279] drm/nouveau/nvif: Avoid build error due to potential integer overflows Trying to build parisc:allmodconfig with gcc 12.x or later results in the following build error. drivers/gpu/drm/nouveau/nvif/object.c: In function 'nvif_object_mthd': drivers/gpu/drm/nouveau/nvif/object.c:161:9: error: 'memcpy' accessing 4294967264 or more bytes at offsets 0 and 32 overlaps 6442450881 bytes at offset -2147483617 [-Werror=restrict] 161 | memcpy(data, args->mthd.data, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/nouveau/nvif/object.c: In function 'nvif_object_ctor': drivers/gpu/drm/nouveau/nvif/object.c:298:17: error: 'memcpy' accessing 4294967240 or more bytes at offsets 0 and 56 overlaps 6442450833 bytes at offset -2147483593 [-Werror=restrict] 298 | memcpy(data, args->new.data, size); gcc assumes that 'sizeof(*args) + size' can overflow, which would result in the problem. The problem is not new, only it is now no longer a warning but an error since W=1 has been enabled for the drm subsystem and since Werror is enabled for test builds. Rearrange arithmetic and use check_add_overflow() for validating the allocation size to avoid the overflow. While at it, split assignments out of if conditions. Fixes: a61ddb4393ad ("drm: enable (most) W=1 warnings by default across the subsystem") Cc: Javier Martinez Canillas Cc: Jani Nikula Cc: Thomas Zimmermann Cc: Danilo Krummrich Cc: Maxime Ripard Cc: Kees Cook Cc: Christophe JAILLET Cc: Joe Perches Reviewed-by: Kees Cook Signed-off-by: Guenter Roeck Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240524134817.1369993-1-linux@roeck-us.net --- drivers/gpu/drm/nouveau/nvif/object.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvif/object.c b/drivers/gpu/drm/nouveau/nvif/object.c index 4d1aaee8fe15..1d19c87eaec1 100644 --- a/drivers/gpu/drm/nouveau/nvif/object.c +++ b/drivers/gpu/drm/nouveau/nvif/object.c @@ -142,11 +142,16 @@ nvif_object_mthd(struct nvif_object *object, u32 mthd, void *data, u32 size) struct nvif_ioctl_v0 ioctl; struct nvif_ioctl_mthd_v0 mthd; } *args; + u32 args_size; u8 stack[128]; int ret; - if (sizeof(*args) + size > sizeof(stack)) { - if (!(args = kmalloc(sizeof(*args) + size, GFP_KERNEL))) + if (check_add_overflow(sizeof(*args), size, &args_size)) + return -ENOMEM; + + if (args_size > sizeof(stack)) { + args = kmalloc(args_size, GFP_KERNEL); + if (!args) return -ENOMEM; } else { args = (void *)stack; @@ -157,7 +162,7 @@ nvif_object_mthd(struct nvif_object *object, u32 mthd, void *data, u32 size) args->mthd.method = mthd; memcpy(args->mthd.data, data, size); - ret = nvif_object_ioctl(object, args, sizeof(*args) + size, NULL); + ret = nvif_object_ioctl(object, args, args_size, NULL); memcpy(data, args->mthd.data, size); if (args != (void *)stack) kfree(args); @@ -276,7 +281,15 @@ nvif_object_ctor(struct nvif_object *parent, const char *name, u32 handle, object->map.size = 0; if (parent) { - if (!(args = kmalloc(sizeof(*args) + size, GFP_KERNEL))) { + u32 args_size; + + if (check_add_overflow(sizeof(*args), size, &args_size)) { + nvif_object_dtor(object); + return -ENOMEM; + } + + args = kmalloc(args_size, GFP_KERNEL); + if (!args) { nvif_object_dtor(object); return -ENOMEM; } @@ -293,8 +306,7 @@ nvif_object_ctor(struct nvif_object *parent, const char *name, u32 handle, args->new.oclass = oclass; memcpy(args->new.data, data, size); - ret = nvif_object_ioctl(parent, args, sizeof(*args) + size, - &object->priv); + ret = nvif_object_ioctl(parent, args, args_size, &object->priv); memcpy(data, args->new.data, size); kfree(args); if (ret == 0) From f89ea63f1c65d3e93b255f14f9d9e05df87955fa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 May 2024 15:26:11 +0100 Subject: [PATCH 086/279] netfs, 9p: Fix race between umount and async request completion There's a problem in 9p's interaction with netfslib whereby a crash occurs because the 9p_fid structs get forcibly destroyed during client teardown (without paying attention to their refcounts) before netfslib has finished with them. However, it's not a simple case of deferring the clunking that p9_fid_put() does as that requires the p9_client record to still be present. The problem is that netfslib has to unlock pages and clear the IN_PROGRESS flag before destroying the objects involved - including the fid - and, in any case, nothing checks to see if writeback completed barring looking at the page flags. Fix this by keeping a count of outstanding I/O requests (of any type) and waiting for it to quiesce during inode eviction. Reported-by: syzbot+df038d463cca332e8414@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000005be0aa061846f8d6@google.com/ Reported-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000b86c5e06130da9c6@google.com/ Reported-by: syzbot+1527696d41a634cc1819@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000041f960618206d7e@google.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/755891.1716560771@warthog.procyon.org.uk Tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Reviewed-by: Dominique Martinet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Christian Schoenebeck cc: Jeff Layton cc: Steve French cc: Hillf Danton cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-and-tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/9p/vfs_inode.c | 1 + fs/afs/inode.c | 1 + fs/netfs/objects.c | 5 +++++ fs/smb/client/cifsfs.c | 1 + include/linux/netfs.h | 18 ++++++++++++++++++ 5 files changed, 26 insertions(+) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7a3308d77606..fd72fc38c8f5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -348,6 +348,7 @@ void v9fs_evict_inode(struct inode *inode) __le32 __maybe_unused version; if (!is_bad_inode(inode)) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); version = cpu_to_le32(v9inode->qid.version); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 94fc049aff58..15bb7989c387 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -648,6 +648,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index c90d482b1650..f4a642727479 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -72,6 +72,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + atomic_inc(&ctx->io_count); trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); @@ -124,6 +125,7 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); @@ -142,6 +144,9 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } + + if (atomic_dec_and_test(&ictx->io_count)) + wake_up_var(&ictx->io_count); call_rcu(&rreq->rcu, netfs_free_request_rcu); } diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index ec5b639f421a..14810ffd15c8 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -431,6 +431,7 @@ cifs_free_inode(struct inode *inode) static void cifs_evict_inode(struct inode *inode) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ca56a4428043..3b22ce0d064c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -68,6 +68,7 @@ struct netfs_inode { loff_t remote_i_size; /* Size of the remote file */ loff_t zero_point; /* Size after which we assume there's no data * on the server */ + atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ @@ -472,6 +473,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, ctx->remote_i_size = i_size_read(&ctx->inode); ctx->zero_point = LLONG_MAX; ctx->flags = 0; + atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif @@ -515,4 +517,20 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) #endif } +/** + * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete + * @ctx: The netfs inode to wait on + * + * Wait for outstanding I/O requests of any type to complete. This is intended + * to be called from inode eviction routines. This makes sure that any + * resources held by those requests are cleaned up before we let the inode get + * cleaned up. + */ +static inline void netfs_wait_for_outstanding_io(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0); +} + #endif /* _LINUX_NETFS_H */ From e569eb34970281438e2b48a3ef11c87459fcfbcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20L=C3=B3pez?= Date: Mon, 27 May 2024 11:43:52 +0200 Subject: [PATCH 087/279] tracing/probes: fix error check in parse_btf_field() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit btf_find_struct_member() might return NULL or an error via the ERR_PTR() macro. However, its caller in parse_btf_field() only checks for the NULL condition. Fix this by using IS_ERR() and returning the error up the stack. Link: https://lore.kernel.org/all/20240527094351.15687-1-clopez@suse.de/ Fixes: c440adfbe3025 ("tracing/probes: Support BTF based data structure field access") Signed-off-by: Carlos López Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 5e263c141574..39877c80d6cb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -554,6 +554,10 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, anon_offs = 0; field = btf_find_struct_member(ctx->btf, type, fieldname, &anon_offs); + if (IS_ERR(field)) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return PTR_ERR(field); + } if (!field) { trace_probe_log_err(ctx->offset, NO_BTF_FIELD); return -ENOENT; From 6cb05d89fd62a76a9b74bd16211fb0930e89fea8 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 22 May 2024 21:13:08 +0300 Subject: [PATCH 088/279] dma-buf: handle testing kthreads creation failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kthread creation may possibly fail inside race_signal_callback(). In such a case stop the already started threads, put the already taken references to them and return with error code. Found by Linux Verification Center (linuxtesting.org). Fixes: 2989f6451084 ("dma-buf: Add selftests for dma-fence") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Reviewed-by: T.J. Mercier Link: https://patchwork.freedesktop.org/patch/msgid/20240522181308.841686-1-pchelkin@ispras.ru Signed-off-by: Christian König --- drivers/dma-buf/st-dma-fence.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/dma-buf/st-dma-fence.c b/drivers/dma-buf/st-dma-fence.c index b7c6f7ea9e0c..6a1bfcd0cc21 100644 --- a/drivers/dma-buf/st-dma-fence.c +++ b/drivers/dma-buf/st-dma-fence.c @@ -540,6 +540,12 @@ static int race_signal_callback(void *arg) t[i].before = pass; t[i].task = kthread_run(thread_signal_callback, &t[i], "dma-fence:%d", i); + if (IS_ERR(t[i].task)) { + ret = PTR_ERR(t[i].task); + while (--i >= 0) + kthread_stop_put(t[i].task); + return ret; + } get_task_struct(t[i].task); } From 82d71b53d7e732ede6028591342bdc80fabfa29f Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 27 May 2024 15:13:14 +0200 Subject: [PATCH 089/279] Documentation/core-api: correct reference to SWIOTLB_DYNAMIC Commit c93f261dfc39 ("Documentation/core-api: add swiotlb documentation") accidentally refers to CONFIG_DYNAMIC_SWIOTLB in one place, while the config is actually called CONFIG_SWIOTLB_DYNAMIC. Correct the reference to the intended config option. Signed-off-by: Lukas Bulwahn Reviewed-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- Documentation/core-api/swiotlb.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/core-api/swiotlb.rst b/Documentation/core-api/swiotlb.rst index 5ad2c9ca85bc..cf06bae44ff8 100644 --- a/Documentation/core-api/swiotlb.rst +++ b/Documentation/core-api/swiotlb.rst @@ -192,7 +192,7 @@ alignment larger than PAGE_SIZE. Dynamic swiotlb --------------- -When CONFIG_DYNAMIC_SWIOTLB is enabled, swiotlb can do on-demand expansion of +When CONFIG_SWIOTLB_DYNAMIC is enabled, swiotlb can do on-demand expansion of the amount of memory available for allocation as bounce buffers. If a bounce buffer request fails due to lack of available space, an asynchronous background task is kicked off to allocate memory from general system memory and turn it From 80e4e17ac9e05adba3f1f0e1793398086e6d4007 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 May 2024 15:16:06 -0700 Subject: [PATCH 090/279] block: remove blk_queue_max_integrity_segments This is unused now that all the atomic queue limit conversions are merged. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240521221606.393040-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e253e7bd0d17..7428cb43952d 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -66,12 +66,6 @@ blk_integrity_queue_supports_integrity(struct request_queue *q) return q->integrity.profile; } -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ - q->limits.max_integrity_segments = segs; -} - static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { @@ -151,10 +145,6 @@ static inline void blk_integrity_register(struct gendisk *d, static inline void blk_integrity_unregister(struct gendisk *d) { } -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ -} static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { From d9780064b163b91c28e4d44ec3115599db65b7fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 May 2024 14:36:18 +0200 Subject: [PATCH 091/279] dm: move setting zoned_enabled to dm_table_set_restrictions Keep it together with the rest of the zoned code. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240527123634.1116952-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 3 --- drivers/md/dm-zone.c | 8 +++++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index cc66a27c363a..e291b78b307b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2040,9 +2040,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, r = dm_set_zones_restrictions(t, q); if (r) return r; - if (blk_queue_is_zoned(q) && - !static_key_enabled(&zoned_enabled.key)) - static_branch_enable(&zoned_enabled); } dm_update_crypto_profile(q, t); diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 8e6bcb0d786a..3103360ce7f0 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -287,7 +287,13 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) queue_emulates_zone_append(q) ? "emulated" : "native"); } - return dm_revalidate_zones(md, t); + ret = dm_revalidate_zones(md, t); + if (ret < 0) + return ret; + + if (!static_key_enabled(&zoned_enabled.key)) + static_branch_enable(&zoned_enabled); + return 0; } /* From 5e7a4bbcc33d7df6bcc8565a8938c196285e5423 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 May 2024 14:36:19 +0200 Subject: [PATCH 092/279] dm: remove dm_check_zoned Fold it into the only caller in preparation to changes in the queue limits setup. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240527123634.1116952-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-zone.c | 59 +++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 3103360ce7f0..0ee22494857d 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -160,37 +160,6 @@ static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, return 0; } -static int dm_check_zoned(struct mapped_device *md, struct dm_table *t) -{ - struct gendisk *disk = md->disk; - unsigned int nr_conv_zones = 0; - int ret; - - /* Count conventional zones */ - md->zone_revalidate_map = t; - ret = dm_blk_report_zones(disk, 0, UINT_MAX, - dm_check_zoned_cb, &nr_conv_zones); - md->zone_revalidate_map = NULL; - if (ret < 0) { - DMERR("Check zoned failed %d", ret); - return ret; - } - - /* - * If we only have conventional zones, expose the mapped device as - * a regular device. - */ - if (nr_conv_zones >= ret) { - disk->queue->limits.max_open_zones = 0; - disk->queue->limits.max_active_zones = 0; - disk->queue->limits.zoned = false; - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - disk->nr_zones = 0; - } - - return 0; -} - /* * Revalidate the zones of a mapped device to initialize resource necessary * for zone append emulation. Note that we cannot simply use the block layer @@ -254,6 +223,8 @@ static bool dm_table_supports_zone_append(struct dm_table *t) int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) { struct mapped_device *md = t->md; + struct gendisk *disk = md->disk; + unsigned int nr_conv_zones = 0; int ret; /* @@ -272,14 +243,30 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) return 0; /* - * Check that the mapped device will indeed be zoned, that is, that it - * has sequential write required zones. + * Count conventional zones to check that the mapped device will indeed + * have sequential write required zones. */ - ret = dm_check_zoned(md, t); - if (ret) + md->zone_revalidate_map = t; + ret = dm_blk_report_zones(disk, 0, UINT_MAX, + dm_check_zoned_cb, &nr_conv_zones); + md->zone_revalidate_map = NULL; + if (ret < 0) { + DMERR("Check zoned failed %d", ret); return ret; - if (!blk_queue_is_zoned(q)) + } + + /* + * If we only have conventional zones, expose the mapped device as + * a regular device. + */ + if (nr_conv_zones >= ret) { + disk->queue->limits.max_open_zones = 0; + disk->queue->limits.max_active_zones = 0; + disk->queue->limits.zoned = false; + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + disk->nr_zones = 0; return 0; + } if (!md->disk->nr_zones) { DMINFO("%s using %s zone append", From c8c1f7012b807ca4da0136eacab96961b56f25d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 May 2024 14:36:20 +0200 Subject: [PATCH 093/279] dm: make dm_set_zones_restrictions work on the queue limits Don't stuff the values directly into the queue without any synchronization, but instead delay applying the queue limits in the caller and let dm_set_zones_restrictions work on the limit structure. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240527123634.1116952-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 12 ++++++------ drivers/md/dm-zone.c | 11 ++++++----- drivers/md/dm.h | 3 ++- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e291b78b307b..b2d5246cff21 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1981,10 +1981,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_secure_erase(t)) limits->max_secure_erase_sectors = 0; - r = queue_limits_set(q, limits); - if (r) - return r; - if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) @@ -2036,12 +2032,16 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * For a zoned target, setup the zones related queue attributes * and resources necessary for zone append emulation if necessary. */ - if (blk_queue_is_zoned(q)) { - r = dm_set_zones_restrictions(t, q); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && limits->zoned) { + r = dm_set_zones_restrictions(t, q, limits); if (r) return r; } + r = queue_limits_set(q, limits); + if (r) + return r; + dm_update_crypto_profile(q, t); /* diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 0ee22494857d..5d66d916730e 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -220,7 +220,8 @@ static bool dm_table_supports_zone_append(struct dm_table *t) return true; } -int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *lim) { struct mapped_device *md = t->md; struct gendisk *disk = md->disk; @@ -236,7 +237,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); } else { set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - blk_queue_max_zone_append_sectors(q, 0); + lim->max_zone_append_sectors = 0; } if (!get_capacity(md->disk)) @@ -260,9 +261,9 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) * a regular device. */ if (nr_conv_zones >= ret) { - disk->queue->limits.max_open_zones = 0; - disk->queue->limits.max_active_zones = 0; - disk->queue->limits.zoned = false; + lim->max_open_zones = 0; + lim->max_active_zones = 0; + lim->zoned = false; clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); disk->nr_zones = 0; return 0; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index e0c57f19839b..53ef8207fe2c 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -101,7 +101,8 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); /* * Zoned targets related functions. */ -int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q); +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *lim); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, From b0c6bcd58d44b1b843d1b7218db5a1efe917d27e Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Tue, 7 May 2024 15:06:55 +0530 Subject: [PATCH 094/279] xfs: Add cond_resched to block unmap range and reflink remap path An async dio write to a sparse file can generate a lot of extents and when we unlink this file (using rm), the kernel can be busy in umapping and freeing those extents as part of transaction processing. Similarly xfs reflink remapping path can also iterate over a million extent entries in xfs_reflink_remap_blocks(). Since we can busy loop in these two functions, so let's add cond_resched() to avoid softlockup messages like these. watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [kworker/1:0:82435] CPU: 1 PID: 82435 Comm: kworker/1:0 Tainted: G S L 6.9.0-rc5-0-default #1 Workqueue: xfs-inodegc/sda2 xfs_inodegc_worker NIP [c000000000beea10] xfs_extent_busy_trim+0x100/0x290 LR [c000000000bee958] xfs_extent_busy_trim+0x48/0x290 Call Trace: xfs_alloc_get_rec+0x54/0x1b0 (unreliable) xfs_alloc_compute_aligned+0x5c/0x144 xfs_alloc_ag_vextent_size+0x238/0x8d4 xfs_alloc_fix_freelist+0x540/0x694 xfs_free_extent_fix_freelist+0x84/0xe0 __xfs_free_extent+0x74/0x1ec xfs_extent_free_finish_item+0xcc/0x214 xfs_defer_finish_one+0x194/0x388 xfs_defer_finish_noroll+0x1b4/0x5c8 xfs_defer_finish+0x2c/0xc4 xfs_bunmapi_range+0xa4/0x100 xfs_itruncate_extents_flags+0x1b8/0x2f4 xfs_inactive_truncate+0xe0/0x124 xfs_inactive+0x30c/0x3e0 xfs_inodegc_worker+0x140/0x234 process_scheduled_works+0x240/0x57c worker_thread+0x198/0x468 kthread+0x138/0x140 start_kernel_thread+0x14/0x18 run fstests generic/175 at 2024-02-02 04:40:21 [ C17] watchdog: BUG: soft lockup - CPU#17 stuck for 23s! [xfs_io:7679] watchdog: BUG: soft lockup - CPU#17 stuck for 23s! [xfs_io:7679] CPU: 17 PID: 7679 Comm: xfs_io Kdump: loaded Tainted: G X 6.4.0 NIP [c008000005e3ec94] xfs_rmapbt_diff_two_keys+0x54/0xe0 [xfs] LR [c008000005e08798] xfs_btree_get_leaf_keys+0x110/0x1e0 [xfs] Call Trace: 0xc000000014107c00 (unreliable) __xfs_btree_updkeys+0x8c/0x2c0 [xfs] xfs_btree_update_keys+0x150/0x170 [xfs] xfs_btree_lshift+0x534/0x660 [xfs] xfs_btree_make_block_unfull+0x19c/0x240 [xfs] xfs_btree_insrec+0x4e4/0x630 [xfs] xfs_btree_insert+0x104/0x2d0 [xfs] xfs_rmap_insert+0xc4/0x260 [xfs] xfs_rmap_map_shared+0x228/0x630 [xfs] xfs_rmap_finish_one+0x2d4/0x350 [xfs] xfs_rmap_update_finish_item+0x44/0xc0 [xfs] xfs_defer_finish_noroll+0x2e4/0x740 [xfs] __xfs_trans_commit+0x1f4/0x400 [xfs] xfs_reflink_remap_extent+0x2d8/0x650 [xfs] xfs_reflink_remap_blocks+0x154/0x320 [xfs] xfs_file_remap_range+0x138/0x3a0 [xfs] do_clone_file_range+0x11c/0x2f0 vfs_clone_file_range+0x60/0x1c0 ioctl_file_clone+0x78/0x140 sys_ioctl+0x934/0x1270 system_call_exception+0x158/0x320 system_call_vectored_common+0x15c/0x2ec Cc: Ojaswin Mujoo Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Darrick J. Wong Tested-by: Disha Goel Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 1 + fs/xfs/xfs_reflink.c | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 3b3206d312d6..c101cf266bc4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6383,6 +6383,7 @@ xfs_bunmapi_range( error = xfs_defer_finish(tpp); if (error) goto out; + cond_resched(); } out: return error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 063a2e00d169..265a2a418bc7 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1387,6 +1387,7 @@ xfs_reflink_remap_blocks( destoff += imap.br_blockcount; len -= imap.br_blockcount; remapped_len += imap.br_blockcount; + cond_resched(); } if (error) From 98e948fb60d41447fd8d2d0c3b8637fc6b6dc26d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 27 May 2024 13:20:07 +0200 Subject: [PATCH 095/279] bpf: Allow delete from sockmap/sockhash only if update is allowed We have seen an influx of syzkaller reports where a BPF program attached to a tracepoint triggers a locking rule violation by performing a map_delete on a sockmap/sockhash. We don't intend to support this artificial use scenario. Extend the existing verifier allowed-program-type check for updating sockmap/sockhash to also cover deleting from a map. From now on only BPF programs which were previously allowed to update sockmap/sockhash can delete from these map types. Fixes: ff9105993240 ("bpf, sockmap: Prevent lock inversion deadlock in map delete elem") Reported-by: Tetsuo Handa Reported-by: syzbot+ec941d6e24f633a59172@syzkaller.appspotmail.com Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Tested-by: syzbot+ec941d6e24f633a59172@syzkaller.appspotmail.com Acked-by: John Fastabend Closes: https://syzkaller.appspot.com/bug?extid=ec941d6e24f633a59172 Link: https://lore.kernel.org/bpf/20240527-sockmap-verify-deletes-v1-1-944b372f2101@cloudflare.com --- kernel/bpf/verifier.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 77da1f438bec..48f3a9acdef3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8882,7 +8882,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) enum bpf_attach_type eatype = env->prog->expected_attach_type; enum bpf_prog_type type = resolve_prog_type(env->prog); - if (func_id != BPF_FUNC_map_update_elem) + if (func_id != BPF_FUNC_map_update_elem && + func_id != BPF_FUNC_map_delete_elem) return false; /* It's not possible to get access to a locked struct sock in these @@ -8893,6 +8894,11 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) if (eatype == BPF_TRACE_ITER) return true; break; + case BPF_PROG_TYPE_SOCK_OPS: + /* map_update allowed only via dedicated helpers with event type checks */ + if (func_id == BPF_FUNC_map_delete_elem) + return true; + break; case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: @@ -8988,7 +8994,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && func_id != BPF_FUNC_sk_select_reuseport && func_id != BPF_FUNC_map_lookup_elem && @@ -8998,7 +9003,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKHASH: if (func_id != BPF_FUNC_sk_redirect_hash && func_id != BPF_FUNC_sock_hash_update && - func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && func_id != BPF_FUNC_sk_select_reuseport && func_id != BPF_FUNC_map_lookup_elem && From 3b9ce0491a43e9af7f108b2f1bced7cd35931660 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 27 May 2024 13:20:08 +0200 Subject: [PATCH 096/279] Revert "bpf, sockmap: Prevent lock inversion deadlock in map delete elem" This reverts commit ff91059932401894e6c86341915615c5eb0eca48. This check is no longer needed. BPF programs attached to tracepoints are now rejected by the verifier when they attempt to delete from a sockmap/sockhash maps. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240527-sockmap-verify-deletes-v1-2-944b372f2101@cloudflare.com --- net/core/sock_map.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 9402889840bf..63c016b4c169 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -423,9 +423,6 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock *sk; int err = 0; - if (irqs_disabled()) - return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ - spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) @@ -948,9 +945,6 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) struct bpf_shtab_elem *elem; int ret = -ENOENT; - if (irqs_disabled()) - return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ - hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); From a63bf556160fb19591183383da6757f52119981d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 27 May 2024 13:20:09 +0200 Subject: [PATCH 097/279] selftests/bpf: Cover verifier checks for mutating sockmap/sockhash Verifier enforces that only certain program types can mutate sock{map,hash} maps, that is update it or delete from it. Add test coverage for these checks so we don't regress. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240527-sockmap-verify-deletes-v1-3-944b372f2101@cloudflare.com --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../bpf/progs/verifier_sockmap_mutate.c | 187 ++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index c60db8beeb73..1c9c4ec1be11 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -67,6 +67,7 @@ #include "verifier_search_pruning.skel.h" #include "verifier_sock.skel.h" #include "verifier_sock_addr.skel.h" +#include "verifier_sockmap_mutate.skel.h" #include "verifier_spill_fill.skel.h" #include "verifier_spin_lock.skel.h" #include "verifier_stack_ptr.skel.h" @@ -183,6 +184,7 @@ void test_verifier_sdiv(void) { RUN(verifier_sdiv); } void test_verifier_search_pruning(void) { RUN(verifier_search_pruning); } void test_verifier_sock(void) { RUN(verifier_sock); } void test_verifier_sock_addr(void) { RUN(verifier_sock_addr); } +void test_verifier_sockmap_mutate(void) { RUN(verifier_sockmap_mutate); } void test_verifier_spill_fill(void) { RUN(verifier_spill_fill); } void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } diff --git a/tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c b/tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c new file mode 100644 index 000000000000..fe4b123187b8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_sockmap_mutate.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#include "bpf_misc.h" + +#define __always_unused __attribute__((unused)) + +char _license[] SEC("license") = "GPL"; + +struct sock { +} __attribute__((preserve_access_index)); + +struct bpf_iter__sockmap { + union { + struct sock *sk; + }; +} __attribute__((preserve_access_index)); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sockhash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sockmap SEC(".maps"); + +enum { CG_OK = 1 }; + +int zero = 0; + +static __always_inline void test_sockmap_delete(void) +{ + bpf_map_delete_elem(&sockmap, &zero); + bpf_map_delete_elem(&sockhash, &zero); +} + +static __always_inline void test_sockmap_update(void *sk) +{ + if (sk) { + bpf_map_update_elem(&sockmap, &zero, sk, BPF_ANY); + bpf_map_update_elem(&sockhash, &zero, sk, BPF_ANY); + } +} + +static __always_inline void test_sockmap_lookup_and_update(void) +{ + struct bpf_sock *sk = bpf_map_lookup_elem(&sockmap, &zero); + + if (sk) { + test_sockmap_update(sk); + bpf_sk_release(sk); + } +} + +static __always_inline void test_sockmap_mutate(void *sk) +{ + test_sockmap_delete(); + test_sockmap_update(sk); +} + +static __always_inline void test_sockmap_lookup_and_mutate(void) +{ + test_sockmap_delete(); + test_sockmap_lookup_and_update(); +} + +SEC("action") +__success +int test_sched_act(struct __sk_buff *skb) +{ + test_sockmap_mutate(skb->sk); + return 0; +} + +SEC("classifier") +__success +int test_sched_cls(struct __sk_buff *skb) +{ + test_sockmap_mutate(skb->sk); + return 0; +} + +SEC("flow_dissector") +__success +int test_flow_dissector_delete(struct __sk_buff *skb __always_unused) +{ + test_sockmap_delete(); + return 0; +} + +SEC("flow_dissector") +__failure __msg("program of this type cannot use helper bpf_sk_release") +int test_flow_dissector_update(struct __sk_buff *skb __always_unused) +{ + test_sockmap_lookup_and_update(); /* no access to skb->sk */ + return 0; +} + +SEC("iter/sockmap") +__success +int test_trace_iter(struct bpf_iter__sockmap *ctx) +{ + test_sockmap_mutate(ctx->sk); + return 0; +} + +SEC("raw_tp/kfree") +__failure __msg("cannot update sockmap in this context") +int test_raw_tp_delete(const void *ctx __always_unused) +{ + test_sockmap_delete(); + return 0; +} + +SEC("raw_tp/kfree") +__failure __msg("cannot update sockmap in this context") +int test_raw_tp_update(const void *ctx __always_unused) +{ + test_sockmap_lookup_and_update(); + return 0; +} + +SEC("sk_lookup") +__success +int test_sk_lookup(struct bpf_sk_lookup *ctx) +{ + test_sockmap_mutate(ctx->sk); + return 0; +} + +SEC("sk_reuseport") +__success +int test_sk_reuseport(struct sk_reuseport_md *ctx) +{ + test_sockmap_mutate(ctx->sk); + return 0; +} + +SEC("socket") +__success +int test_socket_filter(struct __sk_buff *skb) +{ + test_sockmap_mutate(skb->sk); + return 0; +} + +SEC("sockops") +__success +int test_sockops_delete(struct bpf_sock_ops *ctx __always_unused) +{ + test_sockmap_delete(); + return CG_OK; +} + +SEC("sockops") +__failure __msg("cannot update sockmap in this context") +int test_sockops_update(struct bpf_sock_ops *ctx) +{ + test_sockmap_update(ctx->sk); + return CG_OK; +} + +SEC("sockops") +__success +int test_sockops_update_dedicated(struct bpf_sock_ops *ctx) +{ + bpf_sock_map_update(ctx, &sockmap, &zero, BPF_ANY); + bpf_sock_hash_update(ctx, &sockhash, &zero, BPF_ANY); + return CG_OK; +} + +SEC("xdp") +__success +int test_xdp(struct xdp_md *ctx __always_unused) +{ + test_sockmap_lookup_and_mutate(); + return XDP_PASS; +} From d9ff882b54f99f96787fa3df7cd938966843c418 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 27 May 2024 13:34:45 +0900 Subject: [PATCH 098/279] null_blk: Fix return value of nullb_device_power_store() When powering on a null_blk device that is not already on, the return value ret that is initialized to be count is reused to check the return value of null_add_dev(), leading to nullb_device_power_store() to return null_add_dev() return value (0 on success) instead of "count". So make sure to set ret to be equal to count when there are no errors. Fixes: a2db328b0839 ("null_blk: fix null-ptr-dereference while configuring 'power' and 'submit_queues'") Signed-off-by: Damien Le Moal Reviewed-by: Yu Kuai Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20240527043445.235267-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index eb023d267369..631dca2e4e84 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -494,6 +494,7 @@ static ssize_t nullb_device_power_store(struct config_item *item, set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; + ret = count; } else if (dev->power && !newp) { if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { dev->power = newp; From 30a0e3135f9aa14ba745f767375183b3112d7440 Mon Sep 17 00:00:00 2001 From: hexue Date: Mon, 27 May 2024 16:45:33 +0800 Subject: [PATCH 099/279] block: delete redundant function declaration blk_stats_alloc_enable was used for block hybrid poll, the related function definition was removed by patch: commit 54bdd67d0f88 ("blk-mq: remove hybrid polling") but the function declaration was not deleted. Signed-off-by: hexue Link: https://lore.kernel.org/r/20240527084533.1485210-1-xue01.he@samsung.com Signed-off-by: Jens Axboe --- block/blk-stat.h | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-stat.h b/block/blk-stat.h index 17e1eb4ec7e2..5d7f18ba436d 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -64,7 +64,6 @@ struct blk_stat_callback { struct blk_queue_stats *blk_alloc_queue_stats(void); void blk_free_queue_stats(struct blk_queue_stats *); -bool blk_stats_alloc_enable(struct request_queue *q); void blk_stat_add(struct request *rq, u64 now); From 7b05ab85e28f615e70520d24c075249b4512044e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 23 May 2024 14:02:57 +0300 Subject: [PATCH 100/279] ipv4: Fix address dump when IPv4 is disabled on an interface Cited commit started returning an error when user space requests to dump the interface's IPv4 addresses and IPv4 is disabled on the interface. Restore the previous behavior and do not return an error. Before cited commit: # ip address show dev dummy1 10: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether e2:40:68:98:d0:18 brd ff:ff:ff:ff:ff:ff inet6 fe80::e040:68ff:fe98:d018/64 scope link proto kernel_ll valid_lft forever preferred_lft forever # ip link set dev dummy1 mtu 67 # ip address show dev dummy1 10: dummy1: mtu 67 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether e2:40:68:98:d0:18 brd ff:ff:ff:ff:ff:ff After cited commit: # ip address show dev dummy1 10: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 32:2d:69:f2:9c:99 brd ff:ff:ff:ff:ff:ff inet6 fe80::302d:69ff:fef2:9c99/64 scope link proto kernel_ll valid_lft forever preferred_lft forever # ip link set dev dummy1 mtu 67 # ip address show dev dummy1 RTNETLINK answers: No such device Dump terminated With this patch: # ip address show dev dummy1 10: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether de:17:56:bb:57:c0 brd ff:ff:ff:ff:ff:ff inet6 fe80::dc17:56ff:febb:57c0/64 scope link proto kernel_ll valid_lft forever preferred_lft forever # ip link set dev dummy1 mtu 67 # ip address show dev dummy1 10: dummy1: mtu 67 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether de:17:56:bb:57:c0 brd ff:ff:ff:ff:ff:ff I fixed the exact same issue for IPv6 in commit c04f7dfe6ec2 ("ipv6: Fix address dump when IPv6 is disabled on an interface"), but noted [1] that I am not doing the change for IPv4 because I am not aware of a way to disable IPv4 on an interface other than unregistering it. I clearly missed the above case. [1] https://lore.kernel.org/netdev/20240321173042.2151756-1-idosch@nvidia.com/ Fixes: cdb2f80f1c10 ("inet: use xa_array iterator to implement inet_dump_ifaddr()") Reported-by: Carolina Jubran Reported-by: Yamen Safadi Tested-by: Carolina Jubran Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240523110257.334315-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 96accde527da..e827da128c5f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1887,10 +1887,11 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) goto done; if (fillargs.ifindex) { - err = -ENODEV; dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); - if (!dev) + if (!dev) { + err = -ENODEV; goto done; + } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto done; From be008726d0ac338a6bb19c2da2853e3e2112b055 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 23 May 2024 10:13:45 -0400 Subject: [PATCH 101/279] net: gro: initialize network_offset in network layer Syzkaller was able to trigger kernel BUG at net/core/gro.c:424 ! RIP: 0010:gro_pull_from_frag0 net/core/gro.c:424 [inline] RIP: 0010:gro_try_pull_from_frag0 net/core/gro.c:446 [inline] RIP: 0010:dev_gro_receive+0x242f/0x24b0 net/core/gro.c:571 Due to using an incorrect NAPI_GRO_CB(skb)->network_offset. The referenced commit sets this offset to 0 in skb_gro_reset_offset. That matches the expected case in dev_gro_receive: pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, &gro_list->list, skb); But syzkaller injected an skb with protocol ETH_P_TEB into an ip6gre device (by writing the IP6GRE encapsulated version to a TAP device). The result was a first call to eth_gro_receive, and thus an extra ETH_HLEN in network_offset that should not be there. First issue hit is when computing offset from network header in ipv6_gro_pull_exthdrs. Initialize both offsets in the network layer gro_receive. This pairs with all reads in gro_receive, which use skb_gro_receive_network_offset(). Fixes: 186b1ea73ad8 ("net: gro: use cb instead of skb->network_header") Reported-by: syzkaller Signed-off-by: Willem de Bruijn CC: Richard Gobert Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240523141434.1752483-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/af_inet.c | 2 +- net/ipv6/ip6_offload.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e03ba4a21c39..b24d74616637 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1532,7 +1532,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) } NAPI_GRO_CB(skb)->flush |= flush; - NAPI_GRO_CB(skb)->inner_network_offset = off; + NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; /* Note : No need to call skb_gro_postpull_rcsum() here, * as we already checked checksum over ipv4 header was 0 diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index bd5aff97d8b1..9822163428b0 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -236,7 +236,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, if (unlikely(!iph)) goto out; - NAPI_GRO_CB(skb)->inner_network_offset = off; + NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; flush += ntohs(iph->payload_len) != skb->len - hlen; From f4dca95fc0f6350918f2e6727e35b41f7f86fcce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 May 2024 13:05:27 +0000 Subject: [PATCH 102/279] tcp: reduce accepted window in NEW_SYN_RECV state Jason commit made checks against ACK sequence less strict and can be exploited by attackers to establish spoofed flows with less probes. Innocent users might use tcp_rmem[1] == 1,000,000,000, or something more reasonable. An attacker can use a regular TCP connection to learn the server initial tp->rcv_wnd, and use it to optimize the attack. If we make sure that only the announced window (smaller than 65535) is used for ACK validation, we force an attacker to use 65537 packets to complete the 3WHS (assuming server ISN is unknown) Fixes: 378979e94e95 ("tcp: remove 64 KByte limit for initial tp->rcv_wnd value") Link: https://datatracker.ietf.org/meeting/119/materials/slides-119-tcpm-ghost-acks-00 Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20240523130528.60376-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/request_sock.h | 12 ++++++++++++ net/ipv4/tcp_ipv4.c | 7 +------ net/ipv4/tcp_minisocks.c | 7 +++++-- net/ipv6/tcp_ipv6.c | 7 +------ 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index d88c0dfc2d46..ebcb8896bffc 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -285,4 +285,16 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) return atomic_read(&queue->young); } +/* RFC 7323 2.3 Using the Window Scale Option + * The window field (SEG.WND) of every outgoing segment, with the + * exception of segments, MUST be right-shifted by + * Rcv.Wind.Shift bits. + * + * This means the SEG.WND carried in SYNACK can not exceed 65535. + * We use this property to harden TCP stack while in NEW_SYN_RECV state. + */ +static inline u32 tcp_synack_window(const struct request_sock *req) +{ + return min(req->rsk_rcv_wnd, 65535U); +} #endif /* _REQUEST_SOCK_H */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 30ef0c8f5e92..b710958393e6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1144,14 +1144,9 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, #endif } - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), 0, &key, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b93619b2384b..538c06f95918 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -783,8 +783,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ - if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { + if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt + + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4c3605485b68..8c577b651bfc 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1272,15 +1272,10 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, From 0fe53c0ab018b3399b8d4be95f32fd017c9719e1 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 23 May 2024 12:17:31 -0500 Subject: [PATCH 103/279] dt-bindings: net: pse-pd: microchip,pd692x0: Fix missing "additionalProperties" constraints The child nodes are missing "additionalProperties" constraints which means any undocumented properties or child nodes are allowed. Add the constraints, and fix the fallout of wrong manager node regex and missing properties. Fixes: 9c1de033afad ("dt-bindings: net: pse-pd: Add bindings for PD692x0 PSE controller") Signed-off-by: Rob Herring (Arm) Acked-by: Kory Maincent Link: https://lore.kernel.org/r/20240523171732.2836880-1-robh@kernel.org Signed-off-by: Jakub Kicinski --- .../bindings/net/pse-pd/microchip,pd692x0.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/net/pse-pd/microchip,pd692x0.yaml b/Documentation/devicetree/bindings/net/pse-pd/microchip,pd692x0.yaml index 828439398fdf..fd4244fceced 100644 --- a/Documentation/devicetree/bindings/net/pse-pd/microchip,pd692x0.yaml +++ b/Documentation/devicetree/bindings/net/pse-pd/microchip,pd692x0.yaml @@ -24,6 +24,7 @@ properties: managers: type: object + additionalProperties: false description: List of the PD69208T4/PD69204T4/PD69208M PSE managers. Each manager have 4 or 8 physical ports according to the chip version. No need to @@ -47,8 +48,9 @@ properties: - "#size-cells" patternProperties: - "^manager@0[0-9a-b]$": + "^manager@[0-9a-b]$": type: object + additionalProperties: false description: PD69208T4/PD69204T4/PD69208M PSE manager exposing 4 or 8 physical ports. @@ -69,9 +71,14 @@ properties: patternProperties: '^port@[0-7]$': type: object + additionalProperties: false + + properties: + reg: + maxItems: 1 + required: - reg - additionalProperties: false required: - reg From 12f86b9af96a8b09969e4392311602f787b40834 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 23 May 2024 12:17:50 -0500 Subject: [PATCH 104/279] dt-bindings: net: pse-pd: ti,tps23881: Fix missing "additionalProperties" constraints The child nodes are missing "additionalProperties" constraints which means any undocumented properties or child nodes are allowed. Add the constraints and all the undocumented properties exposed by the fix. Fixes: f562202fedad ("dt-bindings: net: pse-pd: Add bindings for TPS23881 PSE controller") Signed-off-by: Rob Herring (Arm) Acked-by: Kory Maincent Link: https://lore.kernel.org/r/20240523171750.2837331-1-robh@kernel.org Signed-off-by: Jakub Kicinski --- .../bindings/net/pse-pd/ti,tps23881.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/devicetree/bindings/net/pse-pd/ti,tps23881.yaml b/Documentation/devicetree/bindings/net/pse-pd/ti,tps23881.yaml index 4147adb11e10..6992d56832bf 100644 --- a/Documentation/devicetree/bindings/net/pse-pd/ti,tps23881.yaml +++ b/Documentation/devicetree/bindings/net/pse-pd/ti,tps23881.yaml @@ -29,13 +29,31 @@ properties: of the ports conversion matrix that establishes relationship between the logical ports and the physical channels. type: object + additionalProperties: false + + properties: + "#address-cells": + const: 1 + + "#size-cells": + const: 0 patternProperties: '^channel@[0-7]$': type: object + additionalProperties: false + + properties: + reg: + maxItems: 1 + required: - reg + required: + - "#address-cells" + - "#size-cells" + unevaluatedProperties: false required: From bf0497f53c8535f99b72041529d3f7708a6e2c0d Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Fri, 24 May 2024 13:05:28 +0800 Subject: [PATCH 105/279] net:fec: Add fec_enet_deinit() When fec_probe() fails or fec_drv_remove() needs to release the fec queue and remove a NAPI context, therefore add a function corresponding to fec_enet_init() and call fec_enet_deinit() which does the opposite to release memory and remove a NAPI context. Fixes: 59d0f7465644 ("net: fec: init multi queue date structure") Signed-off-by: Xiaolei Wang Reviewed-by: Wei Fang Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240524050528.4115581-1-xiaolei.wang@windriver.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index a72d8a2eb0b3..881ece735dcf 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -4130,6 +4130,14 @@ static int fec_enet_init(struct net_device *ndev) return ret; } +static void fec_enet_deinit(struct net_device *ndev) +{ + struct fec_enet_private *fep = netdev_priv(ndev); + + netif_napi_del(&fep->napi); + fec_enet_free_queue(ndev); +} + #ifdef CONFIG_OF static int fec_reset_phy(struct platform_device *pdev) { @@ -4524,6 +4532,7 @@ fec_probe(struct platform_device *pdev) fec_enet_mii_remove(fep); failed_mii_init: failed_irq: + fec_enet_deinit(ndev); failed_init: fec_ptp_stop(pdev); failed_reset: @@ -4587,6 +4596,7 @@ fec_drv_remove(struct platform_device *pdev) pm_runtime_put_noidle(&pdev->dev); pm_runtime_disable(&pdev->dev); + fec_enet_deinit(ndev); free_netdev(ndev); } From 4fb679040d9f758eeb3b4d01bbde6405bf20e64e Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Fri, 24 May 2024 10:53:50 +0200 Subject: [PATCH 106/279] net: micrel: Fix lan8841_config_intr after getting out of sleep mode When the interrupt is enabled, the function lan8841_config_intr tries to clear any pending interrupts by reading the interrupt status, then checks the return value for errors and then continue to enable the interrupt. It has been seen that once the system gets out of sleep mode, the interrupt status has the value 0x400 meaning that the PHY detected that the link was in low power. That is correct value but the problem is that the check is wrong. We try to check for errors but we return an error also in this case which is not an error. Therefore fix this by returning only when there is an error. Fixes: a8f1a19d27ef ("net: micrel: Add support for lan8841 PHY") Signed-off-by: Horatiu Vultur Reviewed-by: Suman Ghosh Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20240524085350.359812-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 1d769322b059..2b8f8b7f1517 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -4029,7 +4029,7 @@ static int lan8841_config_intr(struct phy_device *phydev) if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { err = phy_read(phydev, LAN8814_INTS); - if (err) + if (err < 0) return err; /* Enable / disable interrupts. It is OK to enable PTP interrupt @@ -4045,6 +4045,14 @@ static int lan8841_config_intr(struct phy_device *phydev) return err; err = phy_read(phydev, LAN8814_INTS); + if (err < 0) + return err; + + /* Getting a positive value doesn't mean that is an error, it + * just indicates what was the status. Therefore make sure to + * clear the value and say that there is no error. + */ + err = 0; } return err; From 266aa3b4812e97942a8ce5c7aafa7da059f7b5b8 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 24 May 2024 13:28:59 +0200 Subject: [PATCH 107/279] page_pool: fix &page_pool_params kdoc issues After the tagged commit, @netdev got documented twice and the kdoc script didn't notice that. Remove the second description added later and move the initial one according to the field position. After merging commit 5f8e4007c10d ("kernel-doc: fix struct_group_tagged() parsing"), kdoc requires to describe struct groups as well. &page_pool_params has 2 struct groups which generated new warnings, describe them to resolve this. Fixes: 403f11ac9ab7 ("page_pool: don't use driver-set flags field directly") Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20240524112859.2757403-1-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index b088d131aeb0..7e8477057f3d 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -45,16 +45,17 @@ struct pp_alloc_cache { /** * struct page_pool_params - page pool parameters + * @fast: params accessed frequently on hotpath * @order: 2^order pages on allocation * @pool_size: size of the ptr_ring * @nid: NUMA node id to allocate from pages from * @dev: device, for DMA pre-mapping purposes - * @netdev: netdev this pool will serve (leave as NULL if none or multiple) * @napi: NAPI which is the sole consumer of pages, otherwise NULL * @dma_dir: DMA mapping direction * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV - * @netdev: corresponding &net_device for Netlink introspection + * @slow: params with slowpath access only (initialization and Netlink) + * @netdev: netdev this pool will serve (leave as NULL if none or multiple) * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_SYSTEM_POOL */ struct page_pool_params { From d514c8b54209de7a95ab37259fe32c7406976bd9 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 23 May 2024 10:45:29 -0700 Subject: [PATCH 108/279] idpf: don't enable NAPI and interrupts prior to allocating Rx buffers Currently, idpf enables NAPI and interrupts prior to allocating Rx buffers. This may lead to frame loss (there are no buffers to place incoming frames) and even crashes on quick ifup-ifdown. Interrupts must be enabled only after all the resources are here and available. Split interrupt init into two phases: initialization and enabling, and perform the second only after the queues are fully initialized. Note that we can't just move interrupt initialization down the init process, as the queues must have correct a ::q_vector pointer set and NAPI already added in order to allocate buffers correctly. Also, during the deinit process, disable HW interrupts first and only then disable NAPI. Otherwise, there can be a HW event leading to napi_schedule(), but the NAPI will already be unavailable. Fixes: d4d558718266 ("idpf: initialize interrupts and enable vport") Reported-by: Michal Kubiak Reviewed-by: Wojciech Drewek Signed-off-by: Alexander Lobakin Reviewed-by: Simon Horman Tested-by: Krishneil Singh Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240523-net-2024-05-23-intel-net-fixes-v1-1-17a923e0bb5f@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 1 + drivers/net/ethernet/intel/idpf/idpf_txrx.c | 12 +++++++----- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index 52ceda6306a3..f1ee5584e8fa 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -1394,6 +1394,7 @@ static int idpf_vport_open(struct idpf_vport *vport, bool alloc_res) } idpf_rx_init_buf_tail(vport); + idpf_vport_intr_ena(vport); err = idpf_send_config_queues_msg(vport); if (err) { diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 285da2177ee4..b023704bbbda 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3746,9 +3746,9 @@ static void idpf_vport_intr_ena_irq_all(struct idpf_vport *vport) */ void idpf_vport_intr_deinit(struct idpf_vport *vport) { + idpf_vport_intr_dis_irq_all(vport); idpf_vport_intr_napi_dis_all(vport); idpf_vport_intr_napi_del_all(vport); - idpf_vport_intr_dis_irq_all(vport); idpf_vport_intr_rel_irq(vport); } @@ -4179,7 +4179,6 @@ int idpf_vport_intr_init(struct idpf_vport *vport) idpf_vport_intr_map_vector_to_qs(vport); idpf_vport_intr_napi_add_all(vport); - idpf_vport_intr_napi_ena_all(vport); err = vport->adapter->dev_ops.reg_ops.intr_reg_init(vport); if (err) @@ -4193,17 +4192,20 @@ int idpf_vport_intr_init(struct idpf_vport *vport) if (err) goto unroll_vectors_alloc; - idpf_vport_intr_ena_irq_all(vport); - return 0; unroll_vectors_alloc: - idpf_vport_intr_napi_dis_all(vport); idpf_vport_intr_napi_del_all(vport); return err; } +void idpf_vport_intr_ena(struct idpf_vport *vport) +{ + idpf_vport_intr_napi_ena_all(vport); + idpf_vport_intr_ena_irq_all(vport); +} + /** * idpf_config_rss - Send virtchnl messages to configure RSS * @vport: virtual port diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 3d046b81e507..551391e20464 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -990,6 +990,7 @@ int idpf_vport_intr_alloc(struct idpf_vport *vport); void idpf_vport_intr_update_itr_ena_irq(struct idpf_q_vector *q_vector); void idpf_vport_intr_deinit(struct idpf_vport *vport); int idpf_vport_intr_init(struct idpf_vport *vport); +void idpf_vport_intr_ena(struct idpf_vport *vport); enum pkt_hash_types idpf_ptype_to_htype(const struct idpf_rx_ptype_decoded *decoded); int idpf_config_rss(struct idpf_vport *vport); int idpf_init_rss(struct idpf_vport *vport); From 82617b9a04649e83ee8731918aeadbb6e6d7cbc7 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 23 May 2024 10:45:30 -0700 Subject: [PATCH 109/279] ice: fix accounting if a VLAN already exists The ice_vsi_add_vlan() function is used to add a VLAN filter for the target VSI. This function prepares a filter in the switch table for the given VSI. If it succeeds, the vsi->num_vlan counter is incremented. It is not considered an error to add a VLAN which already exists in the switch table, so the function explicitly checks and ignores -EEXIST. The vsi->num_vlan counter is still incremented. This seems incorrect, as it means we can double-count in the case where the same VLAN is added twice by the caller. The actual table will have one less filter than the count. The ice_vsi_del_vlan() function similarly checks and handles the -ENOENT condition for when deleting a filter that doesn't exist. This flow only decrements the vsi->num_vlan if it actually deleted a filter. The vsi->num_vlan counter is used only in a few places, primarily related to tracking the number of non-zero VLANs. If the vsi->num_vlans gets out of sync, then ice_vsi_num_non_zero_vlans() will incorrectly report more VLANs than are present, and ice_vsi_has_non_zero_vlans() could return true potentially in cases where there are only VLAN 0 filters left. Fix this by only incrementing the vsi->num_vlan in the case where we actually added an entry, and not in the case where the entry already existed. Fixes: a1ffafb0b4a4 ("ice: Support configuring the device to Double VLAN Mode") Signed-off-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240523-net-2024-05-23-intel-net-fixes-v1-2-17a923e0bb5f@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c index 2e9ad27cb9d1..6e8f2aab6080 100644 --- a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c @@ -45,14 +45,15 @@ int ice_vsi_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan) return -EINVAL; err = ice_fltr_add_vlan(vsi, vlan); - if (err && err != -EEXIST) { + if (!err) + vsi->num_vlan++; + else if (err == -EEXIST) + err = 0; + else dev_err(ice_pf_to_dev(vsi->back), "Failure Adding VLAN %d on VSI %i, status %d\n", vlan->vid, vsi->vsi_num, err); - return err; - } - vsi->num_vlan++; - return 0; + return err; } /** From 5597613fb3cf0e36d26cfd8fb2a63196da249333 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:56 +0200 Subject: [PATCH 110/279] selftests: mptcp: lib: support flaky subtests Some subtests can be unstable, failing once every X runs. Fixing them can take time: there could be an issue in the kernel or in the subtest, and it is then important to do a proper analysis, not to hide real bugs. To avoid creating noises on the different CIs, it is important to have a simple way to mark subtests as flaky, and ignore the errors. This is what this patch introduces: subtests can be marked as flaky by setting MPTCP_LIB_SUBTEST_FLAKY env var to 1, e.g. MPTCP_LIB_SUBTEST_FLAKY=1 The subtest will be executed, and errors (if any) will be ignored. It is still good to run these subtests, as it exercises code, and the results can still be useful for the on-going investigations. Note that the MPTCP CI will continue to track these flaky subtests by setting SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var to 1, and a ticket has to be created before marking subtests as flaky. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-1-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_lib.sh | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index ad2ebda5cb64..6ffa9b7a3260 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -21,6 +21,7 @@ declare -rx MPTCP_LIB_AF_INET6=10 MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 +MPTCP_LIB_SUBTEST_FLAKY=0 MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" MPTCP_LIB_IP_MPTCP=0 @@ -41,6 +42,16 @@ else readonly MPTCP_LIB_COLOR_RESET= fi +# SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var can be set not to ignore errors +# from subtests marked as flaky +mptcp_lib_override_flaky() { + [ "${SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY:-}" = 1 ] +} + +mptcp_lib_subtest_is_flaky() { + [ "${MPTCP_LIB_SUBTEST_FLAKY}" = 1 ] && ! mptcp_lib_override_flaky +} + # $1: color, $2: text mptcp_lib_print_color() { echo -e "${MPTCP_LIB_START_PRINT:-}${*}${MPTCP_LIB_COLOR_RESET}" @@ -72,7 +83,16 @@ mptcp_lib_pr_skip() { } mptcp_lib_pr_fail() { - mptcp_lib_print_err "[FAIL]${1:+ ${*}}" + local title cmt + + if mptcp_lib_subtest_is_flaky; then + title="IGNO" + cmt=" (flaky)" + else + title="FAIL" + fi + + mptcp_lib_print_err "[${title}]${cmt}${1:+ ${*}}" } mptcp_lib_pr_info() { @@ -208,7 +228,13 @@ mptcp_lib_result_pass() { # $1: test name mptcp_lib_result_fail() { - __mptcp_lib_result_add "not ok" "${1}" + if mptcp_lib_subtest_is_flaky; then + # It might sound better to use 'not ok # TODO' or 'ok # SKIP', + # but some CIs don't understand 'TODO' and treat SKIP as errors. + __mptcp_lib_result_add "ok" "${1} # IGNORE Flaky" + else + __mptcp_lib_result_add "not ok" "${1}" + fi } # $1: test name From cc73a6577ae64247898269d138dee6b73ff710cc Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:57 +0200 Subject: [PATCH 111/279] selftests: mptcp: simult flows: mark 'unbalanced' tests as flaky These tests are flaky since their introduction. This might be less or not visible depending on the CI running the tests, especially if it is also busy doing other tasks in parallel. A first analysis shown that the transfer can be slowed down when there are some re-injections at the MPTCP level. Such re-injections can of course happen, and disturb the transfer, but it looks strange to have them in this lab. That could be caused by the kernel having access to less CPU cycles -- e.g. when other activities are executed in parallel -- or by a misinterpretation on the MPTCP packet scheduler side. While this is being investigated, the tests are marked as flaky not to create noises in other CIs. Fixes: 219d04992b68 ("mptcp: push pending frames when subflow has free space") Link: https://github.com/multipath-tcp/mptcp_net-next/issues/475 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-2-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/simult_flows.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 4b14b4412166..f74e1c3c126d 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -244,7 +244,7 @@ run_test() do_transfer $small $large $time lret=$? mptcp_lib_result_code "${lret}" "${msg}" - if [ $lret -ne 0 ]; then + if [ $lret -ne 0 ] && ! mptcp_lib_subtest_is_flaky; then ret=$lret [ $bail -eq 0 ] || exit $ret fi @@ -254,7 +254,7 @@ run_test() do_transfer $large $small $time lret=$? mptcp_lib_result_code "${lret}" "${msg}" - if [ $lret -ne 0 ]; then + if [ $lret -ne 0 ] && ! mptcp_lib_subtest_is_flaky; then ret=$lret [ $bail -eq 0 ] || exit $ret fi @@ -290,7 +290,7 @@ run_test 10 10 0 0 "balanced bwidth" run_test 10 10 1 25 "balanced bwidth with unbalanced delay" # we still need some additional infrastructure to pass the following test-cases -run_test 10 3 0 0 "unbalanced bwidth" +MPTCP_LIB_SUBTEST_FLAKY=1 run_test 10 3 0 0 "unbalanced bwidth" run_test 10 3 1 25 "unbalanced bwidth with unbalanced delay" run_test 10 3 25 1 "unbalanced bwidth with opposed, unbalanced delay" From 8c06ac2178a9dee887929232226e35a5cdda1793 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:58 +0200 Subject: [PATCH 112/279] selftests: mptcp: join: mark 'fastclose' tests as flaky These tests are flaky since their introduction. This might be less or not visible depending on the CI running the tests, especially if it is also busy doing other tasks in parallel, and if a debug kernel config is being used. It looks like this issue is often present with the NetDev CI. While this is being investigated, the tests are marked as flaky not to create noises on such CIs. Fixes: 01542c9bf9ab ("selftests: mptcp: add fastclose testcase") Link: https://github.com/multipath-tcp/mptcp_net-next/issues/324 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-3-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index fefa9173bdaa..b869b46823d7 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -261,6 +261,8 @@ reset() TEST_NAME="${1}" + MPTCP_LIB_SUBTEST_FLAKY=0 # reset if modified + if skip_test; then MPTCP_LIB_TEST_COUNTER=$((MPTCP_LIB_TEST_COUNTER+1)) last_test_ignored=1 @@ -448,7 +450,9 @@ reset_with_tcp_filter() # $1: err msg fail_test() { - ret=${KSFT_FAIL} + if ! mptcp_lib_subtest_is_flaky; then + ret=${KSFT_FAIL} + fi if [ ${#} -gt 0 ]; then print_fail "${@}" @@ -3069,6 +3073,7 @@ fullmesh_tests() fastclose_tests() { if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=client \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 @@ -3077,6 +3082,7 @@ fastclose_tests() fi if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 0 0 0 1 From 38af56e6668b455f7dd0a8e2d9afe74100068e17 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:59 +0200 Subject: [PATCH 113/279] selftests: mptcp: join: mark 'fail' tests as flaky These tests are rarely unstable. It depends on the CI running the tests, especially if it is also busy doing other tasks in parallel, and if a debug kernel config is being used. It looks like this issue is sometimes present with the NetDev CI. While this is being investigated, the tests are marked as flaky not to create noises on such CIs. Fixes: b6e074e171bc ("selftests: mptcp: add infinite map testcase") Link: https://github.com/multipath-tcp/mptcp_net-next/issues/491 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-4-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b869b46823d7..2b66c5fa71eb 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3101,6 +3101,7 @@ fail_tests() { # single subflow if reset_with_fail "Infinite map" 1; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=128 \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)" @@ -3109,6 +3110,7 @@ fail_tests() # multiple subflows if reset_with_fail "MP_FAIL MP_RST" 2; then + MPTCP_LIB_SUBTEST_FLAKY=1 tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5ms pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 From c519cf9b7434183bb56ed1e200ac577a5fd34d9b Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 27 May 2024 12:36:19 +0200 Subject: [PATCH 114/279] docs: netdev: Fix typo in Signed-off-by tag s/of/off/ Signed-off-by: Thorsten Blum Fixes: e110ba659271 ("docs: netdev: add note about Changes Requested and revising commit messages") Link: https://lore.kernel.org/r/20240527103618.265801-2-thorsten.blum@toblux.com Signed-off-by: Jakub Kicinski --- Documentation/process/maintainer-netdev.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/maintainer-netdev.rst b/Documentation/process/maintainer-netdev.rst index fd96e4a3cef9..5e1fcfad1c4c 100644 --- a/Documentation/process/maintainer-netdev.rst +++ b/Documentation/process/maintainer-netdev.rst @@ -227,7 +227,7 @@ preferably including links to previous postings, for example:: The amount of mooing will depend on packet rate so should match the diurnal cycle quite well. - Signed-of-by: Joe Defarmer + Signed-off-by: Joe Defarmer --- v3: - add a note about time-of-day mooing fluctuation to the commit message From 77b79df0268bee3ef38fd5e76e86a076ce02995d Mon Sep 17 00:00:00 2001 From: Himal Prasad Ghimiray Date: Wed, 8 May 2024 20:52:15 +0530 Subject: [PATCH 115/279] drm/xe: Change pcode timeout to 50msec while polling again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Polling is initially attempted with timeout_base_ms enabled for preemption, and if it exceeds this timeframe, another attempt is made without preemption, allowing an additional 50 ms before timing out. v2 - Rebase v3 - Move warnings to separate patch (Lucas) Cc: Lucas De Marchi Cc: Rodrigo Vivi Signed-off-by: Himal Prasad Ghimiray Fixes: 7dc9b92dcfef ("drm/xe: Remove i915_utils dependency from xe_pcode.") Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240508152216.3263109-2-himal.prasad.ghimiray@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit c81858eb52266b3d6ba28ca4f62a198231a10cdc) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_pcode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_pcode.c b/drivers/gpu/drm/xe/xe_pcode.c index c010ef16fbf5..a5e7da8cf944 100644 --- a/drivers/gpu/drm/xe/xe_pcode.c +++ b/drivers/gpu/drm/xe/xe_pcode.c @@ -191,7 +191,7 @@ int xe_pcode_request(struct xe_gt *gt, u32 mbox, u32 request, drm_WARN_ON_ONCE(>_to_xe(gt)->drm, timeout_base_ms > 1); preempt_disable(); ret = pcode_try_request(gt, mbox, request, reply_mask, reply, &status, - true, timeout_base_ms * 1000, true); + true, 50 * 1000, true); preempt_enable(); out: From c8ea2c31f5ea437199b239d76ad5db27343edb0c Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 15 Apr 2024 12:04:53 -0700 Subject: [PATCH 116/279] drm/xe: Only use reserved BCS instances for usm migrate exec queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GuC context scheduling queue is 2 entires deep, thus it is possible for a migration job to be stuck behind a fault if migration exec queue shares engines with user jobs. This can deadlock as the migrate exec queue is required to service page faults. Avoid deadlock by only using reserved BCS instances for usm migrate exec queue. Fixes: a043fbab7af5 ("drm/xe/pvc: Use fast copy engines as migrate engine on PVC") Cc: Matt Roper Cc: Niranjana Vishwanathapura Signed-off-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240415190453.696553-2-matthew.brost@intel.com Reviewed-by: Brian Welty (cherry picked from commit 04f4a70a183a688a60fe3882d6e4236ea02cfc67) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_migrate.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 9f6e9b7f11c8..65e5a3f4c340 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -34,7 +34,6 @@ #include "xe_sync.h" #include "xe_trace.h" #include "xe_vm.h" -#include "xe_wa.h" /** * struct xe_migrate - migrate context. @@ -300,10 +299,6 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, } /* - * Due to workaround 16017236439, odd instance hardware copy engines are - * faster than even instance ones. - * This function returns the mask involving all fast copy engines and the - * reserved copy engine to be used as logical mask for migrate engine. * Including the reserved copy engine is required to avoid deadlocks due to * migrate jobs servicing the faults gets stuck behind the job that faulted. */ @@ -317,8 +312,7 @@ static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) if (hwe->class != XE_ENGINE_CLASS_COPY) continue; - if (!XE_WA(gt, 16017236439) || - xe_gt_is_usm_hwe(gt, hwe) || hwe->instance & 1) + if (xe_gt_is_usm_hwe(gt, hwe)) logical_mask |= BIT(hwe->logical_instance); } @@ -369,6 +363,10 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile) if (!hwe || !logical_mask) return ERR_PTR(-EINVAL); + /* + * XXX: Currently only reserving 1 (likely slow) BCS instance on + * PVC, may want to revisit if performance is needed. + */ m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, EXEC_QUEUE_FLAG_KERNEL | EXEC_QUEUE_FLAG_PERMANENT | From 6c5cd0807c79eb4c0cda70b48f6be668a241d584 Mon Sep 17 00:00:00 2001 From: Niranjana Vishwanathapura Date: Tue, 21 May 2024 13:17:11 -0700 Subject: [PATCH 117/279] drm/xe: Properly handle alloc_guc_id() failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Release the submission_state lock if alloc_guc_id() fails. v2: Add Fixes tag and CC stable kernel Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: # v6.8+ Signed-off-by: Niranjana Vishwanathapura Reviewed-by: Nirmoy Das Reviewed-by: Matthew Brost Signed-off-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20240521201711.4934-1-niranjana.vishwanathapura@intel.com (cherry picked from commit 40672b792a36894aff3a337b695f6136ee6ac5d4) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc_submit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index c7d38469fb46..e4e3658e6a13 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1240,6 +1240,7 @@ static int guc_exec_queue_init(struct xe_exec_queue *q) return 0; err_entity: + mutex_unlock(&guc->submission_state.lock); xe_sched_entity_fini(&ge->entity); err_sched: xe_sched_fini(&ge->sched); From 195aba96b854dd664768f382cd1db375d8181f88 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Wed, 22 May 2024 15:06:40 +0300 Subject: [PATCH 118/279] tpm_tis_spi: Account for SPI header when allocating TPM SPI xfer buffer The TPM SPI transfer mechanism uses MAX_SPI_FRAMESIZE for computing the maximum transfer length and the size of the transfer buffer. As such, it does not account for the 4 bytes of header that prepends the SPI data frame. This can result in out-of-bounds accesses and was confirmed with KASAN. Introduce SPI_HDRSIZE to account for the header and use to allocate the transfer buffer. Fixes: a86a42ac2bd6 ("tpm_tis_spi: Add hardware wait polling") Signed-off-by: Matthew R. Ochs Tested-by: Carol Soto Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_spi_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_tis_spi_main.c b/drivers/char/tpm/tpm_tis_spi_main.c index 3f9eaf27b41b..c9eca24bbad4 100644 --- a/drivers/char/tpm/tpm_tis_spi_main.c +++ b/drivers/char/tpm/tpm_tis_spi_main.c @@ -37,6 +37,7 @@ #include "tpm_tis_spi.h" #define MAX_SPI_FRAMESIZE 64 +#define SPI_HDRSIZE 4 /* * TCG SPI flow control is documented in section 6.4 of the spec[1]. In short, @@ -247,7 +248,7 @@ static int tpm_tis_spi_write_bytes(struct tpm_tis_data *data, u32 addr, int tpm_tis_spi_init(struct spi_device *spi, struct tpm_tis_spi_phy *phy, int irq, const struct tpm_tis_phy_ops *phy_ops) { - phy->iobuf = devm_kmalloc(&spi->dev, MAX_SPI_FRAMESIZE, GFP_KERNEL); + phy->iobuf = devm_kmalloc(&spi->dev, SPI_HDRSIZE + MAX_SPI_FRAMESIZE, GFP_KERNEL); if (!phy->iobuf) return -ENOMEM; From f3d7ba9e1bc0c9080834f263d4887bd9c9ea491f Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 27 May 2024 13:56:27 +0300 Subject: [PATCH 119/279] tpm: Open code tpm_buf_parameters() With only single call site, this makes no sense (slipped out of the radar during the review). Open code and document the action directly to the site, to make it more readable. Fixes: 1b6d7f9eb150 ("tpm: add session encryption protection to tpm2_get_random()") Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-buf.c | 26 -------------------------- drivers/char/tpm/tpm2-cmd.c | 10 +++++++++- include/linux/tpm.h | 2 -- 3 files changed, 9 insertions(+), 29 deletions(-) diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c index 647c6ca92ac3..cad0048bcc3c 100644 --- a/drivers/char/tpm/tpm-buf.c +++ b/drivers/char/tpm/tpm-buf.c @@ -223,30 +223,4 @@ u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset) } EXPORT_SYMBOL_GPL(tpm_buf_read_u32); -static u16 tpm_buf_tag(struct tpm_buf *buf) -{ - struct tpm_header *head = (struct tpm_header *)buf->data; - return be16_to_cpu(head->tag); -} - -/** - * tpm_buf_parameters - return the TPM response parameters area of the tpm_buf - * @buf: tpm_buf to use - * - * Where the parameters are located depends on the tag of a TPM - * command (it's immediately after the header for TPM_ST_NO_SESSIONS - * or 4 bytes after for TPM_ST_SESSIONS). Evaluate this and return a - * pointer to the first byte of the parameters area. - * - * @return: pointer to parameters area - */ -u8 *tpm_buf_parameters(struct tpm_buf *buf) -{ - int offset = TPM_HEADER_SIZE; - - if (tpm_buf_tag(buf) == TPM2_ST_SESSIONS) - offset += 4; - - return &buf->data[offset]; -} diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 0cdf892ec2a7..1e856259219e 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -281,6 +281,7 @@ struct tpm2_get_random_out { int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) { struct tpm2_get_random_out *out; + struct tpm_header *head; struct tpm_buf buf; u32 recd; u32 num_bytes = max; @@ -288,6 +289,7 @@ int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) int total = 0; int retries = 5; u8 *dest_ptr = dest; + off_t offset; if (!num_bytes || max > TPM_MAX_RNG_DATA) return -EINVAL; @@ -320,7 +322,13 @@ int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) goto out; } - out = (struct tpm2_get_random_out *)tpm_buf_parameters(&buf); + head = (struct tpm_header *)buf.data; + offset = TPM_HEADER_SIZE; + /* Skip the parameter size field: */ + if (be16_to_cpu(head->tag) == TPM2_ST_SESSIONS) + offset += 4; + + out = (struct tpm2_get_random_out *)&buf.data[offset]; recd = min_t(u32, be16_to_cpu(out->size), num_bytes); if (tpm_buf_length(&buf) < TPM_HEADER_SIZE + diff --git a/include/linux/tpm.h b/include/linux/tpm.h index c17e4efbb2e5..b3217200df28 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -437,8 +437,6 @@ u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset); u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset); u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset); -u8 *tpm_buf_parameters(struct tpm_buf *buf); - /* * Check if TPM device is in the firmware upgrade mode. */ From 4b4647add7d3c8530493f7247d11e257ee425bf0 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 24 May 2024 11:47:02 -0300 Subject: [PATCH 120/279] sock_map: avoid race between sock_map_close and sk_psock_put sk_psock_get will return NULL if the refcount of psock has gone to 0, which will happen when the last call of sk_psock_put is done. However, sk_psock_drop may not have finished yet, so the close callback will still point to sock_map_close despite psock being NULL. This can be reproduced with a thread deleting an element from the sock map, while the second one creates a socket, adds it to the map and closes it. That will trigger the WARN_ON_ONCE: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 7220 at net/core/sock_map.c:1701 sock_map_close+0x2a2/0x2d0 net/core/sock_map.c:1701 Modules linked in: CPU: 1 PID: 7220 Comm: syz-executor380 Not tainted 6.9.0-syzkaller-07726-g3c999d1ae3c7 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 RIP: 0010:sock_map_close+0x2a2/0x2d0 net/core/sock_map.c:1701 Code: df e8 92 29 88 f8 48 8b 1b 48 89 d8 48 c1 e8 03 42 80 3c 20 00 74 08 48 89 df e8 79 29 88 f8 4c 8b 23 eb 89 e8 4f 15 23 f8 90 <0f> 0b 90 48 83 c4 08 5b 41 5c 41 5d 41 5e 41 5f 5d e9 13 26 3d 02 RSP: 0018:ffffc9000441fda8 EFLAGS: 00010293 RAX: ffffffff89731ae1 RBX: ffffffff94b87540 RCX: ffff888029470000 RDX: 0000000000000000 RSI: ffffffff8bcab5c0 RDI: ffffffff8c1faba0 RBP: 0000000000000000 R08: ffffffff92f9b61f R09: 1ffffffff25f36c3 R10: dffffc0000000000 R11: fffffbfff25f36c4 R12: ffffffff89731840 R13: ffff88804b587000 R14: ffff88804b587000 R15: ffffffff89731870 FS: 000055555e080380(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000207d4000 CR4: 0000000000350ef0 Call Trace: unix_release+0x87/0xc0 net/unix/af_unix.c:1048 __sock_release net/socket.c:659 [inline] sock_close+0xbe/0x240 net/socket.c:1421 __fput+0x42b/0x8a0 fs/file_table.c:422 __do_sys_close fs/open.c:1556 [inline] __se_sys_close fs/open.c:1541 [inline] __x64_sys_close+0x7f/0x110 fs/open.c:1541 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fb37d618070 Code: 00 00 48 c7 c2 b8 ff ff ff f7 d8 64 89 02 b8 ff ff ff ff eb d4 e8 10 2c 00 00 80 3d 31 f0 07 00 00 74 17 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 48 c3 0f 1f 80 00 00 00 00 48 83 ec 18 89 7c RSP: 002b:00007ffcd4a525d8 EFLAGS: 00000202 ORIG_RAX: 0000000000000003 RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fb37d618070 RDX: 0000000000000010 RSI: 00000000200001c0 RDI: 0000000000000004 RBP: 0000000000000000 R08: 0000000100000000 R09: 0000000100000000 R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Use sk_psock, which will only check that the pointer is not been set to NULL yet, which should only happen after the callbacks are restored. If, then, a reference can still be gotten, we may call sk_psock_stop and cancel psock->work. As suggested by Paolo Abeni, reorder the condition so the control flow is less convoluted. After that change, the reproducer does not trigger the WARN_ON_ONCE anymore. Suggested-by: Paolo Abeni Reported-by: syzbot+07a2e4a1a57118ef7355@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=07a2e4a1a57118ef7355 Fixes: aadb2bb83ff7 ("sock_map: Fix a potential use-after-free in sock_map_close()") Fixes: 5b4a79ba65a1 ("bpf, sockmap: Don't let sock_map_{close,destroy,unhash} call itself") Cc: stable@vger.kernel.org Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20240524144702.1178377-1-cascardo@igalia.com Signed-off-by: Paolo Abeni --- net/core/sock_map.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 63c016b4c169..d3dbb92153f2 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1674,19 +1674,23 @@ void sock_map_close(struct sock *sk, long timeout) lock_sock(sk); rcu_read_lock(); - psock = sk_psock_get(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - saved_close = READ_ONCE(sk->sk_prot)->close; - } else { + psock = sk_psock(sk); + if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); + psock = sk_psock_get(sk); + if (unlikely(!psock)) + goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); + } else { + saved_close = READ_ONCE(sk->sk_prot)->close; +no_psock: + rcu_read_unlock(); + release_sock(sk); } /* Make sure we do not recurse. This is a bug. From f09fc6cee0dcfc38148ee6b6dd04f93e353d22f2 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 28 May 2024 12:52:21 +0300 Subject: [PATCH 121/279] tpm: Rename TPM2_OA_TMPL to TPM2_OA_NULL_KEY and make it local Rename and document TPM2_OA_TMPL, as originally requested in the patch set review, but left unaddressed without any appropriate reasoning. The new name is TPM2_OA_NULL_KEY, has a documentation and is local only to tpm2-sessions.c. Link: https://lore.kernel.org/linux-integrity/ddbeb8111f48a8ddb0b8fca248dff6cc9d7079b2.camel@HansenPartnership.com/ Link: https://lore.kernel.org/linux-integrity/CZCKTWU6ZCC9.2UTEQPEVICYHL@suppilovahvero/ Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm2-sessions.c | 21 +++++++++++++++++++-- include/linux/tpm.h | 15 --------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c index ea8860661876..907ac9956a78 100644 --- a/drivers/char/tpm/tpm2-sessions.c +++ b/drivers/char/tpm/tpm2-sessions.c @@ -80,6 +80,9 @@ /* maximum number of names the TPM must remember for authorization */ #define AUTH_MAX_NAMES 3 +#define AES_KEY_BYTES AES_KEYSIZE_128 +#define AES_KEY_BITS (AES_KEY_BYTES*8) + static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, u32 *handle, u8 *name); @@ -954,6 +957,20 @@ int tpm2_start_auth_session(struct tpm_chip *chip) } EXPORT_SYMBOL(tpm2_start_auth_session); +/* + * A mask containing the object attributes for the kernel held null primary key + * used in HMAC encryption. For more information on specific attributes look up + * to "8.3 TPMA_OBJECT (Object Attributes)". + */ +#define TPM2_OA_NULL_KEY ( \ + TPM2_OA_NO_DA | \ + TPM2_OA_FIXED_TPM | \ + TPM2_OA_FIXED_PARENT | \ + TPM2_OA_SENSITIVE_DATA_ORIGIN | \ + TPM2_OA_USER_WITH_AUTH | \ + TPM2_OA_DECRYPT | \ + TPM2_OA_RESTRICTED) + /** * tpm2_parse_create_primary() - parse the data returned from TPM_CC_CREATE_PRIMARY * @@ -1018,7 +1035,7 @@ static int tpm2_parse_create_primary(struct tpm_chip *chip, struct tpm_buf *buf, val = tpm_buf_read_u32(buf, &offset_t); /* object properties */ - if (val != TPM2_OA_TMPL) + if (val != TPM2_OA_NULL_KEY) return -EINVAL; /* auth policy (empty) */ @@ -1178,7 +1195,7 @@ static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, tpm_buf_append_u16(&template, TPM_ALG_SHA256); /* object properties */ - tpm_buf_append_u32(&template, TPM2_OA_TMPL); + tpm_buf_append_u32(&template, TPM2_OA_NULL_KEY); /* sauth policy (empty) */ tpm_buf_append_u16(&template, 0); diff --git a/include/linux/tpm.h b/include/linux/tpm.h index b3217200df28..21a67dc9efe8 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -394,21 +394,6 @@ enum tpm2_object_attributes { TPM2_OA_SIGN = BIT(18), }; -/* - * definitions for the canonical template. These are mandated - * by the TCG key template documents - */ - -#define AES_KEY_BYTES AES_KEYSIZE_128 -#define AES_KEY_BITS (AES_KEY_BYTES*8) -#define TPM2_OA_TMPL (TPM2_OA_NO_DA | \ - TPM2_OA_FIXED_TPM | \ - TPM2_OA_FIXED_PARENT | \ - TPM2_OA_SENSITIVE_DATA_ORIGIN | \ - TPM2_OA_USER_WITH_AUTH | \ - TPM2_OA_DECRYPT | \ - TPM2_OA_RESTRICTED) - enum tpm2_session_attributes { TPM2_SA_CONTINUE_SESSION = BIT(0), TPM2_SA_AUDIT_EXCLUSIVE = BIT(1), From d3e43a8fa43effdbb62c7edc206df7ac67772205 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 28 May 2024 12:58:41 +0300 Subject: [PATCH 122/279] tpm: Enable TCG_TPM2_HMAC by default only for X86_64 Given the not fully root caused performance issues on non-x86 platforms, enable the feature by default only for x86-64. That is the platform it brings the most value and has gone most of the QA. Can be reconsidered later and can be obviously opt-in enabled too on any arch. Link: https://lore.kernel.org/linux-integrity/bf67346ef623ff3c452c4f968b7d900911e250c3.camel@gmail.com/#t Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index e63a6a17793c..cf0be8a7939d 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -29,7 +29,7 @@ if TCG_TPM config TCG_TPM2_HMAC bool "Use HMAC and encrypted transactions on the TPM bus" - default y + default X86_64 select CRYPTO_ECDH select CRYPTO_LIB_AESCFB select CRYPTO_LIB_SHA256 From 72b6a2d6506843375c7b91197f49ef38ca0c6d0f Mon Sep 17 00:00:00 2001 From: Alina Yu Date: Tue, 28 May 2024 14:01:13 +0800 Subject: [PATCH 123/279] regulator: rtq2208: Fix invalid memory access when devm_of_regulator_put_matches is called In this patch, a software bug has been fixed. rtq2208_ldo_match is no longer a local variable. It prevents invalid memory access when devm_of_regulator_put_matches is called. Signed-off-by: Alina Yu Link: https://msgid.link/r/4ce8c4f16f1cf3aa4e5f36c0694dd3c5ccf3cd1c.1716870419.git.alina_yu@richtek.com Signed-off-by: Mark Brown --- drivers/regulator/rtq2208-regulator.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/regulator/rtq2208-regulator.c b/drivers/regulator/rtq2208-regulator.c index b90e53d922d6..c31b6dc3229c 100644 --- a/drivers/regulator/rtq2208-regulator.c +++ b/drivers/regulator/rtq2208-regulator.c @@ -228,6 +228,11 @@ static const struct regulator_ops rtq2208_regulator_ldo_ops = { .set_suspend_disable = rtq2208_set_suspend_disable, }; +static struct of_regulator_match rtq2208_ldo_match[] = { + {.name = "ldo2", }, + {.name = "ldo1", }, +}; + static unsigned int rtq2208_of_map_mode(unsigned int mode) { switch (mode) { @@ -322,8 +327,7 @@ static irqreturn_t rtq2208_irq_handler(int irqno, void *devid) return IRQ_HANDLED; } -static int rtq2208_of_get_fixed_voltage(struct device *dev, - struct of_regulator_match *rtq2208_ldo_match, int n_fixed) +static int rtq2208_of_get_ldo_dvs_ability(struct device *dev) { struct device_node *np; struct of_regulator_match *match; @@ -338,14 +342,14 @@ static int rtq2208_of_get_fixed_voltage(struct device *dev, if (!np) np = dev->of_node; - ret = of_regulator_match(dev, np, rtq2208_ldo_match, n_fixed); + ret = of_regulator_match(dev, np, rtq2208_ldo_match, ARRAY_SIZE(rtq2208_ldo_match)); of_node_put(np); if (ret < 0) return ret; - for (i = 0; i < n_fixed; i++) { + for (i = 0; i < ARRAY_SIZE(rtq2208_ldo_match); i++) { match = rtq2208_ldo_match + i; init_data = match->init_data; rdesc = (struct rtq2208_regulator_desc *)match->driver_data; @@ -388,8 +392,7 @@ static const struct linear_range rtq2208_vout_range[] = { REGULATOR_LINEAR_RANGE(1310000, 181, 255, 10000), }; -static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, int mtp_sel, - int idx, struct of_regulator_match *rtq2208_ldo_match, int *ldo_idx) +static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, int mtp_sel, int idx) { struct regulator_desc *desc; static const struct { @@ -461,8 +464,7 @@ static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, in static int rtq2208_parse_regulator_dt_data(int n_regulator, const unsigned int *regulator_idx_table, struct rtq2208_regulator_desc *rdesc[RTQ2208_LDO_MAX], struct device *dev) { - struct of_regulator_match rtq2208_ldo_match[2]; - int mtp_sel, ret, i, idx, ldo_idx = 0; + int mtp_sel, i, idx, ret; /* get mtp_sel0 or mtp_sel1 */ mtp_sel = device_property_read_bool(dev, "richtek,mtp-sel-high"); @@ -474,7 +476,7 @@ static int rtq2208_parse_regulator_dt_data(int n_regulator, const unsigned int * if (!rdesc[i]) return -ENOMEM; - rtq2208_init_regulator_desc(rdesc[i], mtp_sel, idx, rtq2208_ldo_match, &ldo_idx); + rtq2208_init_regulator_desc(rdesc[i], mtp_sel, idx); /* init ldo dvs ability */ if (idx >= RTQ2208_LDO2) @@ -482,7 +484,7 @@ static int rtq2208_parse_regulator_dt_data(int n_regulator, const unsigned int * } /* init ldo fixed_uV */ - ret = rtq2208_of_get_fixed_voltage(dev, rtq2208_ldo_match, ldo_idx); + ret = rtq2208_of_get_ldo_dvs_ability(dev); if (ret) return dev_err_probe(dev, ret, "Failed to get ldo fixed_uV\n"); From 233e27b4d21c3e44eb863f03e566d3a22e81a7ae Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 28 May 2024 15:28:52 +0900 Subject: [PATCH 124/279] null_blk: Print correct max open zones limit in null_init_zoned_dev() When changing the maximum number of open zones, print that number instead of the total number of zones. Fixes: dc4d137ee3b7 ("null_blk: add support for max open/active zone limit for zoned devices") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240528062852.437599-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 5b5a63adacc1..79c8e5e99f7f 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -108,7 +108,7 @@ int null_init_zoned_dev(struct nullb_device *dev, if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { dev->zone_max_open = dev->zone_max_active; pr_info("changed the maximum number of open zones to %u\n", - dev->nr_zones); + dev->zone_max_open); } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { dev->zone_max_open = 0; pr_info("zone_max_open limit disabled, limit >= zone count\n"); From bafea1c58b24be594d97841ced1b7ae0347bf6e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 May 2024 20:26:13 +0200 Subject: [PATCH 125/279] sd: also set max_user_sectors when setting max_sectors sd can set a max_sectors value that is lower than the max_hw_sectors limit based on the block limits VPD page. While this is rather unusual, it used to work until the max_user_sectors field was split out to cleanly deal with conflicting hardware and user limits when the hardware limit changes. Also set max_user_sectors to ensure the limit can properly be stacked. Fixes: 4f563a64732d ("block: add a max_user_discard_sectors queue limit") Reported-by: Mike Snitzer Signed-off-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240523182618.602003-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 332eb9dac22d..f6c822c9cbd2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3700,8 +3700,10 @@ static int sd_revalidate_disk(struct gendisk *disk) */ if (sdkp->first_scan || q->limits.max_sectors > q->limits.max_dev_sectors || - q->limits.max_sectors > q->limits.max_hw_sectors) + q->limits.max_sectors > q->limits.max_hw_sectors) { q->limits.max_sectors = rw_max; + q->limits.max_user_sectors = rw_max; + } sdkp->first_scan = 0; From e528bede6f4e6822afdf0fa80be46ea9199f0911 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 May 2024 20:26:14 +0200 Subject: [PATCH 126/279] block: stack max_user_sectors The max_user_sectors is one of the three factors determining the actual max_sectors limit for READ/WRITE requests. Because of that it needs to be stacked at least for the device mapper multi-path case where requests are directly inserted on the lower device. For SCSI disks this is important because the sd driver actually sets it's own advisory limit that is lower than max_hw_sectors based on the block limits VPD page. While this is a bit odd an unusual, the same effect can happen if a user or udev script tweaks the value manually. Fixes: 4f563a64732d ("block: add a max_user_discard_sectors queue limit") Reported-by: Mike Snitzer Signed-off-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240523182618.602003-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index a7fe8e90240a..7a672021daee 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -611,6 +611,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, unsigned int top, bottom, alignment, ret = 0; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); + t->max_user_sectors = min_not_zero(t->max_user_sectors, + b->max_user_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, From e993db2d6e5207f1ae061c2ac554ab1f714c741d Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 24 May 2024 12:46:51 +0200 Subject: [PATCH 127/279] block: check for max_hw_sectors underflow The logical block size need to be smaller than the max_hw_sector setting, otherwise we can't even transfer a single LBA. Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 7a672021daee..effeb9a639bb 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -104,6 +104,7 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) static int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; + unsigned int logical_block_sectors; /* * Unless otherwise specified, default to 512 byte logical blocks and a @@ -134,8 +135,11 @@ static int blk_validate_limits(struct queue_limits *lim) lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS)) return -EINVAL; + logical_block_sectors = lim->logical_block_size >> SECTOR_SHIFT; + if (WARN_ON_ONCE(logical_block_sectors > lim->max_hw_sectors)) + return -EINVAL; lim->max_hw_sectors = round_down(lim->max_hw_sectors, - lim->logical_block_size >> SECTOR_SHIFT); + logical_block_sectors); /* * The actual max_sectors value is a complex beast and also takes the @@ -153,7 +157,7 @@ static int blk_validate_limits(struct queue_limits *lim) lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); } lim->max_sectors = round_down(lim->max_sectors, - lim->logical_block_size >> SECTOR_SHIFT); + logical_block_sectors); /* * Random default for the maximum number of segments. Driver should not From a14a68b76954e73031ca6399abace17dcb77c17a Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 28 May 2024 20:09:12 +0800 Subject: [PATCH 128/279] bcache: allow allocator to invalidate bucket in gc Currently, if the gc is running, when the allocator found free_inc is empty, allocator has to wait the gc finish. Before that, the IO is blocked. But actually, there would be some buckets is reclaimable before gc, and gc will never mark this kind of bucket to be unreclaimable. So we can put these buckets into free_inc in gc running to avoid IO being blocked. Signed-off-by: Dongsheng Yang Signed-off-by: Mingzhe Zou Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20240528120914.28705-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 13 +++++-------- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/btree.c | 7 ++++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ce13c272c387..32a46343097d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -129,12 +129,9 @@ static inline bool can_inc_bucket_gen(struct bucket *b) bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) { - BUG_ON(!ca->set->gc_mark_valid); - - return (!GC_MARK(b) || - GC_MARK(b) == GC_MARK_RECLAIMABLE) && - !atomic_read(&b->pin) && - can_inc_bucket_gen(b); + return (ca->set->gc_mark_valid || b->reclaimable_in_gc) && + ((!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) && + !atomic_read(&b->pin) && can_inc_bucket_gen(b)); } void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) @@ -148,6 +145,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) bch_inc_gen(ca, b); b->prio = INITIAL_PRIO; atomic_inc(&b->pin); + b->reclaimable_in_gc = 0; } static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) @@ -352,8 +350,7 @@ static int bch_allocator_thread(void *arg) */ retry_invalidate: - allocator_wait(ca, ca->set->gc_mark_valid && - !ca->invalidate_needs_gc); + allocator_wait(ca, !ca->invalidate_needs_gc); invalidate_buckets(ca); /* diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4e6afa89921f..1d33e40d26ea 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -200,6 +200,7 @@ struct bucket { uint8_t gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint16_t gc_mark; /* Bitfield used by GC. See below for field */ + uint16_t reclaimable_in_gc:1; }; /* diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index d011a7154d33..4e6ccf2c8a0b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1741,18 +1741,20 @@ static void btree_gc_start(struct cache_set *c) mutex_lock(&c->bucket_lock); - c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; ca = c->cache; for_each_bucket(b, ca) { b->last_gc = b->gen; + if (bch_can_invalidate_bucket(ca, b)) + b->reclaimable_in_gc = 1; if (!atomic_read(&b->pin)) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); } } + c->gc_mark_valid = 0; mutex_unlock(&c->bucket_lock); } @@ -1809,6 +1811,9 @@ static void bch_btree_gc_finish(struct cache_set *c) for_each_bucket(b, ca) { c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + if (b->reclaimable_in_gc) + b->reclaimable_in_gc = 0; + if (atomic_read(&b->pin)) continue; From 05356938a4be356adde4eab4425c6822f3c7d706 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 28 May 2024 20:09:13 +0800 Subject: [PATCH 129/279] bcache: call force_wake_up_gc() if necessary in check_should_bypass() If there are extreme heavy write I/O continuously hit on relative small cache device (512GB in my testing), it is possible to make counter c->gc_stats.in_use continue to increase and exceed CUTOFF_CACHE_ADD. If 'c->gc_stats.in_use > CUTOFF_CACHE_ADD' happens, all following write requests will bypass the cache device because check_should_bypass() returns 'true'. Because all writes bypass the cache device, counter c->sectors_to_gc has no chance to be negative value, and garbage collection thread won't be waken up even the whole cache becomes clean after writeback accomplished. The aftermath is that all write I/Os go directly into backing device even the cache device is clean. To avoid the above situation, this patch uses a quite conservative way to fix: if 'c->gc_stats.in_use > CUTOFF_CACHE_ADD' happens, only wakes up garbage collection thread when the whole cache device is clean. Before the fix, the writes-always-bypass situation happens after 10+ hours write I/O pressure on 512GB Intel optane memory which acts as cache device. After this fix, such situation doesn't happen after 36+ hours testing. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20240528120914.28705-3-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 83d112bd2b1c..af345dc6fde1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -369,10 +369,24 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) struct io *i; if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - c->gc_stats.in_use > CUTOFF_CACHE_ADD || (bio_op(bio) == REQ_OP_DISCARD)) goto skip; + if (c->gc_stats.in_use > CUTOFF_CACHE_ADD) { + /* + * If cached buckets are all clean now, 'true' will be + * returned and all requests will bypass the cache device. + * Then c->sectors_to_gc has no chance to be negative, and + * gc thread won't wake up and caching won't work forever. + * Here call force_wake_up_gc() to avoid such aftermath. + */ + if (BDEV_STATE(&dc->sb) == BDEV_STATE_CLEAN && + c->gc_mark_valid) + force_wake_up_gc(c); + + goto skip; + } + if (mode == CACHE_MODE_NONE || (mode == CACHE_MODE_WRITEAROUND && op_is_write(bio_op(bio)))) From 74d4ce92e08d5669d66fd890403724faa4286c21 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 28 May 2024 20:09:14 +0800 Subject: [PATCH 130/279] bcache: code cleanup in __bch_bucket_alloc_set() In __bch_bucket_alloc_set() the lines after lable 'err:' indeed do nothing useful after multiple cache devices are removed from bcache code. This cleanup patch drops the useless code to save a bit CPU cycles. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20240528120914.28705-4-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 32a46343097d..48ce750bf70a 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -498,8 +498,8 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, ca = c->cache; b = bch_bucket_alloc(ca, reserve, wait); - if (b == -1) - goto err; + if (b < 0) + return -1; k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, bucket_to_sector(c, b), @@ -508,10 +508,6 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, SET_KEY_PTRS(k, 1); return 0; -err: - bch_bucket_free(c, k); - bkey_put(c, k); - return -1; } int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, From 56a5cf538c3f2d935b0d81040a8303b6e7fc5fd8 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Mon, 27 May 2024 12:00:15 +0530 Subject: [PATCH 131/279] net: ti: icssg-prueth: Fix start counter for ft1 filter The start counter for FT1 filter is wrongly set to 0 in the driver. FT1 is used for source address violation (SAV) check and source address starts at Byte 6 not Byte 0. Fix this by changing start counter to ETH_ALEN in icssg_ft1_set_mac_addr(). Fixes: e9b4ece7d74b ("net: ti: icssg-prueth: Add Firmware config and classification APIs.") Signed-off-by: MD Danish Anwar Link: https://lore.kernel.org/r/20240527063015.263748-1-danishanwar@ti.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icssg_classifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_classifier.c b/drivers/net/ethernet/ti/icssg/icssg_classifier.c index 79ba47bb3602..f7d21da1a0fb 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_classifier.c +++ b/drivers/net/ethernet/ti/icssg/icssg_classifier.c @@ -455,7 +455,7 @@ void icssg_ft1_set_mac_addr(struct regmap *miig_rt, int slice, u8 *mac_addr) { const u8 mask_addr[] = { 0, 0, 0, 0, 0, 0, }; - rx_class_ft1_set_start_len(miig_rt, slice, 0, 6); + rx_class_ft1_set_start_len(miig_rt, slice, ETH_ALEN, ETH_ALEN); rx_class_ft1_set_da(miig_rt, slice, 0, mac_addr); rx_class_ft1_set_da_mask(miig_rt, slice, 0, mask_addr); rx_class_ft1_cfg_set_type(miig_rt, slice, 0, FT1_CFG_TYPE_EQ); From 1292bc2ebf63e705ae18bbaaf9cea21b68d37ee6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 May 2024 13:59:34 -0400 Subject: [PATCH 132/279] bcachefs: Plumb bkey into __btree_err() It can be useful to know the exact byte offset within a btree node where an error occured. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 85 ++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index cbf8f5d90602..829c1b91477d 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -519,7 +519,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, - struct btree *b, struct bset *i, + struct btree *b, struct bset *i, struct bkey_packed *k, unsigned offset, int write) { prt_printf(out, bch2_log_msg(c, "%s"), @@ -537,15 +537,20 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, b->written, btree_ptr_sectors_written(&b->key)); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + if (k) + prt_printf(out, " bset byte offset %lu", + (unsigned long)(void *)k - + ((unsigned long)(void *)i & ~511UL)); prt_str(out, ": "); } -__printf(9, 10) +__printf(10, 11) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, + struct bkey_packed *k, int write, bool have_retry, enum bch_sb_error_id err_type, @@ -555,7 +560,7 @@ static int __btree_err(int ret, bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; va_list args; - btree_err_msg(&out, c, ca, b, i, b->written, write); + btree_err_msg(&out, c, ca, b, i, k, b->written, write); va_start(args, fmt); prt_vprintf(&out, fmt, args); @@ -611,9 +616,9 @@ static int __btree_err(int ret, return ret; } -#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \ +#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \ + int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ BCH_FSCK_ERR_##_err_type, \ msg, ##__VA_ARGS__); \ \ @@ -690,7 +695,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_version_compatible(version), -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_unsupported_version, "unsupported bset version %u.%u", BCH_VERSION_MAJOR(version), @@ -698,7 +703,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(version < c->sb.version_min, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, NULL, btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { @@ -711,7 +716,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(BCH_VERSION_MAJOR(version) > BCH_VERSION_MAJOR(c->sb.version), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, NULL, btree_node_bset_newer_than_sb, "bset version %u newer than superblock version %u", version, c->sb.version)) { @@ -723,13 +728,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(BSET_SEPARATE_WHITEOUTS(i), -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); if (btree_err_on(offset + sectors > btree_sectors(c), -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_past_end_of_btree_node, "bset past end of btree node")) { i->u64s = 0; @@ -739,13 +744,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_empty, "empty bset"); btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_wrong_sector_offset, "bset at wrong sector offset"); @@ -761,20 +766,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, /* XXX endianness */ btree_err_on(bp->seq != bn->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, bset_bad_seq, "incorrect sequence number (wrong btree node)"); } btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_btree, "incorrect btree id"); btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_level, "incorrect level"); @@ -793,7 +798,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_min_key, "incorrect min_key: got %s should be %s", (printbuf_reset(&buf1), @@ -804,7 +809,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_max_key, "incorrect max key %s", (printbuf_reset(&buf1), @@ -816,7 +821,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), -BCH_ERR_btree_node_read_err_bad_node, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_format, "invalid bkey format: %s\n %s", buf1.buf, (printbuf_reset(&buf2), @@ -883,7 +888,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_past_bset_end, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); @@ -892,14 +897,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_bad_format, "invalid bkey format %u", k->format)) goto drop_this_key; if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_bad_u64s, "bad k->u64s %u (min %u max %zu)", k->u64s, bkeyp_key_u64s(&b->format, k), @@ -921,7 +926,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bad_bkey, "invalid bkey: %s", buf.buf); goto drop_this_key; @@ -942,7 +947,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_to_text(&buf, u.k); if (btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_out_of_order, "%s", buf.buf)) goto drop_this_key; @@ -1011,13 +1016,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (bch2_meta_read_fault("btree")) btree_err(-BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_fault_injected, "dynamic fault"); btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_magic, "bad magic: want %llx, got %llx", bset_magic(c), le64_to_cpu(b->data->magic)); @@ -1032,7 +1037,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(b->data->keys.seq != bp->seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_seq, "got wrong btree node: got\n%s", (printbuf_reset(&buf), @@ -1041,7 +1046,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_seq, "bad btree header: seq 0\n%s", (printbuf_reset(&buf), @@ -1060,7 +1065,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_unknown_csum, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -1073,7 +1078,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(csum_bad, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_bad_csum, "%s", (printbuf_reset(&buf), @@ -1088,7 +1093,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), -BCH_ERR_btree_node_read_err_incompatible, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_unsupported_version, "btree node does not have NEW_EXTENT_OVERWRITE set"); @@ -1102,7 +1107,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_unknown_csum, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -1114,7 +1119,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(csum_bad, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_bad_csum, "%s", (printbuf_reset(&buf), @@ -1152,14 +1157,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(blacklisted && first, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_blacklisted_journal_seq, "first btree node bset has blacklisted journal seq (%llu)", le64_to_cpu(i->journal_seq)); btree_err_on(blacklisted && ptr_written, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, first_bset_blacklisted_journal_seq, "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", le64_to_cpu(i->journal_seq), @@ -1178,7 +1183,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (ptr_written) { btree_err_on(b->written < ptr_written, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_data_missing, "btree node data missing: expected %u sectors, found %u", ptr_written, b->written); @@ -1191,7 +1196,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, le64_to_cpu(bne->keys.journal_seq), true), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bset_after_end, "found bset signature after last bset"); } @@ -1235,7 +1240,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bad_bkey, "%s", buf.buf); @@ -1471,18 +1476,18 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) written2 = btree_node_sectors_written(c, ra->buf[i]); if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_replicas_sectors_written_mismatch, "btree node sectors written mismatch: %u != %u", written, written2) || btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_bset_after_end, "found bset signature after last bset") || btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_replicas_data_mismatch, "btree node replicas content mismatch")) dump_bset_maps = true; From 9e1a66e66870ebeebea9f674550118df3c12eaf6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 May 2024 13:24:31 -0400 Subject: [PATCH 133/279] bcachefs: Fix lookup_first_inode() when inode_generations are present This function is used for finding the hash seed (which is the same in all versions of an inode in different snapshots): ff an inode has been deleted in a child snapshot we need to iterate until we find a live version. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index c8f57465131c..4cd28db9bad8 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -77,21 +77,17 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - POS(0, inode_nr), - BTREE_ITER_all_snapshots); - k = bch2_btree_iter_peek(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { - ret = -BCH_ERR_ENOENT_inode; - goto err; + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; } - - ret = bch2_inode_unpack(k, inode); -err: + ret = -BCH_ERR_ENOENT_inode; +found: bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); bch2_trans_iter_exit(trans, &iter); return ret; From 218e5e0c2a3acdb29ccbdfbfdd5e2def27d3aae2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 May 2024 02:40:06 -0400 Subject: [PATCH 134/279] bcachefs: Fix locking assert We now track whether a transaction is locked, and verify that we don't have nodes locked when the transaction isn't locked; reorder relocks to not pop the new assert. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 75f5e6fe4634..34056aaece00 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -424,18 +424,18 @@ static int btree_key_cache_fill(struct btree_trans *trans, goto err; } + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_k); + goto err; + } + if (!bch2_btree_node_relock(trans, ck_path, 0)) { kfree(new_k); trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); goto err; } - - ret = bch2_trans_relock(trans); - if (ret) { - kfree(new_k); - goto err; - } } } From 82af5ceb5d9d1f0613be3b9161ec1104b85f00b8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 May 2024 22:22:30 -0400 Subject: [PATCH 135/279] bcachefs: Refactor delete_dead_snapshots() Consolidate per-key work into delete_dead_snapshots_process_key(), so we now walk all keys once, not twice. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 65 ++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 40 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 629900a5e641..466fa3e6a4b6 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1351,7 +1351,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -static int snapshot_delete_key(struct btree_trans *trans, +static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, snapshot_id_list *deleted, @@ -1360,26 +1360,26 @@ static int snapshot_delete_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ + return 0; if (!bkey_eq(k.k->p, *last_pos)) equiv_seen->nr = 0; - *last_pos = k.k->p; - if (snapshot_list_has_id(deleted, k.k->p.snapshot) || - snapshot_list_has_id(equiv_seen, equiv)) { + if (snapshot_list_has_id(deleted, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - } else { - return snapshot_list_add(c, equiv_seen, equiv); - } -} -static int move_key_to_correct_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + if (!bpos_eq(*last_pos, k.k->p) && + snapshot_list_has_id(equiv_seen, equiv)) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); + + *last_pos = k.k->p; + + int ret = snapshot_list_add_nodup(c, equiv_seen, equiv); + if (ret) + return ret; /* * When we have a linear chain of snapshot nodes, we consider @@ -1389,21 +1389,20 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans, * * If there are multiple keys in different snapshots at the same * position, we're only going to keep the one in the newest - * snapshot - the rest have been overwritten and are redundant, - * and for the key we're going to keep we need to move it to the - * equivalance class ID if it's not there already. + * snapshot (we delete the others above) - the rest have been + * overwritten and are redundant, and for the key we're going to keep we + * need to move it to the equivalance class ID if it's not there + * already. */ if (equiv != k.k->p.snapshot) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - struct btree_iter new_iter; - int ret; - - ret = PTR_ERR_OR_ZERO(new); + int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; new->k.p.snapshot = equiv; + struct btree_iter new_iter; bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, BTREE_ITER_all_snapshots| BTREE_ITER_cached| @@ -1538,7 +1537,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) struct btree_trans *trans; snapshot_id_list deleted = { 0 }; snapshot_id_list deleted_interior = { 0 }; - u32 id; int ret = 0; if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) @@ -1585,33 +1583,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (ret) goto err; - for (id = 0; id < BTREE_ID_NR; id++) { + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { struct bpos last_pos = POS_MIN; snapshot_id_list equiv_seen = { 0 }; struct disk_reservation res = { 0 }; - if (!btree_type_has_snapshots(id)) - continue; - - /* - * deleted inodes btree is maintained by a trigger on the inodes - * btree - no work for us to do here, and it's not safe to scan - * it because we'll see out of date keys due to the btree write - * buffer: - */ - if (id == BTREE_ID_deleted_inodes) + if (!btree_type_has_snapshots(btree)) continue; ret = for_each_btree_key_commit(trans, iter, - id, POS_MIN, + btree, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, - snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: - for_each_btree_key_commit(trans, iter, - id, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, - move_key_to_correct_snapshot(trans, &iter, k)); + delete_dead_snapshots_process_key(trans, &iter, k, &deleted, + &equiv_seen, &last_pos)); bch2_disk_reservation_put(c, &res); darray_exit(&equiv_seen); From 08f50005e09f3bf74a7cb5fd86335d3c4077df51 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 May 2024 12:38:30 -0400 Subject: [PATCH 136/279] bcachefs: Run check_key_has_snapshot in snapshot_delete_keys() delete_dead_snapshots now runs before the main fsck.c passes which check for keys for invalid snapshots; thus, it needs those checks as well. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 27 ++++----------------------- fs/bcachefs/snapshot.c | 25 ++++++++++++++++++++++++- fs/bcachefs/snapshot.h | 1 + 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 4cd28db9bad8..fd277bd58ed3 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -766,25 +766,6 @@ static int get_visible_inodes(struct btree_trans *trans, return ret; } -static int check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, - bkey_in_missing_snapshot, - "key in missing snapshot: %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; -fsck_err: - printbuf_exit(&buf); - return ret; -} - static int hash_redo_key(struct btree_trans *trans, const struct bch_hash_desc desc, struct bch_hash_info *hash_info, @@ -979,7 +960,7 @@ static int check_inode(struct btree_trans *trans, bool do_update = false; int ret; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) goto err; if (ret) @@ -1483,7 +1464,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct printbuf buf = PRINTBUF; int ret = 0; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; goto out; @@ -2006,7 +1987,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct printbuf buf = PRINTBUF; int ret = 0; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; goto out; @@ -2161,7 +2142,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker_entry *i; int ret; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) return ret; if (ret) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 466fa3e6a4b6..51918acfd726 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1042,6 +1042,25 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) return ret; } +int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, + bkey_in_missing_snapshot, + "key in missing snapshot %s, delete?", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; +fsck_err: + printbuf_exit(&buf); + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1358,6 +1377,10 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, snapshot_id_list *equiv_seen, struct bpos *last_pos) { + int ret = bch2_check_key_has_snapshot(trans, iter, k); + if (ret) + return ret < 0 ? ret : 0; + struct bch_fs *c = trans->c; u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ @@ -1377,7 +1400,7 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, *last_pos = k.k->p; - int ret = snapshot_list_add_nodup(c, equiv_seen, equiv); + ret = snapshot_list_add_nodup(c, equiv_seen, equiv); if (ret) return ret; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index ab13d8f4b41e..31b0ee03e962 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -242,6 +242,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_reconstruct_snapshots(struct bch_fs *); +int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); void bch2_delete_dead_snapshots_work(struct work_struct *); From 247c056bde2ebc9fad2fc62332dc7cc99b58d720 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 May 2024 16:30:19 -0400 Subject: [PATCH 137/279] bcachefs: Fix setting of downgrade recovery passes/errors bch2_check_version_downgrade() was setting c->sb.version, which bch2_sb_set_downgrade() expects to be at the previous version; and it shouldn't even have been set directly because c->sb.version is updated by write_super(). Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index f1bee6c5222d..d73a0222f709 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1132,18 +1132,12 @@ bool bch2_check_version_downgrade(struct bch_fs *c) * c->sb will be checked before we write the superblock, so update it as * well: */ - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) { + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - c->sb.version_upgrade_complete = bcachefs_metadata_version_current; - } - if (c->sb.version > bcachefs_metadata_version_current) { + if (c->sb.version > bcachefs_metadata_version_current) c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - c->sb.version = bcachefs_metadata_version_current; - } - if (c->sb.version_min > bcachefs_metadata_version_current) { + if (c->sb.version_min > bcachefs_metadata_version_current) c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); - c->sb.version_min = bcachefs_metadata_version_current; - } c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); return ret; } From b4131076c16fdd2bc6cb09cfa7e0cfe278aa49a1 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 27 May 2024 12:01:18 -0700 Subject: [PATCH 138/279] bcachefs: add missing MODULE_DESCRIPTION() Fix the 'make W=1' warning: WARNING: modpost: missing MODULE_DESCRIPTION() in fs/bcachefs/mean_and_variance_test.o Signed-off-by: Jeff Johnson Signed-off-by: Kent Overstreet --- fs/bcachefs/mean_and_variance_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index 4c298e74723d..e9d9c0212e44 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -217,4 +217,5 @@ static struct kunit_suite mean_and_variance_test_suite = { kunit_test_suite(mean_and_variance_test_suite); MODULE_AUTHOR("Daniel B. Hill"); +MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests"); MODULE_LICENSE("GPL"); From 088d0de81220a74d7d553febb81656927f10bb16 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 May 2024 18:40:50 -0400 Subject: [PATCH 139/279] bcachefs: btree_gc can now handle unknown btrees Compatibility fix - we no longer have a separate table for which order gc walks btrees in, and special case the stripes btree directly. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 44 +----------------------------------- fs/bcachefs/btree_gc.c | 9 ++++---- fs/bcachefs/btree_gc.h | 44 ++++++++++++++++-------------------- fs/bcachefs/btree_gc_types.h | 29 ++++++++++++++++++++++++ fs/bcachefs/ec.c | 2 +- 5 files changed, 55 insertions(+), 73 deletions(-) create mode 100644 fs/bcachefs/btree_gc_types.h diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index bc0ea2c4efef..2a538eb2af11 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -457,6 +457,7 @@ enum bch_time_stats { }; #include "alloc_types.h" +#include "btree_gc_types.h" #include "btree_types.h" #include "btree_node_scan_types.h" #include "btree_write_buffer_types.h" @@ -488,49 +489,6 @@ enum bch_time_stats { struct btree; -enum gc_phase { - GC_PHASE_NOT_RUNNING, - GC_PHASE_START, - GC_PHASE_SB, - - GC_PHASE_BTREE_stripes, - GC_PHASE_BTREE_extents, - GC_PHASE_BTREE_inodes, - GC_PHASE_BTREE_dirents, - GC_PHASE_BTREE_xattrs, - GC_PHASE_BTREE_alloc, - GC_PHASE_BTREE_quotas, - GC_PHASE_BTREE_reflink, - GC_PHASE_BTREE_subvolumes, - GC_PHASE_BTREE_snapshots, - GC_PHASE_BTREE_lru, - GC_PHASE_BTREE_freespace, - GC_PHASE_BTREE_need_discard, - GC_PHASE_BTREE_backpointers, - GC_PHASE_BTREE_bucket_gens, - GC_PHASE_BTREE_snapshot_trees, - GC_PHASE_BTREE_deleted_inodes, - GC_PHASE_BTREE_logged_ops, - GC_PHASE_BTREE_rebalance_work, - GC_PHASE_BTREE_subvolume_children, - - GC_PHASE_PENDING_DELETE, -}; - -struct gc_pos { - enum gc_phase phase; - u16 level; - struct bpos pos; -}; - -struct reflink_gc { - u64 offset; - u32 size; - u32 refcount; -}; - -typedef GENRADIX(struct reflink_gc) reflink_gc_table; - struct io_count { u64 sectors[2][BCH_DATA_NR]; }; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 8035c8b797ab..e9e901feda29 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -673,8 +673,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) { - return (int) btree_id_to_gc_phase(l) - - (int) btree_id_to_gc_phase(r); + return cmp_int(gc_btree_order(l), gc_btree_order(r)); } static int bch2_gc_btrees(struct bch_fs *c) @@ -711,7 +710,7 @@ static int bch2_gc_btrees(struct bch_fs *c) static int bch2_mark_superblocks(struct bch_fs *c) { mutex_lock(&c->sb_lock); - gc_pos_set(c, gc_phase(GC_PHASE_SB)); + gc_pos_set(c, gc_phase(GC_PHASE_sb)); int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); mutex_unlock(&c->sb_lock); @@ -1209,7 +1208,7 @@ int bch2_check_allocations(struct bch_fs *c) if (ret) goto out; - gc_pos_set(c, gc_phase(GC_PHASE_START)); + gc_pos_set(c, gc_phase(GC_PHASE_start)); ret = bch2_mark_superblocks(c); BUG_ON(ret); @@ -1231,7 +1230,7 @@ int bch2_check_allocations(struct bch_fs *c) percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); bch2_gc_free(c); percpu_up_write(&c->mark_lock); diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 1b6489d8e0f4..876d81e2017d 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -3,6 +3,7 @@ #define _BCACHEFS_BTREE_GC_H #include "bkey.h" +#include "btree_gc_types.h" #include "btree_types.h" int bch2_check_topology(struct bch_fs *); @@ -32,36 +33,15 @@ int bch2_check_allocations(struct bch_fs *); /* Position of (the start of) a gc phase: */ static inline struct gc_pos gc_phase(enum gc_phase phase) { - return (struct gc_pos) { - .phase = phase, - .level = 0, - .pos = POS_MIN, - }; -} - -static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -{ - return cmp_int(l.phase, r.phase) ?: - -cmp_int(l.level, r.level) ?: - bpos_cmp(l.pos, r.pos); -} - -static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) -{ - switch (id) { -#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; - BCH_BTREE_IDS() -#undef x - default: - BUG(); - } + return (struct gc_pos) { .phase = phase, }; } static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, struct bpos pos) { return (struct gc_pos) { - .phase = btree_id_to_gc_phase(btree), + .phase = GC_PHASE_btree, + .btree = btree, .level = level, .pos = pos, }; @@ -76,6 +56,22 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b) return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p); } +static inline int gc_btree_order(enum btree_id btree) +{ + if (btree == BTREE_ID_stripes) + return -1; + return btree; +} + +static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) +{ + return cmp_int(l.phase, r.phase) ?: + cmp_int(gc_btree_order(l.btree), + gc_btree_order(r.btree)) ?: + -cmp_int(l.level, r.level) ?: + bpos_cmp(l.pos, r.pos); +} + static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) { unsigned seq; diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h new file mode 100644 index 000000000000..b82c24bcc088 --- /dev/null +++ b/fs/bcachefs/btree_gc_types.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_GC_TYPES_H +#define _BCACHEFS_BTREE_GC_TYPES_H + +#include + +enum gc_phase { + GC_PHASE_not_running, + GC_PHASE_start, + GC_PHASE_sb, + GC_PHASE_btree, +}; + +struct gc_pos { + enum gc_phase phase:8; + enum btree_id btree:8; + u16 level; + struct bpos pos; +}; + +struct reflink_gc { + u64 offset; + u32 size; + u32 refcount; +}; + +typedef GENRADIX(struct reflink_gc) reflink_gc_table; + +#endif /* _BCACHEFS_BTREE_GC_TYPES_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index b26dc7424662..d8b9beca3776 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -908,7 +908,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && + if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; From f1d4fed13fa0c0f8c99cdbbc27460f1e1f46fd4e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 May 2024 19:17:09 -0400 Subject: [PATCH 140/279] bcachefs: Better fsck error message for key version Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e9e901feda29..dc97991bcd6a 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -585,16 +585,17 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, bkey_version_in_future, - "key version number higher than recorded: %llu > %llu", - k.k->version.lo, - atomic64_read(&c->key_version))) + "key version number higher than recorded %llu\n %s", + atomic64_read(&c->key_version), + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) atomic64_set(&c->key_version, k.k->version.lo); } if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), c, btree_bitmap_not_marked, "btree ptr not marked in member info btree allocated bitmap\n %s", - (bch2_bkey_val_to_text(&buf, c, k), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { mutex_lock(&c->sb_lock); bch2_dev_btree_bitmap_mark(c, k); From be647e2c76b27f409cdd520f66c95be888b553a3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 May 2024 06:41:45 -0700 Subject: [PATCH 141/279] nvme: use srcu for iterating namespace list The nvme pci driver synchronizes with all the namespace queues during a reset to ensure that there's no pending timeout work. Meanwhile the timeout work potentially iterates those same namespaces to freeze their queues. Each of those namespace iterations use the same read lock. If a write lock should somehow get between the synchronize and freeze steps, then forward progress is deadlocked. We had been relying on the nvme controller state machine to ensure the reset work wouldn't conflict with timeout work. That guarantee may be a bit fragile to rely on, so iterate the namespace lists without taking potentially circular locks, as reported by lockdep. Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/ Reported-by: Shinichiro Kawasaki Tested-by: Shinichiro Kawasaki Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 99 +++++++++++++++++++++-------------- drivers/nvme/host/ioctl.c | 15 +++--- drivers/nvme/host/multipath.c | 21 ++++---- drivers/nvme/host/nvme.h | 4 +- 4 files changed, 83 insertions(+), 56 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7706df237349..f5d150c62955 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -678,7 +678,7 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -static inline bool nvme_get_ns(struct nvme_ns *ns) +bool nvme_get_ns(struct nvme_ns *ns) { return kref_get_unless_zero(&ns->kref); } @@ -3684,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { if (ns->head->ns_id == nsid) { if (!nvme_get_ns(ns)) continue; @@ -3696,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ns->head->ns_id > nsid) break; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return ret; } EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); @@ -3710,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) { if (tmp->head->ns_id < ns->head->ns_id) { - list_add(&ns->list, &tmp->list); + list_add_rcu(&ns->list, &tmp->list); return; } } @@ -3776,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (nvme_update_ns_info(ns, info)) goto out_unlink_ns; - down_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); /* * Ensure that no namespaces are added to the ctrl list after the queues * are frozen, thereby avoiding a deadlock between scan and reset. */ if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) { - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); goto out_unlink_ns; } nvme_ns_add_to_ctrl_list(ns); - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); nvme_get_ctrl(ctrl); if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups)) @@ -3809,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) out_cleanup_ns_from_list: nvme_put_ctrl(ctrl); - down_write(&ctrl->namespaces_rwsem); - list_del_init(&ns->list); - up_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); + list_del_rcu(&ns->list); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3861,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_cdev_del(&ns->cdev, &ns->cdev_device); del_gendisk(ns->disk); - down_write(&ns->ctrl->namespaces_rwsem); - list_del_init(&ns->list); - up_write(&ns->ctrl->namespaces_rwsem); + mutex_lock(&ns->ctrl->namespaces_lock); + list_del_rcu(&ns->list); + mutex_unlock(&ns->ctrl->namespaces_lock); + synchronize_srcu(&ns->ctrl->srcu); if (last_path) nvme_mpath_shutdown_disk(ns->head); @@ -3953,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, struct nvme_ns *ns, *next; LIST_HEAD(rm_list); - down_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { if (ns->head->ns_id > nsid) - list_move_tail(&ns->list, &rm_list); + list_splice_init_rcu(&ns->list, &rm_list, + synchronize_rcu); } - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); list_for_each_entry_safe(ns, next, &rm_list, list) nvme_ns_remove(ns); - } static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) @@ -4132,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) /* this is a no-op when called from the controller reset handler */ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); - down_write(&ctrl->namespaces_rwsem); - list_splice_init(&ctrl->namespaces, &ns_list); - up_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); + list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); list_for_each_entry_safe(ns, next, &ns_list, list) nvme_ns_remove(ns); @@ -4582,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev) key_put(ctrl->tls_key); nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); + cleanup_srcu_struct(&ctrl->srcu); nvme_auth_stop(ctrl); nvme_auth_free(ctrl); __free_page(ctrl->discard_page); @@ -4614,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->passthru_err_log_enabled = false; clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); spin_lock_init(&ctrl->lock); + mutex_init(&ctrl->namespaces_lock); + + ret = init_srcu_struct(&ctrl->srcu); + if (ret) + return ret; + mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); xa_init(&ctrl->cels); - init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; ctrl->quirks = quirks; @@ -4697,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, out: if (ctrl->discard_page) __free_page(ctrl->discard_page); + cleanup_srcu_struct(&ctrl->srcu); return ret; } EXPORT_SYMBOL_GPL(nvme_init_ctrl); @@ -4705,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl); void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mark_disk_dead(ns->disk); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead); void nvme_unfreeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mq_unfreeze_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); clear_bit(NVME_CTRL_FROZEN, &ctrl->flags); } EXPORT_SYMBOL_GPL(nvme_unfreeze); @@ -4728,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze); int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); if (timeout <= 0) break; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return timeout; } EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); @@ -4743,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); void nvme_wait_freeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mq_freeze_queue_wait(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_wait_freeze); void nvme_start_freeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; set_bit(NVME_CTRL_FROZEN, &ctrl->flags); - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_freeze_queue_start(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_start_freeze); @@ -4802,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue); void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_sync_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_sync_io_queues); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 499a8bb7cac7..9d9d2a127c4e 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -789,15 +789,15 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, bool open_for_write) { struct nvme_ns *ns; - int ret; + int ret, srcu_idx; - down_read(&ctrl->namespaces_rwsem); + srcu_idx = srcu_read_lock(&ctrl->srcu); if (list_empty(&ctrl->namespaces)) { ret = -ENOTTY; goto out_unlock; } - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list); if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { dev_warn(ctrl->device, "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); @@ -807,15 +807,18 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, dev_warn(ctrl->device, "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); - kref_get(&ns->kref); - up_read(&ctrl->namespaces_rwsem); + if (!nvme_get_ns(ns)) { + ret = -ENXIO; + goto out_unlock; + } + srcu_read_unlock(&ctrl->srcu, srcu_idx); ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write); nvme_put_ns(ns); return ret; out_unlock: - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return ret; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1bee176fd850..d8b6b4648eaf 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -151,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq) void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { if (!ns->head->disk) continue; kblockd_schedule_work(&ns->head->requeue_work); if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE) disk_uevent(ns->head->disk, KOBJ_CHANGE); } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } static const char *nvme_ana_state_names[] = { @@ -194,13 +195,14 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns) void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { nvme_mpath_clear_current_path(ns); kblockd_schedule_work(&ns->head->requeue_work); } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } void nvme_mpath_revalidate_paths(struct nvme_ns *ns) @@ -681,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; unsigned *nr_change_groups = data; struct nvme_ns *ns; + int srcu_idx; dev_dbg(ctrl->device, "ANA group %d: %s.\n", le32_to_cpu(desc->grpid), @@ -692,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (!nr_nsids) return 0; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { unsigned nsid; again: nsid = le32_to_cpu(desc->nsids[n]); @@ -706,7 +709,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (ns->head->ns_id > nsid) goto again; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c43a30753d87..f3a41133ac3f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -282,7 +282,8 @@ struct nvme_ctrl { struct blk_mq_tag_set *tagset; struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; - struct rw_semaphore namespaces_rwsem; + struct mutex namespaces_lock; + struct srcu_struct srcu; struct device ctrl_device; struct device *device; /* char device */ #ifdef CONFIG_NVME_HWMON @@ -1160,6 +1161,7 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects, struct nvme_command *cmd, int status); struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); +bool nvme_get_ns(struct nvme_ns *ns); void nvme_put_ns(struct nvme_ns *ns); static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) From c758b77d4a0a0ed3a1292b3fd7a2aeccd1a169a4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 27 May 2024 22:38:52 +0300 Subject: [PATCH 142/279] nvmet: fix a possible leak when destroy a ctrl during qp establishment In nvmet_sq_destroy we capture sq->ctrl early and if it is non-NULL we know that a ctrl was allocated (in the admin connect request handler) and we need to release pending AERs, clear ctrl->sqs and sq->ctrl (for nvme-loop primarily), and drop the final reference on the ctrl. However, a small window is possible where nvmet_sq_destroy starts (as a result of the client giving up and disconnecting) concurrently with the nvme admin connect cmd (which may be in an early stage). But *before* kill_and_confirm of sq->ref (i.e. the admin connect managed to get an sq live reference). In this case, sq->ctrl was allocated however after it was captured in a local variable in nvmet_sq_destroy. This prevented the final reference drop on the ctrl. Solve this by re-capturing the sq->ctrl after all inflight request has completed, where for sure sq->ctrl reference is final, and move forward based on that. This issue was observed in an environment with many hosts connecting multiple ctrls simoutanuosly, creating a delay in allocating a ctrl leading up to this race window. Reported-by: Alex Turin Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 2fde22323622..06f0c587f343 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -818,6 +818,15 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) percpu_ref_exit(&sq->ref); nvmet_auth_sq_free(sq); + /* + * we must reference the ctrl again after waiting for inflight IO + * to complete. Because admin connect may have sneaked in after we + * store sq->ctrl locally, but before we killed the percpu_ref. the + * admin connect allocates and assigns sq->ctrl, which now needs a + * final ref put, as this ctrl is going away. + */ + ctrl = sq->ctrl; + if (ctrl) { /* * The teardown flow may take some time, and the host may not From 016c22e410c6eabfc5164f514d2a0ad06eddf3ba Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 16:47:34 -0400 Subject: [PATCH 143/279] bcachefs: split out sb-members_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 102 +---------------------------- fs/bcachefs/sb-members_format.h | 110 ++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 101 deletions(-) create mode 100644 fs/bcachefs/sb-members_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index d801e19cb489..c4d10b528b54 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -513,6 +513,7 @@ struct bch_sb_field { #include "snapshot_format.h" #include "subvolume_format.h" #include "sb-counters_format.h" +#include "sb-members_format.h" enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -545,106 +546,6 @@ struct bch_sb_field_journal_v2 { } d[]; }; -/* BCH_SB_FIELD_members_v1: */ - -#define BCH_MIN_NR_NBUCKETS (1 << 6) - -#define BCH_IOPS_MEASUREMENTS() \ - x(seqread, 0) \ - x(seqwrite, 1) \ - x(randread, 2) \ - x(randwrite, 3) - -enum bch_iops_measurement { -#define x(t, n) BCH_IOPS_##t = n, - BCH_IOPS_MEASUREMENTS() -#undef x - BCH_IOPS_NR -}; - -#define BCH_MEMBER_ERROR_TYPES() \ - x(read, 0) \ - x(write, 1) \ - x(checksum, 2) - -enum bch_member_error_type { -#define x(t, n) BCH_MEMBER_ERROR_##t = n, - BCH_MEMBER_ERROR_TYPES() -#undef x - BCH_MEMBER_ERROR_NR -}; - -struct bch_member { - __uuid_t uuid; - __le64 nbuckets; /* device size */ - __le16 first_bucket; /* index of first bucket used */ - __le16 bucket_size; /* sectors */ - __u8 btree_bitmap_shift; - __u8 pad[3]; - __le64 last_mount; /* time_t */ - - __le64 flags; - __le32 iops[4]; - __le64 errors[BCH_MEMBER_ERROR_NR]; - __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; - __le64 errors_reset_time; - __le64 seq; - __le64 btree_allocated_bitmap; - /* - * On recovery from a clean shutdown we don't normally read the journal, - * but we still want to resume writing from where we left off so we - * don't overwrite more than is necessary, for list journal debugging: - */ - __le32 last_journal_bucket; - __le32 last_journal_bucket_offset; -}; - -/* - * This limit comes from the bucket_gens array - it's a single allocation, and - * kernel allocation are limited to INT_MAX - */ -#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) - -#define BCH_MEMBER_V1_BYTES 56 - -LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) -/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) -LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) -LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) -LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) -LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, - struct bch_member, flags, 30, 31) - -#if 0 -LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -#endif - -#define BCH_MEMBER_STATES() \ - x(rw, 0) \ - x(ro, 1) \ - x(failed, 2) \ - x(spare, 3) - -enum bch_member_state { -#define x(t, n) BCH_MEMBER_STATE_##t = n, - BCH_MEMBER_STATES() -#undef x - BCH_MEMBER_STATE_NR -}; - -struct bch_sb_field_members_v1 { - struct bch_sb_field field; - struct bch_member _members[]; //Members are now variable size -}; - -struct bch_sb_field_members_v2 { - struct bch_sb_field field; - __le16 member_bytes; //size of single member entry - u8 pad[6]; - struct bch_member _members[]; -}; /* BCH_SB_FIELD_crypt: */ @@ -909,7 +810,6 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) #define BCH_SB_SECTOR 8 -#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ #define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h new file mode 100644 index 000000000000..e2630548c0f6 --- /dev/null +++ b/fs/bcachefs/sb-members_format.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H +#define _BCACHEFS_SB_MEMBERS_FORMAT_H + +/* + * We refer to members with bitmasks in various places - but we need to get rid + * of this limit: + */ +#define BCH_SB_MEMBERS_MAX 64 + +#define BCH_MIN_NR_NBUCKETS (1 << 6) + +#define BCH_IOPS_MEASUREMENTS() \ + x(seqread, 0) \ + x(seqwrite, 1) \ + x(randread, 2) \ + x(randwrite, 3) + +enum bch_iops_measurement { +#define x(t, n) BCH_IOPS_##t = n, + BCH_IOPS_MEASUREMENTS() +#undef x + BCH_IOPS_NR +}; + +#define BCH_MEMBER_ERROR_TYPES() \ + x(read, 0) \ + x(write, 1) \ + x(checksum, 2) + +enum bch_member_error_type { +#define x(t, n) BCH_MEMBER_ERROR_##t = n, + BCH_MEMBER_ERROR_TYPES() +#undef x + BCH_MEMBER_ERROR_NR +}; + +struct bch_member { + __uuid_t uuid; + __le64 nbuckets; /* device size */ + __le16 first_bucket; /* index of first bucket used */ + __le16 bucket_size; /* sectors */ + __u8 btree_bitmap_shift; + __u8 pad[3]; + __le64 last_mount; /* time_t */ + + __le64 flags; + __le32 iops[4]; + __le64 errors[BCH_MEMBER_ERROR_NR]; + __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; + __le64 errors_reset_time; + __le64 seq; + __le64 btree_allocated_bitmap; + /* + * On recovery from a clean shutdown we don't normally read the journal, + * but we still want to resume writing from where we left off so we + * don't overwrite more than is necessary, for list journal debugging: + */ + __le32 last_journal_bucket; + __le32 last_journal_bucket_offset; +}; + +/* + * This limit comes from the bucket_gens array - it's a single allocation, and + * kernel allocation are limited to INT_MAX + */ +#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) + +#define BCH_MEMBER_V1_BYTES 56 + +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) +/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) +LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) +LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) +LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags, 30, 31) + +#if 0 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); +#endif + +#define BCH_MEMBER_STATES() \ + x(rw, 0) \ + x(ro, 1) \ + x(failed, 2) \ + x(spare, 3) + +enum bch_member_state { +#define x(t, n) BCH_MEMBER_STATE_##t = n, + BCH_MEMBER_STATES() +#undef x + BCH_MEMBER_STATE_NR +}; + +struct bch_sb_field_members_v1 { + struct bch_sb_field field; + struct bch_member _members[]; //Members are now variable size +}; + +struct bch_sb_field_members_v2 { + struct bch_sb_field field; + __le16 member_bytes; //size of single member entry + u8 pad[6]; + struct bch_member _members[]; +}; + +#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */ From 4c5eef0c50ccc71e225f320835dc7cd51f64f961 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 16:54:29 -0400 Subject: [PATCH 144/279] bcachefs: split out sb-downgrade_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 14 +------------- fs/bcachefs/sb-downgrade_format.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 13 deletions(-) create mode 100644 fs/bcachefs/sb-downgrade_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index c4d10b528b54..c966eb7e37b8 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -513,6 +513,7 @@ struct bch_sb_field { #include "snapshot_format.h" #include "subvolume_format.h" #include "sb-counters_format.h" +#include "sb-downgrade_format.h" #include "sb-members_format.h" enum bch_sb_field_type { @@ -546,7 +547,6 @@ struct bch_sb_field_journal_v2 { } d[]; }; - /* BCH_SB_FIELD_crypt: */ struct nonce { @@ -738,18 +738,6 @@ struct bch_sb_field_ext { __le64 btrees_lost_data; }; -struct bch_sb_field_downgrade_entry { - __le16 version; - __le64 recovery_passes[2]; - __le16 nr_errors; - __le16 errors[] __counted_by(nr_errors); -} __packed __aligned(2); - -struct bch_sb_field_downgrade { - struct bch_sb_field field; - struct bch_sb_field_downgrade_entry entries[]; -}; - /* Superblock: */ /* diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h new file mode 100644 index 000000000000..cffd932be3ec --- /dev/null +++ b/fs/bcachefs/sb-downgrade_format.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H +#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H + +struct bch_sb_field_downgrade_entry { + __le16 version; + __le64 recovery_passes[2]; + __le16 nr_errors; + __le16 errors[] __counted_by(nr_errors); +} __packed __aligned(2); + +struct bch_sb_field_downgrade { + struct bch_sb_field field; + struct bch_sb_field_downgrade_entry entries[]; +}; + +#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */ From 1cdcc6e3c2cb69a235bd32eb0da51205e432f77f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 17:06:51 -0400 Subject: [PATCH 145/279] bcachefs: Split out disk_groups_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 19 +------------------ fs/bcachefs/disk_groups_format.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 18 deletions(-) create mode 100644 fs/bcachefs/disk_groups_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index c966eb7e37b8..b7690fc3ffc6 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -507,6 +507,7 @@ struct bch_sb_field { #include "ec_format.h" #include "inode_format.h" #include "dirent_format.h" +#include "disk_groups_format.h" #include "xattr_format.h" #include "quota_format.h" #include "logged_ops_format.h" @@ -665,24 +666,6 @@ struct bch_sb_field_replicas { struct bch_replicas_entry_v1 entries[]; } __packed __aligned(8); -/* BCH_SB_FIELD_disk_groups: */ - -#define BCH_SB_LABEL_SIZE 32 - -struct bch_disk_group { - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 flags[2]; -} __packed __aligned(8); - -LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) - -struct bch_sb_field_disk_groups { - struct bch_sb_field field; - struct bch_disk_group entries[]; -} __packed __aligned(8); - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h new file mode 100644 index 000000000000..698990bbf1d2 --- /dev/null +++ b/fs/bcachefs/disk_groups_format.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H +#define _BCACHEFS_DISK_GROUPS_FORMAT_H + +#define BCH_SB_LABEL_SIZE 32 + +struct bch_disk_group { + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 flags[2]; +} __packed __aligned(8); + +LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) +LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) + +struct bch_sb_field_disk_groups { + struct bch_sb_field field; + struct bch_disk_group entries[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */ From 24998050b69ac1f4caa41b66dbd245f4de366b3c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 17:13:52 -0400 Subject: [PATCH 146/279] bcachefs: Split out replicas_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 37 +++++------------------------------ fs/bcachefs/replicas_format.h | 31 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 32 deletions(-) create mode 100644 fs/bcachefs/replicas_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b7690fc3ffc6..66df1e5c2a87 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -503,19 +503,20 @@ struct bch_sb_field { #include "alloc_background_format.h" #include "extents_format.h" -#include "reflink_format.h" #include "ec_format.h" -#include "inode_format.h" #include "dirent_format.h" #include "disk_groups_format.h" -#include "xattr_format.h" -#include "quota_format.h" +#include "inode_format.h" #include "logged_ops_format.h" +#include "quota_format.h" +#include "reflink_format.h" +#include "replicas_format.h" #include "snapshot_format.h" #include "subvolume_format.h" #include "sb-counters_format.h" #include "sb-downgrade_format.h" #include "sb-members_format.h" +#include "xattr_format.h" enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -596,8 +597,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -/* BCH_SB_FIELD_replicas: */ - #define BCH_DATA_TYPES() \ x(free, 0) \ x(sb, 1) \ @@ -640,32 +639,6 @@ static inline bool data_type_is_hidden(enum bch_data_type type) } } -struct bch_replicas_entry_v0 { - __u8 data_type; - __u8 nr_devs; - __u8 devs[]; -} __packed; - -struct bch_sb_field_replicas_v0 { - struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[]; -} __packed __aligned(8); - -struct bch_replicas_entry_v1 { - __u8 data_type; - __u8 nr_devs; - __u8 nr_required; - __u8 devs[]; -} __packed; - -#define replicas_entry_bytes(_i) \ - (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) - -struct bch_sb_field_replicas { - struct bch_sb_field field; - struct bch_replicas_entry_v1 entries[]; -} __packed __aligned(8); - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h new file mode 100644 index 000000000000..b97208195d06 --- /dev/null +++ b/fs/bcachefs/replicas_format.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_FORMAT_H +#define _BCACHEFS_REPLICAS_FORMAT_H + +struct bch_replicas_entry_v0 { + __u8 data_type; + __u8 nr_devs; + __u8 devs[]; +} __packed; + +struct bch_sb_field_replicas_v0 { + struct bch_sb_field field; + struct bch_replicas_entry_v0 entries[]; +} __packed __aligned(8); + +struct bch_replicas_entry_v1 { + __u8 data_type; + __u8 nr_devs; + __u8 nr_required; + __u8 devs[]; +} __packed; + +struct bch_sb_field_replicas { + struct bch_sb_field field; + struct bch_replicas_entry_v1 entries[]; +} __packed __aligned(8); + +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) + +#endif /* _BCACHEFS_REPLICAS_FORMAT_H */ From 5c16c5748894652e8013dbcb27e54e8319c53b00 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 17:20:40 -0400 Subject: [PATCH 147/279] bcachefs: Split out journal_seq_blacklist_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 11 +---------- fs/bcachefs/journal_seq_blacklist_format.h | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 10 deletions(-) create mode 100644 fs/bcachefs/journal_seq_blacklist_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 66df1e5c2a87..89ac286f17f0 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -507,6 +507,7 @@ struct bch_sb_field { #include "dirent_format.h" #include "disk_groups_format.h" #include "inode_format.h" +#include "journal_seq_blacklist_format.h" #include "logged_ops_format.h" #include "quota_format.h" #include "reflink_format.h" @@ -666,16 +667,6 @@ struct bch_sb_field_clean { __u64 _data[]; }; -struct journal_seq_blacklist_entry { - __le64 start; - __le64 end; -}; - -struct bch_sb_field_journal_seq_blacklist { - struct bch_sb_field field; - struct journal_seq_blacklist_entry start[]; -}; - struct bch_sb_field_errors { struct bch_sb_field field; struct bch_sb_field_error_entry { diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h new file mode 100644 index 000000000000..2566b12dbc04 --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist_format.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H +#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H + +struct journal_seq_blacklist_entry { + __le64 start; + __le64 end; +}; + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; + struct journal_seq_blacklist_entry start[]; +}; + +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */ From 759bb4eabc727077145f3173f8ef6c2ac745c3d6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 17:23:01 -0400 Subject: [PATCH 148/279] bcachefs: Split out sb-errors_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 12 +- fs/bcachefs/sb-errors_format.h | 296 +++++++++++++++++++++++++++++++++ fs/bcachefs/sb-errors_types.h | 281 ------------------------------- 3 files changed, 297 insertions(+), 292 deletions(-) create mode 100644 fs/bcachefs/sb-errors_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 89ac286f17f0..90c12fe2a2cd 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -516,6 +516,7 @@ struct bch_sb_field { #include "subvolume_format.h" #include "sb-counters_format.h" #include "sb-downgrade_format.h" +#include "sb-errors_format.h" #include "sb-members_format.h" #include "xattr_format.h" @@ -667,17 +668,6 @@ struct bch_sb_field_clean { __u64 _data[]; }; -struct bch_sb_field_errors { - struct bch_sb_field field; - struct bch_sb_field_error_entry { - __le64 v; - __le64 last_error_time; - } entries[]; -}; - -LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); -LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); - struct bch_sb_field_ext { struct bch_sb_field field; __le64 recovery_passes_required[2]; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h new file mode 100644 index 000000000000..84d2763bd597 --- /dev/null +++ b/fs/bcachefs/sb-errors_format.h @@ -0,0 +1,296 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H +#define _BCACHEFS_SB_ERRORS_FORMAT_H + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0) \ + x(dirty_but_no_journal_entries, 1) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ + x(sb_clean_journal_seq_mismatch, 3) \ + x(sb_clean_btree_root_mismatch, 4) \ + x(sb_clean_missing, 5) \ + x(jset_unsupported_version, 6) \ + x(jset_unknown_csum, 7) \ + x(jset_last_seq_newer_than_seq, 8) \ + x(jset_past_bucket_end, 9) \ + x(jset_seq_blacklisted, 10) \ + x(journal_entries_missing, 11) \ + x(journal_entry_replicas_not_marked, 12) \ + x(journal_entry_past_jset_end, 13) \ + x(journal_entry_replicas_data_mismatch, 14) \ + x(journal_entry_bkey_u64s_0, 15) \ + x(journal_entry_bkey_past_end, 16) \ + x(journal_entry_bkey_bad_format, 17) \ + x(journal_entry_bkey_invalid, 18) \ + x(journal_entry_btree_root_bad_size, 19) \ + x(journal_entry_blacklist_bad_size, 20) \ + x(journal_entry_blacklist_v2_bad_size, 21) \ + x(journal_entry_blacklist_v2_start_past_end, 22) \ + x(journal_entry_usage_bad_size, 23) \ + x(journal_entry_data_usage_bad_size, 24) \ + x(journal_entry_clock_bad_size, 25) \ + x(journal_entry_clock_bad_rw, 26) \ + x(journal_entry_dev_usage_bad_size, 27) \ + x(journal_entry_dev_usage_bad_dev, 28) \ + x(journal_entry_dev_usage_bad_pad, 29) \ + x(btree_node_unreadable, 30) \ + x(btree_node_fault_injected, 31) \ + x(btree_node_bad_magic, 32) \ + x(btree_node_bad_seq, 33) \ + x(btree_node_unsupported_version, 34) \ + x(btree_node_bset_older_than_sb_min, 35) \ + x(btree_node_bset_newer_than_sb, 36) \ + x(btree_node_data_missing, 37) \ + x(btree_node_bset_after_end, 38) \ + x(btree_node_replicas_sectors_written_mismatch, 39) \ + x(btree_node_replicas_data_mismatch, 40) \ + x(bset_unknown_csum, 41) \ + x(bset_bad_csum, 42) \ + x(bset_past_end_of_btree_node, 43) \ + x(bset_wrong_sector_offset, 44) \ + x(bset_empty, 45) \ + x(bset_bad_seq, 46) \ + x(bset_blacklisted_journal_seq, 47) \ + x(first_bset_blacklisted_journal_seq, 48) \ + x(btree_node_bad_btree, 49) \ + x(btree_node_bad_level, 50) \ + x(btree_node_bad_min_key, 51) \ + x(btree_node_bad_max_key, 52) \ + x(btree_node_bad_format, 53) \ + x(btree_node_bkey_past_bset_end, 54) \ + x(btree_node_bkey_bad_format, 55) \ + x(btree_node_bad_bkey, 56) \ + x(btree_node_bkey_out_of_order, 57) \ + x(btree_root_bkey_invalid, 58) \ + x(btree_root_read_error, 59) \ + x(btree_root_bad_min_key, 60) \ + x(btree_root_bad_max_key, 61) \ + x(btree_node_read_error, 62) \ + x(btree_node_topology_bad_min_key, 63) \ + x(btree_node_topology_bad_max_key, 64) \ + x(btree_node_topology_overwritten_by_prev_node, 65) \ + x(btree_node_topology_overwritten_by_next_node, 66) \ + x(btree_node_topology_interior_node_empty, 67) \ + x(fs_usage_hidden_wrong, 68) \ + x(fs_usage_btree_wrong, 69) \ + x(fs_usage_data_wrong, 70) \ + x(fs_usage_cached_wrong, 71) \ + x(fs_usage_reserved_wrong, 72) \ + x(fs_usage_persistent_reserved_wrong, 73) \ + x(fs_usage_nr_inodes_wrong, 74) \ + x(fs_usage_replicas_wrong, 75) \ + x(dev_usage_buckets_wrong, 76) \ + x(dev_usage_sectors_wrong, 77) \ + x(dev_usage_fragmented_wrong, 78) \ + x(dev_usage_buckets_ec_wrong, 79) \ + x(bkey_version_in_future, 80) \ + x(bkey_u64s_too_small, 81) \ + x(bkey_invalid_type_for_btree, 82) \ + x(bkey_extent_size_zero, 83) \ + x(bkey_extent_size_greater_than_offset, 84) \ + x(bkey_size_nonzero, 85) \ + x(bkey_snapshot_nonzero, 86) \ + x(bkey_snapshot_zero, 87) \ + x(bkey_at_pos_max, 88) \ + x(bkey_before_start_of_btree_node, 89) \ + x(bkey_after_end_of_btree_node, 90) \ + x(bkey_val_size_nonzero, 91) \ + x(bkey_val_size_too_small, 92) \ + x(alloc_v1_val_size_bad, 93) \ + x(alloc_v2_unpack_error, 94) \ + x(alloc_v3_unpack_error, 95) \ + x(alloc_v4_val_size_bad, 96) \ + x(alloc_v4_backpointers_start_bad, 97) \ + x(alloc_key_data_type_bad, 98) \ + x(alloc_key_empty_but_have_data, 99) \ + x(alloc_key_dirty_sectors_0, 100) \ + x(alloc_key_data_type_inconsistency, 101) \ + x(alloc_key_to_missing_dev_bucket, 102) \ + x(alloc_key_cached_inconsistency, 103) \ + x(alloc_key_cached_but_read_time_zero, 104) \ + x(alloc_key_to_missing_lru_entry, 105) \ + x(alloc_key_data_type_wrong, 106) \ + x(alloc_key_gen_wrong, 107) \ + x(alloc_key_dirty_sectors_wrong, 108) \ + x(alloc_key_cached_sectors_wrong, 109) \ + x(alloc_key_stripe_wrong, 110) \ + x(alloc_key_stripe_redundancy_wrong, 111) \ + x(bucket_sector_count_overflow, 112) \ + x(bucket_metadata_type_mismatch, 113) \ + x(need_discard_key_wrong, 114) \ + x(freespace_key_wrong, 115) \ + x(freespace_hole_missing, 116) \ + x(bucket_gens_val_size_bad, 117) \ + x(bucket_gens_key_wrong, 118) \ + x(bucket_gens_hole_wrong, 119) \ + x(bucket_gens_to_invalid_dev, 120) \ + x(bucket_gens_to_invalid_buckets, 121) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ + x(need_discard_freespace_key_bad, 124) \ + x(backpointer_bucket_offset_wrong, 125) \ + x(backpointer_to_missing_device, 126) \ + x(backpointer_to_missing_alloc, 127) \ + x(backpointer_to_missing_ptr, 128) \ + x(lru_entry_at_time_0, 129) \ + x(lru_entry_to_invalid_bucket, 130) \ + x(lru_entry_bad, 131) \ + x(btree_ptr_val_too_big, 132) \ + x(btree_ptr_v2_val_too_big, 133) \ + x(btree_ptr_has_non_ptr, 134) \ + x(extent_ptrs_invalid_entry, 135) \ + x(extent_ptrs_no_ptrs, 136) \ + x(extent_ptrs_too_many_ptrs, 137) \ + x(extent_ptrs_redundant_crc, 138) \ + x(extent_ptrs_redundant_stripe, 139) \ + x(extent_ptrs_unwritten, 140) \ + x(extent_ptrs_written_and_unwritten, 141) \ + x(ptr_to_invalid_device, 142) \ + x(ptr_to_duplicate_device, 143) \ + x(ptr_after_last_bucket, 144) \ + x(ptr_before_first_bucket, 145) \ + x(ptr_spans_multiple_buckets, 146) \ + x(ptr_to_missing_backpointer, 147) \ + x(ptr_to_missing_alloc_key, 148) \ + x(ptr_to_missing_replicas_entry, 149) \ + x(ptr_to_missing_stripe, 150) \ + x(ptr_to_incorrect_stripe, 151) \ + x(ptr_gen_newer_than_bucket_gen, 152) \ + x(ptr_too_stale, 153) \ + x(stale_dirty_ptr, 154) \ + x(ptr_bucket_data_type_mismatch, 155) \ + x(ptr_cached_and_erasure_coded, 156) \ + x(ptr_crc_uncompressed_size_too_small, 157) \ + x(ptr_crc_csum_type_unknown, 158) \ + x(ptr_crc_compression_type_unknown, 159) \ + x(ptr_crc_redundant, 160) \ + x(ptr_crc_uncompressed_size_too_big, 161) \ + x(ptr_crc_nonce_mismatch, 162) \ + x(ptr_stripe_redundant, 163) \ + x(reservation_key_nr_replicas_invalid, 164) \ + x(reflink_v_refcount_wrong, 165) \ + x(reflink_p_to_missing_reflink_v, 166) \ + x(stripe_pos_bad, 167) \ + x(stripe_val_size_bad, 168) \ + x(stripe_sector_count_wrong, 169) \ + x(snapshot_tree_pos_bad, 170) \ + x(snapshot_tree_to_missing_snapshot, 171) \ + x(snapshot_tree_to_missing_subvol, 172) \ + x(snapshot_tree_to_wrong_subvol, 173) \ + x(snapshot_tree_to_snapshot_subvol, 174) \ + x(snapshot_pos_bad, 175) \ + x(snapshot_parent_bad, 176) \ + x(snapshot_children_not_normalized, 177) \ + x(snapshot_child_duplicate, 178) \ + x(snapshot_child_bad, 179) \ + x(snapshot_skiplist_not_normalized, 180) \ + x(snapshot_skiplist_bad, 181) \ + x(snapshot_should_not_have_subvol, 182) \ + x(snapshot_to_bad_snapshot_tree, 183) \ + x(snapshot_bad_depth, 184) \ + x(snapshot_bad_skiplist, 185) \ + x(subvol_pos_bad, 186) \ + x(subvol_not_master_and_not_snapshot, 187) \ + x(subvol_to_missing_root, 188) \ + x(subvol_root_wrong_bi_subvol, 189) \ + x(bkey_in_missing_snapshot, 190) \ + x(inode_pos_inode_nonzero, 191) \ + x(inode_pos_blockdev_range, 192) \ + x(inode_unpack_error, 193) \ + x(inode_str_hash_invalid, 194) \ + x(inode_v3_fields_start_bad, 195) \ + x(inode_snapshot_mismatch, 196) \ + x(inode_unlinked_but_clean, 197) \ + x(inode_unlinked_but_nlink_nonzero, 198) \ + x(inode_checksum_type_invalid, 199) \ + x(inode_compression_type_invalid, 200) \ + x(inode_subvol_root_but_not_dir, 201) \ + x(inode_i_size_dirty_but_clean, 202) \ + x(inode_i_sectors_dirty_but_clean, 203) \ + x(inode_i_sectors_wrong, 204) \ + x(inode_dir_wrong_nlink, 205) \ + x(inode_dir_multiple_links, 206) \ + x(inode_multiple_links_but_nlink_0, 207) \ + x(inode_wrong_backpointer, 208) \ + x(inode_wrong_nlink, 209) \ + x(inode_unreachable, 210) \ + x(deleted_inode_but_clean, 211) \ + x(deleted_inode_missing, 212) \ + x(deleted_inode_is_dir, 213) \ + x(deleted_inode_not_unlinked, 214) \ + x(extent_overlapping, 215) \ + x(extent_in_missing_inode, 216) \ + x(extent_in_non_reg_inode, 217) \ + x(extent_past_end_of_inode, 218) \ + x(dirent_empty_name, 219) \ + x(dirent_val_too_big, 220) \ + x(dirent_name_too_long, 221) \ + x(dirent_name_embedded_nul, 222) \ + x(dirent_name_dot_or_dotdot, 223) \ + x(dirent_name_has_slash, 224) \ + x(dirent_d_type_wrong, 225) \ + x(inode_bi_parent_wrong, 226) \ + x(dirent_in_missing_dir_inode, 227) \ + x(dirent_in_non_dir_inode, 228) \ + x(dirent_to_missing_inode, 229) \ + x(dirent_to_missing_subvol, 230) \ + x(dirent_to_itself, 231) \ + x(quota_type_invalid, 232) \ + x(xattr_val_size_too_small, 233) \ + x(xattr_val_size_too_big, 234) \ + x(xattr_invalid_type, 235) \ + x(xattr_name_invalid_chars, 236) \ + x(xattr_in_missing_inode, 237) \ + x(root_subvol_missing, 238) \ + x(root_dir_missing, 239) \ + x(root_inode_not_dir, 240) \ + x(dir_loop, 241) \ + x(hash_table_key_duplicate, 242) \ + x(hash_table_key_wrong_offset, 243) \ + x(unlinked_inode_not_on_deleted_list, 244) \ + x(reflink_p_front_pad_bad, 245) \ + x(journal_entry_dup_same_device, 246) \ + x(inode_bi_subvol_missing, 247) \ + x(inode_bi_subvol_wrong, 248) \ + x(inode_points_to_missing_dirent, 249) \ + x(inode_points_to_wrong_dirent, 250) \ + x(inode_bi_parent_nonzero, 251) \ + x(dirent_to_missing_parent_subvol, 252) \ + x(dirent_not_visible_in_parent_subvol, 253) \ + x(subvol_fs_path_parent_wrong, 254) \ + x(subvol_root_fs_path_parent_nonzero, 255) \ + x(subvol_children_not_set, 256) \ + x(subvol_children_bad, 257) \ + x(subvol_loop, 258) \ + x(subvol_unreachable, 259) \ + x(btree_node_bkey_bad_u64s, 260) \ + x(btree_node_topology_empty_interior_node, 261) \ + x(btree_ptr_v2_min_key_bad, 262) \ + x(btree_root_unreadable_and_scan_found_nothing, 263) \ + x(snapshot_node_missing, 264) \ + x(dup_backpointer_to_bad_csum_extent, 265) \ + x(btree_bitmap_not_marked, 266) \ + x(sb_clean_entry_overrun, 267) \ + x(btree_ptr_v2_written_0, 268) \ + x(subvol_snapshot_bad, 269) \ + x(subvol_inode_bad, 270) + +enum bch_sb_error_id { +#define x(t, n) BCH_FSCK_ERR_##t = n, + BCH_SB_ERRS() +#undef x + BCH_SB_ERR_MAX +}; + +struct bch_sb_field_errors { + struct bch_sb_field field; + struct bch_sb_field_error_entry { + __le64 v; + __le64 last_error_time; + } entries[]; +}; + +LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); +LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); + +#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 666599d3fb9d..40325239c3b0 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -4,286 +4,6 @@ #include "darray.h" -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0) \ - x(dirty_but_no_journal_entries, 1) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ - x(sb_clean_journal_seq_mismatch, 3) \ - x(sb_clean_btree_root_mismatch, 4) \ - x(sb_clean_missing, 5) \ - x(jset_unsupported_version, 6) \ - x(jset_unknown_csum, 7) \ - x(jset_last_seq_newer_than_seq, 8) \ - x(jset_past_bucket_end, 9) \ - x(jset_seq_blacklisted, 10) \ - x(journal_entries_missing, 11) \ - x(journal_entry_replicas_not_marked, 12) \ - x(journal_entry_past_jset_end, 13) \ - x(journal_entry_replicas_data_mismatch, 14) \ - x(journal_entry_bkey_u64s_0, 15) \ - x(journal_entry_bkey_past_end, 16) \ - x(journal_entry_bkey_bad_format, 17) \ - x(journal_entry_bkey_invalid, 18) \ - x(journal_entry_btree_root_bad_size, 19) \ - x(journal_entry_blacklist_bad_size, 20) \ - x(journal_entry_blacklist_v2_bad_size, 21) \ - x(journal_entry_blacklist_v2_start_past_end, 22) \ - x(journal_entry_usage_bad_size, 23) \ - x(journal_entry_data_usage_bad_size, 24) \ - x(journal_entry_clock_bad_size, 25) \ - x(journal_entry_clock_bad_rw, 26) \ - x(journal_entry_dev_usage_bad_size, 27) \ - x(journal_entry_dev_usage_bad_dev, 28) \ - x(journal_entry_dev_usage_bad_pad, 29) \ - x(btree_node_unreadable, 30) \ - x(btree_node_fault_injected, 31) \ - x(btree_node_bad_magic, 32) \ - x(btree_node_bad_seq, 33) \ - x(btree_node_unsupported_version, 34) \ - x(btree_node_bset_older_than_sb_min, 35) \ - x(btree_node_bset_newer_than_sb, 36) \ - x(btree_node_data_missing, 37) \ - x(btree_node_bset_after_end, 38) \ - x(btree_node_replicas_sectors_written_mismatch, 39) \ - x(btree_node_replicas_data_mismatch, 40) \ - x(bset_unknown_csum, 41) \ - x(bset_bad_csum, 42) \ - x(bset_past_end_of_btree_node, 43) \ - x(bset_wrong_sector_offset, 44) \ - x(bset_empty, 45) \ - x(bset_bad_seq, 46) \ - x(bset_blacklisted_journal_seq, 47) \ - x(first_bset_blacklisted_journal_seq, 48) \ - x(btree_node_bad_btree, 49) \ - x(btree_node_bad_level, 50) \ - x(btree_node_bad_min_key, 51) \ - x(btree_node_bad_max_key, 52) \ - x(btree_node_bad_format, 53) \ - x(btree_node_bkey_past_bset_end, 54) \ - x(btree_node_bkey_bad_format, 55) \ - x(btree_node_bad_bkey, 56) \ - x(btree_node_bkey_out_of_order, 57) \ - x(btree_root_bkey_invalid, 58) \ - x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 60) \ - x(btree_root_bad_max_key, 61) \ - x(btree_node_read_error, 62) \ - x(btree_node_topology_bad_min_key, 63) \ - x(btree_node_topology_bad_max_key, 64) \ - x(btree_node_topology_overwritten_by_prev_node, 65) \ - x(btree_node_topology_overwritten_by_next_node, 66) \ - x(btree_node_topology_interior_node_empty, 67) \ - x(fs_usage_hidden_wrong, 68) \ - x(fs_usage_btree_wrong, 69) \ - x(fs_usage_data_wrong, 70) \ - x(fs_usage_cached_wrong, 71) \ - x(fs_usage_reserved_wrong, 72) \ - x(fs_usage_persistent_reserved_wrong, 73) \ - x(fs_usage_nr_inodes_wrong, 74) \ - x(fs_usage_replicas_wrong, 75) \ - x(dev_usage_buckets_wrong, 76) \ - x(dev_usage_sectors_wrong, 77) \ - x(dev_usage_fragmented_wrong, 78) \ - x(dev_usage_buckets_ec_wrong, 79) \ - x(bkey_version_in_future, 80) \ - x(bkey_u64s_too_small, 81) \ - x(bkey_invalid_type_for_btree, 82) \ - x(bkey_extent_size_zero, 83) \ - x(bkey_extent_size_greater_than_offset, 84) \ - x(bkey_size_nonzero, 85) \ - x(bkey_snapshot_nonzero, 86) \ - x(bkey_snapshot_zero, 87) \ - x(bkey_at_pos_max, 88) \ - x(bkey_before_start_of_btree_node, 89) \ - x(bkey_after_end_of_btree_node, 90) \ - x(bkey_val_size_nonzero, 91) \ - x(bkey_val_size_too_small, 92) \ - x(alloc_v1_val_size_bad, 93) \ - x(alloc_v2_unpack_error, 94) \ - x(alloc_v3_unpack_error, 95) \ - x(alloc_v4_val_size_bad, 96) \ - x(alloc_v4_backpointers_start_bad, 97) \ - x(alloc_key_data_type_bad, 98) \ - x(alloc_key_empty_but_have_data, 99) \ - x(alloc_key_dirty_sectors_0, 100) \ - x(alloc_key_data_type_inconsistency, 101) \ - x(alloc_key_to_missing_dev_bucket, 102) \ - x(alloc_key_cached_inconsistency, 103) \ - x(alloc_key_cached_but_read_time_zero, 104) \ - x(alloc_key_to_missing_lru_entry, 105) \ - x(alloc_key_data_type_wrong, 106) \ - x(alloc_key_gen_wrong, 107) \ - x(alloc_key_dirty_sectors_wrong, 108) \ - x(alloc_key_cached_sectors_wrong, 109) \ - x(alloc_key_stripe_wrong, 110) \ - x(alloc_key_stripe_redundancy_wrong, 111) \ - x(bucket_sector_count_overflow, 112) \ - x(bucket_metadata_type_mismatch, 113) \ - x(need_discard_key_wrong, 114) \ - x(freespace_key_wrong, 115) \ - x(freespace_hole_missing, 116) \ - x(bucket_gens_val_size_bad, 117) \ - x(bucket_gens_key_wrong, 118) \ - x(bucket_gens_hole_wrong, 119) \ - x(bucket_gens_to_invalid_dev, 120) \ - x(bucket_gens_to_invalid_buckets, 121) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ - x(need_discard_freespace_key_bad, 124) \ - x(backpointer_bucket_offset_wrong, 125) \ - x(backpointer_to_missing_device, 126) \ - x(backpointer_to_missing_alloc, 127) \ - x(backpointer_to_missing_ptr, 128) \ - x(lru_entry_at_time_0, 129) \ - x(lru_entry_to_invalid_bucket, 130) \ - x(lru_entry_bad, 131) \ - x(btree_ptr_val_too_big, 132) \ - x(btree_ptr_v2_val_too_big, 133) \ - x(btree_ptr_has_non_ptr, 134) \ - x(extent_ptrs_invalid_entry, 135) \ - x(extent_ptrs_no_ptrs, 136) \ - x(extent_ptrs_too_many_ptrs, 137) \ - x(extent_ptrs_redundant_crc, 138) \ - x(extent_ptrs_redundant_stripe, 139) \ - x(extent_ptrs_unwritten, 140) \ - x(extent_ptrs_written_and_unwritten, 141) \ - x(ptr_to_invalid_device, 142) \ - x(ptr_to_duplicate_device, 143) \ - x(ptr_after_last_bucket, 144) \ - x(ptr_before_first_bucket, 145) \ - x(ptr_spans_multiple_buckets, 146) \ - x(ptr_to_missing_backpointer, 147) \ - x(ptr_to_missing_alloc_key, 148) \ - x(ptr_to_missing_replicas_entry, 149) \ - x(ptr_to_missing_stripe, 150) \ - x(ptr_to_incorrect_stripe, 151) \ - x(ptr_gen_newer_than_bucket_gen, 152) \ - x(ptr_too_stale, 153) \ - x(stale_dirty_ptr, 154) \ - x(ptr_bucket_data_type_mismatch, 155) \ - x(ptr_cached_and_erasure_coded, 156) \ - x(ptr_crc_uncompressed_size_too_small, 157) \ - x(ptr_crc_csum_type_unknown, 158) \ - x(ptr_crc_compression_type_unknown, 159) \ - x(ptr_crc_redundant, 160) \ - x(ptr_crc_uncompressed_size_too_big, 161) \ - x(ptr_crc_nonce_mismatch, 162) \ - x(ptr_stripe_redundant, 163) \ - x(reservation_key_nr_replicas_invalid, 164) \ - x(reflink_v_refcount_wrong, 165) \ - x(reflink_p_to_missing_reflink_v, 166) \ - x(stripe_pos_bad, 167) \ - x(stripe_val_size_bad, 168) \ - x(stripe_sector_count_wrong, 169) \ - x(snapshot_tree_pos_bad, 170) \ - x(snapshot_tree_to_missing_snapshot, 171) \ - x(snapshot_tree_to_missing_subvol, 172) \ - x(snapshot_tree_to_wrong_subvol, 173) \ - x(snapshot_tree_to_snapshot_subvol, 174) \ - x(snapshot_pos_bad, 175) \ - x(snapshot_parent_bad, 176) \ - x(snapshot_children_not_normalized, 177) \ - x(snapshot_child_duplicate, 178) \ - x(snapshot_child_bad, 179) \ - x(snapshot_skiplist_not_normalized, 180) \ - x(snapshot_skiplist_bad, 181) \ - x(snapshot_should_not_have_subvol, 182) \ - x(snapshot_to_bad_snapshot_tree, 183) \ - x(snapshot_bad_depth, 184) \ - x(snapshot_bad_skiplist, 185) \ - x(subvol_pos_bad, 186) \ - x(subvol_not_master_and_not_snapshot, 187) \ - x(subvol_to_missing_root, 188) \ - x(subvol_root_wrong_bi_subvol, 189) \ - x(bkey_in_missing_snapshot, 190) \ - x(inode_pos_inode_nonzero, 191) \ - x(inode_pos_blockdev_range, 192) \ - x(inode_unpack_error, 193) \ - x(inode_str_hash_invalid, 194) \ - x(inode_v3_fields_start_bad, 195) \ - x(inode_snapshot_mismatch, 196) \ - x(inode_unlinked_but_clean, 197) \ - x(inode_unlinked_but_nlink_nonzero, 198) \ - x(inode_checksum_type_invalid, 199) \ - x(inode_compression_type_invalid, 200) \ - x(inode_subvol_root_but_not_dir, 201) \ - x(inode_i_size_dirty_but_clean, 202) \ - x(inode_i_sectors_dirty_but_clean, 203) \ - x(inode_i_sectors_wrong, 204) \ - x(inode_dir_wrong_nlink, 205) \ - x(inode_dir_multiple_links, 206) \ - x(inode_multiple_links_but_nlink_0, 207) \ - x(inode_wrong_backpointer, 208) \ - x(inode_wrong_nlink, 209) \ - x(inode_unreachable, 210) \ - x(deleted_inode_but_clean, 211) \ - x(deleted_inode_missing, 212) \ - x(deleted_inode_is_dir, 213) \ - x(deleted_inode_not_unlinked, 214) \ - x(extent_overlapping, 215) \ - x(extent_in_missing_inode, 216) \ - x(extent_in_non_reg_inode, 217) \ - x(extent_past_end_of_inode, 218) \ - x(dirent_empty_name, 219) \ - x(dirent_val_too_big, 220) \ - x(dirent_name_too_long, 221) \ - x(dirent_name_embedded_nul, 222) \ - x(dirent_name_dot_or_dotdot, 223) \ - x(dirent_name_has_slash, 224) \ - x(dirent_d_type_wrong, 225) \ - x(inode_bi_parent_wrong, 226) \ - x(dirent_in_missing_dir_inode, 227) \ - x(dirent_in_non_dir_inode, 228) \ - x(dirent_to_missing_inode, 229) \ - x(dirent_to_missing_subvol, 230) \ - x(dirent_to_itself, 231) \ - x(quota_type_invalid, 232) \ - x(xattr_val_size_too_small, 233) \ - x(xattr_val_size_too_big, 234) \ - x(xattr_invalid_type, 235) \ - x(xattr_name_invalid_chars, 236) \ - x(xattr_in_missing_inode, 237) \ - x(root_subvol_missing, 238) \ - x(root_dir_missing, 239) \ - x(root_inode_not_dir, 240) \ - x(dir_loop, 241) \ - x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) \ - x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) \ - x(journal_entry_dup_same_device, 246) \ - x(inode_bi_subvol_missing, 247) \ - x(inode_bi_subvol_wrong, 248) \ - x(inode_points_to_missing_dirent, 249) \ - x(inode_points_to_wrong_dirent, 250) \ - x(inode_bi_parent_nonzero, 251) \ - x(dirent_to_missing_parent_subvol, 252) \ - x(dirent_not_visible_in_parent_subvol, 253) \ - x(subvol_fs_path_parent_wrong, 254) \ - x(subvol_root_fs_path_parent_nonzero, 255) \ - x(subvol_children_not_set, 256) \ - x(subvol_children_bad, 257) \ - x(subvol_loop, 258) \ - x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) \ - x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) \ - x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) \ - x(btree_bitmap_not_marked, 266) \ - x(sb_clean_entry_overrun, 267) \ - x(btree_ptr_v2_written_0, 268) \ - x(subvol_snapshot_bad, 269) \ - x(subvol_inode_bad, 270) - -enum bch_sb_error_id { -#define x(t, n) BCH_FSCK_ERR_##t = n, - BCH_SB_ERRS() -#undef x - BCH_SB_ERR_MAX -}; - struct bch_sb_error_entry_cpu { u64 id:16, nr:48; @@ -293,4 +13,3 @@ struct bch_sb_error_entry_cpu { typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; #endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ - From 8528bde1b66bab9a0abc2f521523abd00049c81b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 18:20:52 -0400 Subject: [PATCH 149/279] bcachefs: Fix uninitialized var warning Can't actually be used uninitialized, but gcc was being silly. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 2 +- fs/bcachefs/buckets.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 692b1c7d5018..4321f9fb73bd 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -690,7 +690,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos; + struct bpos bucket_pos = POS_MIN; struct bch_backpointer bp; if (p.ptr.cached) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index b469586517a8..ed97712d0db1 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1134,7 +1134,7 @@ static int __trigger_extent(struct btree_trans *trans, r.e.nr_required = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors; + s64 disk_sectors = 0; ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); if (ret < 0) return ret; From 33c563ebf8d3deed7d8addd20d77398ac737ef9a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 8 May 2024 22:50:34 +0200 Subject: [PATCH 150/279] netfilter: nft_payload: skbuff vlan metadata mangle support Userspace assumes vlan header is present at a given offset, but vlan offload allows to store this in metadata fields of the skbuff. Hence mangling vlan results in a garbled packet. Handle this transparently by adding a parser to the kernel. If vlan metadata is present and payload offset is over 12 bytes (source and destination mac address fields), then subtract vlan header present in vlan metadata, otherwise mangle vlan metadata based on offset and length, extracting data from the source register. This is similar to: 8cfd23e67401 ("netfilter: nft_payload: work around vlan header stripping") to deal with vlan payload mangling. Fixes: 7ec3f7b47b8d ("netfilter: nft_payload: add packet mangling support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 72 +++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index a3cb5dbcb362..0c43d748e23a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -145,12 +145,12 @@ int nft_payload_inner_offset(const struct nft_pktinfo *pkt) return pkt->inneroff; } -static bool nft_payload_need_vlan_copy(const struct nft_payload *priv) +static bool nft_payload_need_vlan_adjust(u32 offset, u32 len) { - unsigned int len = priv->offset + priv->len; + unsigned int boundary = offset + len; /* data past ether src/dst requested, copy needed */ - if (len > offsetof(struct ethhdr, h_proto)) + if (boundary > offsetof(struct ethhdr, h_proto)) return true; return false; @@ -174,7 +174,7 @@ void nft_payload_eval(const struct nft_expr *expr, goto err; if (skb_vlan_tag_present(skb) && - nft_payload_need_vlan_copy(priv)) { + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; @@ -801,21 +801,79 @@ struct nft_payload_set { u8 csum_flags; }; +/* This is not struct vlan_hdr. */ +struct nft_payload_vlan_hdr { + __be16 h_vlan_proto; + __be16 h_vlan_TCI; +}; + +static bool +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len, + int *vlan_hlen) +{ + struct nft_payload_vlan_hdr *vlanh; + __be16 vlan_proto; + u16 vlan_tci; + + if (offset >= offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto)) { + *vlan_hlen = VLAN_HLEN; + return true; + } + + switch (offset) { + case offsetof(struct vlan_ethhdr, h_vlan_proto): + if (len == 2) { + vlan_proto = nft_reg_load_be16(src); + skb->vlan_proto = vlan_proto; + } else if (len == 4) { + vlanh = (struct nft_payload_vlan_hdr *)src; + __vlan_hwaccel_put_tag(skb, vlanh->h_vlan_proto, + ntohs(vlanh->h_vlan_TCI)); + } else { + return false; + } + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI): + if (len != 2) + return false; + + vlan_tci = ntohs(nft_reg_load_be16(src)); + skb->vlan_tci = vlan_tci; + break; + default: + return false; + } + + return true; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload_set *priv = nft_expr_priv(expr); - struct sk_buff *skb = pkt->skb; const u32 *src = ®s->data[priv->sreg]; - int offset, csum_offset; + int offset, csum_offset, vlan_hlen = 0; + struct sk_buff *skb = pkt->skb; __wsum fsum, tsum; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) goto err; - offset = skb_mac_header(skb) - skb->data; + + if (skb_vlan_tag_present(skb) && + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { + if (!nft_payload_set_vlan(src, skb, + priv->offset, priv->len, + &vlan_hlen)) + goto err; + + if (!vlan_hlen) + return; + } + + offset = skb_mac_header(skb) - skb->data - vlan_hlen; break; case NFT_PAYLOAD_NETWORK_HEADER: offset = skb_network_offset(skb); From 21a673bddc8fd4873c370caf9ae70ffc6d47e8d3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 13 May 2024 12:27:15 +0200 Subject: [PATCH 151/279] netfilter: tproxy: bail out if IP has been disabled on the device syzbot reports: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] [..] RIP: 0010:nf_tproxy_laddr4+0xb7/0x340 net/ipv4/netfilter/nf_tproxy_ipv4.c:62 Call Trace: nft_tproxy_eval_v4 net/netfilter/nft_tproxy.c:56 [inline] nft_tproxy_eval+0xa9a/0x1a00 net/netfilter/nft_tproxy.c:168 __in_dev_get_rcu() can return NULL, so check for this. Reported-and-tested-by: syzbot+b94a6818504ea90d7661@syzkaller.appspotmail.com Fixes: cc6eb4338569 ("tproxy: use the interface primary IP address as a default value for --on-ip") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tproxy_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 69e331799604..73e66a088e25 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -58,6 +58,8 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) laddr = 0; indev = __in_dev_get_rcu(skb->dev); + if (!indev) + return daddr; in_dev_for_each_ifa_rcu(ifa, indev) { if (ifa->ifa_flags & IFA_F_SECONDARY) From e8ded22ef0f4831279c363c264cd41cd9d59ca9e Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Tue, 21 May 2024 10:25:05 -0400 Subject: [PATCH 152/279] netfilter: nft_fib: allow from forward/input without iif selector This removes the restriction of needing iif selector in the forward/input hooks for fib lookups when requested result is oif/oifname. Removing this restriction allows "loose" lookups from the forward hooks. Fixes: be8be04e5ddb ("netfilter: nft_fib: reverse path filter for policy-based routing on iif") Signed-off-by: Eric Garver Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_fib.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 37cfe6dd712d..b58f62195ff3 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -35,11 +35,9 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, switch (priv->result) { case NFT_FIB_RESULT_OIF: case NFT_FIB_RESULT_OIFNAME: - hooks = (1 << NF_INET_PRE_ROUTING); - if (priv->flags & NFTA_FIB_F_IIF) { - hooks |= (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_FORWARD); - } + hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); break; case NFT_FIB_RESULT_ADDRTYPE: if (priv->flags & NFTA_FIB_F_IIF) From 83208cbf2f08c270033003e10f3e7351de64a5c5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2024 19:21:59 -0400 Subject: [PATCH 153/279] bcachefs: Don't return -EROFS from mount on inconsistency error We were accidentally returning -EROFS during recovery on filesystem inconsistency - since this is what the journal returns on emergency shutdown. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 96040a95cf46..cd388f1702dc 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1939,8 +1939,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) { ret = PTR_ERR(sb); - ret = bch2_err_class(ret); - return ERR_PTR(ret); + goto err; } c = sb->s_fs_info; @@ -2016,6 +2015,15 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, err_put_super: __bch2_fs_stop(c); deactivate_locked_super(sb); +err: + /* + * On an inconsistency error in recovery we might see an -EROFS derived + * errorcode (from the journal), but we don't want to return that to + * userspace as that causes util-linux to retry the mount RO - which is + * confusing: + */ + if (bch2_err_matches(ret, EROFS) && ret != -EROFS) + ret = -EIO; return ERR_PTR(bch2_err_class(ret)); } From e634134180885574d1fe7aa162777ba41e7fcd5b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 27 May 2024 18:39:54 +0300 Subject: [PATCH 154/279] net/sched: taprio: make q->picos_per_byte available to fill_sched_entry() In commit b5b73b26b3ca ("taprio: Fix allowing too small intervals"), a comparison of user input against length_to_duration(q, ETH_ZLEN) was introduced, to avoid RCU stalls due to frequent hrtimers. The implementation of length_to_duration() depends on q->picos_per_byte being set for the link speed. The blamed commit in the Fixes: tag has moved this too late, so the checks introduced above are ineffective. The q->picos_per_byte is zero at parse_taprio_schedule() -> parse_sched_list() -> parse_sched_entry() -> fill_sched_entry() time. Move the taprio_set_picos_per_byte() call as one of the first things in taprio_change(), before the bulk of the netlink attribute parsing is done. That's because it is needed there. Add a selftest to make sure the issue doesn't get reintroduced. Fixes: 09dbdf28f9f9 ("net/sched: taprio: fix calculation of maximum gate durations") Signed-off-by: Vladimir Oltean Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240527153955.553333-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 4 +++- .../tc-testing/tc-tests/qdiscs/taprio.json | 22 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 1ab17e8a7260..118915055360 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1848,6 +1848,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, } q->flags = taprio_flags; + /* Needed for length_to_duration() during netlink attribute parsing */ + taprio_set_picos_per_byte(dev, q); + err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; @@ -1907,7 +1910,6 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) goto free_sched; - taprio_set_picos_per_byte(dev, q); taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json index 12da0a939e3e..8f12f00a4f57 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json @@ -132,6 +132,28 @@ "echo \"1\" > /sys/bus/netdevsim/del_device" ] }, + { + "id": "6f62", + "name": "Add taprio Qdisc with too short interval", + "category": [ + "qdisc", + "taprio" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 8\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: taprio num_tc 2 queues 1@0 1@1 sched-entry S 01 300 sched-entry S 02 1700 clockid CLOCK_TAI", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc taprio 1: root refcnt", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, { "id": "3e1e", "name": "Add taprio Qdisc with an invalid cycle-time", From fb66df20a7201e60f2b13d7f95d031b31a8831d3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 27 May 2024 18:39:55 +0300 Subject: [PATCH 155/279] net/sched: taprio: extend minimum interval restriction to entire cycle too It is possible for syzbot to side-step the restriction imposed by the blamed commit in the Fixes: tag, because the taprio UAPI permits a cycle-time different from (and potentially shorter than) the sum of entry intervals. We need one more restriction, which is that the cycle time itself must be larger than N * ETH_ZLEN bit times, where N is the number of schedule entries. This restriction needs to apply regardless of whether the cycle time came from the user or was the implicit, auto-calculated value, so we move the existing "cycle == 0" check outside the "if "(!new->cycle_time)" branch. This way covers both conditions and scenarios. Add a selftest which illustrates the issue triggered by syzbot. Fixes: b5b73b26b3ca ("taprio: Fix allowing too small intervals") Reported-by: syzbot+a7d2b1d5d1af83035567@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/0000000000007d66bc06196e7c66@google.com/ Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20240527153955.553333-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 10 ++++----- .../tc-testing/tc-tests/qdiscs/taprio.json | 22 +++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 118915055360..937a0c513c17 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1151,11 +1151,6 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); - if (!cycle) { - NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); - return -EINVAL; - } - if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; @@ -1164,6 +1159,11 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, new->cycle_time = cycle; } + if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) { + NL_SET_ERR_MSG(extack, "'cycle_time' is too small"); + return -EINVAL; + } + taprio_calculate_gate_durations(q, new); return 0; diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json index 8f12f00a4f57..557fb074acf0 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json @@ -154,6 +154,28 @@ "echo \"1\" > /sys/bus/netdevsim/del_device" ] }, + { + "id": "831f", + "name": "Add taprio Qdisc with too short cycle-time", + "category": [ + "qdisc", + "taprio" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 8\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: taprio num_tc 2 queues 1@0 1@1 sched-entry S 01 200000 sched-entry S 02 200000 cycle-time 100 clockid CLOCK_TAI", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc taprio 1: root refcnt", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, { "id": "3e1e", "name": "Add taprio Qdisc with an invalid cycle-time", From e57f2187ccc125f1f14f6d2c83da80831fc3ce9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Larumbe?= Date: Thu, 23 May 2024 12:32:17 +0100 Subject: [PATCH 156/279] drm/panfrost: Fix dma_resv deadlock at drm object pin time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Panfrost must pin an object that is being prepared a dma-buf attachment for on behalf of another driver, the core drm gem object pinning code already takes a lock on the object's dma reservation. However, Panfrost GEM object's pinning callback would eventually try taking the lock on the same dma reservation when delegating pinning of the object onto the shmem subsystem, which led to a deadlock. This can be shown by enabling CONFIG_DEBUG_WW_MUTEX_SLOWPATH, which throws the following recursive locking situation: weston/3440 is trying to acquire lock: ffff000000e235a0 (reservation_ww_class_mutex){+.+.}-{3:3}, at: drm_gem_shmem_pin+0x34/0xb8 [drm_shmem_helper] but task is already holding lock: ffff000000e235a0 (reservation_ww_class_mutex){+.+.}-{3:3}, at: drm_gem_pin+0x2c/0x80 [drm] Fix it by replacing drm_gem_shmem_pin with its locked version, as the lock had already been taken by drm_gem_pin(). Cc: Thomas Zimmermann Cc: Dmitry Osipenko Cc: Boris Brezillon Cc: Steven Price Fixes: a78027847226 ("drm/gem: Acquire reservation lock in drm_gem_{pin/unpin}()") Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Signed-off-by: Boris Brezillon Link: https://patchwork.freedesktop.org/patch/msgid/20240523113236.432585-2-adrian.larumbe@collabora.com --- drivers/gpu/drm/panfrost/panfrost_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c index d47b40b82b0b..8e0ff3efede7 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem.c @@ -192,7 +192,7 @@ static int panfrost_gem_pin(struct drm_gem_object *obj) if (bo->is_heap) return -EINVAL; - return drm_gem_shmem_pin(&bo->base); + return drm_gem_shmem_pin_locked(&bo->base); } static enum drm_gem_object_status panfrost_gem_status(struct drm_gem_object *obj) From 8c2f5dd0c362ec036f0217da1d413ce2b8361080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Larumbe?= Date: Thu, 23 May 2024 12:32:18 +0100 Subject: [PATCH 157/279] drm/lima: Fix dma_resv deadlock at drm object pin time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a78027847226 ("drm/gem: Acquire reservation lock in drm_gem_{pin/unpin}()") moved locking the DRM object's dma reservation to drm_gem_pin(), but Lima's pin callback kept calling drm_gem_shmem_pin, which also tries to lock the same dma_resv, leading to a double lock situation. As was already done for Panfrost in the previous commit, fix it by replacing drm_gem_shmem_pin() with its locked variant. Cc: Thomas Zimmermann Cc: Dmitry Osipenko Cc: Boris Brezillon Cc: Steven Price Fixes: a78027847226 ("drm/gem: Acquire reservation lock in drm_gem_{pin/unpin}()") Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Tested-by: Val Packett Signed-off-by: Boris Brezillon Link: https://patchwork.freedesktop.org/patch/msgid/20240523113236.432585-3-adrian.larumbe@collabora.com --- drivers/gpu/drm/lima/lima_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/lima/lima_gem.c b/drivers/gpu/drm/lima/lima_gem.c index 7ea244d876ca..9bb997dbb4b9 100644 --- a/drivers/gpu/drm/lima/lima_gem.c +++ b/drivers/gpu/drm/lima/lima_gem.c @@ -185,7 +185,7 @@ static int lima_gem_pin(struct drm_gem_object *obj) if (bo->heap_size) return -EINVAL; - return drm_gem_shmem_pin(&bo->base); + return drm_gem_shmem_pin_locked(&bo->base); } static int lima_gem_vmap(struct drm_gem_object *obj, struct iosys_map *map) From 3b8407e81ed76c0d84d710c2a177a8fe24292702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Larumbe?= Date: Thu, 23 May 2024 12:32:19 +0100 Subject: [PATCH 158/279] drm/gem-shmem: Add import attachment warning to locked pin function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit ec144244a43f ("drm/gem-shmem: Acquire reservation lock in GEM pin/unpin callbacks") moved locking DRM object's dma reservation to drm_gem_shmem_object_pin, and made drm_gem_shmem_pin_locked public, so we need to make sure the not-imported check warning is also added to the latter. Cc: Thomas Zimmermann Cc: Dmitry Osipenko Cc: Boris Brezillon Fixes: a78027847226 ("drm/gem: Acquire reservation lock in drm_gem_{pin/unpin}()") Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Signed-off-by: Boris Brezillon Link: https://patchwork.freedesktop.org/patch/msgid/20240523113236.432585-4-adrian.larumbe@collabora.com --- drivers/gpu/drm/drm_gem_shmem_helper.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index 885a62c2e1be..53c003983ad1 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -233,6 +233,8 @@ int drm_gem_shmem_pin_locked(struct drm_gem_shmem_object *shmem) dma_resv_assert_held(shmem->base.resv); + drm_WARN_ON(shmem->base.dev, shmem->base.import_attach); + ret = drm_gem_shmem_get_pages(shmem); return ret; From a607468b521cc99ca64f19947cb7a40f8c814730 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 19 May 2024 13:24:30 +0900 Subject: [PATCH 159/279] kconfig: remove unused expr_is_no() This has not been used since commit e911503085ae ("Kconfig: Remove bad inference rules expr_eliminate_dups2()"). Signed-off-by: Masahiro Yamada --- scripts/kconfig/expr.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/scripts/kconfig/expr.h b/scripts/kconfig/expr.h index d965e427753e..fa50fc45622e 100644 --- a/scripts/kconfig/expr.h +++ b/scripts/kconfig/expr.h @@ -302,11 +302,6 @@ static inline int expr_is_yes(struct expr *e) return !e || (e->type == E_SYMBOL && e->left.sym == &symbol_yes); } -static inline int expr_is_no(struct expr *e) -{ - return e && (e->type == E_SYMBOL && e->left.sym == &symbol_no); -} - #ifdef __cplusplus } #endif From aabdc960a283ba78086b0bf66ee74326f49e218e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 19 May 2024 18:22:27 +0900 Subject: [PATCH 160/279] kconfig: fix comparison to constant symbols, 'm', 'n' Currently, comparisons to 'm' or 'n' result in incorrect output. [Test Code] config MODULES def_bool y modules config A def_tristate m config B def_bool A > n CONFIG_B is unset, while CONFIG_B=y is expected. The reason for the issue is because Kconfig compares the tristate values as strings. Currently, the .type fields in the constant symbol definitions, symbol_{yes,mod,no} are unspecified, i.e., S_UNKNOWN. When expr_calc_value() evaluates 'A > n', it checks the types of 'A' and 'n' to determine how to compare them. The left-hand side, 'A', is a tristate symbol with a value of 'm', which corresponds to a numeric value of 1. (Internally, 'y', 'm', and 'n' are represented as 2, 1, and 0, respectively.) The right-hand side, 'n', has an unknown type, so it is treated as the string "n" during the comparison. expr_calc_value() compares two values numerically only when both can have numeric values. Otherwise, they are compared as strings. symbol numeric value ASCII code ------------------------------------- y 2 0x79 m 1 0x6d n 0 0x6e 'm' is greater than 'n' if compared numerically (since 1 is greater than 0), but smaller than 'n' if compared as strings (since the ASCII code 0x6d is smaller than 0x6e). Specifying .type=S_TRISTATE for symbol_{yes,mod,no} fixes the above test code. Doing so, however, would cause a regression to the following test code. [Test Code 2] config MODULES def_bool n modules config A def_tristate n config B def_bool A = m You would get CONFIG_B=y, while CONFIG_B should not be set. The reason is because sym_get_string_value() turns 'm' into 'n' when the module feature is disabled. Consequently, expr_calc_value() evaluates 'A = n' instead of 'A = m'. This oddity has been hidden because the type of 'm' was previously S_UNKNOWN instead of S_TRISTATE. sym_get_string_value() should not tweak the string because the tristate value has already been correctly calculated. There is no reason to return the string "n" where its tristate value is mod. Fixes: 31847b67bec0 ("kconfig: allow use of relations other than (in)equality") Signed-off-by: Masahiro Yamada --- scripts/kconfig/symbol.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index aa0e25ee5119..0e439d3d48d1 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -14,6 +14,7 @@ struct symbol symbol_yes = { .name = "y", + .type = S_TRISTATE, .curr = { "y", yes }, .menus = LIST_HEAD_INIT(symbol_yes.menus), .flags = SYMBOL_CONST|SYMBOL_VALID, @@ -21,6 +22,7 @@ struct symbol symbol_yes = { struct symbol symbol_mod = { .name = "m", + .type = S_TRISTATE, .curr = { "m", mod }, .menus = LIST_HEAD_INIT(symbol_mod.menus), .flags = SYMBOL_CONST|SYMBOL_VALID, @@ -28,6 +30,7 @@ struct symbol symbol_mod = { struct symbol symbol_no = { .name = "n", + .type = S_TRISTATE, .curr = { "n", no }, .menus = LIST_HEAD_INIT(symbol_no.menus), .flags = SYMBOL_CONST|SYMBOL_VALID, @@ -820,8 +823,7 @@ const char *sym_get_string_value(struct symbol *sym) case no: return "n"; case mod: - sym_calc_value(modules_sym); - return (modules_sym->curr.tri == no) ? "n" : "m"; + return "m"; case yes: return "y"; } From 31894d35b51ba61e2931cbf28e80114a4f72bc2b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 19 May 2024 22:34:02 +0900 Subject: [PATCH 161/279] kconfig: remove redundant check in expr_join_or() The check for 'sym1 == sym2' is redundant here because it has already been done a few lines above: if (sym1 != sym2) return NULL; Signed-off-by: Masahiro Yamada --- scripts/kconfig/expr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/expr.c b/scripts/kconfig/expr.c index a290de36307b..4d95fce5f9a7 100644 --- a/scripts/kconfig/expr.c +++ b/scripts/kconfig/expr.c @@ -476,7 +476,7 @@ static struct expr *expr_join_or(struct expr *e1, struct expr *e2) return expr_alloc_comp(E_UNEQUAL, sym1, &symbol_yes); } } - if (sym1->type == S_BOOLEAN && sym1 == sym2) { + if (sym1->type == S_BOOLEAN) { if ((e1->type == E_NOT && e1->left.expr->type == E_SYMBOL && e2->type == E_SYMBOL) || (e2->type == E_NOT && e2->left.expr->type == E_SYMBOL && e1->type == E_SYMBOL)) return expr_alloc_symbol(&symbol_yes); From 659bbf7e1b08267b8e1dd900b316edcb6f6d9e2e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 20 May 2024 12:56:52 -0700 Subject: [PATCH 162/279] kbuild: scripts/gdb: Replace missed $(srctree)/$(src) w/ $(src) Recently we went through the source tree and replaced $(srctree)/$(src) w/ $(src). However, the gdb scripts Makefile had a hidden $(srctree)/$(src) that looked like this: $(abspath $(srctree))/$(src) Because we missed that then my installed kernel had symlinks that looked like this: __init__.py -> ${INSTALL_DIR}/$(INSTALL_DIR}/scripts/gdb/linux/__init__.py Let's also replace the midden $(abspath $(srctree))/$(src) with $(src). Now: __init__.py -> $(INSTALL_DIR}/scripts/gdb/linux/__init__.py Fixes: b1992c3772e6 ("kbuild: use $(src) instead of $(srctree)/$(src) for source directory") Signed-off-by: Douglas Anderson Signed-off-by: Masahiro Yamada --- scripts/gdb/linux/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdb/linux/Makefile b/scripts/gdb/linux/Makefile index d77ad9079d0f..fd1402c0a1a1 100644 --- a/scripts/gdb/linux/Makefile +++ b/scripts/gdb/linux/Makefile @@ -5,7 +5,7 @@ ifdef building_out_of_srctree symlinks := $(patsubst $(src)/%,%,$(wildcard $(src)/*.py)) quiet_cmd_symlink = SYMLINK $@ - cmd_symlink = ln -fsn $(patsubst $(obj)/%,$(abspath $(srctree))/$(src)/%,$@) $@ + cmd_symlink = ln -fsn $(patsubst $(obj)/%,$(src)/%,$@) $@ always-y += $(symlinks) $(addprefix $(obj)/, $(symlinks)): FORCE From 04b8cb0945b4bf679c71dc2351e0d3c25481e3c6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 20 May 2024 21:42:09 +0900 Subject: [PATCH 163/279] kbuild: avoid unneeded kallsyms step 3 Since commit 951bcae6c5a0 ("kallsyms: Avoid weak references for kallsyms symbols"), the kallsyms step 3 always occurs. You can compare the build logs. [Before 951bcae6c5a0] $ git checkout 951bcae6c5a0^ $ make defconfig all [ snip ] LD .tmp_vmlinux.kallsyms1 NM .tmp_vmlinux.kallsyms1.syms KSYMS .tmp_vmlinux.kallsyms1.S AS .tmp_vmlinux.kallsyms1.S LD .tmp_vmlinux.kallsyms2 NM .tmp_vmlinux.kallsyms2.syms KSYMS .tmp_vmlinux.kallsyms2.S AS .tmp_vmlinux.kallsyms2.S LD vmlinux [After 951bcae6c5a0] $ git checkout 951bcae6c5a0 $ make defconfig all [ snip ] LD .tmp_vmlinux.kallsyms1 NM .tmp_vmlinux.kallsyms1.syms KSYMS .tmp_vmlinux.kallsyms1.S AS .tmp_vmlinux.kallsyms1.S LD .tmp_vmlinux.kallsyms2 NM .tmp_vmlinux.kallsyms2.syms KSYMS .tmp_vmlinux.kallsyms2.S AS .tmp_vmlinux.kallsyms2.S LD .tmp_vmlinux.kallsyms3 # should not happen NM .tmp_vmlinux.kallsyms3.syms # should not happen KSYMS .tmp_vmlinux.kallsyms3.S # should not happen AS .tmp_vmlinux.kallsyms3.S # should not happen LD vmlinux The resulting vmlinux is correct, but it always requires an additional linking step. The symbols produced by kallsyms are excluded from kallsyms itself because they were previously missing in step 1. With those symbols excluded, the symbol lists matched between step 1 and step 2, eliminating the need for step 3. Now, this has a negative effect. Since 951bcae6c5a0, the PROVIDE() directives provide the fallback definitions, which are not trimmed from the sysbol list in step 1 because ${kallsymso_prev} is empty at this point. In step 2, ${kallsymso_prev} is set, and the kallsyms_* symbols are trimmed from the symbol list. Due to the table size difference between step 1 and step 2 (the former is larger due to the presence of kallsyms_*), step 3 is triggered. Now that the kallsyms_* symbols are always linked, let's stop omitting them from kallsyms. This avoids unnecessary step 3. Fixes: 951bcae6c5a0 ("kallsyms: Avoid weak references for kallsyms symbols") Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 6 +++--- scripts/mksysmap | 11 +---------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 7862a8101747..b0d39a927fbc 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -179,7 +179,7 @@ kallsyms_step() kallsyms_S=${kallsyms_vmlinux}.S vmlinux_link ${kallsyms_vmlinux} "${kallsymso_prev}" ${btf_vmlinux_bin_o} - mksysmap ${kallsyms_vmlinux} ${kallsyms_vmlinux}.syms ${kallsymso_prev} + mksysmap ${kallsyms_vmlinux} ${kallsyms_vmlinux}.syms kallsyms ${kallsyms_vmlinux}.syms ${kallsyms_S} info AS ${kallsyms_S} @@ -193,7 +193,7 @@ kallsyms_step() mksysmap() { info NM ${2} - ${CONFIG_SHELL} "${srctree}/scripts/mksysmap" ${1} ${2} ${3} + ${CONFIG_SHELL} "${srctree}/scripts/mksysmap" ${1} ${2} } sorttable() @@ -282,7 +282,7 @@ if is_enabled CONFIG_DEBUG_INFO_BTF && is_enabled CONFIG_BPF; then ${RESOLVE_BTFIDS} vmlinux fi -mksysmap vmlinux System.map ${kallsymso} +mksysmap vmlinux System.map if is_enabled CONFIG_BUILDTIME_TABLE_SORT; then info SORTTAB vmlinux diff --git a/scripts/mksysmap b/scripts/mksysmap index 57ff5656d566..e46bafe333bd 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -4,7 +4,7 @@ # tools to retrieve the actual addresses of symbols in the kernel. # # Usage -# mksysmap vmlinux System.map [exclude] +# mksysmap vmlinux System.map ##### @@ -92,13 +92,4 @@ ${NM} -n ${1} | sed >${2} -e " # ppc stub /\.long_branch\./d /\.plt_branch\./d - -# --------------------------------------------------------------------------- -# Ignored kallsyms symbols -# -# If the 3rd parameter exists, symbols from it will be omitted from the output. -# This makes kallsyms have the identical symbol lists in the step 1 and 2. -# Without this, the step2 would get new symbols generated by scripts/kallsyms.c -# when CONFIG_KALLSYMS_ALL is enabled. That might require one more pass. -$(if [ $# -ge 3 ]; then ${NM} ${3} | sed -n '/ U /!s:.* \([^ ]*\)$:/ \1$/d:p'; fi) " From b18b047002b7d3b19d9fb905c1bd2a214016c153 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 20 May 2024 21:42:10 +0900 Subject: [PATCH 164/279] kbuild: change scripts/mksysmap into sed script The previous commit removed the subshell execution from scripts/mksysmap, which is now simple enough to become a sed script. Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 2 +- scripts/mksysmap | 19 ++++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index b0d39a927fbc..c22a213ea6a9 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -193,7 +193,7 @@ kallsyms_step() mksysmap() { info NM ${2} - ${CONFIG_SHELL} "${srctree}/scripts/mksysmap" ${1} ${2} + ${NM} -n "${1}" | "${srctree}/scripts/mksysmap" > "${2}" } sorttable() diff --git a/scripts/mksysmap b/scripts/mksysmap index e46bafe333bd..c12723a04655 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -1,22 +1,16 @@ -#!/bin/sh -x -# Based on the vmlinux file create the System.map file +#!/bin/sed -f +# SPDX-License-Identifier: GPL-2.0-only +# +# sed script to filter out symbols that are not needed for System.map, +# or not suitable for kallsyms. The input should be 'nm -n '. +# # System.map is used by module-init tools and some debugging # tools to retrieve the actual addresses of symbols in the kernel. # -# Usage -# mksysmap vmlinux System.map - - -##### -# Generate System.map (actual filename passed as second argument) -# The following refers to the symbol type as per nm(1). - # readprofile starts reading symbols when _stext is found, and # continue until it finds a symbol which is not either of 'T', 't', # 'W' or 'w'. # - -${NM} -n ${1} | sed >${2} -e " # --------------------------------------------------------------------------- # Ignored symbol types # @@ -92,4 +86,3 @@ ${NM} -n ${1} | sed >${2} -e " # ppc stub /\.long_branch\./d /\.plt_branch\./d -" From 3430f65d6130ccbc86f0ff45642eeb9e2032a600 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 20 May 2024 21:42:11 +0900 Subject: [PATCH 165/279] kbuild: fix short log for AS in link-vmlinux.sh In convention, short logs print the output file, not the input file. Let's change the suffix for 'AS' since it assembles *.S into *.o. [Before] LD .tmp_vmlinux.kallsyms1 NM .tmp_vmlinux.kallsyms1.syms KSYMS .tmp_vmlinux.kallsyms1.S AS .tmp_vmlinux.kallsyms1.S LD .tmp_vmlinux.kallsyms2 NM .tmp_vmlinux.kallsyms2.syms KSYMS .tmp_vmlinux.kallsyms2.S AS .tmp_vmlinux.kallsyms2.S LD vmlinux [After] LD .tmp_vmlinux.kallsyms1 NM .tmp_vmlinux.kallsyms1.syms KSYMS .tmp_vmlinux.kallsyms1.S AS .tmp_vmlinux.kallsyms1.o LD .tmp_vmlinux.kallsyms2 NM .tmp_vmlinux.kallsyms2.syms KSYMS .tmp_vmlinux.kallsyms2.S AS .tmp_vmlinux.kallsyms2.o LD vmlinux Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index c22a213ea6a9..7aca51b24e9f 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -182,7 +182,7 @@ kallsyms_step() mksysmap ${kallsyms_vmlinux} ${kallsyms_vmlinux}.syms kallsyms ${kallsyms_vmlinux}.syms ${kallsyms_S} - info AS ${kallsyms_S} + info AS ${kallsymso} ${CC} ${NOSTDINC_FLAGS} ${LINUXINCLUDE} ${KBUILD_CPPFLAGS} \ ${KBUILD_AFLAGS} ${KBUILD_AFLAGS_KERNEL} \ -c -o ${kallsymso} ${kallsyms_S} From 3c562a70cf4da331baef60ebb3f0e30b254006e9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 22 May 2024 19:43:11 +0900 Subject: [PATCH 166/279] kbuild: remove a stale comment about cleaning in link-vmlinux.sh Remove the left-over of commit 51eb95e2da41 ("kbuild: Don't remove link-vmlinux temporary files on exit/signal"). Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 7aca51b24e9f..46ce5d04dbeb 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -201,7 +201,6 @@ sorttable() ${objtree}/scripts/sorttable ${1} } -# Delete output files in case of error cleanup() { rm -f .btf.* From e06a698ae62b9ee5ca98e65be2c90a61464192e6 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Tue, 28 May 2024 16:52:18 +0800 Subject: [PATCH 167/279] scripts/make_fit: Drop fdt image entry compatible string According to the FIT image source file format document found in U-boot [1] and the split-out FIT image specification [2], under "'/images' node" -> "Conditionally mandatory property", the "compatible" property is described as "compatible method for loading image", i.e., not the compatible string embedded in the FDT or used for matching. Drop the compatible string from the fdt image entry node. While at it also fix up a typo in the document section of output_dtb. [1] U-boot source "doc/usage/fit/source_file_format.rst", or on the website: https://docs.u-boot.org/en/latest/usage/fit/source_file_format.html [2] https://github.com/open-source-firmware/flat-image-tree/blob/main/source/chapter2-source-file-format.rst Fixes: 7a23b027ec17 ("arm64: boot: Support Flat Image Tree") Signed-off-by: Chen-Yu Tsai Reviewed-by: Simon Glass Signed-off-by: Masahiro Yamada --- scripts/make_fit.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/make_fit.py b/scripts/make_fit.py index 3de90c5a094b..263147df80a4 100755 --- a/scripts/make_fit.py +++ b/scripts/make_fit.py @@ -190,7 +190,7 @@ def output_dtb(fsw, seq, fname, arch, compress): Args: fsw (libfdt.FdtSw): Object to use for writing seq (int): Sequence number (1 for first) - fmame (str): Filename containing the DTB + fname (str): Filename containing the DTB arch: FIT architecture, e.g. 'arm64' compress (str): Compressed algorithm, e.g. 'gzip' @@ -211,7 +211,6 @@ def output_dtb(fsw, seq, fname, arch, compress): fsw.property_string('type', 'flat_dt') fsw.property_string('arch', arch) fsw.property_string('compression', compress) - fsw.property('compatible', bytes(compat)) with open(fname, 'rb') as inf: compressed = compress_data(inf, compress) From 647535760a00a854c185dd4d7e6eccfea30ea0d5 Mon Sep 17 00:00:00 2001 From: Janusz Krzysztofik Date: Mon, 6 May 2024 20:02:50 +0200 Subject: [PATCH 168/279] Revert "drm/i915: Remove extra multi-gt pm-references" This reverts commit 1f33dc0c1189efb9ae19c6fc22b64dd3e26261fb. There was a patch supposed to fix an issue of illegal attempts to free a still active i915 VMA object when parking a GT believed to be idle, reported by CI on 2-GT Meteor Lake. As a solution, an extra wakeref for a Primary GT was acquired from i915_gem_do_execbuffer() -- see commit f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform"). However, that fix occurred insufficient -- the issue was still reported by CI. That wakeref was released on exit from i915_gem_do_execbuffer(), then potentially before completion of the request and deactivation of its associated VMAs. Moreover, CI reports indicated that single-GT platforms also suffered sporadically from the same race. Since that issue was fixed by another commit f3c71b2ded5c ("drm/i915/vma: Fix UAF on destroy against retire race"), the changes introduced by that insufficient fix were dropped as no longer useful. However, that series resulted in another VMA UAF scenario now being triggered in CI. <4> [260.290809] ------------[ cut here ]------------ <4> [260.290988] list_del corruption. prev->next should be ffff888118c5d990, but was ffff888118c5a510. (prev=ffff888118c5a510) <4> [260.291004] WARNING: CPU: 2 PID: 1143 at lib/list_debug.c:62 __list_del_entry_valid_or_report+0xb7/0xe0 .. <4> [260.291055] CPU: 2 PID: 1143 Comm: kms_plane Not tainted 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1 <4> [260.291058] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [260.291060] RIP: 0010:__list_del_entry_valid_or_report+0xb7/0xe0 ... <4> [260.291087] Call Trace: <4> [260.291089] <4> [260.291124] i915_vma_reopen+0x43/0x80 [i915] <4> [260.291298] eb_lookup_vmas+0x9cb/0xcc0 [i915] <4> [260.291579] i915_gem_do_execbuffer+0xc9a/0x26d0 [i915] <4> [260.291883] i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915] ... <4> [260.292301] ... <4> [260.292506] ---[ end trace 0000000000000000 ]--- <4> [260.292782] general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6ca3: 0000 [#1] PREEMPT SMP NOPTI <4> [260.303575] CPU: 2 PID: 1143 Comm: kms_plane Tainted: G W 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1 <4> [260.313851] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [260.326359] RIP: 0010:eb_validate_vmas+0x114/0xd80 [i915] ... <4> [260.428756] Call Trace: <4> [260.431192] <4> [639.283393] i915_gem_do_execbuffer+0xd05/0x26d0 [i915] <4> [639.305245] i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915] ... <4> [639.411134] ... <4> [639.449979] ---[ end trace 0000000000000000 ]--- We defer actually closing, unbinding and destroying a VMA until next idle point, or until the object is freed in the meantime. By postponing the unbind, we allow for the VMA to be reopened by the client, avoiding the work required to rebind the VMA. Starting from commit b0647a5e79b1 ("drm/i915: Avoid live-lock with i915_vma_parked()"), we assume that as long as a GT is held idle, no VMA would be reopened while we destroy them. That assumption is no longer true in multi-GT configurations, where a VMA we reopen may be handled by a GT different from the one that we already keep active via its engine while we set up an execbuf request. Restoring the extra GT0 PM wakeref removed from i915_gem_do_execbuffer() processing path seems to fix this issue. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10608 Signed-off-by: Janusz Krzysztofik Cc: Rodrigo Vivi Cc: Nirmoy Das Reviewed-by: Nirmoy Das Fixes: 1f33dc0c1189 ("drm/i915: Remove extra multi-gt pm-references") Link: https://patchwork.freedesktop.org/patch/msgid/20240506180253.96858-2-janusz.krzysztofik@linux.intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 749670a58d935303ad1ce529acc73f12de25832e) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 42619fc05de4..090724fa766c 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -255,6 +255,7 @@ struct i915_execbuffer { struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */ intel_wakeref_t wakeref; + intel_wakeref_t wakeref_gt0; /** our requests to build */ struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; @@ -2685,6 +2686,7 @@ static int eb_select_engine(struct i915_execbuffer *eb) { struct intel_context *ce, *child; + struct intel_gt *gt; unsigned int idx; int err; @@ -2708,10 +2710,17 @@ eb_select_engine(struct i915_execbuffer *eb) } } eb->num_batches = ce->parallel.number_children + 1; + gt = ce->engine->gt; for_each_child(ce, child) intel_context_get(child); eb->wakeref = intel_gt_pm_get(ce->engine->gt); + /* + * Keep GT0 active on MTL so that i915_vma_parked() doesn't + * free VMAs while execbuf ioctl is validating VMAs. + */ + if (gt->info.id) + eb->wakeref_gt0 = intel_gt_pm_get(to_gt(gt->i915)); if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { err = intel_context_alloc_state(ce); @@ -2750,6 +2759,9 @@ eb_select_engine(struct i915_execbuffer *eb) return err; err: + if (gt->info.id) + intel_gt_pm_put(to_gt(gt->i915), eb->wakeref_gt0); + intel_gt_pm_put(ce->engine->gt, eb->wakeref); for_each_child(ce, child) intel_context_put(child); @@ -2763,6 +2775,12 @@ eb_put_engine(struct i915_execbuffer *eb) struct intel_context *child; i915_vm_put(eb->context->vm); + /* + * This works in conjunction with eb_select_engine() to prevent + * i915_vma_parked() from interfering while execbuf validates vmas. + */ + if (eb->gt->info.id) + intel_gt_pm_put(to_gt(eb->gt->i915), eb->wakeref_gt0); intel_gt_pm_put(eb->context->engine->gt, eb->wakeref); for_each_child(eb->context, child) intel_context_put(child); From 70cb9188ffc75e643debf292fcddff36c9dbd4ae Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 23 Apr 2024 18:23:10 +0200 Subject: [PATCH 169/279] drm/i915/gt: Disarm breadcrumbs if engines are already idle The breadcrumbs use a GT wakeref for guarding the interrupt, but are disarmed during release of the engine wakeref. This leaves a hole where we may attach a breadcrumb just as the engine is parking (after it has parked its breadcrumbs), execute the irq worker with some signalers still attached, but never be woken again. That issue manifests itself in CI with IGT runner timeouts while tests are waiting indefinitely for release of all GT wakerefs. <6> [209.151778] i915: Running live_engine_pm_selftests/live_engine_busy_stats <7> [209.231628] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_5 <7> [209.231816] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_4 <7> [209.231944] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_3 <7> [209.232056] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_2 <7> [209.232166] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling DC_off <7> [209.232270] i915 0000:00:02.0: [drm:skl_enable_dc6 [i915]] Enabling DC6 <7> [209.232368] i915 0000:00:02.0: [drm:gen9_set_dc_state.part.0 [i915]] Setting DC state from 00 to 02 <4> [299.356116] [IGT] Inactivity timeout exceeded. Killing the current test with SIGQUIT. ... <6> [299.356526] sysrq: Show State ... <6> [299.373964] task:i915_selftest state:D stack:11784 pid:5578 tgid:5578 ppid:873 flags:0x00004002 <6> [299.373967] Call Trace: <6> [299.373968] <6> [299.373970] __schedule+0x3bb/0xda0 <6> [299.373974] schedule+0x41/0x110 <6> [299.373976] intel_wakeref_wait_for_idle+0x82/0x100 [i915] <6> [299.374083] ? __pfx_var_wake_function+0x10/0x10 <6> [299.374087] live_engine_busy_stats+0x9b/0x500 [i915] <6> [299.374173] __i915_subtests+0xbe/0x240 [i915] <6> [299.374277] ? __pfx___intel_gt_live_setup+0x10/0x10 [i915] <6> [299.374369] ? __pfx___intel_gt_live_teardown+0x10/0x10 [i915] <6> [299.374456] intel_engine_live_selftests+0x1c/0x30 [i915] <6> [299.374547] __run_selftests+0xbb/0x190 [i915] <6> [299.374635] i915_live_selftests+0x4b/0x90 [i915] <6> [299.374717] i915_pci_probe+0x10d/0x210 [i915] At the end of the interrupt worker, if there are no more engines awake, disarm the breadcrumb and go to sleep. Fixes: 9d5612ca165a ("drm/i915/gt: Defer enabling the breadcrumb interrupt to after submission") Closes: https://gitlab.freedesktop.org/drm/intel/issues/10026 Signed-off-by: Chris Wilson Cc: Andrzej Hajda Cc: # v5.12+ Signed-off-by: Janusz Krzysztofik Acked-by: Nirmoy Das Reviewed-by: Andrzej Hajda Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240423165505.465734-2-janusz.krzysztofik@linux.intel.com (cherry picked from commit fbad43eccae5cb14594195c20113369aabaa22b5) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index d650beb8ed22..20b9b04ec1e0 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -263,8 +263,13 @@ static void signal_irq_work(struct irq_work *work) i915_request_put(rq); } + /* Lazy irq enabling after HW submission */ if (!READ_ONCE(b->irq_armed) && !list_empty(&b->signalers)) intel_breadcrumbs_arm_irq(b); + + /* And confirm that we still want irqs enabled before we yield */ + if (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) + intel_breadcrumbs_disarm_irq(b); } struct intel_breadcrumbs * @@ -315,13 +320,7 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) return; /* Kick the work once more to drain the signalers, and disarm the irq */ - irq_work_sync(&b->irq_work); - while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { - local_irq_disable(); - signal_irq_work(&b->irq_work); - local_irq_enable(); - cond_resched(); - } + irq_work_queue(&b->irq_work); } void intel_breadcrumbs_free(struct kref *kref) @@ -404,7 +403,7 @@ static void insert_breadcrumb(struct i915_request *rq) * the request as it may have completed and raised the interrupt as * we were attaching it into the lists. */ - if (!b->irq_armed || __i915_request_is_complete(rq)) + if (!READ_ONCE(b->irq_armed) || __i915_request_is_complete(rq)) irq_work_queue(&b->irq_work); } From d4f36db62396b73bed383c0b6e48d36278cafa78 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 30 Apr 2024 09:48:09 -0700 Subject: [PATCH 170/279] drm/i915/guc: avoid FIELD_PREP warning With gcc-7 and earlier, there are lots of warnings like In file included from :0:0: In function '__guc_context_policy_add_priority.isra.66', inlined from '__guc_context_set_prio.isra.67' at drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:3292:3, inlined from 'guc_context_set_prio' at drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:3320:2: include/linux/compiler_types.h:399:38: error: call to '__compiletime_assert_631' declared with attribute error: FIELD_PREP: mask is not constant _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) ^ ... drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:2422:3: note: in expansion of macro 'FIELD_PREP' FIELD_PREP(GUC_KLV_0_KEY, GUC_CONTEXT_POLICIES_KLV_ID_##id) | \ ^~~~~~~~~~ Make sure that GUC_KLV_0_KEY is an unsigned value to avoid the warning. Fixes: 77b6f79df66e ("drm/i915/guc: Update to GuC version 69.0.3") Signed-off-by: Arnd Bergmann Reviewed-by: Michal Wajdeczko Signed-off-by: Julia Filipchuk Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20240430164809.482131-1-julia.filipchuk@intel.com (cherry picked from commit 364e039827ef628c650c21c1afe1c54d9c3296d9) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h index bebf28e3c479..525587cfe1af 100644 --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h @@ -29,9 +29,9 @@ */ #define GUC_KLV_LEN_MIN 1u -#define GUC_KLV_0_KEY (0xffff << 16) -#define GUC_KLV_0_LEN (0xffff << 0) -#define GUC_KLV_n_VALUE (0xffffffff << 0) +#define GUC_KLV_0_KEY (0xffffu << 16) +#define GUC_KLV_0_LEN (0xffffu << 0) +#define GUC_KLV_n_VALUE (0xffffffffu << 0) /** * DOC: GuC Self Config KLVs From 33defcacd207196a6b35857087e6335590adad62 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 28 May 2024 22:39:18 +0300 Subject: [PATCH 171/279] drm/panel/lg-sw43408: select CONFIG_DRM_DISPLAY_DP_HELPER This panel driver uses DSC PPS functions and as such depends on the DRM_DISPLAY_DP_HELPER. Select this symbol to make required functions available to the driver. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404200800.kYsRYyli-lkp@intel.com/ Fixes: 069a6c0e94f9 ("drm: panel: Add LG sw43408 panel driver") Reviewed-by: Neil Armstrong Reviewed-by: Marijn Suijten Link: https://patchwork.freedesktop.org/patch/msgid/20240528-panel-sw43408-fix-v4-1-330b42445bcc@linaro.org Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/panel/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig index 982324ef5a41..2ae0eb0638f3 100644 --- a/drivers/gpu/drm/panel/Kconfig +++ b/drivers/gpu/drm/panel/Kconfig @@ -340,6 +340,8 @@ config DRM_PANEL_LG_SW43408 depends on OF depends on DRM_MIPI_DSI depends on BACKLIGHT_CLASS_DEVICE + select DRM_DISPLAY_DP_HELPER + select DRM_DISPLAY_HELPER help Say Y here if you want to enable support for LG sw43408 panel. The panel has a 1080x2160@60Hz resolution and uses 24 bit RGB per From 659a3062c705753a9ec6fd28a4c67ee4254f9584 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Thu, 16 May 2024 17:14:03 +0200 Subject: [PATCH 172/279] drm/i915/selftests: Set always_coherent to false when reading from CPU Commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.") was not complete as for non LLC sharing platforms cpu read can happen from LLC which probably doesn't have the latest changes made by GPU. Cc: Andi Shyti Cc: Janusz Krzysztofik Cc: Jonathan Cavitt Fixes: 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.") Reviewed-by: Jonathan Cavitt Reviewed-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240516151403.2875-1-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit 007ed70831426d4cc108d879d688de6b8e3e6d45) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c index 65a931ea80e9..3527b8f446fe 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c @@ -196,7 +196,7 @@ static int verify_access(struct drm_i915_private *i915, if (err) goto out_file; - mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, true); + mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, false); vaddr = i915_gem_object_pin_map_unlocked(native_obj, mode); if (IS_ERR(vaddr)) { err = PTR_ERR(vaddr); From 8c318cb70c88aa02068db7518e852b909c9b400f Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 28 May 2024 22:39:19 +0300 Subject: [PATCH 173/279] drm/panel/lg-sw43408: mark sw43408_backlight_ops as static Fix sparse warning regarding symbol 'sw43408_backlight_ops' not being declared. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404200739.hbWZvOhR-lkp@intel.com/ Reviewed-by: Neil Armstrong Fixes: 069a6c0e94f9 ("drm: panel: Add LG sw43408 panel driver") Reviewed-by: Marijn Suijten Link: https://patchwork.freedesktop.org/patch/msgid/20240528-panel-sw43408-fix-v4-2-330b42445bcc@linaro.org Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/panel/panel-lg-sw43408.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-lg-sw43408.c b/drivers/gpu/drm/panel/panel-lg-sw43408.c index 115f4702d59f..2b3a73696dce 100644 --- a/drivers/gpu/drm/panel/panel-lg-sw43408.c +++ b/drivers/gpu/drm/panel/panel-lg-sw43408.c @@ -182,7 +182,7 @@ static int sw43408_backlight_update_status(struct backlight_device *bl) return mipi_dsi_dcs_set_display_brightness_large(dsi, brightness); } -const struct backlight_ops sw43408_backlight_ops = { +static const struct backlight_ops sw43408_backlight_ops = { .update_status = sw43408_backlight_update_status, }; From ee01b6a386eaf9984b58a2476e8f531149679da9 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Fri, 17 May 2024 11:06:16 +0200 Subject: [PATCH 174/279] drm/i915/gt: Fix CCS id's calculation for CCS mode setting The whole point of the previous fixes has been to change the CCS hardware configuration to generate only one stream available to the compute users. We did this by changing the info.engine_mask that is set during device probe, reset during the detection of the fused engines, and finally reset again when choosing the CCS mode. We can't use the engine_mask variable anymore, as with the current configuration, it imposes only one CCS no matter what the hardware configuration is. Before changing the engine_mask for the third time, save it and use it for calculating the CCS mode. After the previous changes, the user reported a performance drop to around 1/4. We have tested that the compute operations, with the current patch, have improved by the same factor. Fixes: 6db31251bb26 ("drm/i915/gt: Enable only one CCS for compute workload") Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Gnattu OC Cc: Joonas Lahtinen Cc: Matt Roper Tested-by: Jian Ye Reviewed-by: Umesh Nerlige Ramappa Tested-by: Gnattu OC Link: https://patchwork.freedesktop.org/patch/msgid/20240517090616.242529-1-andi.shyti@linux.intel.com (cherry picked from commit a09d2327a9ba8e3f5be238bc1b7ca2809255b464) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 6 ++++++ drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c | 2 +- drivers/gpu/drm/i915/gt/intel_gt_types.h | 8 ++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 5c8e9ee3b008..3b740ca25000 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -885,6 +885,12 @@ static intel_engine_mask_t init_engine_mask(struct intel_gt *gt) if (IS_DG2(gt->i915)) { u8 first_ccs = __ffs(CCS_MASK(gt)); + /* + * Store the number of active cslices before + * changing the CCS engine configuration + */ + gt->ccs.cslices = CCS_MASK(gt); + /* Mask off all the CCS engine */ info->engine_mask &= ~GENMASK(CCS3, CCS0); /* Put back in the first CCS engine */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c index 99b71bb7da0a..3c62a44e9106 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c @@ -19,7 +19,7 @@ unsigned int intel_gt_apply_ccs_mode(struct intel_gt *gt) /* Build the value for the fixed CCS load balancing */ for (cslice = 0; cslice < I915_MAX_CCS; cslice++) { - if (CCS_MASK(gt) & BIT(cslice)) + if (gt->ccs.cslices & BIT(cslice)) /* * If available, assign the cslice * to the first available engine... diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index def7dd0eb6f1..cfdd2ad5e954 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -207,6 +207,14 @@ struct intel_gt { [MAX_ENGINE_INSTANCE + 1]; enum intel_submission_method submission_method; + struct { + /* + * Mask of the non fused CCS slices + * to be used for the load balancing + */ + intel_engine_mask_t cslices; + } ccs; + /* * Default address space (either GGTT or ppGTT depending on arch). * From 43e2b37e2ab660c3565d4cff27922bc70e79c3f1 Mon Sep 17 00:00:00 2001 From: Vidya Srinivas Date: Mon, 20 May 2024 22:26:34 +0530 Subject: [PATCH 175/279] drm/i915/dpt: Make DPT object unshrinkable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some scenarios, the DPT object gets shrunk but the actual framebuffer did not and thus its still there on the DPT's vm->bound_list. Then it tries to rewrite the PTEs via a stale CPU mapping. This causes panic. Cc: stable@vger.kernel.org Reported-by: Shawn Lee Fixes: 0dc987b699ce ("drm/i915/display: Add smem fallback allocation for dpt") Signed-off-by: Vidya Srinivas [vsyrjala: Add TODO comment] Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240520165634.1162470-1-vidya.srinivas@intel.com (cherry picked from commit 51064d471c53dcc8eddd2333c3f1c1d9131ba36c) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_object.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index 3560a062d287..5d7446a48ae7 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -284,7 +284,9 @@ bool i915_gem_object_has_iomem(const struct drm_i915_gem_object *obj); static inline bool i915_gem_object_is_shrinkable(const struct drm_i915_gem_object *obj) { - return i915_gem_object_type_has(obj, I915_GEM_OBJECT_IS_SHRINKABLE); + /* TODO: make DPT shrinkable when it has no bound vmas */ + return i915_gem_object_type_has(obj, I915_GEM_OBJECT_IS_SHRINKABLE) && + !obj->is_dpt; } static inline bool From 75800e2e4203ea83bbc9d4f63ad97ea582244a08 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 21 May 2024 17:30:22 +0300 Subject: [PATCH 176/279] drm/i915: Fix audio component initialization After registering the audio component in i915_audio_component_init() the audio driver may call i915_audio_component_get_power() via the component ops. This could program AUD_FREQ_CNTRL with an uninitialized value if the latter function is called before display.audio.freq_cntrl gets initialized. The get_power() function also does a modeset which in the above case happens too early before the initialization step and triggers the "Reject display access from task" error message added by the Fixes: commit below. Fix the above issue by registering the audio component only after the initialization step. Fixes: 87c1694533c9 ("drm/i915: save AUD_FREQ_CNTRL state at audio domain suspend") Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/10291 Cc: stable@vger.kernel.org # v5.5+ Signed-off-by: Imre Deak Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20240521143022.3784539-1-imre.deak@intel.com (cherry picked from commit fdd0b80172758ce284f19fa8a26d90c61e4371d2) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_audio.c | 32 ++++++++++++------- drivers/gpu/drm/i915/display/intel_audio.h | 1 + .../drm/i915/display/intel_display_driver.c | 2 ++ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_audio.c b/drivers/gpu/drm/i915/display/intel_audio.c index ed81e1466c4b..40e7d862675e 100644 --- a/drivers/gpu/drm/i915/display/intel_audio.c +++ b/drivers/gpu/drm/i915/display/intel_audio.c @@ -1252,17 +1252,6 @@ static const struct component_ops i915_audio_component_bind_ops = { static void i915_audio_component_init(struct drm_i915_private *i915) { u32 aud_freq, aud_freq_init; - int ret; - - ret = component_add_typed(i915->drm.dev, - &i915_audio_component_bind_ops, - I915_COMPONENT_AUDIO); - if (ret < 0) { - drm_err(&i915->drm, - "failed to add audio component (%d)\n", ret); - /* continue with reduced functionality */ - return; - } if (DISPLAY_VER(i915) >= 9) { aud_freq_init = intel_de_read(i915, AUD_FREQ_CNTRL); @@ -1285,6 +1274,21 @@ static void i915_audio_component_init(struct drm_i915_private *i915) /* init with current cdclk */ intel_audio_cdclk_change_post(i915); +} + +static void i915_audio_component_register(struct drm_i915_private *i915) +{ + int ret; + + ret = component_add_typed(i915->drm.dev, + &i915_audio_component_bind_ops, + I915_COMPONENT_AUDIO); + if (ret < 0) { + drm_err(&i915->drm, + "failed to add audio component (%d)\n", ret); + /* continue with reduced functionality */ + return; + } i915->display.audio.component_registered = true; } @@ -1317,6 +1321,12 @@ void intel_audio_init(struct drm_i915_private *i915) i915_audio_component_init(i915); } +void intel_audio_register(struct drm_i915_private *i915) +{ + if (!i915->display.audio.lpe.platdev) + i915_audio_component_register(i915); +} + /** * intel_audio_deinit() - deinitialize the audio driver * @i915: the i915 drm device private data diff --git a/drivers/gpu/drm/i915/display/intel_audio.h b/drivers/gpu/drm/i915/display/intel_audio.h index 9327954b801e..576c061d72a4 100644 --- a/drivers/gpu/drm/i915/display/intel_audio.h +++ b/drivers/gpu/drm/i915/display/intel_audio.h @@ -28,6 +28,7 @@ void intel_audio_codec_get_config(struct intel_encoder *encoder, void intel_audio_cdclk_change_pre(struct drm_i915_private *dev_priv); void intel_audio_cdclk_change_post(struct drm_i915_private *dev_priv); void intel_audio_init(struct drm_i915_private *dev_priv); +void intel_audio_register(struct drm_i915_private *i915); void intel_audio_deinit(struct drm_i915_private *dev_priv); void intel_audio_sdp_split_update(const struct intel_crtc_state *crtc_state); diff --git a/drivers/gpu/drm/i915/display/intel_display_driver.c b/drivers/gpu/drm/i915/display/intel_display_driver.c index 89bd032ed995..794b4af38055 100644 --- a/drivers/gpu/drm/i915/display/intel_display_driver.c +++ b/drivers/gpu/drm/i915/display/intel_display_driver.c @@ -540,6 +540,8 @@ void intel_display_driver_register(struct drm_i915_private *i915) intel_display_driver_enable_user_access(i915); + intel_audio_register(i915); + intel_display_debugfs_register(i915); /* From e9022b31db80019025967b03df1d059433e9f26d Mon Sep 17 00:00:00 2001 From: Minda Chen Date: Tue, 28 May 2024 09:51:20 +0800 Subject: [PATCH 177/279] MAINTAINERS: dwmac: starfive: update Maintainer Update the maintainer of starfive dwmac driver. Signed-off-by: Minda Chen Acked-by: Emil Renner Berthing Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e80db76fe393..90930a03e7a5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21318,7 +21318,7 @@ F: arch/riscv/boot/dts/starfive/ STARFIVE DWMAC GLUE LAYER M: Emil Renner Berthing -M: Samin Guo +M: Minda Chen S: Maintained F: Documentation/devicetree/bindings/net/starfive,jh7110-dwmac.yaml F: drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c From 068648aab72c9ba7b0597354ef4d81ffaac7b979 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 28 May 2024 11:12:31 +0800 Subject: [PATCH 178/279] nfc/nci: Add the inconsistency check between the input data length and count write$nci(r0, &(0x7f0000000740)=ANY=[@ANYBLOB="610501"], 0xf) Syzbot constructed a write() call with a data length of 3 bytes but a count value of 15, which passed too little data to meet the basic requirements of the function nci_rf_intf_activated_ntf_packet(). Therefore, increasing the comparison between data length and count value to avoid problems caused by inconsistent data length and count. Reported-and-tested-by: syzbot+71bfed2b2bcea46c98f2@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: David S. Miller --- drivers/nfc/virtual_ncidev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nfc/virtual_ncidev.c b/drivers/nfc/virtual_ncidev.c index 590b038e449e..6b89d596ba9a 100644 --- a/drivers/nfc/virtual_ncidev.c +++ b/drivers/nfc/virtual_ncidev.c @@ -125,6 +125,10 @@ static ssize_t virtual_ncidev_write(struct file *file, kfree_skb(skb); return -EFAULT; } + if (strnlen(skb->data, count) != count) { + kfree_skb(skb); + return -EINVAL; + } nci_recv_frame(vdev->ndev, skb); return count; From b1e7cee96127468c2483cf10c2899c9b5cf79bf8 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 13 May 2024 10:02:48 +0000 Subject: [PATCH 179/279] powerpc/bpf: enforce full ordering for ATOMIC operations with BPF_FETCH The Linux Kernel Memory Model [1][2] requires RMW operations that have a return value to be fully ordered. BPF atomic operations with BPF_FETCH (including BPF_XCHG and BPF_CMPXCHG) return a value back so they need to be JITed to fully ordered operations. POWERPC currently emits relaxed operations for these. We can show this by running the following litmus-test: PPC SB+atomic_add+fetch { 0:r0=x; (* dst reg assuming offset is 0 *) 0:r1=2; (* src reg *) 0:r2=1; 0:r4=y; (* P0 writes to this, P1 reads this *) 0:r5=z; (* P1 writes to this, P0 reads this *) 0:r6=0; 1:r2=1; 1:r4=y; 1:r5=z; } P0 | P1 ; stw r2, 0(r4) | stw r2,0(r5) ; | ; loop:lwarx r3, r6, r0 | ; mr r8, r3 | ; add r3, r3, r1 | sync ; stwcx. r3, r6, r0 | ; bne loop | ; mr r1, r8 | ; | ; lwa r7, 0(r5) | lwa r7,0(r4) ; ~exists(0:r7=0 /\ 1:r7=0) Witnesses Positive: 9 Negative: 3 Condition ~exists (0:r7=0 /\ 1:r7=0) Observation SB+atomic_add+fetch Sometimes 3 9 This test shows that the older store in P0 is reordered with a newer load to a different address. Although there is a RMW operation with fetch between them. Adding a sync before and after RMW fixes the issue: Witnesses Positive: 9 Negative: 0 Condition ~exists (0:r7=0 /\ 1:r7=0) Observation SB+atomic_add+fetch Never 0 9 [1] https://www.kernel.org/doc/Documentation/memory-barriers.txt [2] https://www.kernel.org/doc/Documentation/atomic_t.txt Fixes: aea7ef8a82c0 ("powerpc/bpf/32: add support for BPF_ATOMIC bitwise operations") Fixes: 2d9206b22743 ("powerpc/bpf/32: Add instructions for atomic_[cmp]xchg") Fixes: dbe6e2456fb0 ("powerpc/bpf/64: add support for atomic fetch operations") Fixes: 1e82dfaa7819 ("powerpc/bpf/64: Add instructions for atomic_[cmp]xchg") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Puranjay Mohan Reviewed-by: Christophe Leroy Reviewed-by: Naveen N Rao Acked-by: Paul E. McKenney Signed-off-by: Michael Ellerman Link: https://msgid.link/20240513100248.110535-1-puranjay@kernel.org --- arch/powerpc/net/bpf_jit_comp32.c | 12 ++++++++++++ arch/powerpc/net/bpf_jit_comp64.c | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 43b97032a91c..a0c4f1bde83e 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -900,6 +900,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code /* Get offset into TMP_REG */ EMIT(PPC_RAW_LI(tmp_reg, off)); + /* + * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' + * before and after the operation. + * + * This is a requirement in the Linux Kernel Memory Model. + * See __cmpxchg_u32() in asm/cmpxchg.h as an example. + */ + if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); tmp_idx = ctx->idx * 4; /* load value from memory into r0 */ EMIT(PPC_RAW_LWARX(_R0, tmp_reg, dst_reg, 0)); @@ -953,6 +962,9 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code /* For the BPF_FETCH variant, get old data into src_reg */ if (imm & BPF_FETCH) { + /* Emit 'sync' to enforce full ordering */ + if (IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); EMIT(PPC_RAW_MR(ret_reg, ax_reg)); if (!fp->aux->verifier_zext) EMIT(PPC_RAW_LI(ret_reg - 1, 0)); /* higher 32-bit */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 8afc14a4a125..7703dcf48be8 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -846,6 +846,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code /* Get offset into TMP_REG_1 */ EMIT(PPC_RAW_LI(tmp1_reg, off)); + /* + * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' + * before and after the operation. + * + * This is a requirement in the Linux Kernel Memory Model. + * See __cmpxchg_u64() in asm/cmpxchg.h as an example. + */ + if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); tmp_idx = ctx->idx * 4; /* load value from memory into TMP_REG_2 */ if (size == BPF_DW) @@ -908,6 +917,9 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code PPC_BCC_SHORT(COND_NE, tmp_idx); if (imm & BPF_FETCH) { + /* Emit 'sync' to enforce full ordering */ + if (IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); EMIT(PPC_RAW_MR(ret_reg, _R0)); /* * Skip unnecessary zero-extension for 32-bit cmpxchg. From 4a69c1264ff41bc5bf7c03101ada0454fbf08868 Mon Sep 17 00:00:00 2001 From: Witold Sadowski Date: Wed, 29 May 2024 00:40:32 -0700 Subject: [PATCH 180/279] spi: cadence: Ensure data lines set to low during dummy-cycle period During dummy-cycles xSPI will switch GPIO into Hi-Z mode. In that dummy period voltage on data lines will slowly drop, what can cause unintentional modebyte transmission. Value send to SPI memory chip will depend on last address, and clock frequency. To prevent unforeseen consequences of that behaviour, force send single modebyte(0x00). Modebyte will be send only if number of dummy-cycles is not equal to 0. Code must also reduce dummycycle byte count by one - as one byte is send as modebyte. Signed-off-by: Witold Sadowski Link: https://msgid.link/r/20240529074037.1345882-2-wsadowski@marvell.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-xspi.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-cadence-xspi.c b/drivers/spi/spi-cadence-xspi.c index 2209e9fc378f..2e3eacd46b72 100644 --- a/drivers/spi/spi-cadence-xspi.c +++ b/drivers/spi/spi-cadence-xspi.c @@ -145,6 +145,9 @@ #define CDNS_XSPI_STIG_DONE_FLAG BIT(0) #define CDNS_XSPI_TRD_STATUS 0x0104 +#define MODE_NO_OF_BYTES GENMASK(25, 24) +#define MODEBYTES_COUNT 1 + /* Helper macros for filling command registers */ #define CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_1(op, data_phase) ( \ FIELD_PREP(CDNS_XSPI_CMD_INSTR_TYPE, (data_phase) ? \ @@ -157,9 +160,10 @@ FIELD_PREP(CDNS_XSPI_CMD_P1_R2_ADDR3, ((op)->addr.val >> 24) & 0xFF) | \ FIELD_PREP(CDNS_XSPI_CMD_P1_R2_ADDR4, ((op)->addr.val >> 32) & 0xFF)) -#define CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_3(op) ( \ +#define CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_3(op, modebytes) ( \ FIELD_PREP(CDNS_XSPI_CMD_P1_R3_ADDR5, ((op)->addr.val >> 40) & 0xFF) | \ FIELD_PREP(CDNS_XSPI_CMD_P1_R3_CMD, (op)->cmd.opcode) | \ + FIELD_PREP(MODE_NO_OF_BYTES, modebytes) | \ FIELD_PREP(CDNS_XSPI_CMD_P1_R3_NUM_ADDR_BYTES, (op)->addr.nbytes)) #define CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_4(op, chipsel) ( \ @@ -173,12 +177,12 @@ #define CDNS_XSPI_CMD_FLD_DSEQ_CMD_2(op) \ FIELD_PREP(CDNS_XSPI_CMD_DSEQ_R2_DCNT_L, (op)->data.nbytes & 0xFFFF) -#define CDNS_XSPI_CMD_FLD_DSEQ_CMD_3(op) ( \ +#define CDNS_XSPI_CMD_FLD_DSEQ_CMD_3(op, dummybytes) ( \ FIELD_PREP(CDNS_XSPI_CMD_DSEQ_R3_DCNT_H, \ ((op)->data.nbytes >> 16) & 0xffff) | \ FIELD_PREP(CDNS_XSPI_CMD_DSEQ_R3_NUM_OF_DUMMY, \ (op)->dummy.buswidth != 0 ? \ - (((op)->dummy.nbytes * 8) / (op)->dummy.buswidth) : \ + (((dummybytes) * 8) / (op)->dummy.buswidth) : \ 0)) #define CDNS_XSPI_CMD_FLD_DSEQ_CMD_4(op, chipsel) ( \ @@ -351,6 +355,7 @@ static int cdns_xspi_send_stig_command(struct cdns_xspi_dev *cdns_xspi, u32 cmd_regs[6]; u32 cmd_status; int ret; + int dummybytes = op->dummy.nbytes; ret = cdns_xspi_wait_for_controller_idle(cdns_xspi); if (ret < 0) @@ -365,7 +370,12 @@ static int cdns_xspi_send_stig_command(struct cdns_xspi_dev *cdns_xspi, memset(cmd_regs, 0, sizeof(cmd_regs)); cmd_regs[1] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_1(op, data_phase); cmd_regs[2] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_2(op); - cmd_regs[3] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_3(op); + if (dummybytes != 0) { + cmd_regs[3] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_3(op, 1); + dummybytes--; + } else { + cmd_regs[3] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_3(op, 0); + } cmd_regs[4] = CDNS_XSPI_CMD_FLD_P1_INSTR_CMD_4(op, cdns_xspi->cur_cs); @@ -375,7 +385,7 @@ static int cdns_xspi_send_stig_command(struct cdns_xspi_dev *cdns_xspi, cmd_regs[0] = CDNS_XSPI_STIG_DONE_FLAG; cmd_regs[1] = CDNS_XSPI_CMD_FLD_DSEQ_CMD_1(op); cmd_regs[2] = CDNS_XSPI_CMD_FLD_DSEQ_CMD_2(op); - cmd_regs[3] = CDNS_XSPI_CMD_FLD_DSEQ_CMD_3(op); + cmd_regs[3] = CDNS_XSPI_CMD_FLD_DSEQ_CMD_3(op, dummybytes); cmd_regs[4] = CDNS_XSPI_CMD_FLD_DSEQ_CMD_4(op, cdns_xspi->cur_cs); From 9dedabe95b49ec9b0d16ce8f0ed1f9a12dd4a040 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 29 May 2024 11:42:35 -0400 Subject: [PATCH 181/279] spi: Assign dummy scatterlist to unidirectional transfers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 8cc3bad9d9d6 ("spi: Remove unneded check for orig_nents") introduced a regression: unmapped data could now be passed to the DMA APIs, resulting in null pointer dereferences. Commit 9f788ba457b4 ("spi: Don't mark message DMA mapped when no transfer in it is") and commit da560097c056 ("spi: Check if transfer is mapped before calling DMA sync APIs") addressed the problem, but only partially. Unidirectional transactions will still result in null pointer dereference. To prevent that from happening, assign a dummy scatterlist when no data is mapped, so that the DMA API can be called and not result in a null pointer dereference. Signed-off-by: Andy Shevchenko Reported-by: Neil Armstrong Closes: https://lore.kernel.org/r/8ae675b5-fcf9-4c9b-b06a-4462f70e1322@linaro.org Reported-by: Nícolas F. R. A. Prado Closes: https://lore.kernel.org/all/d3679496-2e4e-4a7c-97ed-f193bd53af1d@notapiano Closes: https://lore.kernel.org/all/4748499f-789c-45a8-b50a-2dd09f4bac8c@notapiano Fixes: 8cc3bad9d9d6 ("spi: Remove unneded check for orig_nents") Tested-by: Nícolas F. R. A. Prado [nfraprado: wrote the commit message] Signed-off-by: Nícolas F. R. A. Prado Link: https://msgid.link/r/20240529-dma-oops-dummy-v1-1-bb43aacfb11b@collabora.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index f94420858c22..9bc9fd10d538 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1220,6 +1220,11 @@ void spi_unmap_buf(struct spi_controller *ctlr, struct device *dev, spi_unmap_buf_attrs(ctlr, dev, sgt, dir, 0); } +/* Dummy SG for unidirect transfers */ +static struct scatterlist dummy_sg = { + .page_link = SG_END, +}; + static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) { struct device *tx_dev, *rx_dev; @@ -1258,6 +1263,8 @@ static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) attrs); if (ret != 0) return ret; + } else { + xfer->tx_sg.sgl = &dummy_sg; } if (xfer->rx_buf != NULL) { @@ -1271,6 +1278,8 @@ static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) return ret; } + } else { + xfer->rx_sg.sgl = &dummy_sg; } } /* No transfer has been mapped, bail out with success */ From 06fe9b1df1086b42718d632aa57e8f7cd1a66a21 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 May 2024 09:38:38 -0600 Subject: [PATCH 182/279] io_uring: don't attempt to mmap larger than what the user asks for MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If IORING_FEAT_SINGLE_MMAP is ignored, as can happen if an application uses an ancient liburing or does setup manually, then 3 mmap's are required to map the ring into userspace. The kernel will still have collapsed the mappings, however userspace may ask for mapping them individually. If so, then we should not use the full number of ring pages, as it may exceed the partial mapping. Doing so will yield an -EFAULT from vm_insert_pages(), as we pass in more pages than what the application asked for. Cap the number of pages to match what the application asked for, for the particular mapping operation. Reported-by: Lucas Mülling Link: https://github.com/axboe/liburing/issues/1157 Fixes: 3ab1db3c6039 ("io_uring: get rid of remap_pfn_range() for mapping rings/sqes") Signed-off-by: Jens Axboe --- io_uring/memmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 4785d6af5fee..a0f32a255fd1 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -244,6 +244,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned int npages; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); @@ -253,8 +254,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, - ctx->n_ring_pages); + npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); case IORING_OFF_SQES: return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, ctx->n_sqe_pages); From 3bd27a847a3a4827a948387cc8f0dbc9fa5931d5 Mon Sep 17 00:00:00 2001 From: Matthias Maennich Date: Tue, 28 May 2024 11:32:43 +0000 Subject: [PATCH 183/279] kheaders: explicitly define file modes for archived headers Build environments might be running with different umask settings resulting in indeterministic file modes for the files contained in kheaders.tar.xz. The file itself is served with 444, i.e. world readable. Archive the files explicitly with 744,a+X to improve reproducibility across build environments. --mode=0444 is not suitable as directories need to be executable. Also, 444 makes it hard to delete all the readonly files after extraction. Cc: stable@vger.kernel.org Signed-off-by: Matthias Maennich Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 6d443ea22bb7..8b6e0c2bc0df 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -84,7 +84,7 @@ find $cpio_dir -type f -print0 | # Create archive and try to normalize metadata for reproducibility. tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ - --owner=0 --group=0 --sort=name --numeric-owner \ + --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null echo $headers_md5 > kernel/kheaders.md5 From 6e58e0173507e506a5627741358bc770f220e356 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 28 May 2024 18:31:50 +0200 Subject: [PATCH 184/279] kheaders: use `command -v` to test for existence of `cpio` Commit 13e1df09284d ("kheaders: explicitly validate existence of cpio command") added an explicit check for `cpio` using `type`. However, `type` in `dash` (which is used in some popular distributions and base images as the shell script runner) prints the missing message to standard output, and thus no error is printed: $ bash -c 'type missing >/dev/null' bash: line 1: type: missing: not found $ dash -c 'type missing >/dev/null' $ For instance, this issue may be seen by loongarch builders, given its defconfig enables CONFIG_IKHEADERS since commit 9cc1df421f00 ("LoongArch: Update Loongson-3 default config file"). Therefore, use `command -v` instead to have consistent behavior, and take the chance to provide a more explicit error. Fixes: 13e1df09284d ("kheaders: explicitly validate existence of cpio command") Signed-off-by: Miguel Ojeda Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 8b6e0c2bc0df..383fd43ac612 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -14,7 +14,12 @@ include/ arch/$SRCARCH/include/ " -type cpio > /dev/null +if ! command -v cpio >/dev/null; then + echo >&2 "***" + echo >&2 "*** 'cpio' could not be found." + echo >&2 "***" + exit 1 +fi # Support incremental builds by skipping archive generation # if timestamps of files being archived are not changed. From 1b1c9f0fd3fb70adf1f3b0aec58ab037d6e595d0 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 28 May 2024 15:02:32 -0500 Subject: [PATCH 185/279] dt-bindings: kbuild: Fix dt_binding_check on unconfigured build The 'dt_binding_check' target shouldn't depend on the kernel configuration, but it has since commit 604a57ba9781 ("dt-bindings: kbuild: Add separate target/dependency for processed-schema.json"). That is because CHECK_DT_BINDING make variable was dropped, but scripts/dtc/Makefile was missed. The CHECK_DTBS variable can be used instead. Reported-by: Francesco Dolcini Fixes: 604a57ba9781 ("dt-bindings: kbuild: Add separate target/dependency for processed-schema.json") Signed-off-by: "Rob Herring (Arm)" Signed-off-by: Masahiro Yamada --- scripts/dtc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/dtc/Makefile b/scripts/dtc/Makefile index a18657072541..b47f4daa4515 100644 --- a/scripts/dtc/Makefile +++ b/scripts/dtc/Makefile @@ -3,7 +3,7 @@ # *** Also keep .gitignore in sync when changing *** hostprogs-always-$(CONFIG_DTC) += dtc fdtoverlay -hostprogs-always-$(CHECK_DT_BINDING) += dtc +hostprogs-always-$(CHECK_DTBS) += dtc dtc-objs := dtc.o flattree.o fstree.o data.o livetree.o treesource.o \ srcpos.o checks.o util.o From 4a4be1ad3a6efea16c56615f31117590fd881358 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 29 May 2024 09:39:34 -0700 Subject: [PATCH 186/279] Revert "vfs: Delete the associated dentry when deleting a file" This reverts commit 681ce8623567ba7e7333908e9826b77145312dda. We gave it a try, but it turns out the kernel test robot did in fact find performance regressions for it, so we'll have to look at the more involved alternative fixes for Yafang Shao's Elasticsearch load issue. There were several alternatives discussed, they just weren't as simple as this first attempt. The report is of a -7.4% regression of filebench.sum_operations/s, which appears significant enough to trigger my "this patch may get reverted if somebody finds a performance regression on some other load" rule. So it's still the case that we should end up deleting dentries more aggressively - or just be better at pruning them later - but it needs a bit more finesse than this simple thing. Link: https://lore.kernel.org/all/202405291318.4dfbb352-oliver.sang@intel.com/ Cc: Yafang Shao Cc: Al Viro Cc: Christian Brauner Signed-off-by: Linus Torvalds --- fs/dcache.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 1ee6404b430b..407095188f83 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2360,17 +2360,19 @@ EXPORT_SYMBOL(d_hash_and_lookup); * - unhash this dentry and free it. * * Usually, we want to just turn this into - * a negative dentry, but certain workloads can - * generate a large number of negative dentries. - * Therefore, it would be better to simply - * unhash it. + * a negative dentry, but if anybody else is + * currently using the dentry or the inode + * we can't do that and we fall back on removing + * it from the hash queues and waiting for + * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * - * Remove the dentry from the hash queues so it can be deleted later. + * Turn the dentry into a negative dentry if possible, otherwise + * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) @@ -2379,8 +2381,6 @@ void d_delete(struct dentry * dentry) spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); - __d_drop(dentry); - /* * Are we the only user? */ @@ -2388,6 +2388,7 @@ void d_delete(struct dentry * dentry) dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { + __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } From 95d7c452a26564ef0c427f2806761b857106d8c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 21 May 2024 12:52:42 +0200 Subject: [PATCH 187/279] spi: stm32: Don't warn about spurious interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dev_warn to notify about a spurious interrupt was introduced with the reasoning that these are unexpected. However spurious interrupts tend to trigger continously and the error message on the serial console prevents that the core's detection of spurious interrupts kicks in (which disables the irq) and just floods the console. Fixes: c64e7efe46b7 ("spi: stm32: make spurious and overrun interrupts visible") Signed-off-by: Uwe Kleine-König Link: https://msgid.link/r/20240521105241.62400-2-u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index e4e7ddb7524a..4c4ff074e3f6 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -1057,7 +1057,7 @@ static irqreturn_t stm32h7_spi_irq_thread(int irq, void *dev_id) mask |= STM32H7_SPI_SR_TXP | STM32H7_SPI_SR_RXP; if (!(sr & mask)) { - dev_warn(spi->dev, "spurious IT (sr=0x%08x, ier=0x%08x)\n", + dev_vdbg(spi->dev, "spurious IT (sr=0x%08x, ier=0x%08x)\n", sr, ier); spin_unlock_irqrestore(&spi->lock, flags); return IRQ_NONE; From 7b038b564b3e2a752d2211e7b0c3c29fd2f6e197 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 29 May 2024 16:28:41 -0400 Subject: [PATCH 188/279] bcachefs: Fix failure to return error on misaligned dio write This was reported as an error when running coreutils shred. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-direct.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 09d21aef879a..049b61bc9a5b 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -609,8 +609,10 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) if (unlikely(ret)) goto err_put_write_ref; - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) { + ret = -EINVAL; goto err_put_write_ref; + } inode_dio_begin(&inode->v); bch2_pagecache_block_get(inode); From ba46b3bda296c4f82b061ac40b90f49d2a00a380 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 15 May 2024 11:25:49 -0400 Subject: [PATCH 189/279] drm/amdgpu: Adjust logic in amdgpu_device_partner_bandwidth() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use current speed/width on devices which don't support dynamic PCIe switching. Fixes: 466a7d115326 ("drm/amd: Use the first non-dGPU PCI device for BW limits") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3289 Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 861ccff78af9..932dc93b2e63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5944,13 +5944,18 @@ static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, *speed = PCI_SPEED_UNKNOWN; *width = PCIE_LNK_WIDTH_UNKNOWN; - while ((parent = pci_upstream_bridge(parent))) { - /* skip upstream/downstream switches internal to dGPU*/ - if (parent->vendor == PCI_VENDOR_ID_ATI) - continue; - *speed = pcie_get_speed_cap(parent); - *width = pcie_get_width_cap(parent); - break; + if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { + while ((parent = pci_upstream_bridge(parent))) { + /* skip upstream/downstream switches internal to dGPU*/ + if (parent->vendor == PCI_VENDOR_ID_ATI) + continue; + *speed = pcie_get_speed_cap(parent); + *width = pcie_get_width_cap(parent); + break; + } + } else { + /* use the current speeds rather than max if switching is not supported */ + pcie_bandwidth_available(adev->pdev, NULL, speed, width); } } From 05d9e24ddb15160164ba6e917a88c00907dc2434 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 16 May 2024 09:51:26 -0400 Subject: [PATCH 190/279] drm/amdgpu: silence UBSAN warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert a variable sized array from [1] to []. Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/include/atomfirmware.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/include/atomfirmware.h b/drivers/gpu/drm/amd/include/atomfirmware.h index 1acb2d2c5597..571691837200 100644 --- a/drivers/gpu/drm/amd/include/atomfirmware.h +++ b/drivers/gpu/drm/amd/include/atomfirmware.h @@ -3583,7 +3583,7 @@ struct atom_gpio_voltage_object_v4 uint8_t phase_delay_us; // phase delay in unit of micro second uint8_t reserved; uint32_t gpio_mask_val; // GPIO Mask value - struct atom_voltage_gpio_map_lut voltage_gpio_lut[1]; + struct atom_voltage_gpio_map_lut voltage_gpio_lut[] __counted_by(gpio_entry_num); }; struct atom_svid2_voltage_object_v4 From a0cf36546cc24ae1c95d72253c7795d4d2fc77aa Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Thu, 23 May 2024 17:14:45 +0800 Subject: [PATCH 191/279] drm/amdgpu: fix dereference null return value for the function amdgpu_vm_pt_parent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pointer parent may be NULLed by the function amdgpu_vm_pt_parent. To make the code more robust, check the pointer parent. Signed-off-by: Jesse Zhang Suggested-by: Christian König Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 7fdd306a48a0..f07647a9a9d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -706,11 +706,15 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params *params, struct amdgpu_vm_bo_base *entry) { struct amdgpu_vm_bo_base *parent = amdgpu_vm_pt_parent(entry); - struct amdgpu_bo *bo = parent->bo, *pbo; + struct amdgpu_bo *bo, *pbo; struct amdgpu_vm *vm = params->vm; uint64_t pde, pt, flags; unsigned int level; + if (WARN_ON(!parent)) + return -EINVAL; + + bo = parent->bo; for (level = 0, pbo = bo->parent; pbo; ++level) pbo = pbo->parent; From dd2b75fd9a79bf418e088656822af06fc253dbe3 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 20 May 2024 14:41:31 -0400 Subject: [PATCH 192/279] Revert "drm/amdkfd: fix gfx_target_version for certain 11.0.3 devices" This reverts commit 28ebbb4981cb1fad12e0b1227dbecc88810b1ee8. Revert this commit as apparently the LLVM code to take advantage of this never landed. Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher Cc: Feifei Xu --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 9596bca57212..afc57df421cd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -408,15 +408,8 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) f2g = &gfx_v11_kfd2kgd; break; case IP_VERSION(11, 0, 3): - if ((adev->pdev->device == 0x7460 && - adev->pdev->revision == 0x00) || - (adev->pdev->device == 0x7461 && - adev->pdev->revision == 0x00)) - /* Note: Compiler version is 11.0.5 while HW version is 11.0.3 */ - gfx_target_version = 110005; - else - /* Note: Compiler version is 11.0.1 while HW version is 11.0.3 */ - gfx_target_version = 110001; + /* Note: Compiler version is 11.0.1 while HW version is 11.0.3 */ + gfx_target_version = 110001; f2g = &gfx_v11_kfd2kgd; break; case IP_VERSION(11, 5, 0): From 1f327dfc846ae82e16e52ed9c559d566826486d2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 22 May 2024 15:26:50 -0400 Subject: [PATCH 193/279] drm/amdkfd: simplify APU VRAM handling With commit 89773b85599a ("drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs") big and small APU "VRAM" handling in KFD was unified. Since AMD_IS_APU is set for both big and small APUs, we can simplify the checks in the code. v2: clean up a few more places (Lang) Acked-by: Felix Kuehling Reviewed-by: Lang Yu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 ++++++++-------- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 ++---- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 - 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 8975cf41a91a..48ad0c04aa72 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, return -EINVAL; vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id); - if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { + if (adev->flags & AMD_IS_APU) { system_mem_needed = size; ttm_mem_needed = size; } @@ -233,7 +233,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, if (adev && xcp_id >= 0) { adev->kfd.vram_used[xcp_id] += vram_needed; adev->kfd.vram_used_aligned[xcp_id] += - (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ? + (adev->flags & AMD_IS_APU) ? vram_needed : ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN); } @@ -261,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, if (adev) { adev->kfd.vram_used[xcp_id] -= size; - if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { + if (adev->flags & AMD_IS_APU) { adev->kfd.vram_used_aligned[xcp_id] -= size; kfd_mem_limit.system_mem_used -= size; kfd_mem_limit.ttm_mem_used -= size; @@ -890,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem, * if peer device has large BAR. In contrast, access over xGMI is * allowed for both small and large BAR configurations of peer device */ - if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) && + if ((adev != bo_adev && !(adev->flags & AMD_IS_APU)) && ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) || (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) || (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) { @@ -1658,7 +1658,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, - atomic64_read(&adev->vram_pin_size) - reserved_for_pt; - if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { + if (adev->flags & AMD_IS_APU) { system_mem_available = no_system_mem_limit ? kfd_mem_limit.max_system_mem_limit : kfd_mem_limit.max_system_mem_limit - @@ -1706,7 +1706,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM; - if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { + if (adev->flags & AMD_IS_APU) { domain = AMDGPU_GEM_DOMAIN_GTT; alloc_domain = AMDGPU_GEM_DOMAIN_GTT; alloc_flags = 0; @@ -1953,7 +1953,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( if (size) { if (!is_imported && (mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM || - ((adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) && + ((adev->flags & AMD_IS_APU) && mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT))) *size = bo_size; else @@ -2376,7 +2376,7 @@ static int import_obj_create(struct amdgpu_device *adev, (*mem)->bo = bo; (*mem)->va = va; (*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) && - !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ? + !(adev->flags & AMD_IS_APU) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT; (*mem)->mapped_to_gpu_memory = 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 4816fcb9803a..8ee3d07ffbdf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -1023,7 +1023,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1)) return -EINVAL; - if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) + if (adev->flags & AMD_IS_APU) return 0; pgmap = &kfddev->pgmap; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 069b81eeea03..31e500859ab0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2619,8 +2619,7 @@ svm_range_best_restore_location(struct svm_range *prange, return -1; } - if (node->adev->gmc.is_app_apu || - node->adev->flags & AMD_IS_APU) + if (node->adev->flags & AMD_IS_APU) return 0; if (prange->preferred_loc == gpuid || @@ -3338,8 +3337,7 @@ svm_range_best_prefetch_location(struct svm_range *prange) goto out; } - if (bo_node->adev->gmc.is_app_apu || - bo_node->adev->flags & AMD_IS_APU) { + if (bo_node->adev->flags & AMD_IS_APU) { best_loc = 0; goto out; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index 9c37bd0567ef..70c1776611c4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -201,7 +201,6 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s * is initialized to not 0 when page migration register device memory. */ #define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0 ||\ - (adev)->gmc.is_app_apu ||\ ((adev)->flags & AMD_IS_APU)) void svm_range_bo_unref_async(struct svm_range_bo *svm_bo); From a9bc5a19e4958fe664254d1ad2dc2a9f5868c210 Mon Sep 17 00:00:00 2001 From: Rajneesh Bhardwaj Date: Wed, 22 May 2024 15:04:29 -0400 Subject: [PATCH 194/279] drm/amdgpu: Make CPX mode auto default in NPS4 On GFXIP9.4.3, make CPX mode as the default compute mode if the node is setup in NPS4 memory partition mode. This change is only applicable for dGPU, for APU, continue to use TPX mode. Reviewed-by: Felix Kuehling Signed-off-by: Rajneesh Bhardwaj Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index 414ea3f560a7..d4e2aed2efa3 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -422,7 +422,7 @@ __aqua_vanjaram_get_auto_mode(struct amdgpu_xcp_mgr *xcp_mgr) if (adev->gmc.num_mem_partitions == num_xcc / 2) return (adev->flags & AMD_IS_APU) ? AMDGPU_TPX_PARTITION_MODE : - AMDGPU_QPX_PARTITION_MODE; + AMDGPU_CPX_PARTITION_MODE; if (adev->gmc.num_mem_partitions == 2 && !(adev->flags & AMD_IS_APU)) return AMDGPU_DPX_PARTITION_MODE; From 67c7d4fa267bcfe8d68fb36d938e3c6e0912b57d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 9 May 2024 13:37:27 +0200 Subject: [PATCH 195/279] drm/amd/pm: remove deprecated I2C_CLASS_SPD support from newly added SMU_14_0_2 Support for I2C_CLASS_SPD is currently being removed from the kernel. Only remaining step is to remove the definition of I2C_CLASS_SPD. Setting I2C_CLASS_SPD in a driver is a no-op meanwhile, so remove it here. Reviewed-by: Alex Deucher Signed-off-by: Heiner Kallweit Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 706265220292..90703f4542ab 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -1562,7 +1562,6 @@ static int smu_v14_0_2_i2c_control_init(struct smu_context *smu) smu_i2c->port = i; mutex_init(&smu_i2c->mutex); control->owner = THIS_MODULE; - control->class = I2C_CLASS_SPD; control->dev.parent = &adev->pdev->dev; control->algo = &smu_v14_0_2_i2c_algo; snprintf(control->name, sizeof(control->name), "AMDGPU SMU %d", i); From fa0bc8f297b29126b5ae983406e9bc76d48a9a8e Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 22 May 2024 23:08:09 +0200 Subject: [PATCH 196/279] hwmon: (dell-smm) Add Dell G15 5511 to fan control whitelist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A user reported that he needs to disable BIOS fan control on his Dell G15 5511 in order to be able to control the fans. Closes: https://github.com/Wer-Wolf/i8kutils/issues/5 Signed-off-by: Armin Wolf Acked-by: Pali Rohár Link: https://lore.kernel.org/r/20240522210809.294488-1-W_Armin@gmx.de Signed-off-by: Guenter Roeck --- drivers/hwmon/dell-smm-hwmon.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 48a81c64f00d..942526bd4775 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -1545,6 +1545,14 @@ static const struct dmi_system_id i8k_whitelist_fan_control[] __initconst = { }, .driver_data = (void *)&i8k_fan_control_data[I8K_FAN_30A3_31A3], }, + { + .ident = "Dell G15 5511", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Dell G15 5511"), + }, + .driver_data = (void *)&i8k_fan_control_data[I8K_FAN_30A3_31A3], + }, { } }; From a94ff8e50c20bde6d50864849a98b106e45d30c6 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 23 May 2024 17:47:14 +0200 Subject: [PATCH 197/279] hwmon: (ltc2992) Fix memory leak in ltc2992_parse_dt() A new error path was added to the fwnode_for_each_available_node() loop in ltc2992_parse_dt(), which leads to an early return that requires a call to fwnode_handle_put() to avoid a memory leak in that case. Add the missing fwnode_handle_put() in the error path from a zero value shunt resistor. Cc: stable@vger.kernel.org Fixes: 10b029020487 ("hwmon: (ltc2992) Avoid division by zero") Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20240523-fwnode_for_each_available_child_node_scoped-v2-1-701f3a03f2fb@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/ltc2992.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c index 229aed15d5ca..d4a93223cd3b 100644 --- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -876,9 +876,11 @@ static int ltc2992_parse_dt(struct ltc2992_state *st) ret = fwnode_property_read_u32(child, "shunt-resistor-micro-ohms", &val); if (!ret) { - if (!val) + if (!val) { + fwnode_handle_put(child); return dev_err_probe(&st->client->dev, -EINVAL, "shunt resistor value cannot be zero\n"); + } st->r_sense_uohm[addr] = val; } } From 92f1655aa2b2294d0b49925f3b875a634bd3b59e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 May 2024 11:43:53 +0000 Subject: [PATCH 198/279] net: fix __dst_negative_advice() race __dst_negative_advice() does not enforce proper RCU rules when sk->dst_cache must be cleared, leading to possible UAF. RCU rules are that we must first clear sk->sk_dst_cache, then call dst_release(old_dst). Note that sk_dst_reset(sk) is implementing this protocol correctly, while __dst_negative_advice() uses the wrong order. Given that ip6_negative_advice() has special logic against RTF_CACHE, this means each of the three ->negative_advice() existing methods must perform the sk_dst_reset() themselves. Note the check against NULL dst is centralized in __dst_negative_advice(), there is no need to duplicate it in various callbacks. Many thanks to Clement Lecigne for tracking this issue. This old bug became visible after the blamed commit, using UDP sockets. Fixes: a87cb3e48ee8 ("net: Facility to report route quality of connected sockets") Reported-by: Clement Lecigne Diagnosed-by: Clement Lecigne Signed-off-by: Eric Dumazet Cc: Tom Herbert Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240528114353.1794151-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst_ops.h | 2 +- include/net/sock.h | 13 +++---------- net/ipv4/route.c | 22 ++++++++-------------- net/ipv6/route.c | 29 +++++++++++++++-------------- net/xfrm/xfrm_policy.c | 11 +++-------- 5 files changed, 30 insertions(+), 47 deletions(-) diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 6d1c8541183d..3a9001a042a5 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -24,7 +24,7 @@ struct dst_ops { void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, struct net_device *dev); - struct dst_entry * (*negative_advice)(struct dst_entry *); + void (*negative_advice)(struct sock *sk, struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, diff --git a/include/net/sock.h b/include/net/sock.h index 5f4d0629348f..953c8dc4e259 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2063,17 +2063,10 @@ sk_dst_get(const struct sock *sk) static inline void __dst_negative_advice(struct sock *sk) { - struct dst_entry *ndst, *dst = __sk_dst_get(sk); + struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->ops->negative_advice) { - ndst = dst->ops->negative_advice(dst); - - if (ndst != dst) { - rcu_assign_pointer(sk->sk_dst_cache, ndst); - sk_tx_queue_clear(sk); - WRITE_ONCE(sk->sk_dst_pending_confirm, 0); - } - } + if (dst && dst->ops->negative_advice) + dst->ops->negative_advice(sk, dst); } static inline void dst_negative_advice(struct sock *sk) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5fd54103174f..b3073d1c8f8f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,7 +129,8 @@ struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, @@ -825,22 +826,15 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf __ip_do_redirect(rt, skb, &fl4, true); } -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst) { struct rtable *rt = dst_rtable(dst); - struct dst_entry *ret = dst; - if (rt) { - if (dst->obsolete > 0) { - ip_rt_put(rt); - ret = NULL; - } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->dst.expires) { - ip_rt_put(rt); - ret = NULL; - } - } - return ret; + if ((dst->obsolete > 0) || + (rt->rt_flags & RTCF_REDIRECTED) || + rt->dst.expires) + sk_dst_reset(sk); } /* diff --git a/net/ipv6/route.c b/net/ipv6/route.c index bbc2a0dd9314..a504b88ec06b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -87,7 +87,8 @@ struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); -static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev); @@ -2770,24 +2771,24 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, } EXPORT_INDIRECT_CALLABLE(ip6_dst_check); -static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); - if (rt) { - if (rt->rt6i_flags & RTF_CACHE) { - rcu_read_lock(); - if (rt6_check_expired(rt)) { - rt6_remove_exception_rt(rt); - dst = NULL; - } - rcu_read_unlock(); - } else { - dst_release(dst); - dst = NULL; + if (rt->rt6i_flags & RTF_CACHE) { + rcu_read_lock(); + if (rt6_check_expired(rt)) { + /* counteract the dst_release() in sk_dst_reset() */ + dst_hold(dst); + sk_dst_reset(sk); + + rt6_remove_exception_rt(rt); } + rcu_read_unlock(); + return; } - return dst; + sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 475b904fe68b..66e07de2de35 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3910,15 +3910,10 @@ static void xfrm_link_failure(struct sk_buff *skb) /* Impossible. Such dst must be popped before reaches point of failure. */ } -static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) +static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { - if (dst) { - if (dst->obsolete) { - dst_release(dst); - dst = NULL; - } - } - return dst; + if (dst->obsolete) + sk_dst_reset(sk); } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) From b8c8abefc07b47f0dc9342530b7618237df96724 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 28 May 2024 22:30:30 +0200 Subject: [PATCH 199/279] ipv4: correctly iterate over the target netns in inet_dump_ifaddr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A recent change to inet_dump_ifaddr had the function incorrectly iterate over net rather than tgt_net, resulting in the data coming for the incorrect network namespace. Fixes: cdb2f80f1c10 ("inet: use xa_array iterator to implement inet_dump_ifaddr()") Reported-by: Stéphane Graber Closes: https://github.com/lxc/incus/issues/892 Bisected-by: Stéphane Graber Signed-off-by: Alexander Mikhalitsyn Tested-by: Stéphane Graber Acked-by: Christian Brauner Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240528203030.10839-1-aleksandr.mikhalitsyn@canonical.com Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e827da128c5f..f3892ee9dfb3 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1903,7 +1903,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = inet_base_seq(tgt_net); - for_each_netdev_dump(net, dev, ctx->ifindex) { + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; From 278d65ccdadb5f0fa0ceaf7b9cc97b305cd72822 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 28 May 2024 14:34:26 -0700 Subject: [PATCH 200/279] net: dsa: microchip: fix RGMII error in KSZ DSA driver The driver should return RMII interface when XMII is running in RMII mode. Fixes: 0ab7f6bf1675 ("net: dsa: microchip: ksz9477: use common xmii function") Signed-off-by: Tristram Ha Acked-by: Arun Ramadoss Acked-by: Jerry Ray Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/1716932066-3342-1-git-send-email-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 1e0085cd9a9a..2818e24e2a51 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -3142,7 +3142,7 @@ phy_interface_t ksz_get_xmii(struct ksz_device *dev, int port, bool gbit) else interface = PHY_INTERFACE_MODE_MII; } else if (val == bitval[P_RMII_SEL]) { - interface = PHY_INTERFACE_MODE_RGMII; + interface = PHY_INTERFACE_MODE_RMII; } else { interface = PHY_INTERFACE_MODE_RGMII; if (data8 & P_RGMII_ID_EG_ENABLE) From bfd546a552e140b0a4c8a21527c39d6d21addb28 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Tue, 28 May 2024 15:06:04 -0700 Subject: [PATCH 201/279] e1000e: move force SMBUS near the end of enable_ulp function The commit 861e8086029e ("e1000e: move force SMBUS from enable ulp function to avoid PHY loss issue") introduces a regression on PCH_MTP_I219_LM18 (PCIID: 0x8086550A). Without the referred commit, the ethernet works well after suspend and resume, but after applying the commit, the ethernet couldn't work anymore after the resume and the dmesg shows that the NIC link changes to 10Mbps (1000Mbps originally): [ 43.305084] e1000e 0000:00:1f.6 enp0s31f6: NIC Link is Up 10 Mbps Full Duplex, Flow Control: Rx/Tx Without the commit, the force SMBUS code will not be executed if "return 0" or "goto out" is executed in the enable_ulp(), and in my case, the "goto out" is executed since FWSM_FW_VALID is set. But after applying the commit, the force SMBUS code will be ran unconditionally. Here move the force SMBUS code back to enable_ulp() and put it immediately ahead of hw->phy.ops.release(hw), this could allow the longest settling time as possible for interface in this function and doesn't change the original code logic. The issue was found on a Lenovo laptop with the ethernet hw as below: 00:1f.6 Ethernet controller [0200]: Intel Corporation Device [8086:550a] (rev 20). And this patch is verified (cable plug and unplug, system suspend and resume) on Lenovo laptops with ethernet hw: [8086:550a], [8086:550b], [8086:15bb], [8086:15be], [8086:1a1f], [8086:1a1c] and [8086:0dc7]. Fixes: 861e8086029e ("e1000e: move force SMBUS from enable ulp function to avoid PHY loss issue") Signed-off-by: Hui Wang Acked-by: Vitaly Lifshits Tested-by: Naama Meir Reviewed-by: Simon Horman Reviewed-by: Paul Menzel Signed-off-by: Tony Nguyen Tested-by: Zhang Rui Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240528-net-2024-05-28-intel-net-fixes-v1-1-dc8593d2bbc6@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 22 +++++++++++++++++++++ drivers/net/ethernet/intel/e1000e/netdev.c | 18 ----------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index f9e94be36e97..2e98a2a0bead 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1225,6 +1225,28 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) } release: + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + + /* Force SMBus mode in PHY */ + ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); + if (ret_val) { + e1000e_enable_phy_retry(hw); + hw->phy.ops.release(hw); + goto out; + } + phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; + e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); + + e1000e_enable_phy_retry(hw); + + /* Force SMBus mode in MAC */ + mac_reg = er32(CTRL_EXT); + mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; + ew32(CTRL_EXT, mac_reg); + hw->phy.ops.release(hw); out: if (ret_val) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 220d62fca55d..da5c59daf8ba 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6623,7 +6623,6 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) struct e1000_hw *hw = &adapter->hw; u32 ctrl, ctrl_ext, rctl, status, wufc; int retval = 0; - u16 smb_ctrl; /* Runtime suspend should only enable wakeup for link changes */ if (runtime) @@ -6697,23 +6696,6 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) if (retval) return retval; } - - /* Force SMBUS to allow WOL */ - /* Switching PHY interface always returns MDI error - * so disable retry mechanism to avoid wasting time - */ - e1000e_disable_phy_retry(hw); - - e1e_rphy(hw, CV_SMB_CTRL, &smb_ctrl); - smb_ctrl |= CV_SMB_CTRL_FORCE_SMBUS; - e1e_wphy(hw, CV_SMB_CTRL, smb_ctrl); - - e1000e_enable_phy_retry(hw); - - /* Force SMBus mode in MAC */ - ctrl_ext = er32(CTRL_EXT); - ctrl_ext |= E1000_CTRL_EXT_FORCE_SMBUS; - ew32(CTRL_EXT, ctrl_ext); } /* Ensure that the appropriate bits are set in LPI_CTRL From 218ed820d364ddc2b0150951e6b1a1bd1e49469d Mon Sep 17 00:00:00 2001 From: Thinh Tran Date: Tue, 28 May 2024 15:06:05 -0700 Subject: [PATCH 202/279] i40e: factoring out i40e_suspend/i40e_resume Two new functions, i40e_io_suspend() and i40e_io_resume(), have been introduced. These functions were factored out from the existing i40e_suspend() and i40e_resume() respectively. This factoring was done due to concerns about the logic of the I40E_SUSPENSED state, which caused the device to be unable to recover. The functions are now used in the EEH handling for device suspend/resume callbacks. The function i40e_enable_mc_magic_wake() has been moved ahead of i40e_io_suspend() to ensure it is declared before being used. Tested-by: Robert Thomas Signed-off-by: Thinh Tran Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy Reviewed-by: Jacob Keller Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240528-net-2024-05-28-intel-net-fixes-v1-2-dc8593d2bbc6@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 249 +++++++++++--------- 1 file changed, 135 insertions(+), 114 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1f188c052828..d5f25ea304bf 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16334,6 +16334,139 @@ static void i40e_remove(struct pci_dev *pdev) pci_disable_device(pdev); } +/** + * i40e_enable_mc_magic_wake - enable multicast magic packet wake up + * using the mac_address_write admin q function + * @pf: pointer to i40e_pf struct + **/ +static void i40e_enable_mc_magic_wake(struct i40e_pf *pf) +{ + struct i40e_vsi *main_vsi = i40e_pf_get_main_vsi(pf); + struct i40e_hw *hw = &pf->hw; + u8 mac_addr[6]; + u16 flags = 0; + int ret; + + /* Get current MAC address in case it's an LAA */ + if (main_vsi && main_vsi->netdev) { + ether_addr_copy(mac_addr, main_vsi->netdev->dev_addr); + } else { + dev_err(&pf->pdev->dev, + "Failed to retrieve MAC address; using default\n"); + ether_addr_copy(mac_addr, hw->mac.addr); + } + + /* The FW expects the mac address write cmd to first be called with + * one of these flags before calling it again with the multicast + * enable flags. + */ + flags = I40E_AQC_WRITE_TYPE_LAA_WOL; + + if (hw->func_caps.flex10_enable && hw->partition_id != 1) + flags = I40E_AQC_WRITE_TYPE_LAA_ONLY; + + ret = i40e_aq_mac_address_write(hw, flags, mac_addr, NULL); + if (ret) { + dev_err(&pf->pdev->dev, + "Failed to update MAC address registers; cannot enable Multicast Magic packet wake up"); + return; + } + + flags = I40E_AQC_MC_MAG_EN + | I40E_AQC_WOL_PRESERVE_ON_PFR + | I40E_AQC_WRITE_TYPE_UPDATE_MC_MAG; + ret = i40e_aq_mac_address_write(hw, flags, mac_addr, NULL); + if (ret) + dev_err(&pf->pdev->dev, + "Failed to enable Multicast Magic Packet wake up\n"); +} + +/** + * i40e_io_suspend - suspend all IO operations + * @pf: pointer to i40e_pf struct + * + **/ +static int i40e_io_suspend(struct i40e_pf *pf) +{ + struct i40e_hw *hw = &pf->hw; + + set_bit(__I40E_DOWN, pf->state); + + /* Ensure service task will not be running */ + del_timer_sync(&pf->service_timer); + cancel_work_sync(&pf->service_task); + + /* Client close must be called explicitly here because the timer + * has been stopped. + */ + i40e_notify_client_of_netdev_close(pf, false); + + if (test_bit(I40E_HW_CAP_WOL_MC_MAGIC_PKT_WAKE, pf->hw.caps) && + pf->wol_en) + i40e_enable_mc_magic_wake(pf); + + /* Since we're going to destroy queues during the + * i40e_clear_interrupt_scheme() we should hold the RTNL lock for this + * whole section + */ + rtnl_lock(); + + i40e_prep_for_reset(pf); + + wr32(hw, I40E_PFPM_APM, (pf->wol_en ? I40E_PFPM_APM_APME_MASK : 0)); + wr32(hw, I40E_PFPM_WUFC, (pf->wol_en ? I40E_PFPM_WUFC_MAG_MASK : 0)); + + /* Clear the interrupt scheme and release our IRQs so that the system + * can safely hibernate even when there are a large number of CPUs. + * Otherwise hibernation might fail when mapping all the vectors back + * to CPU0. + */ + i40e_clear_interrupt_scheme(pf); + + rtnl_unlock(); + + return 0; +} + +/** + * i40e_io_resume - resume IO operations + * @pf: pointer to i40e_pf struct + * + **/ +static int i40e_io_resume(struct i40e_pf *pf) +{ + struct device *dev = &pf->pdev->dev; + int err; + + /* We need to hold the RTNL lock prior to restoring interrupt schemes, + * since we're going to be restoring queues + */ + rtnl_lock(); + + /* We cleared the interrupt scheme when we suspended, so we need to + * restore it now to resume device functionality. + */ + err = i40e_restore_interrupt_scheme(pf); + if (err) { + dev_err(dev, "Cannot restore interrupt scheme: %d\n", + err); + } + + clear_bit(__I40E_DOWN, pf->state); + i40e_reset_and_rebuild(pf, false, true); + + rtnl_unlock(); + + /* Clear suspended state last after everything is recovered */ + clear_bit(__I40E_SUSPENDED, pf->state); + + /* Restart the service task */ + mod_timer(&pf->service_timer, + round_jiffies(jiffies + pf->service_timer_period)); + + return 0; +} + /** * i40e_pci_error_detected - warning that something funky happened in PCI land * @pdev: PCI device information struct @@ -16446,53 +16579,6 @@ static void i40e_pci_error_resume(struct pci_dev *pdev) i40e_handle_reset_warning(pf, false); } -/** - * i40e_enable_mc_magic_wake - enable multicast magic packet wake up - * using the mac_address_write admin q function - * @pf: pointer to i40e_pf struct - **/ -static void i40e_enable_mc_magic_wake(struct i40e_pf *pf) -{ - struct i40e_vsi *main_vsi = i40e_pf_get_main_vsi(pf); - struct i40e_hw *hw = &pf->hw; - u8 mac_addr[6]; - u16 flags = 0; - int ret; - - /* Get current MAC address in case it's an LAA */ - if (main_vsi && main_vsi->netdev) { - ether_addr_copy(mac_addr, main_vsi->netdev->dev_addr); - } else { - dev_err(&pf->pdev->dev, - "Failed to retrieve MAC address; using default\n"); - ether_addr_copy(mac_addr, hw->mac.addr); - } - - /* The FW expects the mac address write cmd to first be called with - * one of these flags before calling it again with the multicast - * enable flags. - */ - flags = I40E_AQC_WRITE_TYPE_LAA_WOL; - - if (hw->func_caps.flex10_enable && hw->partition_id != 1) - flags = I40E_AQC_WRITE_TYPE_LAA_ONLY; - - ret = i40e_aq_mac_address_write(hw, flags, mac_addr, NULL); - if (ret) { - dev_err(&pf->pdev->dev, - "Failed to update MAC address registers; cannot enable Multicast Magic packet wake up"); - return; - } - - flags = I40E_AQC_MC_MAG_EN - | I40E_AQC_WOL_PRESERVE_ON_PFR - | I40E_AQC_WRITE_TYPE_UPDATE_MC_MAG; - ret = i40e_aq_mac_address_write(hw, flags, mac_addr, NULL); - if (ret) - dev_err(&pf->pdev->dev, - "Failed to enable Multicast Magic Packet wake up\n"); -} - /** * i40e_shutdown - PCI callback for shutting down * @pdev: PCI device information struct @@ -16552,48 +16638,11 @@ static void i40e_shutdown(struct pci_dev *pdev) static int i40e_suspend(struct device *dev) { struct i40e_pf *pf = dev_get_drvdata(dev); - struct i40e_hw *hw = &pf->hw; /* If we're already suspended, then there is nothing to do */ if (test_and_set_bit(__I40E_SUSPENDED, pf->state)) return 0; - - set_bit(__I40E_DOWN, pf->state); - - /* Ensure service task will not be running */ - del_timer_sync(&pf->service_timer); - cancel_work_sync(&pf->service_task); - - /* Client close must be called explicitly here because the timer - * has been stopped. - */ - i40e_notify_client_of_netdev_close(pf, false); - - if (test_bit(I40E_HW_CAP_WOL_MC_MAGIC_PKT_WAKE, pf->hw.caps) && - pf->wol_en) - i40e_enable_mc_magic_wake(pf); - - /* Since we're going to destroy queues during the - * i40e_clear_interrupt_scheme() we should hold the RTNL lock for this - * whole section - */ - rtnl_lock(); - - i40e_prep_for_reset(pf); - - wr32(hw, I40E_PFPM_APM, (pf->wol_en ? I40E_PFPM_APM_APME_MASK : 0)); - wr32(hw, I40E_PFPM_WUFC, (pf->wol_en ? I40E_PFPM_WUFC_MAG_MASK : 0)); - - /* Clear the interrupt scheme and release our IRQs so that the system - * can safely hibernate even when there are a large number of CPUs. - * Otherwise hibernation might fail when mapping all the vectors back - * to CPU0. - */ - i40e_clear_interrupt_scheme(pf); - - rtnl_unlock(); - - return 0; + return i40e_io_suspend(pf); } /** @@ -16603,39 +16652,11 @@ static int i40e_suspend(struct device *dev) static int i40e_resume(struct device *dev) { struct i40e_pf *pf = dev_get_drvdata(dev); - int err; /* If we're not suspended, then there is nothing to do */ if (!test_bit(__I40E_SUSPENDED, pf->state)) return 0; - - /* We need to hold the RTNL lock prior to restoring interrupt schemes, - * since we're going to be restoring queues - */ - rtnl_lock(); - - /* We cleared the interrupt scheme when we suspended, so we need to - * restore it now to resume device functionality. - */ - err = i40e_restore_interrupt_scheme(pf); - if (err) { - dev_err(dev, "Cannot restore interrupt scheme: %d\n", - err); - } - - clear_bit(__I40E_DOWN, pf->state); - i40e_reset_and_rebuild(pf, false, true); - - rtnl_unlock(); - - /* Clear suspended state last after everything is recovered */ - clear_bit(__I40E_SUSPENDED, pf->state); - - /* Restart the service task */ - mod_timer(&pf->service_timer, - round_jiffies(jiffies + pf->service_timer_period)); - - return 0; + return i40e_io_resume(pf); } static const struct pci_error_handlers i40e_err_handler = { From c80b6538d35a7a60d874c5a76c3c5a82b6a28fbb Mon Sep 17 00:00:00 2001 From: Thinh Tran Date: Tue, 28 May 2024 15:06:06 -0700 Subject: [PATCH 203/279] i40e: Fully suspend and resume IO operations in EEH case When EEH events occurs, the callback functions in the i40e, which are managed by the EEH driver, will completely suspend and resume all IO operations. - In the PCI error detected callback, replaced i40e_prep_for_reset() with i40e_io_suspend(). The change is to fully suspend all I/O operations - In the PCI error slot reset callback, replaced pci_enable_device_mem() with pci_enable_device(). This change enables both I/O and memory of the device. - In the PCI error resume callback, replaced i40e_handle_reset_warning() with i40e_io_resume(). This change allows the system to resume I/O operations Fixes: a5f3d2c17b07 ("powerpc/pseries/pci: Add MSI domains") Reviewed-by: Jacob Keller Tested-by: Robert Thomas Signed-off-by: Thinh Tran Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240528-net-2024-05-28-intel-net-fixes-v1-3-dc8593d2bbc6@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index d5f25ea304bf..284c3fad5a6e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11171,6 +11171,8 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit, ret = i40e_reset(pf); if (!ret) i40e_rebuild(pf, reinit, lock_acquired); + else + dev_err(&pf->pdev->dev, "%s: i40e_reset() FAILED", __func__); } /** @@ -16491,7 +16493,7 @@ static pci_ers_result_t i40e_pci_error_detected(struct pci_dev *pdev, /* shutdown all operations */ if (!test_bit(__I40E_SUSPENDED, pf->state)) - i40e_prep_for_reset(pf); + i40e_io_suspend(pf); /* Request a slot reset */ return PCI_ERS_RESULT_NEED_RESET; @@ -16513,7 +16515,8 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev) u32 reg; dev_dbg(&pdev->dev, "%s\n", __func__); - if (pci_enable_device_mem(pdev)) { + /* enable I/O and memory of the device */ + if (pci_enable_device(pdev)) { dev_info(&pdev->dev, "Cannot re-enable PCI device after reset.\n"); result = PCI_ERS_RESULT_DISCONNECT; @@ -16576,7 +16579,7 @@ static void i40e_pci_error_resume(struct pci_dev *pdev) if (test_bit(__I40E_SUSPENDED, pf->state)) return; - i40e_handle_reset_warning(pf, false); + i40e_io_resume(pf); } /** From 2a6d8f2de2224ac46df94dc40f43f8b9701f6703 Mon Sep 17 00:00:00 2001 From: Paul Greenwalt Date: Tue, 28 May 2024 15:06:08 -0700 Subject: [PATCH 204/279] ice: fix 200G PHY types to link speed mapping Commit 24407a01e57c ("ice: Add 200G speed/phy type use") added support for 200G PHY speeds, but did not include the mapping of 200G PHY types to link speed. As a result the driver is returning UNKNOWN link speed when setting 200G ethtool advertised link modes. To fix this add 200G PHY types to link speed mapping to ice_get_link_speed_based_on_phy_type(). Fixes: 24407a01e57c ("ice: Add 200G speed/phy type use") Reviewed-by: Michal Swiatkowski Signed-off-by: Paul Greenwalt Tested-by: Pucha Himasekhar Reddy Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240528-net-2024-05-28-intel-net-fixes-v1-5-dc8593d2bbc6@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_common.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 5649b257e631..24716a3b494c 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -3148,6 +3148,16 @@ ice_get_link_speed_based_on_phy_type(u64 phy_type_low, u64 phy_type_high) case ICE_PHY_TYPE_HIGH_100G_AUI2: speed_phy_type_high = ICE_AQ_LINK_SPEED_100GB; break; + case ICE_PHY_TYPE_HIGH_200G_CR4_PAM4: + case ICE_PHY_TYPE_HIGH_200G_SR4: + case ICE_PHY_TYPE_HIGH_200G_FR4: + case ICE_PHY_TYPE_HIGH_200G_LR4: + case ICE_PHY_TYPE_HIGH_200G_DR4: + case ICE_PHY_TYPE_HIGH_200G_KR4_PAM4: + case ICE_PHY_TYPE_HIGH_200G_AUI4_AOC_ACC: + case ICE_PHY_TYPE_HIGH_200G_AUI4: + speed_phy_type_high = ICE_AQ_LINK_SPEED_200GB; + break; default: speed_phy_type_high = ICE_AQ_LINK_SPEED_UNKNOWN; break; From a51c9b1c9ab2351e62933357fcad5bfad27f2400 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 28 May 2024 15:06:11 -0700 Subject: [PATCH 205/279] ice: check for unregistering correct number of devlink params On module load, the ice driver checks for the lack of a specific PF capability to determine if it should reduce the number of devlink params to register. One situation when this test returns true is when the driver loads in safe mode. The same check is not present on the unload path when devlink params are unregistered. This results in the driver triggering a WARN_ON in the kernel devlink code. The current check and code path uses a reduction in the number of elements reported in the list of params. This is fragile and not good for future maintaining. Change the parameters to be held in two lists, one always registered and one dependent on the check. Add a symmetrical check in the unload path so that the correct parameters are unregistered as well. Fixes: 109eb2917284 ("ice: Add tx_scheduling_layers devlink param") CC: Lukasz Czapnik Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Reviewed-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240528-net-2024-05-28-intel-net-fixes-v1-8-dc8593d2bbc6@intel.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/intel/ice/devlink/devlink.c | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index c4b69655cdf5..704e9ad5144e 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -1388,7 +1388,7 @@ enum ice_param_id { ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS, }; -static const struct devlink_param ice_devlink_params[] = { +static const struct devlink_param ice_dvl_rdma_params[] = { DEVLINK_PARAM_GENERIC(ENABLE_ROCE, BIT(DEVLINK_PARAM_CMODE_RUNTIME), ice_devlink_enable_roce_get, ice_devlink_enable_roce_set, @@ -1397,6 +1397,9 @@ static const struct devlink_param ice_devlink_params[] = { ice_devlink_enable_iw_get, ice_devlink_enable_iw_set, ice_devlink_enable_iw_validate), +}; + +static const struct devlink_param ice_dvl_sched_params[] = { DEVLINK_PARAM_DRIVER(ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS, "tx_scheduling_layers", DEVLINK_PARAM_TYPE_U8, @@ -1464,21 +1467,31 @@ int ice_devlink_register_params(struct ice_pf *pf) { struct devlink *devlink = priv_to_devlink(pf); struct ice_hw *hw = &pf->hw; - size_t params_size; + int status; - params_size = ARRAY_SIZE(ice_devlink_params); + status = devl_params_register(devlink, ice_dvl_rdma_params, + ARRAY_SIZE(ice_dvl_rdma_params)); + if (status) + return status; - if (!hw->func_caps.common_cap.tx_sched_topo_comp_mode_en) - params_size--; + if (hw->func_caps.common_cap.tx_sched_topo_comp_mode_en) + status = devl_params_register(devlink, ice_dvl_sched_params, + ARRAY_SIZE(ice_dvl_sched_params)); - return devl_params_register(devlink, ice_devlink_params, - params_size); + return status; } void ice_devlink_unregister_params(struct ice_pf *pf) { - devl_params_unregister(priv_to_devlink(pf), ice_devlink_params, - ARRAY_SIZE(ice_devlink_params)); + struct devlink *devlink = priv_to_devlink(pf); + struct ice_hw *hw = &pf->hw; + + devl_params_unregister(devlink, ice_dvl_rdma_params, + ARRAY_SIZE(ice_dvl_rdma_params)); + + if (hw->func_caps.common_cap.tx_sched_topo_comp_mode_en) + devl_params_unregister(devlink, ice_dvl_sched_params, + ARRAY_SIZE(ice_dvl_sched_params)); } #define ICE_DEVLINK_READ_BLK_SIZE (1024 * 1024) From 2dc8b1e7177d4f49f492ce648440caf2de0c3616 Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Tue, 28 May 2024 20:09:12 +0300 Subject: [PATCH 206/279] net: ena: Fix redundant device NUMA node override The driver overrides the NUMA node id of the device regardless of whether it knows its correct value (often setting it to -1 even though the node id is advertised in 'struct device'). This can lead to suboptimal configurations. This patch fixes this behavior and makes the shared memory allocation functions use the NUMA node id advertised by the underlying device. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Shay Agroskin Link: https://lore.kernel.org/r/20240528170912.1204417-1-shayagr@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_com.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 2d8a66ea82fa..713a595370bf 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -312,7 +312,6 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, struct ena_com_io_sq *io_sq) { size_t size; - int dev_node = 0; memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr)); @@ -325,12 +324,9 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, size = io_sq->desc_entry_size * io_sq->q_depth; if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) { - dev_node = dev_to_node(ena_dev->dmadev); - set_dev_node(ena_dev->dmadev, ctx->numa_node); io_sq->desc_addr.virt_addr = dma_alloc_coherent(ena_dev->dmadev, size, &io_sq->desc_addr.phys_addr, GFP_KERNEL); - set_dev_node(ena_dev->dmadev, dev_node); if (!io_sq->desc_addr.virt_addr) { io_sq->desc_addr.virt_addr = dma_alloc_coherent(ena_dev->dmadev, size, @@ -354,10 +350,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, size = (size_t)io_sq->bounce_buf_ctrl.buffer_size * io_sq->bounce_buf_ctrl.buffers_num; - dev_node = dev_to_node(ena_dev->dmadev); - set_dev_node(ena_dev->dmadev, ctx->numa_node); io_sq->bounce_buf_ctrl.base_buffer = devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL); - set_dev_node(ena_dev->dmadev, dev_node); if (!io_sq->bounce_buf_ctrl.base_buffer) io_sq->bounce_buf_ctrl.base_buffer = devm_kzalloc(ena_dev->dmadev, size, GFP_KERNEL); @@ -397,7 +390,6 @@ static int ena_com_init_io_cq(struct ena_com_dev *ena_dev, struct ena_com_io_cq *io_cq) { size_t size; - int prev_node = 0; memset(&io_cq->cdesc_addr, 0x0, sizeof(io_cq->cdesc_addr)); @@ -409,11 +401,8 @@ static int ena_com_init_io_cq(struct ena_com_dev *ena_dev, size = io_cq->cdesc_entry_size_in_bytes * io_cq->q_depth; - prev_node = dev_to_node(ena_dev->dmadev); - set_dev_node(ena_dev->dmadev, ctx->numa_node); io_cq->cdesc_addr.virt_addr = dma_alloc_coherent(ena_dev->dmadev, size, &io_cq->cdesc_addr.phys_addr, GFP_KERNEL); - set_dev_node(ena_dev->dmadev, prev_node); if (!io_cq->cdesc_addr.virt_addr) { io_cq->cdesc_addr.virt_addr = dma_alloc_coherent(ena_dev->dmadev, size, &io_cq->cdesc_addr.phys_addr, From b3dc6e8003b500861fa307e9a3400c52e78e4d3a Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 29 May 2024 17:56:33 +0800 Subject: [PATCH 207/279] ipvlan: Dont Use skb->sk in ipvlan_process_v{4,6}_outbound Raw packet from PF_PACKET socket ontop of an IPv6-backed ipvlan device will hit WARN_ON_ONCE() in sk_mc_loop() through sch_direct_xmit() path. WARNING: CPU: 2 PID: 0 at net/core/sock.c:775 sk_mc_loop+0x2d/0x70 Modules linked in: sch_netem ipvlan rfkill cirrus drm_shmem_helper sg drm_kms_helper CPU: 2 PID: 0 Comm: swapper/2 Kdump: loaded Not tainted 6.9.0+ #279 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:sk_mc_loop+0x2d/0x70 Code: fa 0f 1f 44 00 00 65 0f b7 15 f7 96 a3 4f 31 c0 66 85 d2 75 26 48 85 ff 74 1c RSP: 0018:ffffa9584015cd78 EFLAGS: 00010212 RAX: 0000000000000011 RBX: ffff91e585793e00 RCX: 0000000002c6a001 RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff91e589c0f000 RBP: ffff91e5855bd100 R08: 0000000000000000 R09: 3d00545216f43d00 R10: ffff91e584fdcc50 R11: 00000060dd8616f4 R12: ffff91e58132d000 R13: ffff91e584fdcc68 R14: ffff91e5869ce800 R15: ffff91e589c0f000 FS: 0000000000000000(0000) GS:ffff91e898100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f788f7c44c0 CR3: 0000000008e1a000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __warn (kernel/panic.c:693) ? sk_mc_loop (net/core/sock.c:760) ? report_bug (lib/bug.c:201 lib/bug.c:219) ? handle_bug (arch/x86/kernel/traps.c:239) ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1)) ? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621) ? sk_mc_loop (net/core/sock.c:760) ip6_finish_output2 (net/ipv6/ip6_output.c:83 (discriminator 1)) ? nf_hook_slow (net/netfilter/core.c:626) ip6_finish_output (net/ipv6/ip6_output.c:222) ? __pfx_ip6_finish_output (net/ipv6/ip6_output.c:215) ipvlan_xmit_mode_l3 (drivers/net/ipvlan/ipvlan_core.c:602) ipvlan ipvlan_start_xmit (drivers/net/ipvlan/ipvlan_main.c:226) ipvlan dev_hard_start_xmit (net/core/dev.c:3594) sch_direct_xmit (net/sched/sch_generic.c:343) __qdisc_run (net/sched/sch_generic.c:416) net_tx_action (net/core/dev.c:5286) handle_softirqs (kernel/softirq.c:555) __irq_exit_rcu (kernel/softirq.c:589) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1043) The warning triggers as this: packet_sendmsg packet_snd //skb->sk is packet sk __dev_queue_xmit __dev_xmit_skb //q->enqueue is not NULL __qdisc_run sch_direct_xmit dev_hard_start_xmit ipvlan_start_xmit ipvlan_xmit_mode_l3 //l3 mode ipvlan_process_outbound //vepa flag ipvlan_process_v6_outbound ip6_local_out __ip6_finish_output ip6_finish_output2 //multicast packet sk_mc_loop //sk->sk_family is AF_PACKET Call ip{6}_local_out() with NULL sk in ipvlan as other tunnels to fix this. Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Suggested-by: Eric Dumazet Signed-off-by: Yue Haibing Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240529095633.613103-1-yuehaibing@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ipvlan/ipvlan_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 2d5b021b4ea6..fef4eff7753a 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -439,7 +439,7 @@ static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - err = ip_local_out(net, skb->sk, skb); + err = ip_local_out(net, NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else @@ -494,7 +494,7 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb) memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - err = ip6_local_out(dev_net(dev), skb->sk, skb); + err = ip6_local_out(dev_net(dev), NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else From 13c7c941e72908b8cce5a84b45a7b5e485ca12ed Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 May 2024 09:35:47 -0700 Subject: [PATCH 208/279] netdev: add qstat for csum complete Recent commit 0cfe71f45f42 ("netdev: add queue stats") added a lot of useful stats, but only those immediately needed by virtio. Presumably virtio does not support CHECKSUM_COMPLETE, so statistic for that form of checksumming wasn't included. Other drivers will definitely need it, in fact we expect it to be needed in net-next soon (mlx5). So let's add the definition of the counter for CHECKSUM_COMPLETE to uAPI in net already, so that the counters are in a more natural order (all subsequent counters have not been present in any released kernel, yet). Signed-off-by: Jakub Kicinski Reviewed-by: Joe Damato Fixes: 0cfe71f45f42 ("netdev: add queue stats") Link: https://lore.kernel.org/r/20240529163547.3693194-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/netdev.yaml | 4 ++++ include/uapi/linux/netdev.h | 1 + tools/include/uapi/linux/netdev.h | 1 + 3 files changed, 6 insertions(+) diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 11a32373365a..959755be4d7f 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -349,6 +349,10 @@ attribute-sets: Number of packets dropped due to transient lack of resources, such as buffer space, host descriptors etc. type: uint + - + name: rx-csum-complete + doc: Number of packets that were marked as CHECKSUM_COMPLETE. + type: uint - name: rx-csum-unnecessary doc: Number of packets that were marked as CHECKSUM_UNNECESSARY. diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index a8188202413e..43742ac5b00d 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -148,6 +148,7 @@ enum { NETDEV_A_QSTATS_RX_ALLOC_FAIL, NETDEV_A_QSTATS_RX_HW_DROPS, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_COMPLETE, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, NETDEV_A_QSTATS_RX_CSUM_NONE, NETDEV_A_QSTATS_RX_CSUM_BAD, diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index a8188202413e..43742ac5b00d 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -148,6 +148,7 @@ enum { NETDEV_A_QSTATS_RX_ALLOC_FAIL, NETDEV_A_QSTATS_RX_HW_DROPS, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_COMPLETE, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, NETDEV_A_QSTATS_RX_CSUM_NONE, NETDEV_A_QSTATS_RX_CSUM_BAD, From 12870ae3818e39ea65bf710f645972277b634f72 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Fri, 24 May 2024 14:29:54 -0500 Subject: [PATCH 209/279] powerpc/pseries/lparcfg: drop error message from guest name lookup It's not an error or exceptional situation when the hosting environment does not expose a name for the LP/guest via RTAS or the device tree. This happens with qemu when run without the '-name' option. The message also lacks a newline. Remove it. Signed-off-by: Nathan Lynch Fixes: eddaa9a40275 ("powerpc/pseries: read the lpar name from the firmware") Signed-off-by: Michael Ellerman Link: https://msgid.link/20240524-lparcfg-updates-v2-1-62e2e9d28724@linux.ibm.com --- arch/powerpc/platforms/pseries/lparcfg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 6e7029640c0c..62da20f9700a 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -371,8 +371,8 @@ static int read_dt_lpar_name(struct seq_file *m) static void read_lpar_name(struct seq_file *m) { - if (read_rtas_lpar_name(m) && read_dt_lpar_name(m)) - pr_err_once("Error can't get the LPAR name"); + if (read_rtas_lpar_name(m)) + read_dt_lpar_name(m); } #define SPLPAR_MAXLENGTH 1026*(sizeof(char)) From 2d43cc701b96f910f50915ac4c2a0cae5deb734c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 29 May 2024 22:30:28 +1000 Subject: [PATCH 210/279] powerpc/uaccess: Fix build errors seen with GCC 13/14 Building ppc64le_defconfig with GCC 14 fails with assembler errors: CC fs/readdir.o /tmp/ccdQn0mD.s: Assembler messages: /tmp/ccdQn0mD.s:212: Error: operand out of domain (18 is not a multiple of 4) /tmp/ccdQn0mD.s:226: Error: operand out of domain (18 is not a multiple of 4) ... [6 lines] /tmp/ccdQn0mD.s:1699: Error: operand out of domain (18 is not a multiple of 4) A snippet of the asm shows: # ../fs/readdir.c:210: unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); ld 9,0(29) # MEM[(u64 *)name_38(D) + _88 * 1], MEM[(u64 *)name_38(D) + _88 * 1] # 210 "../fs/readdir.c" 1 1: std 9,18(8) # put_user # *__pus_addr_52, MEM[(u64 *)name_38(D) + _88 * 1] The 'std' instruction requires a 4-byte aligned displacement because it is a DS-form instruction, and as the assembler says, 18 is not a multiple of 4. A similar error is seen with GCC 13 and CONFIG_UBSAN_SIGNED_WRAP=y. The fix is to change the constraint on the memory operand to put_user(), from "m" which is a general memory reference to "YZ". The "Z" constraint is documented in the GCC manual PowerPC machine constraints, and specifies a "memory operand accessed with indexed or indirect addressing". "Y" is not documented in the manual but specifies a "memory operand for a DS-form instruction". Using both allows the compiler to generate a DS-form "std" or X-form "stdx" as appropriate. The change has to be conditional on CONFIG_PPC_KERNEL_PREFIXED because the "Y" constraint does not guarantee 4-byte alignment when prefixed instructions are enabled. Unfortunately clang doesn't support the "Y" constraint so that has to be behind an ifdef. Although the build error is only seen with GCC 13/14, that appears to just be luck. The constraint has been incorrect since it was first added. Fixes: c20beffeec3c ("powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()") Cc: stable@vger.kernel.org # v5.10+ Suggested-by: Kewen Lin Signed-off-by: Michael Ellerman Link: https://msgid.link/20240529123029.146953-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/uaccess.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index de10437fd206..4cba724c8899 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -92,9 +92,25 @@ __pu_failed: \ : label) #endif +#ifdef CONFIG_CC_IS_CLANG +#define DS_FORM_CONSTRAINT "Z<>" +#else +#define DS_FORM_CONSTRAINT "YZ<>" +#endif + #ifdef __powerpc64__ +#ifdef CONFIG_PPC_KERNEL_PREFIXED #define __put_user_asm2_goto(x, ptr, label) \ __put_user_asm_goto(x, ptr, label, "std") +#else +#define __put_user_asm2_goto(x, addr, label) \ + asm goto ("1: std%U1%X1 %0,%1 # put_user\n" \ + EX_TABLE(1b, %l2) \ + : \ + : "r" (x), DS_FORM_CONSTRAINT (*addr) \ + : \ + : label) +#endif // CONFIG_PPC_KERNEL_PREFIXED #else /* __powerpc64__ */ #define __put_user_asm2_goto(x, addr, label) \ asm goto( \ From 50934945d54238d2d6d8db4b7c1d4c90d2696c57 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 29 May 2024 22:30:29 +1000 Subject: [PATCH 211/279] powerpc/uaccess: Use YZ asm constraint for ld The 'ld' instruction requires a 4-byte aligned displacement because it is a DS-form instruction. But the "m" asm constraint doesn't enforce that. Add a special case of __get_user_asm2_goto() so that the "YZ" constraint can be used for "ld". The "Z" constraint is documented in the GCC manual PowerPC machine constraints, and specifies a "memory operand accessed with indexed or indirect addressing". "Y" is not documented in the manual but specifies a "memory operand for a DS-form instruction". Using both allows the compiler to generate a DS-form "ld" or X-form "ldx" as appropriate. The change has to be conditional on CONFIG_PPC_KERNEL_PREFIXED because the "Y" constraint does not guarantee 4-byte alignment when prefixed instructions are enabled. No build errors have been reported due to this, but the possibility is there depending on compiler code generation decisions. Fixes: c20beffeec3c ("powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()") Signed-off-by: Michael Ellerman Link: https://msgid.link/20240529123029.146953-2-mpe@ellerman.id.au --- arch/powerpc/include/asm/uaccess.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 4cba724c8899..fd594bf6c6a9 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -181,8 +181,19 @@ do { \ #endif #ifdef __powerpc64__ +#ifdef CONFIG_PPC_KERNEL_PREFIXED #define __get_user_asm2_goto(x, addr, label) \ __get_user_asm_goto(x, addr, label, "ld") +#else +#define __get_user_asm2_goto(x, addr, label) \ + asm_goto_output( \ + "1: ld%U1%X1 %0, %1 # get_user\n" \ + EX_TABLE(1b, %l2) \ + : "=r" (x) \ + : DS_FORM_CONSTRAINT (*addr) \ + : \ + : label) +#endif // CONFIG_PPC_KERNEL_PREFIXED #else /* __powerpc64__ */ #define __get_user_asm2_goto(x, addr, label) \ asm_goto_output( \ From be2fc65d66e0406cc9d39d40becaecdf4ee765f3 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 29 May 2024 09:28:50 -0700 Subject: [PATCH 212/279] powerpc: Limit ARCH_HAS_KERNEL_FPU_SUPPORT to PPC64 When building a 32-bit kernel, some toolchains do not allow mixing soft float and hard float object files: LD vmlinux.o powerpc64le-unknown-linux-musl-ld: lib/test_fpu_impl.o uses hard float, arch/powerpc/kernel/udbg.o uses soft float powerpc64le-unknown-linux-musl-ld: failed to merge target specific data of file lib/test_fpu_impl.o make[2]: *** [scripts/Makefile.vmlinux_o:62: vmlinux.o] Error 1 make[1]: *** [Makefile:1152: vmlinux_o] Error 2 make: *** [Makefile:240: __sub-make] Error 2 This is not an issue when building a 64-bit kernel. To unbreak the build, limit ARCH_HAS_KERNEL_FPU_SUPPORT to 64-bit kernels. This is okay because the only real user of this option, amdgpu, was previously limited to PPC64 anyway; see commit a28e4b672f04 ("drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT"). Fixes: 01db473e1aa3 ("powerpc: implement ARCH_HAS_KERNEL_FPU_SUPPORT") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405250851.Z4daYSWG-lkp@intel.com/ Reported-by: Guenter Roeck Closes: https://lore.kernel.org/lkml/eeffaec3-df63-4e55-ab7a-064a65c00efa@roeck-us.net/ Signed-off-by: Samuel Holland Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://msgid.link/20240529162852.1209-1-samuel.holland@sifive.com --- arch/powerpc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3c968f2f4ac4..c88c6d46a5bc 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,7 +137,7 @@ config PPC select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_HUGEPD if HUGETLB_PAGE select ARCH_HAS_KCOV - select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC_FPU + select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC64 && PPC_FPU select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEMREMAP_COMPAT_ALIGN if PPC_64S_HASH_MMU From 0e5895ff7fab0fc05ec17daf9a568368828fa6ea Mon Sep 17 00:00:00 2001 From: Gerald Loacker Date: Wed, 29 May 2024 16:42:45 +0200 Subject: [PATCH 213/279] drm/panel: sitronix-st7789v: fix timing for jt240mhqs_hwt_ek_e3 panel Flickering was observed when using partial mode. Moving the vsync to the same position as used by the default sitronix-st7789v timing resolves this issue. Fixes: 0fbbe96bfa08 ("drm/panel: sitronix-st7789v: add jasonic jt240mhqs-hwt-ek-e3 support") Acked-by: Jessica Zhang Signed-off-by: Gerald Loacker Link: https://lore.kernel.org/r/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-1-e4821802443d@wolfvision.net Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-1-e4821802443d@wolfvision.net --- drivers/gpu/drm/panel/panel-sitronix-st7789v.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c index 88e80fe98112..32e5c0348038 100644 --- a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c +++ b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c @@ -286,9 +286,9 @@ static const struct drm_display_mode jt240mhqs_hwt_ek_e3_mode = { .hsync_end = 240 + 28 + 10, .htotal = 240 + 28 + 10 + 10, .vdisplay = 280, - .vsync_start = 280 + 8, - .vsync_end = 280 + 8 + 4, - .vtotal = 280 + 8 + 4 + 4, + .vsync_start = 280 + 48, + .vsync_end = 280 + 48 + 4, + .vtotal = 280 + 48 + 4 + 4, .width_mm = 43, .height_mm = 37, .flags = DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC, From 2ba50582634d0bfe3a333ab7575a7f0122a7cde8 Mon Sep 17 00:00:00 2001 From: Gerald Loacker Date: Wed, 29 May 2024 16:42:46 +0200 Subject: [PATCH 214/279] drm/panel: sitronix-st7789v: tweak timing for jt240mhqs_hwt_ek_e3 panel Use the default timing parameters to get a refresh rate of about 60 Hz for a clock of 6 MHz. Fixes: 0fbbe96bfa08 ("drm/panel: sitronix-st7789v: add jasonic jt240mhqs-hwt-ek-e3 support") Signed-off-by: Gerald Loacker Acked-by: Jessica Zhang Link: https://lore.kernel.org/r/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-2-e4821802443d@wolfvision.net Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-2-e4821802443d@wolfvision.net --- drivers/gpu/drm/panel/panel-sitronix-st7789v.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c index 32e5c0348038..c7e3f1280404 100644 --- a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c +++ b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c @@ -282,9 +282,9 @@ static const struct drm_display_mode et028013dma_mode = { static const struct drm_display_mode jt240mhqs_hwt_ek_e3_mode = { .clock = 6000, .hdisplay = 240, - .hsync_start = 240 + 28, - .hsync_end = 240 + 28 + 10, - .htotal = 240 + 28 + 10 + 10, + .hsync_start = 240 + 38, + .hsync_end = 240 + 38 + 10, + .htotal = 240 + 38 + 10 + 10, .vdisplay = 280, .vsync_start = 280 + 48, .vsync_end = 280 + 48 + 4, From b62c150c3bae72ac1910dcc588f360159eb0744a Mon Sep 17 00:00:00 2001 From: Gerald Loacker Date: Wed, 29 May 2024 16:42:47 +0200 Subject: [PATCH 215/279] drm/panel: sitronix-st7789v: fix display size for jt240mhqs_hwt_ek_e3 panel This is a portrait mode display. Change the dimensions accordingly. Fixes: 0fbbe96bfa08 ("drm/panel: sitronix-st7789v: add jasonic jt240mhqs-hwt-ek-e3 support") Signed-off-by: Gerald Loacker Acked-by: Jessica Zhang Link: https://lore.kernel.org/r/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-3-e4821802443d@wolfvision.net Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240409-bugfix-jt240mhqs_hwt_ek_e3-timing-v2-3-e4821802443d@wolfvision.net --- drivers/gpu/drm/panel/panel-sitronix-st7789v.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c index c7e3f1280404..e8f385b9c618 100644 --- a/drivers/gpu/drm/panel/panel-sitronix-st7789v.c +++ b/drivers/gpu/drm/panel/panel-sitronix-st7789v.c @@ -289,8 +289,8 @@ static const struct drm_display_mode jt240mhqs_hwt_ek_e3_mode = { .vsync_start = 280 + 48, .vsync_end = 280 + 48 + 4, .vtotal = 280 + 48 + 4 + 4, - .width_mm = 43, - .height_mm = 37, + .width_mm = 37, + .height_mm = 43, .flags = DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC, }; From 34bf6bae3286a58762711cfbce2cf74ecd42e1b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2024 22:21:31 +0200 Subject: [PATCH 216/279] x86/topology/amd: Evaluate SMT in CPUID leaf 0x8000001e only on family 0x17 and greater The new AMD/HYGON topology parser evaluates the SMT information in CPUID leaf 0x8000001e unconditionally while the original code restricted it to CPUs with family 0x17 and greater. This breaks family 0x15 CPUs which advertise that leaf and have a non-zero value in the SMT section. The machine boots, but the scheduler complains loudly about the mismatch of the core IDs: WARNING: CPU: 1 PID: 0 at kernel/sched/core.c:6482 sched_cpu_starting+0x183/0x250 WARNING: CPU: 0 PID: 1 at kernel/sched/topology.c:2408 build_sched_domains+0x76b/0x12b0 Add the condition back to cure it. [ bp: Make it actually build because grandpa is not concerned with trivial stuff. :-P ] Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Closes: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/56 Reported-by: Tim Teichmann Reported-by: Christian Heusel Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Tim Teichmann Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/7skhx6mwe4hxiul64v6azhlxnokheorksqsdbp7qw6g2jduf6c@7b5pvomauugk --- arch/x86/kernel/cpu/topology_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index d419deed6a48..7d476fa697ca 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -84,9 +84,9 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) /* * If leaf 0xb is available, then the domain shifts are set - * already and nothing to do here. + * already and nothing to do here. Only valid for family >= 0x17. */ - if (!has_topoext) { + if (!has_topoext && tscan->c->x86 >= 0x17) { /* * Leaf 0x80000008 set the CORE domain shift already. * Update the SMT domain, but do not propagate it. From e112311615a24e1618a591c73506571dc304eb8d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 30 May 2024 07:23:39 -0700 Subject: [PATCH 217/279] io_uring/rw: Free iovec before cleaning async data kmemleak shows that there is a memory leak in io_uring read operation, where a buffer is allocated at iovec import, but never de-allocated. The memory is allocated at io_async_rw->free_iovec, but, then io_async_rw is kfreed, taking the allocated memory with it. I saw this happening when the read operation fails with -11 (EAGAIN). This is the kmemleak splat. unreferenced object 0xffff8881da591c00 (size 256): ... backtrace (crc 7a15bdee): [<00000000256f2de4>] __kmalloc+0x2d6/0x410 [<000000007a9f5fc7>] iovec_from_user.part.0+0xc6/0x160 [<00000000cecdf83a>] __import_iovec+0x50/0x220 [<00000000d1d586a2>] __io_import_iovec+0x13d/0x220 [<0000000054ee9bd2>] io_prep_rw+0x186/0x340 [<00000000a9c0372d>] io_prep_rwv+0x31/0x120 [<000000001d1170b9>] io_prep_readv+0xe/0x30 [<0000000070b8eb67>] io_submit_sqes+0x1bd/0x780 [<00000000812496d4>] __do_sys_io_uring_enter+0x3ed/0x5b0 [<0000000081499602>] do_syscall_64+0x5d/0x170 [<00000000de1c5a4d>] entry_SYSCALL_64_after_hwframe+0x76/0x7e This occurs because the async data cleanup functions are not set for read/write operations. As a result, the potentially allocated iovec in the rw async data is not freed before the async data is released, leading to a memory leak. With this following patch, kmemleak does not show the leaked memory anymore, and all liburing tests pass. Fixes: a9165b83c193 ("io_uring/rw: always setup io_async_rw for read/write requests") Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240530142340.1248216-1-leitao@debian.org Signed-off-by: Jens Axboe --- io_uring/opdef.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 2de5cca9504e..2e3b7b16effb 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -516,10 +516,12 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_READ_FIXED] = { .name = "READ_FIXED", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { .name = "WRITE_FIXED", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_POLL_ADD] = { @@ -582,10 +584,12 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_READ] = { .name = "READ", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE] = { .name = "WRITE", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_FADVISE] = { @@ -692,6 +696,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_READ_MULTISHOT] = { .name = "READ_MULTISHOT", + .cleanup = io_readv_writev_cleanup, }, [IORING_OP_WAITID] = { .name = "WAITID", From 2a38e4ca302280fdcce370ba2bee79bac16c4587 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 17 May 2024 13:05:34 -0700 Subject: [PATCH 218/279] x86/cpu: Provide default cache line size if not enumerated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tl;dr: CPUs with CPUID.80000008H but without CPUID.01H:EDX[CLFSH] will end up reporting cache_line_size()==0 and bad things happen. Fill in a default on those to avoid the problem. Long Story: The kernel dies a horrible death if c->x86_cache_alignment (aka. cache_line_size() is 0. Normally, this value is populated from c->x86_clflush_size. Right now the code is set up to get c->x86_clflush_size from two places. First, modern CPUs get it from CPUID. Old CPUs that don't have leaf 0x80000008 (or CPUID at all) just get some sane defaults from the kernel in get_cpu_address_sizes(). The vast majority of CPUs that have leaf 0x80000008 also get ->x86_clflush_size from CPUID. But there are oddballs. Intel Quark CPUs[1] and others[2] have leaf 0x80000008 but don't set CPUID.01H:EDX[CLFSH], so they skip over filling in ->x86_clflush_size: cpuid(0x00000001, &tfms, &misc, &junk, &cap0); if (cap0 & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; So they: land in get_cpu_address_sizes() and see that CPUID has level 0x80000008 and jump into the side of the if() that does not fill in c->x86_clflush_size. That assigns a 0 to c->x86_cache_alignment, and hilarity ensues in code like: buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); To fix this, always provide a sane value for ->x86_clflush_size. Big thanks to Andy Shevchenko for finding and reporting this and also providing a first pass at a fix. But his fix was only partial and only worked on the Quark CPUs. It would not, for instance, have worked on the QEMU config. 1. https://raw.githubusercontent.com/InstLatx64/InstLatx64/master/GenuineIntel/GenuineIntel0000590_Clanton_03_CPUID.txt 2. You can also get this behavior if you use "-cpu 486,+clzero" in QEMU. [ dhansen: remove 'vp_bits_from_cpuid' reference in changelog because bpetkov brutally murdered it recently. ] Fixes: fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach") Reported-by: Andy Shevchenko Signed-off-by: Dave Hansen Tested-by: Andy Shevchenko Tested-by: Jörn Heusipp Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240516173928.3960193-1-andriy.shevchenko@linux.intel.com/ Link: https://lore.kernel.org/lkml/5e31cad3-ad4d-493e-ab07-724cfbfaba44@heusipp.de/ Link: https://lore.kernel.org/all/20240517200534.8EC5F33E%40davehans-spike.ostc.intel.com --- arch/x86/kernel/cpu/common.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2b170da84f97..e31293c9609f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1075,6 +1075,10 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c) c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + + /* Provide a sane default if not enumerated: */ + if (!c->x86_clflush_size) + c->x86_clflush_size = 32; } c->x86_cache_bits = c->x86_phys_bits; From 027a44fedd55fbdf1d45603894634acd960ad04b Mon Sep 17 00:00:00 2001 From: Peter Colberg Date: Tue, 21 May 2024 14:12:46 -0400 Subject: [PATCH 219/279] hwmon: (intel-m10-bmc-hwmon) Fix multiplier for N6000 board power sensor The Intel N6000 BMC outputs the board power value in milliwatt, whereas the hwmon sysfs interface must provide power values in microwatt. Fixes: e1983220ae14 ("hwmon: intel-m10-bmc-hwmon: Add N6000 sensors") Signed-off-by: Peter Colberg Reviewed-by: Matthew Gerlach Link: https://lore.kernel.org/r/20240521181246.683833-1-peter.colberg@intel.com Signed-off-by: Guenter Roeck --- drivers/hwmon/intel-m10-bmc-hwmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/intel-m10-bmc-hwmon.c b/drivers/hwmon/intel-m10-bmc-hwmon.c index 6500ca548f9c..ca2dff158925 100644 --- a/drivers/hwmon/intel-m10-bmc-hwmon.c +++ b/drivers/hwmon/intel-m10-bmc-hwmon.c @@ -429,7 +429,7 @@ static const struct m10bmc_sdata n6000bmc_curr_tbl[] = { }; static const struct m10bmc_sdata n6000bmc_power_tbl[] = { - { 0x724, 0x0, 0x0, 0x0, 0x0, 1, "Board Power" }, + { 0x724, 0x0, 0x0, 0x0, 0x0, 1000, "Board Power" }, }; static const struct hwmon_channel_info * const n6000bmc_hinfo[] = { From 52a2c70c3ec555e670a34dd1ab958986451d2dd2 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 30 May 2024 08:20:14 -0700 Subject: [PATCH 220/279] hwmon: (shtc1) Fix property misspelling The property name is "sensirion,low-precision", not "sensicon,low-precision". Cc: Chris Ruehl Fixes: be7373b60df5 ("hwmon: shtc1: add support for device tree bindings") Signed-off-by: Guenter Roeck --- drivers/hwmon/shtc1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/shtc1.c b/drivers/hwmon/shtc1.c index 1f96e94967ee..439dd3dba5fc 100644 --- a/drivers/hwmon/shtc1.c +++ b/drivers/hwmon/shtc1.c @@ -238,7 +238,7 @@ static int shtc1_probe(struct i2c_client *client) if (np) { data->setup.blocking_io = of_property_read_bool(np, "sensirion,blocking-io"); - data->setup.high_precision = !of_property_read_bool(np, "sensicon,low-precision"); + data->setup.high_precision = !of_property_read_bool(np, "sensirion,low-precision"); } else { if (client->dev.platform_data) data->setup = *(struct shtc1_platform_data *)dev->platform_data; From a638b0461b58aa3205cd9d5f14d6f703d795b4af Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Thu, 23 May 2024 11:43:23 +0300 Subject: [PATCH 221/279] riscv: prevent pt_regs corruption for secondary idle threads Top of the kernel thread stack should be reserved for pt_regs. However this is not the case for the idle threads of the secondary boot harts. Their stacks overlap with their pt_regs, so both may get corrupted. Similar issue has been fixed for the primary hart, see c7cdd96eca28 ("riscv: prevent stack corruption by reserving task_pt_regs(p) early"). However that fix was not propagated to the secondary harts. The problem has been noticed in some CPU hotplug tests with V enabled. The function smp_callin stored several registers on stack, corrupting top of pt_regs structure including status field. As a result, kernel attempted to save or restore inexistent V context. Fixes: 9a2451f18663 ("RISC-V: Avoid using per cpu array for ordered booting") Fixes: 2875fe056156 ("RISC-V: Add cpu_ops and modify default booting method") Signed-off-by: Sergey Matyukevich Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240523084327.2013211-1-geomatsi@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu_ops_sbi.c | 2 +- arch/riscv/kernel/cpu_ops_spinwait.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c index 1cc7df740edd..e6fbaaf54956 100644 --- a/arch/riscv/kernel/cpu_ops_sbi.c +++ b/arch/riscv/kernel/cpu_ops_sbi.c @@ -72,7 +72,7 @@ static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle) /* Make sure tidle is updated */ smp_mb(); bdata->task_ptr = tidle; - bdata->stack_ptr = task_stack_page(tidle) + THREAD_SIZE; + bdata->stack_ptr = task_pt_regs(tidle); /* Make sure boot data is updated */ smp_mb(); hsm_data = __pa(bdata); diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c index 613872b0a21a..24869eb88908 100644 --- a/arch/riscv/kernel/cpu_ops_spinwait.c +++ b/arch/riscv/kernel/cpu_ops_spinwait.c @@ -34,8 +34,7 @@ static void cpu_update_secondary_bootdata(unsigned int cpuid, /* Make sure tidle is updated */ smp_mb(); - WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], - task_stack_page(tidle) + THREAD_SIZE); + WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], task_pt_regs(tidle)); WRITE_ONCE(__cpu_spinwait_task_pointer[hartid], tidle); } From 7bed51617401dab2be930b13ed5aacf581f7c8ef Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 26 May 2024 13:01:04 +0200 Subject: [PATCH 222/279] riscv: enable HAVE_ARCH_HUGE_VMAP for XIP kernel HAVE_ARCH_HUGE_VMAP also works on XIP kernel, so remove its dependency on !XIP_KERNEL. This also fixes a boot problem for XIP kernel introduced by the commit in "Fixes:". This commit used huge page mapping for vmemmap, but huge page vmap was not enabled for XIP kernel. Fixes: ff172d4818ad ("riscv: Use hugepage mappings for vmemmap") Signed-off-by: Nam Cao Cc: Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240526110104.470429-1-namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index b94176e25be1..0525ee2d63c7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -106,7 +106,7 @@ config RISCV select HAS_IOPORT if MMU select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP - select HAVE_ARCH_HUGE_VMAP if MMU && 64BIT && !XIP_KERNEL + select HAVE_ARCH_HUGE_VMAP if MMU && 64BIT select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL select HAVE_ARCH_KASAN if MMU && 64BIT From 982a7eb97be685d1129c06671aed4c26d6919af4 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 24 May 2024 11:56:00 -0700 Subject: [PATCH 223/279] Documentation: RISC-V: uabi: Only scalar misaligned loads are supported We're stuck supporting scalar misaligned loads in userspace because they were part of the ISA at the time we froze the uABI. That wasn't the case for vector misaligned accesses, so depending on them unconditionally is a userspace bug. All extant vector hardware traps on these misaligned accesses. Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240524185600.5919-1-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/arch/riscv/uabi.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/arch/riscv/uabi.rst b/Documentation/arch/riscv/uabi.rst index 54d199dce78b..2b420bab0527 100644 --- a/Documentation/arch/riscv/uabi.rst +++ b/Documentation/arch/riscv/uabi.rst @@ -65,4 +65,6 @@ the extension, or may have deliberately removed it from the listing. Misaligned accesses ------------------- -Misaligned accesses are supported in userspace, but they may perform poorly. +Misaligned scalar accesses are supported in userspace, but they may perform +poorly. Misaligned vector accesses are only supported if the Zicclsm extension +is supported. From 1d84afaf02524d2558e8ca3ca169be2ef720380b Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 30 May 2024 16:55:46 +0200 Subject: [PATCH 224/279] riscv: Fix fully ordered LR/SC xchg[8|16]() implementations The fully ordered versions of xchg[8|16]() using LR/SC lack the necessary memory barriers to guarantee the order. Fix this by matching what is already implemented in the fully ordered versions of cmpxchg() using LR/SC. Suggested-by: Andrea Parri Reported-by: Andrea Parri Closes: https://lore.kernel.org/linux-riscv/ZlYbupL5XgzgA0MX@andrea/T/#u Fixes: a8ed2b7a2c13 ("riscv/cmpxchg: Implement xchg for variables of size 1 and 2") Signed-off-by: Alexandre Ghiti Reviewed-by: Andrea Parri Link: https://lore.kernel.org/r/20240530145546.394248-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cmpxchg.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index ddb002ed89de..808b4c78462e 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -10,7 +10,7 @@ #include -#define __arch_xchg_masked(prepend, append, r, p, n) \ +#define __arch_xchg_masked(sc_sfx, prepend, append, r, p, n) \ ({ \ u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3); \ ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE; \ @@ -25,7 +25,7 @@ "0: lr.w %0, %2\n" \ " and %1, %0, %z4\n" \ " or %1, %1, %z3\n" \ - " sc.w %1, %1, %2\n" \ + " sc.w" sc_sfx " %1, %1, %2\n" \ " bnez %1, 0b\n" \ append \ : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \ @@ -46,7 +46,8 @@ : "memory"); \ }) -#define _arch_xchg(ptr, new, sfx, prepend, append) \ +#define _arch_xchg(ptr, new, sc_sfx, swap_sfx, prepend, \ + sc_append, swap_append) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ __typeof__(*(__ptr)) __new = (new); \ @@ -55,15 +56,15 @@ switch (sizeof(*__ptr)) { \ case 1: \ case 2: \ - __arch_xchg_masked(prepend, append, \ + __arch_xchg_masked(sc_sfx, prepend, sc_append, \ __ret, __ptr, __new); \ break; \ case 4: \ - __arch_xchg(".w" sfx, prepend, append, \ + __arch_xchg(".w" swap_sfx, prepend, swap_append, \ __ret, __ptr, __new); \ break; \ case 8: \ - __arch_xchg(".d" sfx, prepend, append, \ + __arch_xchg(".d" swap_sfx, prepend, swap_append, \ __ret, __ptr, __new); \ break; \ default: \ @@ -73,16 +74,17 @@ }) #define arch_xchg_relaxed(ptr, x) \ - _arch_xchg(ptr, x, "", "", "") + _arch_xchg(ptr, x, "", "", "", "", "") #define arch_xchg_acquire(ptr, x) \ - _arch_xchg(ptr, x, "", "", RISCV_ACQUIRE_BARRIER) + _arch_xchg(ptr, x, "", "", "", \ + RISCV_ACQUIRE_BARRIER, RISCV_ACQUIRE_BARRIER) #define arch_xchg_release(ptr, x) \ - _arch_xchg(ptr, x, "", RISCV_RELEASE_BARRIER, "") + _arch_xchg(ptr, x, "", "", RISCV_RELEASE_BARRIER, "", "") #define arch_xchg(ptr, x) \ - _arch_xchg(ptr, x, ".aqrl", "", "") + _arch_xchg(ptr, x, ".rl", ".aqrl", "", RISCV_FULL_BARRIER, "") #define xchg32(ptr, x) \ ({ \ From bb195358806847217efba98de62b7decec3b371f Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Tue, 7 May 2024 16:04:40 -0700 Subject: [PATCH 225/279] drm/msm: remove python 3.9 dependency for compiling msm Since commit 5acf49119630 ("drm/msm: import gen_header.py script from Mesa"), compilation is broken on machines having python versions older than 3.9 due to dependency on argparse.BooleanOptionalAction. Switch to use simple bool for the validate flag to remove the dependency. Fixes: 5acf49119630 ("drm/msm: import gen_header.py script from Mesa") Signed-off-by: Abhinav Kumar Tested-by: Douglas Anderson Reviewed-by: Dmitry Baryshkov Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20240507230440.3384949-1-quic_abhinavk@quicinc.com --- drivers/gpu/drm/msm/registers/gen_header.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/registers/gen_header.py b/drivers/gpu/drm/msm/registers/gen_header.py index fc3bfdc991d2..3926485bb197 100644 --- a/drivers/gpu/drm/msm/registers/gen_header.py +++ b/drivers/gpu/drm/msm/registers/gen_header.py @@ -538,7 +538,7 @@ class Parser(object): self.variants.add(reg.domain) def do_validate(self, schemafile): - if self.validate == False: + if not self.validate: return try: @@ -948,7 +948,8 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('--rnn', type=str, required=True) parser.add_argument('--xml', type=str, required=True) - parser.add_argument('--validate', action=argparse.BooleanOptionalAction) + parser.add_argument('--validate', default=False, action='store_true') + parser.add_argument('--no-validate', dest='validate', action='store_false') subparsers = parser.add_subparsers() subparsers.required = True From 18414a4a2eabb0281d12d374c92874327e0e3fe3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 May 2024 13:35:50 -0600 Subject: [PATCH 226/279] io_uring/net: assign kmsg inq/flags before buffer selection syzbot reports that recv is using an uninitialized value: ===================================================== BUG: KMSAN: uninit-value in io_req_cqe_overflow io_uring/io_uring.c:810 [inline] BUG: KMSAN: uninit-value in io_req_complete_post io_uring/io_uring.c:937 [inline] BUG: KMSAN: uninit-value in io_issue_sqe+0x1f1b/0x22c0 io_uring/io_uring.c:1763 io_req_cqe_overflow io_uring/io_uring.c:810 [inline] io_req_complete_post io_uring/io_uring.c:937 [inline] io_issue_sqe+0x1f1b/0x22c0 io_uring/io_uring.c:1763 io_wq_submit_work+0xa17/0xeb0 io_uring/io_uring.c:1860 io_worker_handle_work+0xc04/0x2000 io_uring/io-wq.c:597 io_wq_worker+0x447/0x1410 io_uring/io-wq.c:651 ret_from_fork+0x6d/0x90 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Uninit was stored to memory at: io_req_set_res io_uring/io_uring.h:215 [inline] io_recv_finish+0xf10/0x1560 io_uring/net.c:861 io_recv+0x12ec/0x1ea0 io_uring/net.c:1175 io_issue_sqe+0x429/0x22c0 io_uring/io_uring.c:1751 io_wq_submit_work+0xa17/0xeb0 io_uring/io_uring.c:1860 io_worker_handle_work+0xc04/0x2000 io_uring/io-wq.c:597 io_wq_worker+0x447/0x1410 io_uring/io-wq.c:651 ret_from_fork+0x6d/0x90 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Uninit was created at: slab_post_alloc_hook mm/slub.c:3877 [inline] slab_alloc_node mm/slub.c:3918 [inline] __do_kmalloc_node mm/slub.c:4038 [inline] __kmalloc+0x6e4/0x1060 mm/slub.c:4052 kmalloc include/linux/slab.h:632 [inline] io_alloc_async_data+0xc0/0x220 io_uring/io_uring.c:1662 io_msg_alloc_async io_uring/net.c:166 [inline] io_recvmsg_prep_setup io_uring/net.c:725 [inline] io_recvmsg_prep+0xbe8/0x1a20 io_uring/net.c:806 io_init_req io_uring/io_uring.c:2135 [inline] io_submit_sqe io_uring/io_uring.c:2182 [inline] io_submit_sqes+0x1135/0x2f10 io_uring/io_uring.c:2335 __do_sys_io_uring_enter io_uring/io_uring.c:3246 [inline] __se_sys_io_uring_enter+0x40f/0x3c80 io_uring/io_uring.c:3183 __x64_sys_io_uring_enter+0x11f/0x1a0 io_uring/io_uring.c:3183 x64_sys_call+0x2c0/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:427 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f which appears to be io_recv_finish() reading kmsg->msg.msg_inq to decide if it needs to set IORING_CQE_F_SOCK_NONEMPTY or not. If the recv is entered with buffer selection, but no buffer is available, then we jump error path which calls io_recv_finish() without having assigned kmsg->msg_inq. This might cause an errant setting of the NONEMPTY flag for a request get gets errored with -ENOBUFS. Reported-by: syzbot+b1647099e82b3b349fbf@syzkaller.appspotmail.com Fixes: 4a3223f7bfda ("io_uring/net: switch io_recv() to using io_async_msghdr") Signed-off-by: Jens Axboe --- io_uring/net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 0a48596429d9..7c98c4d50946 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1127,6 +1127,9 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: + kmsg->msg.msg_inq = -1; + kmsg->msg.msg_flags = 0; + if (io_do_buffer_select(req)) { ret = io_recv_buf_select(req, kmsg, &len, issue_flags); if (unlikely(ret)) @@ -1134,9 +1137,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) sr->buf = NULL; } - kmsg->msg.msg_inq = -1; - kmsg->msg.msg_flags = 0; - if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); From b164316808ec5de391c3e7b0148ec937d32d280d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:32 +0900 Subject: [PATCH 227/279] null_blk: Do not allow runt zone with zone capacity smaller then zone size A zoned device with a smaller last zone together with a zone capacity smaller than the zone size does make any sense as that does not correspond to any possible setup for a real device: 1) For ZNS and zoned UFS devices, all zones are always the same size. 2) For SMR HDDs, all zones always have the same capacity. In other words, if we have a smaller last runt zone, then this zone capacity should always be equal to the zone size. Add a check in null_init_zoned_dev() to prevent a configuration to have both a smaller zone size and a zone capacity smaller than the zone size. Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240530054035.491497-2-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/zoned.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 79c8e5e99f7f..f118d304f310 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -74,6 +74,17 @@ int null_init_zoned_dev(struct nullb_device *dev, return -EINVAL; } + /* + * If a smaller zone capacity was requested, do not allow a smaller last + * zone at the same time as such zone configuration does not correspond + * to any real zoned device. + */ + if (dev->zone_capacity != dev->zone_size && + dev->size & (dev->zone_size - 1)) { + pr_err("A smaller last zone is not allowed with zone capacity smaller than zone size.\n"); + return -EINVAL; + } + zone_capacity_sects = mb_to_sects(dev->zone_capacity); dev_capacity_sects = mb_to_sects(dev->size); dev->zone_size_sects = mb_to_sects(dev->zone_size); From cd6399936869b4a042dd1270078cbf2bb871a407 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:33 +0900 Subject: [PATCH 228/279] block: Fix validation of zoned device with a runt zone Commit ecfe43b11b02 ("block: Remember zone capacity when revalidating zones") introduced checks to ensure that the capacity of the zones of a zoned device is constant for all zones. However, this check ignores the possibility that a zoned device has a smaller last zone with a size not equal to the capacity of other zones. Such device correspond in practice to an SMR drive with a smaller last zone and all zones with a capacity equal to the zone size, leading to the last zone capacity being different than the capacity of other zones. Correctly handle such device by fixing the check for the constant zone capacity in blk_revalidate_seq_zone() using the new helper function disk_zone_is_last(). This helper function is also used in blk_revalidate_zone_cb() when checking the zone size. Fixes: ecfe43b11b02 ("block: Remember zone capacity when revalidating zones") Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240530054035.491497-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 03aa4eead39e..402a50a1ac4d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -450,6 +450,11 @@ static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); } +static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) +{ + return zone->start + zone->len >= get_capacity(disk); +} + static bool disk_insert_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { @@ -1693,11 +1698,13 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, /* * Remember the capacity of the first sequential zone and check - * if it is constant for all zones. + * if it is constant for all zones, ignoring the last zone as it can be + * smaller. */ if (!args->zone_capacity) args->zone_capacity = zone->capacity; - if (zone->capacity != args->zone_capacity) { + if (!disk_zone_is_last(disk, zone) && + zone->capacity != args->zone_capacity) { pr_warn("%s: Invalid variable zone capacity\n", disk->disk_name); return -ENODEV; @@ -1732,7 +1739,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, { struct blk_revalidate_zone_args *args = data; struct gendisk *disk = args->disk; - sector_t capacity = get_capacity(disk); sector_t zone_sectors = disk->queue->limits.chunk_sectors; int ret; @@ -1743,7 +1749,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENODEV; } - if (zone->start >= capacity || !zone->len) { + if (zone->start >= get_capacity(disk) || !zone->len) { pr_warn("%s: Invalid zone start %llu, length %llu\n", disk->disk_name, zone->start, zone->len); return -ENODEV; @@ -1753,7 +1759,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, * All zones must have the same size, with the exception on an eventual * smaller last zone. */ - if (zone->start + zone->len < capacity) { + if (!disk_zone_is_last(disk, zone)) { if (zone->len != zone_sectors) { pr_warn("%s: Invalid zoned device with non constant zone size\n", disk->disk_name); From 29459c3eaa5c6261fbe0dea7bdeb9b48d35d862a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:34 +0900 Subject: [PATCH 229/279] block: Fix zone write plugging handling of devices with a runt zone A zoned device may have a last sequential write required zone that is smaller than other zones. However, all tests to check if a zone write plug write offset exceeds the zone capacity use the same capacity value stored in the gendisk zone_capacity field. This is incorrect for a zoned device with a last runt (smaller) zone. Add the new field last_zone_capacity to struct gendisk to store the capacity of the last zone of the device. blk_revalidate_seq_zone() and blk_revalidate_conv_zone() are both modified to get this value when disk_zone_is_last() returns true. Similarly to zone_capacity, the value is first stored using the last_zone_capacity field of struct blk_revalidate_zone_args. Once zone revalidation of all zones is done, this is used to set the gendisk last_zone_capacity field. The checks to determine if a zone is full or if a sector offset in a zone exceeds the zone capacity in disk_should_remove_zone_wplug(), disk_zone_wplug_abort_unaligned(), blk_zone_write_plug_init_request(), and blk_zone_wplug_prepare_bio() are modified to use the new helper functions disk_zone_is_full() and disk_zone_wplug_is_full(). disk_zone_is_full() uses the zone index to determine if the zone being tested is the last one of the disk and uses the either the disk zone_capacity or last_zone_capacity accordingly. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240530054035.491497-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 35 +++++++++++++++++++++++++++-------- include/linux/blkdev.h | 1 + 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 402a50a1ac4d..52abebf56027 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -455,6 +455,20 @@ static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) return zone->start + zone->len >= get_capacity(disk); } +static bool disk_zone_is_full(struct gendisk *disk, + unsigned int zno, unsigned int offset_in_zone) +{ + if (zno < disk->nr_zones - 1) + return offset_in_zone >= disk->zone_capacity; + return offset_in_zone >= disk->last_zone_capacity; +} + +static bool disk_zone_wplug_is_full(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); +} + static bool disk_insert_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { @@ -548,7 +562,7 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, return false; /* We can remove zone write plugs for zones that are empty or full. */ - return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; + return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); } static void disk_remove_zone_wplug(struct gendisk *disk, @@ -669,13 +683,12 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, struct blk_zone_wplug *zwplug) { - unsigned int zone_capacity = disk->zone_capacity; unsigned int wp_offset = zwplug->wp_offset; struct bio_list bl = BIO_EMPTY_LIST; struct bio *bio; while ((bio = bio_list_pop(&zwplug->bio_list))) { - if (wp_offset >= zone_capacity || + if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) || (bio_op(bio) != REQ_OP_ZONE_APPEND && bio_offset_from_zone_start(bio) != wp_offset)) { blk_zone_wplug_bio_io_error(zwplug, bio); @@ -914,7 +927,6 @@ void blk_zone_write_plug_init_request(struct request *req) sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); struct request_queue *q = req->q; struct gendisk *disk = q->disk; - unsigned int zone_capacity = disk->zone_capacity; struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, blk_rq_pos(req)); unsigned long flags; @@ -938,7 +950,7 @@ void blk_zone_write_plug_init_request(struct request *req) * into the back of the request. */ spin_lock_irqsave(&zwplug->lock, flags); - while (zwplug->wp_offset < zone_capacity) { + while (!disk_zone_wplug_is_full(disk, zwplug)) { bio = bio_list_peek(&zwplug->bio_list); if (!bio) break; @@ -984,7 +996,7 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, * We know such BIO will fail, and that would potentially overflow our * write pointer offset beyond the end of the zone. */ - if (zwplug->wp_offset >= disk->zone_capacity) + if (disk_zone_wplug_is_full(disk, zwplug)) goto err; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { @@ -1561,6 +1573,7 @@ void disk_free_zone_resources(struct gendisk *disk) kfree(disk->conv_zones_bitmap); disk->conv_zones_bitmap = NULL; disk->zone_capacity = 0; + disk->last_zone_capacity = 0; disk->nr_zones = 0; } @@ -1605,6 +1618,7 @@ struct blk_revalidate_zone_args { unsigned long *conv_zones_bitmap; unsigned int nr_zones; unsigned int zone_capacity; + unsigned int last_zone_capacity; sector_t sector; }; @@ -1622,6 +1636,7 @@ static int disk_update_zone_resources(struct gendisk *disk, disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; + disk->last_zone_capacity = args->last_zone_capacity; swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); if (disk->conv_zones_bitmap) nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, @@ -1673,6 +1688,9 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, return -ENODEV; } + if (disk_zone_is_last(disk, zone)) + args->last_zone_capacity = zone->capacity; + if (!disk_need_zone_resources(disk)) return 0; @@ -1703,8 +1721,9 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, */ if (!args->zone_capacity) args->zone_capacity = zone->capacity; - if (!disk_zone_is_last(disk, zone) && - zone->capacity != args->zone_capacity) { + if (disk_zone_is_last(disk, zone)) { + args->last_zone_capacity = zone->capacity; + } else if (zone->capacity != args->zone_capacity) { pr_warn("%s: Invalid variable zone capacity\n", disk->disk_name); return -ENODEV; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aefdda9f4ec7..24c36929920b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -186,6 +186,7 @@ struct gendisk { */ unsigned int nr_zones; unsigned int zone_capacity; + unsigned int last_zone_capacity; unsigned long *conv_zones_bitmap; unsigned int zone_wplugs_hash_bits; spinlock_t zone_wplugs_lock; From 0a751df4566c86e5a24f2a03290dad3d0f215692 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 May 2024 09:45:47 -0400 Subject: [PATCH 230/279] blk-throttle: Fix incorrect display of io.max Commit bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") attempts to revert the code change introduced by commit cd5ab1b0fcb4 ("blk-throttle: add .low interface"). However, it leaves behind the bps_conf[] and iops_conf[] fields in the throtl_grp structure which aren't set anywhere in the new blk-throttle.c code but are still being used by tg_prfill_limit() to display the limits in io.max. Now io.max always displays the following values if a block queue is used: : rbps=0 wbps=0 riops=0 wiops=0 Fix this problem by removing bps_conf[] and iops_conf[] and use bps[] and iops[] instead to complete the revert. Fixes: bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") Reported-by: Justin Forbes Closes: https://github.com/containers/podman/issues/22701#issuecomment-2120627789 Signed-off-by: Waiman Long Acked-by: Tejun Heo Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240530134547.970075-1-longman@redhat.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 24 ++++++++++++------------ block/blk-throttle.h | 8 ++------ 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0be180f9a789..c1bf73f8c75d 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1399,32 +1399,32 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, bps_dft = U64_MAX; iops_dft = UINT_MAX; - if (tg->bps_conf[READ] == bps_dft && - tg->bps_conf[WRITE] == bps_dft && - tg->iops_conf[READ] == iops_dft && - tg->iops_conf[WRITE] == iops_dft) + if (tg->bps[READ] == bps_dft && + tg->bps[WRITE] == bps_dft && + tg->iops[READ] == iops_dft && + tg->iops[WRITE] == iops_dft) return 0; seq_printf(sf, "%s", dname); - if (tg->bps_conf[READ] == U64_MAX) + if (tg->bps[READ] == U64_MAX) seq_printf(sf, " rbps=max"); else - seq_printf(sf, " rbps=%llu", tg->bps_conf[READ]); + seq_printf(sf, " rbps=%llu", tg->bps[READ]); - if (tg->bps_conf[WRITE] == U64_MAX) + if (tg->bps[WRITE] == U64_MAX) seq_printf(sf, " wbps=max"); else - seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE]); + seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); - if (tg->iops_conf[READ] == UINT_MAX) + if (tg->iops[READ] == UINT_MAX) seq_printf(sf, " riops=max"); else - seq_printf(sf, " riops=%u", tg->iops_conf[READ]); + seq_printf(sf, " riops=%u", tg->iops[READ]); - if (tg->iops_conf[WRITE] == UINT_MAX) + if (tg->iops[WRITE] == UINT_MAX) seq_printf(sf, " wiops=max"); else - seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE]); + seq_printf(sf, " wiops=%u", tg->iops[WRITE]); seq_printf(sf, "\n"); return 0; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 393c3d134b96..4d9ef5abdf21 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -95,15 +95,11 @@ struct throtl_grp { bool has_rules_bps[2]; bool has_rules_iops[2]; - /* internally used bytes per second rate limits */ + /* bytes per second rate limits */ uint64_t bps[2]; - /* user configured bps limits */ - uint64_t bps_conf[2]; - /* internally used IOPS limits */ + /* IOPS limits */ unsigned int iops[2]; - /* user configured IOPS limits */ - unsigned int iops_conf[2]; /* Number of bytes dispatched in current slice */ uint64_t bytes_disp[2]; From 0a44078f2b72abcdda47581c942bd5d0468ec50b Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 30 May 2024 13:12:03 -0700 Subject: [PATCH 231/279] perf/x86/rapl: Add missing MODULE_DESCRIPTION() line Fix the warning from 'make C=1 W=1': WARNING: modpost: missing MODULE_DESCRIPTION() in arch/x86/events/rapl.o Signed-off-by: Jeff Johnson Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Link: https://lore.kernel.org/r/20240530-md-arch-x86-events-v1-1-e45ffa8af99f@quicinc.com --- arch/x86/events/rapl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 46e673585560..0c5e7a7c43ac 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -64,6 +64,7 @@ #include "perf_event.h" #include "probe.h" +MODULE_DESCRIPTION("Support Intel/AMD RAPL energy consumption counters"); MODULE_LICENSE("GPL"); /* From dc8e5dfb52d56e955ad09174330252710845b8d2 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 30 May 2024 13:42:51 -0700 Subject: [PATCH 232/279] perf/x86/intel: Add missing MODULE_DESCRIPTION() lines Fix the 'make W=1 C=1' warnings: WARNING: modpost: missing MODULE_DESCRIPTION() in arch/x86/events/intel/intel-uncore.o WARNING: modpost: missing MODULE_DESCRIPTION() in arch/x86/events/intel/intel-cstate.o Signed-off-by: Jeff Johnson Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Link: https://lore.kernel.org/r/20240530-md-arch-x86-events-intel-v1-1-8252194ed20a@quicinc.com --- arch/x86/events/intel/cstate.c | 1 + arch/x86/events/intel/uncore.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index e64eaa8dda5a..9d6e8f13d13a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -114,6 +114,7 @@ #include "../perf_event.h" #include "../probe.h" +MODULE_DESCRIPTION("Support for Intel cstate performance events"); MODULE_LICENSE("GPL"); #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 419c517b8594..c68f5b39952b 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -34,6 +34,7 @@ static struct event_constraint uncore_constraint_fixed = struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); +MODULE_DESCRIPTION("Support for Intel uncore performance events"); MODULE_LICENSE("GPL"); int uncore_pcibus_to_dieid(struct pci_bus *bus) From d40605a6823577a6c40fad6fb1f10a40ea0389d7 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Thu, 30 May 2024 14:15:48 -0400 Subject: [PATCH 233/279] sched/x86: Export 'percpu arch_freq_scale' Commit: 7bc263840bc3 ("sched/topology: Consolidate and clean up access to a CPU's max compute capacity") removed rq->cpu_capacity_orig in favor of using arch_scale_freq_capacity() calls. Export the underlying percpu symbol on x86 so that external trace point helper modules can be made to work again. Signed-off-by: Phil Auld Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240530181548.2039216-1-pauld@redhat.com --- arch/x86/kernel/cpu/aperfmperf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index f9a8c7b7943f..b3fa61d45352 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -345,6 +345,7 @@ static DECLARE_WORK(disable_freq_invariance_work, disable_freq_invariance_workfn); DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); static void scale_freq_tick(u64 acnt, u64 mcnt) { From 86aaa7e9d641c1ad1035ed2df88b8d0b48c86b30 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 30 May 2024 23:28:17 +0200 Subject: [PATCH 234/279] ata: libata-core: Add ATA_HORKAGE_NOLPM for Crucial CT240BX500SSD1 Commit 7627a0edef54 ("ata: ahci: Drop low power policy board type") dropped the board_ahci_low_power board type, and instead enables LPM if: -The AHCI controller reports that it supports LPM (Partial/Slumber), and -CONFIG_SATA_MOBILE_LPM_POLICY != 0, and -The port is not defined as external in the per port PxCMD register, and -The port is not defined as hotplug capable in the per port PxCMD register. Partial and Slumber LPM states can either be initiated by HIPM or DIPM. For HIPM (host initiated power management) to get enabled, both the AHCI controller and the drive have to report that they support HIPM. For DIPM (device initiated power management) to get enabled, only the drive has to report that it supports DIPM. However, the HBA will reject device requests to enter LPM states which the HBA does not support. The problem is that Crucial CT240BX500SSD1 drives do not handle low power modes correctly. The problem was most likely not seen before because no one had used this drive with a AHCI controller with LPM enabled. Add a quirk so that we do not enable LPM for this drive, since we see command timeouts if we do (even though the drive claims to support DIPM). Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Cc: stable@vger.kernel.org Reported-by: Aarrayy Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218832 Reviewed-by: Mika Westerberg Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 4f35aab81a0a..b0ce621fe2a1 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4136,8 +4136,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "PIONEER BD-RW BDR-207M", NULL, ATA_HORKAGE_NOLPM }, { "PIONEER BD-RW BDR-205", NULL, ATA_HORKAGE_NOLPM }, - /* Crucial BX100 SSD 500GB has broken LPM support */ + /* Crucial devices with broken LPM support */ { "CT500BX100SSD1", NULL, ATA_HORKAGE_NOLPM }, + { "CT240BX500SSD1", NULL, ATA_HORKAGE_NOLPM }, /* 512GB MX100 with MU01 firmware has both queued TRIM and LPM issues */ { "Crucial_CT512MX100*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM | From 473880369304cfd4445720cdd8bae4c6f1e16e60 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 30 May 2024 23:32:44 +0200 Subject: [PATCH 235/279] ata: libata-core: Add ATA_HORKAGE_NOLPM for AMD Radeon S3 SSD Commit 7627a0edef54 ("ata: ahci: Drop low power policy board type") dropped the board_ahci_low_power board type, and instead enables LPM if: -The AHCI controller reports that it supports LPM (Partial/Slumber), and -CONFIG_SATA_MOBILE_LPM_POLICY != 0, and -The port is not defined as external in the per port PxCMD register, and -The port is not defined as hotplug capable in the per port PxCMD register. Partial and Slumber LPM states can either be initiated by HIPM or DIPM. For HIPM (host initiated power management) to get enabled, both the AHCI controller and the drive have to report that they support HIPM. For DIPM (device initiated power management) to get enabled, only the drive has to report that it supports DIPM. However, the HBA will reject device requests to enter LPM states which the HBA does not support. The problem is that AMD Radeon S3 SSD drives do not handle low power modes correctly. The problem was most likely not seen before because no one had used this drive with a AHCI controller with LPM enabled. Add a quirk so that we do not enable LPM for this drive, since we see command timeouts if we do (even though the drive claims to support both HIPM and DIPM). Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Cc: stable@vger.kernel.org Reported-by: Doru Iorgulescu Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218832 Reviewed-by: Mika Westerberg Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index b0ce621fe2a1..4ea859dc381a 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4156,6 +4156,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { ATA_HORKAGE_ZERO_AFTER_TRIM | ATA_HORKAGE_NOLPM }, + /* AMD Radeon devices with broken LPM support */ + { "R3SL240G", NULL, ATA_HORKAGE_NOLPM }, + /* These specific Samsung models/firmware-revs do not handle LPM well */ { "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_HORKAGE_NOLPM }, { "SAMSUNG SSD PM830 mSATA *", "CXM13D1Q", ATA_HORKAGE_NOLPM }, From 3cb648c4dd3e8dde800fb3659250ed11f2d9efa5 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 30 May 2024 23:27:04 +0200 Subject: [PATCH 236/279] ata: libata-core: Add ATA_HORKAGE_NOLPM for Apacer AS340 Commit 7627a0edef54 ("ata: ahci: Drop low power policy board type") dropped the board_ahci_low_power board type, and instead enables LPM if: -The AHCI controller reports that it supports LPM (Partial/Slumber), and -CONFIG_SATA_MOBILE_LPM_POLICY != 0, and -The port is not defined as external in the per port PxCMD register, and -The port is not defined as hotplug capable in the per port PxCMD register. Partial and Slumber LPM states can either be initiated by HIPM or DIPM. For HIPM (host initiated power management) to get enabled, both the AHCI controller and the drive have to report that they support HIPM. For DIPM (device initiated power management) to get enabled, only the drive has to report that it supports DIPM. However, the HBA will reject device requests to enter LPM states which the HBA does not support. The problem is that Apacer AS340 drives do not handle low power modes correctly. The problem was most likely not seen before because no one had used this drive with a AHCI controller with LPM enabled. Add a quirk so that we do not enable LPM for this drive, since we see command timeouts if we do (even though the drive claims to support DIPM). Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Cc: stable@vger.kernel.org Reported-by: Tim Teichmann Closes: https://lore.kernel.org/linux-ide/87bk4pbve8.ffs@tglx/ Reviewed-by: Mika Westerberg Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 4ea859dc381a..e1bf8a19b3c8 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4159,6 +4159,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { /* AMD Radeon devices with broken LPM support */ { "R3SL240G", NULL, ATA_HORKAGE_NOLPM }, + /* Apacer models with LPM issues */ + { "Apacer AS340*", NULL, ATA_HORKAGE_NOLPM }, + /* These specific Samsung models/firmware-revs do not handle LPM well */ { "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_HORKAGE_NOLPM }, { "SAMSUNG SSD PM830 mSATA *", "CXM13D1Q", ATA_HORKAGE_NOLPM }, From 88da52ccd66e65f2e63a6c35c9dff55d448ef4dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 16 May 2024 20:19:34 +0200 Subject: [PATCH 237/279] landlock: Fix d_parent walk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The WARN_ON_ONCE() in collect_domain_accesses() can be triggered when trying to link a root mount point. This cannot work in practice because this directory is mounted, but the VFS check is done after the call to security_path_link(). Do not use source directory's d_parent when the source directory is the mount point. Cc: Günther Noack Cc: Paul Moore Cc: stable@vger.kernel.org Reported-by: syzbot+bf4903dc7e12b18ebc87@syzkaller.appspotmail.com Fixes: b91c3e4ea756 ("landlock: Add support for file reparenting with LANDLOCK_ACCESS_FS_REFER") Closes: https://lore.kernel.org/r/000000000000553d3f0618198200@google.com Link: https://lore.kernel.org/r/20240516181935.1645983-2-mic@digikod.net [mic: Fix commit message] Signed-off-by: Mickaël Salaün --- security/landlock/fs.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/security/landlock/fs.c b/security/landlock/fs.c index 22d8b7c28074..7877a64cc6b8 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1110,6 +1110,7 @@ static int current_check_refer_path(struct dentry *const old_dentry, bool allow_parent1, allow_parent2; access_mask_t access_request_parent1, access_request_parent2; struct path mnt_dir; + struct dentry *old_parent; layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {}, layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {}; @@ -1157,9 +1158,17 @@ static int current_check_refer_path(struct dentry *const old_dentry, mnt_dir.mnt = new_dir->mnt; mnt_dir.dentry = new_dir->mnt->mnt_root; + /* + * old_dentry may be the root of the common mount point and + * !IS_ROOT(old_dentry) at the same time (e.g. with open_tree() and + * OPEN_TREE_CLONE). We do not need to call dget(old_parent) because + * we keep a reference to old_dentry. + */ + old_parent = (old_dentry == mnt_dir.dentry) ? old_dentry : + old_dentry->d_parent; + /* new_dir->dentry is equal to new_dentry->d_parent */ - allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry, - old_dentry->d_parent, + allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry, old_parent, &layer_masks_parent1); allow_parent2 = collect_domain_accesses( dom, mnt_dir.dentry, new_dir->dentry, &layer_masks_parent2); From 0055f53aac80fd938bf7cdfad7ad414ca6c0e198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 16 May 2024 20:19:35 +0200 Subject: [PATCH 238/279] selftests/landlock: Add layout1.refer_mount_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add tests to check error codes when linking or renaming a mount root directory. This previously triggered a kernel warning, but it is fixed with the previous commit. Cc: Günther Noack Cc: Paul Moore Link: https://lore.kernel.org/r/20240516181935.1645983-3-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 45 ++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 6b5a9ff88c3d..7d063c652be1 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -35,6 +35,7 @@ * See https://sourceware.org/glibc/wiki/Synchronizing_Headers. */ #include +#include #include "common.h" @@ -47,6 +48,13 @@ int renameat2(int olddirfd, const char *oldpath, int newdirfd, } #endif +#ifndef open_tree +int open_tree(int dfd, const char *filename, unsigned int flags) +{ + return syscall(__NR_open_tree, dfd, filename, flags); +} +#endif + #ifndef RENAME_EXCHANGE #define RENAME_EXCHANGE (1 << 1) #endif @@ -2400,6 +2408,43 @@ TEST_F_FORK(layout1, refer_denied_by_default4) layer_dir_s1d1_refer); } +/* + * Tests walking through a denied root mount. + */ +TEST_F_FORK(layout1, refer_mount_root_deny) +{ + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_MAKE_DIR, + }; + int root_fd, ruleset_fd; + + /* Creates a mount object from a non-mount point. */ + set_cap(_metadata, CAP_SYS_ADMIN); + root_fd = + open_tree(AT_FDCWD, dir_s1d1, + AT_EMPTY_PATH | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + clear_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_LE(0, root_fd); + + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0)); + EXPECT_EQ(0, close(ruleset_fd)); + + /* Link denied by Landlock: EACCES. */ + EXPECT_EQ(-1, linkat(root_fd, ".", root_fd, "does_not_exist", 0)); + EXPECT_EQ(EACCES, errno); + + /* renameat2() always returns EBUSY. */ + EXPECT_EQ(-1, renameat2(root_fd, ".", root_fd, "does_not_exist", 0)); + EXPECT_EQ(EBUSY, errno); + + EXPECT_EQ(0, close(root_fd)); +} + TEST_F_FORK(layout1, reparent_link) { const struct rule layer1[] = { From 518549c120e671c4906f77d1802b97e9b23f673a Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 29 May 2024 18:16:56 -0500 Subject: [PATCH 239/279] cifs: fix creating sockets when using sfu mount options When running fstest generic/423 with sfu mount option, it was being skipped due to inability to create sockets: generic/423 [not run] cifs does not support mknod/mkfifo which can also be easily reproduced with their af_unix tool: ./src/af_unix /mnt1/socket-two bind: Operation not permitted Fix sfu mount option to allow creating and reporting sockets. Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifspdu.h | 2 +- fs/smb/client/inode.c | 4 ++++ fs/smb/client/smb2ops.c | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index c46d418c1c0c..a2072ab9e586 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -2574,7 +2574,7 @@ typedef struct { struct win_dev { - unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO*/ + unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO or LnxSOCK */ __le64 major; __le64 minor; } __attribute__((packed)); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 262576573eb5..4a8aa1de9522 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -606,6 +606,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); fattr->cf_rdev = MKDEV(mjr, mnr); } + } else if (memcmp("LnxSOCK", pbuf, 8) == 0) { + cifs_dbg(FYI, "Socket\n"); + fattr->cf_mode |= S_IFSOCK; + fattr->cf_dtype = DT_SOCK; } else if (memcmp("IntxLNK", pbuf, 7) == 0) { cifs_dbg(FYI, "Symlink\n"); fattr->cf_mode |= S_IFLNK; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 4ce6c3121a7e..c8e536540895 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4997,6 +4997,9 @@ static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, pdev.major = cpu_to_le64(MAJOR(dev)); pdev.minor = cpu_to_le64(MINOR(dev)); break; + case S_IFSOCK: + strscpy(pdev.type, "LnxSOCK"); + break; case S_IFIFO: strscpy(pdev.type, "LnxFIFO"); break; From adb77bba9c664f5d120e0ffb1387e9d7408e1529 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Apr 2024 19:31:52 -0700 Subject: [PATCH 240/279] scsi: mpt3sas: Avoid possible run-time warning with long manufacturer strings The prior strscpy() replacement of strncpy() here expected the manufacture_reply strings to be NUL-terminated, but it is possible they are not, as the code pattern here shows, e.g., edev->vendor_id being exactly 1 character larger than manufacture_reply->vendor_id, and the replaced strncpy() was copying only up to the size of the source character array. Replace this with memtostr(), which is the unambiguous way to convert a maybe not-NUL-terminated character array into a NUL-terminated string. Fixes: b7e9712a02e8 ("scsi: mpt3sas: Replace deprecated strncpy() with strscpy()") Signed-off-by: Kees Cook Tested-by: Marco Patalano Reviewed-by: Ewan D. Milne Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240410023155.2100422-3-keescook@chromium.org Signed-off-by: Kees Cook --- drivers/scsi/mpt3sas/mpt3sas_base.c | 2 +- drivers/scsi/mpt3sas/mpt3sas_transport.c | 14 +++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 258647fc6bdd..1320e06727df 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -4774,7 +4774,7 @@ _base_display_ioc_capabilities(struct MPT3SAS_ADAPTER *ioc) char desc[17] = {0}; u32 iounit_pg1_flags; - strscpy(desc, ioc->manu_pg0.ChipName, sizeof(desc)); + memtostr(desc, ioc->manu_pg0.ChipName); ioc_info(ioc, "%s: FWVersion(%02d.%02d.%02d.%02d), ChipRevision(0x%02x)\n", desc, (ioc->facts.FWVersion.Word & 0xFF000000) >> 24, diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c index 76f9a9177198..d84413b77d84 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_transport.c +++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c @@ -458,17 +458,13 @@ _transport_expander_report_manufacture(struct MPT3SAS_ADAPTER *ioc, goto out; manufacture_reply = data_out + sizeof(struct rep_manu_request); - strscpy(edev->vendor_id, manufacture_reply->vendor_id, - sizeof(edev->vendor_id)); - strscpy(edev->product_id, manufacture_reply->product_id, - sizeof(edev->product_id)); - strscpy(edev->product_rev, manufacture_reply->product_rev, - sizeof(edev->product_rev)); + memtostr(edev->vendor_id, manufacture_reply->vendor_id); + memtostr(edev->product_id, manufacture_reply->product_id); + memtostr(edev->product_rev, manufacture_reply->product_rev); edev->level = manufacture_reply->sas_format & 1; if (edev->level) { - strscpy(edev->component_vendor_id, - manufacture_reply->component_vendor_id, - sizeof(edev->component_vendor_id)); + memtostr(edev->component_vendor_id, + manufacture_reply->component_vendor_id); tmp = (u8 *)&manufacture_reply->component_id; edev->component_id = tmp[0] << 8 | tmp[1]; edev->component_revision_id = From 4e173c825b1914d5b118bbf26f0168102d56ae95 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 31 May 2024 08:54:52 -0700 Subject: [PATCH 241/279] mailmap: update entry for Kees Cook I'm tired of gmail breaking DKIM. Switch everything over to my @kernel.org alias instead. Signed-off-by: Kees Cook --- .mailmap | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.mailmap b/.mailmap index 43cd2995dbc2..efd9fa867a8e 100644 --- a/.mailmap +++ b/.mailmap @@ -337,10 +337,11 @@ Kalyan Thota Karthikeyan Periyasamy Kathiravan T Kay Sievers -Kees Cook -Kees Cook -Kees Cook -Kees Cook +Kees Cook +Kees Cook +Kees Cook +Kees Cook +Kees Cook Keith Busch Keith Busch Kenneth W Chen From d551ce15d08114514d489fad63bd275de2aca862 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 14:02:35 +0200 Subject: [PATCH 242/279] mailbox: zynqmp-ipi: drop irq_to_desc() call irq_to_desc() is not exported to loadable modules, so this driver now fails to link in some configurations: ERROR: modpost: "irq_to_desc" [drivers/mailbox/zynqmp-ipi-mailbox.ko] undefined! I can't see a purpose for this call, since the return value is unused and probably left over from some code refactoring. Address the link failure by just removing the line. Fixes: 6ffb1635341b ("mailbox: zynqmp: handle SGI for shared IPI") Signed-off-by: Arnd Bergmann Tested-by: Tanmay Shah Signed-off-by: Jassi Brar --- drivers/mailbox/zynqmp-ipi-mailbox.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mailbox/zynqmp-ipi-mailbox.c b/drivers/mailbox/zynqmp-ipi-mailbox.c index 7c90bac3de21..4acf5612487c 100644 --- a/drivers/mailbox/zynqmp-ipi-mailbox.c +++ b/drivers/mailbox/zynqmp-ipi-mailbox.c @@ -850,7 +850,6 @@ static int xlnx_mbox_init_sgi(struct platform_device *pdev, return ret; } - irq_to_desc(pdata->virq_sgi); irq_set_status_flags(pdata->virq_sgi, IRQ_PER_CPU); /* Setup function for the CPU hot-plug cases */ From 0c2f6d04619ec2b53ad4b0b591eafc9389786e86 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 May 2024 17:29:18 +0200 Subject: [PATCH 243/279] x86/topology/intel: Unlock CPUID before evaluating anything Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If this bit is set by the BIOS then CPUID evaluation including topology enumeration does not work correctly as the evaluation code does not try to analyze any leaf greater than two. This went unnoticed before because the original topology code just repeated evaluation several times and managed to overwrite the initial limited information with the correct one later. The new evaluation code does it once and therefore ends up with the limited and wrong information. Cure this by unlocking CPUID right before evaluating anything which depends on the maximum CPUID leaf being greater than two instead of rereading stuff after unlock. Fixes: 22d63660c35e ("x86/cpu: Use common topology code for Intel") Reported-by: Peter Schneider Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Peter Schneider Cc: Link: https://lore.kernel.org/r/fd3f73dc-a86f-4bcf-9c60-43556a21eb42@googlemail.com --- arch/x86/kernel/cpu/common.c | 3 ++- arch/x86/kernel/cpu/cpu.h | 2 ++ arch/x86/kernel/cpu/intel.c | 25 ++++++++++++++++--------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e31293c9609f..d4e539d4e158 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1589,6 +1589,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (have_cpuid_p()) { cpu_detect(c); get_cpu_vendor(c); + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); setup_force_cpu_cap(X86_FEATURE_CPUID); get_cpu_address_sizes(c); @@ -1748,7 +1749,7 @@ static void generic_identify(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); - + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); get_cpu_address_sizes(c); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index ea9e07d57c8d..1beccefbaff9 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -61,9 +61,11 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; extern void __init tsx_init(void); void tsx_ap_init(void); +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c); #else static inline void tsx_init(void) { } static inline void tsx_ap_init(void) { } +static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { } #endif /* CONFIG_CPU_SUP_INTEL */ extern void init_spectral_chicken(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3c3e7e5695ba..fdf3489d92a4 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -269,19 +269,26 @@ static void detect_tme_early(struct cpuinfo_x86 *c) c->x86_phys_bits -= keyid_bits; } +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd)) + return; + + /* + * The BIOS can have limited CPUID to leaf 2, which breaks feature + * enumeration. Unlock it and update the maximum leaf info. + */ + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) + c->cpuid_level = cpuid_eax(0); +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; - /* Unmask CPUID levels if masked: */ - if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - if (msr_clear_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { - c->cpuid_level = cpuid_eax(0); - get_cpu_cap(c); - } - } - if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); From 99a6087dfdc65303d26ab5fba2dacd8931b82b08 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 31 May 2024 11:57:07 -0700 Subject: [PATCH 244/279] kunit/fortify: Remove __kmalloc_node() test __kmalloc_node() is considered an "internal" function to the Slab, so drop it from explicit testing. Link: https://lore.kernel.org/r/20240531185703.work.588-kees@kernel.org Signed-off-by: Kees Cook --- lib/fortify_kunit.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c index 39da5b3bc649..f9cc467334ce 100644 --- a/lib/fortify_kunit.c +++ b/lib/fortify_kunit.c @@ -235,9 +235,6 @@ static void fortify_test_alloc_size_##allocator##_dynamic(struct kunit *test) \ kmalloc_array_node(alloc_size, 1, gfp, NUMA_NO_NODE), \ kfree(p)); \ checker(expected_size, __kmalloc(alloc_size, gfp), \ - kfree(p)); \ - checker(expected_size, \ - __kmalloc_node(alloc_size, gfp, NUMA_NO_NODE), \ kfree(p)); \ \ orig = kmalloc(alloc_size, gfp); \ From 7bc4244c882a7d7d79f4afefc50893244eb11d07 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Jun 2024 07:28:21 +0200 Subject: [PATCH 245/279] Revert "VT: Use macros to define ioctls" This reverts commit 8c467f3300591a206fa8dcc6988d768910799872. Turns out this breaks many architectures as the vt ioctls do not all match up everywhere due to historical reasons, so the original commit is invalid for many values. Reported-by: Nick Bowler Reported-by: Arnd Bergmann Reported-by: Jiri Slaby Reported-by: Christian Zigotzky Reported-by: Michael Ellerman Cc: Al Viro Cc: Alexey Gladkov Link: https://lore.kernel.org/r/ad4e561c-1d49-4f25-882c-7a36c6b1b5c0@draconx.ca Link: https://lore.kernel.org/r/0da9785e-ba44-4718-9d08-4e96c1ba7ab2@kernel.org Link: https://lore.kernel.org/all/34d848f4-670b-4493-bf21-130ef862521b@xenosoft.de/ Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/kd.h | 96 ++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 49 deletions(-) diff --git a/include/uapi/linux/kd.h b/include/uapi/linux/kd.h index 8ddb2219a84b..6b384065c013 100644 --- a/include/uapi/linux/kd.h +++ b/include/uapi/linux/kd.h @@ -5,61 +5,60 @@ #include /* 0x4B is 'K', to avoid collision with termios and vt */ -#define KD_IOCTL_BASE 'K' -#define GIO_FONT _IO(KD_IOCTL_BASE, 0x60) /* gets font in expanded form */ -#define PIO_FONT _IO(KD_IOCTL_BASE, 0x61) /* use font in expanded form */ +#define GIO_FONT 0x4B60 /* gets font in expanded form */ +#define PIO_FONT 0x4B61 /* use font in expanded form */ -#define GIO_FONTX _IO(KD_IOCTL_BASE, 0x6B) /* get font using struct consolefontdesc */ -#define PIO_FONTX _IO(KD_IOCTL_BASE, 0x6C) /* set font using struct consolefontdesc */ +#define GIO_FONTX 0x4B6B /* get font using struct consolefontdesc */ +#define PIO_FONTX 0x4B6C /* set font using struct consolefontdesc */ struct consolefontdesc { unsigned short charcount; /* characters in font (256 or 512) */ unsigned short charheight; /* scan lines per character (1-32) */ char __user *chardata; /* font data in expanded form */ }; -#define PIO_FONTRESET _IO(KD_IOCTL_BASE, 0x6D) /* reset to default font */ +#define PIO_FONTRESET 0x4B6D /* reset to default font */ -#define GIO_CMAP _IO(KD_IOCTL_BASE, 0x70) /* gets colour palette on VGA+ */ -#define PIO_CMAP _IO(KD_IOCTL_BASE, 0x71) /* sets colour palette on VGA+ */ +#define GIO_CMAP 0x4B70 /* gets colour palette on VGA+ */ +#define PIO_CMAP 0x4B71 /* sets colour palette on VGA+ */ -#define KIOCSOUND _IO(KD_IOCTL_BASE, 0x2F) /* start sound generation (0 for off) */ -#define KDMKTONE _IO(KD_IOCTL_BASE, 0x30) /* generate tone */ +#define KIOCSOUND 0x4B2F /* start sound generation (0 for off) */ +#define KDMKTONE 0x4B30 /* generate tone */ -#define KDGETLED _IO(KD_IOCTL_BASE, 0x31) /* return current led state */ -#define KDSETLED _IO(KD_IOCTL_BASE, 0x32) /* set led state [lights, not flags] */ +#define KDGETLED 0x4B31 /* return current led state */ +#define KDSETLED 0x4B32 /* set led state [lights, not flags] */ #define LED_SCR 0x01 /* scroll lock led */ #define LED_NUM 0x02 /* num lock led */ #define LED_CAP 0x04 /* caps lock led */ -#define KDGKBTYPE _IO(KD_IOCTL_BASE, 0x33) /* get keyboard type */ +#define KDGKBTYPE 0x4B33 /* get keyboard type */ #define KB_84 0x01 #define KB_101 0x02 /* this is what we always answer */ #define KB_OTHER 0x03 -#define KDADDIO _IO(KD_IOCTL_BASE, 0x34) /* add i/o port as valid */ -#define KDDELIO _IO(KD_IOCTL_BASE, 0x35) /* del i/o port as valid */ -#define KDENABIO _IO(KD_IOCTL_BASE, 0x36) /* enable i/o to video board */ -#define KDDISABIO _IO(KD_IOCTL_BASE, 0x37) /* disable i/o to video board */ +#define KDADDIO 0x4B34 /* add i/o port as valid */ +#define KDDELIO 0x4B35 /* del i/o port as valid */ +#define KDENABIO 0x4B36 /* enable i/o to video board */ +#define KDDISABIO 0x4B37 /* disable i/o to video board */ -#define KDSETMODE _IO(KD_IOCTL_BASE, 0x3A) /* set text/graphics mode */ +#define KDSETMODE 0x4B3A /* set text/graphics mode */ #define KD_TEXT 0x00 #define KD_GRAPHICS 0x01 #define KD_TEXT0 0x02 /* obsolete */ #define KD_TEXT1 0x03 /* obsolete */ -#define KDGETMODE _IO(KD_IOCTL_BASE, 0x3B) /* get current mode */ +#define KDGETMODE 0x4B3B /* get current mode */ -#define KDMAPDISP _IO(KD_IOCTL_BASE, 0x3C) /* map display into address space */ -#define KDUNMAPDISP _IO(KD_IOCTL_BASE, 0x3D) /* unmap display from address space */ +#define KDMAPDISP 0x4B3C /* map display into address space */ +#define KDUNMAPDISP 0x4B3D /* unmap display from address space */ typedef char scrnmap_t; #define E_TABSZ 256 -#define GIO_SCRNMAP _IO(KD_IOCTL_BASE, 0x40) /* get screen mapping from kernel */ -#define PIO_SCRNMAP _IO(KD_IOCTL_BASE, 0x41) /* put screen mapping table in kernel */ -#define GIO_UNISCRNMAP _IO(KD_IOCTL_BASE, 0x69) /* get full Unicode screen mapping */ -#define PIO_UNISCRNMAP _IO(KD_IOCTL_BASE, 0x6A) /* set full Unicode screen mapping */ +#define GIO_SCRNMAP 0x4B40 /* get screen mapping from kernel */ +#define PIO_SCRNMAP 0x4B41 /* put screen mapping table in kernel */ +#define GIO_UNISCRNMAP 0x4B69 /* get full Unicode screen mapping */ +#define PIO_UNISCRNMAP 0x4B6A /* set full Unicode screen mapping */ -#define GIO_UNIMAP _IO(KD_IOCTL_BASE, 0x66) /* get unicode-to-font mapping from kernel */ +#define GIO_UNIMAP 0x4B66 /* get unicode-to-font mapping from kernel */ struct unipair { unsigned short unicode; unsigned short fontpos; @@ -68,8 +67,8 @@ struct unimapdesc { unsigned short entry_ct; struct unipair __user *entries; }; -#define PIO_UNIMAP _IO(KD_IOCTL_BASE, 0x67) /* put unicode-to-font mapping in kernel */ -#define PIO_UNIMAPCLR _IO(KD_IOCTL_BASE, 0x68) /* clear table, possibly advise hash algorithm */ +#define PIO_UNIMAP 0x4B67 /* put unicode-to-font mapping in kernel */ +#define PIO_UNIMAPCLR 0x4B68 /* clear table, possibly advise hash algorithm */ struct unimapinit { unsigned short advised_hashsize; /* 0 if no opinion */ unsigned short advised_hashstep; /* 0 if no opinion */ @@ -84,19 +83,19 @@ struct unimapinit { #define K_MEDIUMRAW 0x02 #define K_UNICODE 0x03 #define K_OFF 0x04 -#define KDGKBMODE _IO(KD_IOCTL_BASE, 0x44) /* gets current keyboard mode */ -#define KDSKBMODE _IO(KD_IOCTL_BASE, 0x45) /* sets current keyboard mode */ +#define KDGKBMODE 0x4B44 /* gets current keyboard mode */ +#define KDSKBMODE 0x4B45 /* sets current keyboard mode */ #define K_METABIT 0x03 #define K_ESCPREFIX 0x04 -#define KDGKBMETA _IO(KD_IOCTL_BASE, 0x62) /* gets meta key handling mode */ -#define KDSKBMETA _IO(KD_IOCTL_BASE, 0x63) /* sets meta key handling mode */ +#define KDGKBMETA 0x4B62 /* gets meta key handling mode */ +#define KDSKBMETA 0x4B63 /* sets meta key handling mode */ #define K_SCROLLLOCK 0x01 #define K_NUMLOCK 0x02 #define K_CAPSLOCK 0x04 -#define KDGKBLED _IO(KD_IOCTL_BASE, 0x64) /* get led flags (not lights) */ -#define KDSKBLED _IO(KD_IOCTL_BASE, 0x65) /* set led flags (not lights) */ +#define KDGKBLED 0x4B64 /* get led flags (not lights) */ +#define KDSKBLED 0x4B65 /* set led flags (not lights) */ struct kbentry { unsigned char kb_table; @@ -108,15 +107,15 @@ struct kbentry { #define K_ALTTAB 0x02 #define K_ALTSHIFTTAB 0x03 -#define KDGKBENT _IO(KD_IOCTL_BASE, 0x46) /* gets one entry in translation table */ -#define KDSKBENT _IO(KD_IOCTL_BASE, 0x47) /* sets one entry in translation table */ +#define KDGKBENT 0x4B46 /* gets one entry in translation table */ +#define KDSKBENT 0x4B47 /* sets one entry in translation table */ struct kbsentry { unsigned char kb_func; unsigned char kb_string[512]; }; -#define KDGKBSENT _IO(KD_IOCTL_BASE, 0x48) /* gets one function key string entry */ -#define KDSKBSENT _IO(KD_IOCTL_BASE, 0x49) /* sets one function key string entry */ +#define KDGKBSENT 0x4B48 /* gets one function key string entry */ +#define KDSKBSENT 0x4B49 /* sets one function key string entry */ struct kbdiacr { unsigned char diacr, base, result; @@ -125,8 +124,8 @@ struct kbdiacrs { unsigned int kb_cnt; /* number of entries in following array */ struct kbdiacr kbdiacr[256]; /* MAX_DIACR from keyboard.h */ }; -#define KDGKBDIACR _IO(KD_IOCTL_BASE, 0x4A) /* read kernel accent table */ -#define KDSKBDIACR _IO(KD_IOCTL_BASE, 0x4B) /* write kernel accent table */ +#define KDGKBDIACR 0x4B4A /* read kernel accent table */ +#define KDSKBDIACR 0x4B4B /* write kernel accent table */ struct kbdiacruc { unsigned int diacr, base, result; @@ -135,16 +134,16 @@ struct kbdiacrsuc { unsigned int kb_cnt; /* number of entries in following array */ struct kbdiacruc kbdiacruc[256]; /* MAX_DIACR from keyboard.h */ }; -#define KDGKBDIACRUC _IO(KD_IOCTL_BASE, 0xFA) /* read kernel accent table - UCS */ -#define KDSKBDIACRUC _IO(KD_IOCTL_BASE, 0xFB) /* write kernel accent table - UCS */ +#define KDGKBDIACRUC 0x4BFA /* read kernel accent table - UCS */ +#define KDSKBDIACRUC 0x4BFB /* write kernel accent table - UCS */ struct kbkeycode { unsigned int scancode, keycode; }; -#define KDGETKEYCODE _IO(KD_IOCTL_BASE, 0x4C) /* read kernel keycode table entry */ -#define KDSETKEYCODE _IO(KD_IOCTL_BASE, 0x4D) /* write kernel keycode table entry */ +#define KDGETKEYCODE 0x4B4C /* read kernel keycode table entry */ +#define KDSETKEYCODE 0x4B4D /* write kernel keycode table entry */ -#define KDSIGACCEPT _IO(KD_IOCTL_BASE, 0x4E) /* accept kbd generated signals */ +#define KDSIGACCEPT 0x4B4E /* accept kbd generated signals */ struct kbd_repeat { int delay; /* in msec; <= 0: don't change */ @@ -152,11 +151,10 @@ struct kbd_repeat { /* earlier this field was misnamed "rate" */ }; -#define KDKBDREP _IO(KD_IOCTL_BASE, 0x52) /* set keyboard delay/repeat rate; - * actually used values are returned - */ +#define KDKBDREP 0x4B52 /* set keyboard delay/repeat rate; + * actually used values are returned */ -#define KDFONTOP _IO(KD_IOCTL_BASE, 0x72) /* font operations */ +#define KDFONTOP 0x4B72 /* font operations */ struct console_font_op { unsigned int op; /* operation code KD_FONT_OP_* */ From c3f38fa61af77b49866b006939479069cd451173 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 2 Jun 2024 15:44:56 -0700 Subject: [PATCH 246/279] Linux 6.10-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f975b6396328..7f921ae547f1 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Baby Opossum Posse # *DOCUMENTATION* From e0e8e4bce61cac674fdabd85d070e7bab1634a8b Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Mon, 3 Jun 2024 10:32:23 +0300 Subject: [PATCH 247/279] ASoC: SOF: Intel: hda-dai: skip tlv for dspless mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sof_ipc4_dma_config_tlv{} is for Audio DSP firmware only. Don't set it in dspless mode. Fixes: 17386cb1b48b ("ASoC: SOF: Intel: hda-dai: set dma_stream_channel_map device") Signed-off-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Signed-off-by: Peter Ujfalusi Link: https://msgid.link/r/20240603073224.14726-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dai.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index ce675c22a5ab..a2b6dbcfa918 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -525,6 +525,9 @@ int sdw_hda_dai_hw_params(struct snd_pcm_substream *substream, return ret; } + if (sdev->dspless_mode_selected) + return 0; + ipc4_copier = widget_to_copier(w); dma_config_tlv = &ipc4_copier->dma_config_tlv[cpu_dai_id]; dma_config = &dma_config_tlv->dma_config; From 3b06e137089fc0beb5ffa6a869de9a93df984072 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Mon, 3 Jun 2024 10:32:24 +0300 Subject: [PATCH 248/279] ASoC: SOF: Intel: hda-dai: remove skip_tlv label MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We just return 0 after the skip_tlv label. No need to use a label. Signed-off-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Signed-off-by: Peter Ujfalusi Link: https://msgid.link/r/20240603073224.14726-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dai.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index a2b6dbcfa918..c61d298ea6b3 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -379,7 +379,7 @@ static int non_hda_dai_hw_params_data(struct snd_pcm_substream *substream, sdev = widget_to_sdev(w); if (sdev->dspless_mode_selected) - goto skip_tlv; + return 0; /* get stream_id */ hext_stream = ops->get_hext_stream(sdev, cpu_dai, substream); @@ -423,7 +423,6 @@ static int non_hda_dai_hw_params_data(struct snd_pcm_substream *substream, dma_config->dma_stream_channel_map.device_count = 1; dma_config->dma_priv_config_size = 0; -skip_tlv: return 0; } From d3cb3516f2540e6c384eef96b4ffeb49425175ed Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 31 May 2024 01:30:54 +0300 Subject: [PATCH 249/279] MAINTAINERS: copy linux-arm-msm for sound/qcom changes Not having linux-arm-msm@ in cc for audio-related changes for Qualcomm platforms means that interested parties can easily miss the patches. Add corresponding L: entry so that linux-arm-msm ML gets CC'ed for audio patches too. Signed-off-by: Dmitry Baryshkov Acked-by: Srinivas Kandagatla Link: https://msgid.link/r/20240531-asoc-qcom-cc-lamsm-v1-1-f026ad618496@linaro.org Signed-off-by: Mark Brown --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8754ac2c259d..451c1aa5af3c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18212,6 +18212,7 @@ QCOM AUDIO (ASoC) DRIVERS M: Srinivas Kandagatla M: Banajit Goswami L: alsa-devel@alsa-project.org (moderated for non-subscribers) +L: linux-arm-msm@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/soc/qcom/qcom,apr* F: Documentation/devicetree/bindings/sound/qcom,* From a73a83021ae136ab6b0d08eb196d84b1d02814e9 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 09:30:13 -0700 Subject: [PATCH 250/279] ASoC: mxs: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in sound/soc/mxs/snd-soc-mxs-pcm.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://msgid.link/r/20240602-md-snd-soc-mxs-pcm-v1-1-1e663d11328d@quicinc.com Signed-off-by: Mark Brown --- sound/soc/mxs/mxs-pcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/mxs/mxs-pcm.c b/sound/soc/mxs/mxs-pcm.c index df2e4be992d2..9bb08cadeb18 100644 --- a/sound/soc/mxs/mxs-pcm.c +++ b/sound/soc/mxs/mxs-pcm.c @@ -43,4 +43,5 @@ int mxs_pcm_platform_register(struct device *dev) } EXPORT_SYMBOL_GPL(mxs_pcm_platform_register); +MODULE_DESCRIPTION("MXS ASoC PCM driver"); MODULE_LICENSE("GPL"); From 7478e15bcc16cbc0fa1b8c431163bf651033c088 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 10:00:27 -0700 Subject: [PATCH 251/279] ASoC: fsl: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in sound/soc/fsl/imx-pcm-dma.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://msgid.link/r/20240602-md-snd-fsl-imx-pcm-dma-v1-1-e7efc33c6bf3@quicinc.com Signed-off-by: Mark Brown --- sound/soc/fsl/imx-pcm-dma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/fsl/imx-pcm-dma.c b/sound/soc/fsl/imx-pcm-dma.c index 14e94270911c..4fa208d6a032 100644 --- a/sound/soc/fsl/imx-pcm-dma.c +++ b/sound/soc/fsl/imx-pcm-dma.c @@ -50,4 +50,5 @@ int imx_pcm_dma_init(struct platform_device *pdev) } EXPORT_SYMBOL_GPL(imx_pcm_dma_init); +MODULE_DESCRIPTION("Freescale i.MX PCM DMA interface"); MODULE_LICENSE("GPL"); From 968c974c08106fcf911d8d390d0f049af855d348 Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Mon, 3 Jun 2024 10:47:16 +0000 Subject: [PATCH 252/279] ASoC: rt722-sdca-sdw: add silence detection register as volatile Including silence detection register as volatile. Signed-off-by: Jack Yu Link: https://msgid.link/r/c66a6bd6d220426793096b42baf85437@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca-sdw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/codecs/rt722-sdca-sdw.c b/sound/soc/codecs/rt722-sdca-sdw.c index b33da2215ade..f73ee3bf90f5 100644 --- a/sound/soc/codecs/rt722-sdca-sdw.c +++ b/sound/soc/codecs/rt722-sdca-sdw.c @@ -68,6 +68,7 @@ static bool rt722_sdca_mbq_readable_register(struct device *dev, unsigned int re case 0x200007f: case 0x2000082 ... 0x200008e: case 0x2000090 ... 0x2000094: + case 0x3110000: case 0x5300000 ... 0x5300002: case 0x5400002: case 0x5600000 ... 0x5600007: @@ -125,6 +126,7 @@ static bool rt722_sdca_mbq_volatile_register(struct device *dev, unsigned int re case 0x2000067: case 0x2000084: case 0x2000086: + case 0x3110000: return true; default: return false; From 65909a7e7aa8b25c9cc5f04c1fd5d6f0f1d76fcd Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 3 Jun 2024 17:16:07 -0700 Subject: [PATCH 253/279] ASoC: qcom: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in sound/soc/qcom/snd-soc-qcom-sdw.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://msgid.link/r/20240603-md-snd-soc-qcom-sdw-v1-1-101ea8bcdd38@quicinc.com Signed-off-by: Mark Brown --- sound/soc/qcom/sdw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/qcom/sdw.c b/sound/soc/qcom/sdw.c index eaa8bb016e50..f2eda2ff46c0 100644 --- a/sound/soc/qcom/sdw.c +++ b/sound/soc/qcom/sdw.c @@ -160,4 +160,5 @@ int qcom_snd_sdw_hw_free(struct snd_pcm_substream *substream, return 0; } EXPORT_SYMBOL_GPL(qcom_snd_sdw_hw_free); +MODULE_DESCRIPTION("Qualcomm ASoC SoundWire helper functions"); MODULE_LICENSE("GPL"); From afe377286ad49e0b69071d2a767e2c6553f4094b Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Tue, 4 Jun 2024 14:28:43 +0100 Subject: [PATCH 254/279] ASoC: cs42l43: Increase default type detect time and button delay Some problematic headsets have been discovered, to help with correctly identifying these, the detect time must be increased. Also improve the reliability of the impedance value from the button detect by slightly increasing the button detect delay. Fixes: 686b8f711b99 ("ASoC: cs42l43: Lower default type detect time") Signed-off-by: Maciej Strozek Signed-off-by: Charles Keepax Link: https://msgid.link/r/20240604132843.3309114-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43-jack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs42l43-jack.c b/sound/soc/codecs/cs42l43-jack.c index 901b9dbcf585..d9ab003e166b 100644 --- a/sound/soc/codecs/cs42l43-jack.c +++ b/sound/soc/codecs/cs42l43-jack.c @@ -121,7 +121,7 @@ int cs42l43_set_jack(struct snd_soc_component *component, priv->buttons[3] = 735; } - ret = cs42l43_find_index(priv, "cirrus,detect-us", 1000, &priv->detect_us, + ret = cs42l43_find_index(priv, "cirrus,detect-us", 50000, &priv->detect_us, cs42l43_accdet_us, ARRAY_SIZE(cs42l43_accdet_us)); if (ret < 0) goto error; @@ -433,7 +433,7 @@ irqreturn_t cs42l43_button_press(int irq, void *data) // Wait for 2 full cycles of comb filter to ensure good reading queue_delayed_work(system_wq, &priv->button_press_work, - msecs_to_jiffies(10)); + msecs_to_jiffies(20)); return IRQ_HANDLED; } From b7c40988808f8d7426dee1e4d96a4e204de4a8bc Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 4 Jun 2024 10:19:46 +0800 Subject: [PATCH 255/279] ASoC: codecs: ES8326: Solve headphone detection issue When switching between OMTP and CTIA headset, we can hear pop noise. To solve this issue, We modified the configuration for headphone detection Signed-off-by: Zhang Yi Link: https://msgid.link/r/20240604021946.2911-1-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index 03b539ba540f..6a4e42e5e35b 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -857,12 +857,16 @@ static void es8326_jack_detect_handler(struct work_struct *work) * set auto-check mode, then restart jack_detect_work after 400ms. * Don't report jack status. */ - regmap_write(es8326->regmap, ES8326_INT_SOURCE, - (ES8326_INT_SRC_PIN9 | ES8326_INT_SRC_BUTTON)); + regmap_write(es8326->regmap, ES8326_INT_SOURCE, 0x00); regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x01); + regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x10, 0x00); es8326_enable_micbias(es8326->component); usleep_range(50000, 70000); regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x00); + regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x10, 0x10); + usleep_range(50000, 70000); + regmap_write(es8326->regmap, ES8326_INT_SOURCE, + (ES8326_INT_SRC_PIN9 | ES8326_INT_SRC_BUTTON)); regmap_write(es8326->regmap, ES8326_SYS_BIAS, 0x1f); regmap_update_bits(es8326->regmap, ES8326_HP_DRIVER_REF, 0x0f, 0x08); queue_delayed_work(system_wq, &es8326->jack_detect_work, From ccd8d753f0fe8f16745fa2b6be5946349731d901 Mon Sep 17 00:00:00 2001 From: Alibek Omarov Date: Tue, 4 Jun 2024 21:47:52 +0300 Subject: [PATCH 256/279] ASoC: rockchip: i2s-tdm: Fix trcm mode by setting clock on right mclk When TRCM mode is enabled, I2S RX and TX clocks are synchronized through selected clock source. Without this fix BCLK and LRCK might get parented to an uninitialized MCLK and the DAI will receive data at wrong pace. However, unlike in original i2s-tdm driver, there is no need to manually synchronize mclk_rx and mclk_tx, as only one gets used anyway. Tested on a board with RK3568 SoC and Silergy SY24145S codec with enabled and disabled TRCM mode. Fixes: 9e2ab4b18ebd ("ASoC: rockchip: i2s-tdm: Fix inaccurate sampling rates") Signed-off-by: Alibek Omarov Reviewed-by: Luca Ceresoli Link: https://msgid.link/r/20240604184752.697313-1-a1ba.omarov@gmail.com Signed-off-by: Mark Brown --- sound/soc/rockchip/rockchip_i2s_tdm.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sound/soc/rockchip/rockchip_i2s_tdm.c b/sound/soc/rockchip/rockchip_i2s_tdm.c index 9fa020ef7eab..ee517d7b5b7b 100644 --- a/sound/soc/rockchip/rockchip_i2s_tdm.c +++ b/sound/soc/rockchip/rockchip_i2s_tdm.c @@ -655,8 +655,17 @@ static int rockchip_i2s_tdm_hw_params(struct snd_pcm_substream *substream, int err; if (i2s_tdm->is_master_mode) { - struct clk *mclk = (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) ? - i2s_tdm->mclk_tx : i2s_tdm->mclk_rx; + struct clk *mclk; + + if (i2s_tdm->clk_trcm == TRCM_TX) { + mclk = i2s_tdm->mclk_tx; + } else if (i2s_tdm->clk_trcm == TRCM_RX) { + mclk = i2s_tdm->mclk_rx; + } else if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { + mclk = i2s_tdm->mclk_tx; + } else { + mclk = i2s_tdm->mclk_rx; + } err = clk_set_rate(mclk, DEFAULT_MCLK_FS * params_rate(params)); if (err) From 97d8613679eb53bd0c07d0fbd3d8471e46ba46c1 Mon Sep 17 00:00:00 2001 From: Hsin-Te Yuan Date: Fri, 31 May 2024 08:37:54 +0000 Subject: [PATCH 257/279] ASoC: mediatek: mt8183-da7219-max98357: Fix kcontrol name collision Since "Headphone Switch" kcontrol name has already been used by da7219, rename the control name from "Headphone" to "Headphones" to prevent the colision. Also, this change makes kcontrol name align with the one in mt8186-mt6366-da7219-max98357.c. Fixes: 9c7388baa2053 ("ASoC: mediatek: mt8183-da7219-max98357: Map missing jack kcontrols") Change-Id: I9ae69a4673cd04786b247cc514fdd20f878ef009 Signed-off-by: Hsin-Te Yuan Reviewed-by: Chen-Yu Tsai Link: https://msgid.link/r/20240531-da7219-v1-1-ac3343f3ae6a@chromium.org Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8183/mt8183-da7219-max98357.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sound/soc/mediatek/mt8183/mt8183-da7219-max98357.c b/sound/soc/mediatek/mt8183/mt8183-da7219-max98357.c index acaf81fd6c9b..f848e14b091a 100644 --- a/sound/soc/mediatek/mt8183/mt8183-da7219-max98357.c +++ b/sound/soc/mediatek/mt8183/mt8183-da7219-max98357.c @@ -31,7 +31,7 @@ struct mt8183_da7219_max98357_priv { static struct snd_soc_jack_pin mt8183_da7219_max98357_jack_pins[] = { { - .pin = "Headphone", + .pin = "Headphones", .mask = SND_JACK_HEADPHONE, }, { @@ -626,7 +626,7 @@ static struct snd_soc_codec_conf mt6358_codec_conf[] = { }; static const struct snd_kcontrol_new mt8183_da7219_max98357_snd_controls[] = { - SOC_DAPM_PIN_SWITCH("Headphone"), + SOC_DAPM_PIN_SWITCH("Headphones"), SOC_DAPM_PIN_SWITCH("Headset Mic"), SOC_DAPM_PIN_SWITCH("Speakers"), SOC_DAPM_PIN_SWITCH("Line Out"), @@ -634,7 +634,7 @@ static const struct snd_kcontrol_new mt8183_da7219_max98357_snd_controls[] = { static const struct snd_soc_dapm_widget mt8183_da7219_max98357_dapm_widgets[] = { - SND_SOC_DAPM_HP("Headphone", NULL), + SND_SOC_DAPM_HP("Headphones", NULL), SND_SOC_DAPM_MIC("Headset Mic", NULL), SND_SOC_DAPM_SPK("Speakers", NULL), SND_SOC_DAPM_SPK("Line Out", NULL), @@ -680,7 +680,7 @@ static struct snd_soc_codec_conf mt8183_da7219_rt1015_codec_conf[] = { }; static const struct snd_kcontrol_new mt8183_da7219_rt1015_snd_controls[] = { - SOC_DAPM_PIN_SWITCH("Headphone"), + SOC_DAPM_PIN_SWITCH("Headphones"), SOC_DAPM_PIN_SWITCH("Headset Mic"), SOC_DAPM_PIN_SWITCH("Left Spk"), SOC_DAPM_PIN_SWITCH("Right Spk"), @@ -689,7 +689,7 @@ static const struct snd_kcontrol_new mt8183_da7219_rt1015_snd_controls[] = { static const struct snd_soc_dapm_widget mt8183_da7219_rt1015_dapm_widgets[] = { - SND_SOC_DAPM_HP("Headphone", NULL), + SND_SOC_DAPM_HP("Headphones", NULL), SND_SOC_DAPM_MIC("Headset Mic", NULL), SND_SOC_DAPM_SPK("Left Spk", NULL), SND_SOC_DAPM_SPK("Right Spk", NULL), From 2ed22161b19b11239aa742804549f63edd7c91e3 Mon Sep 17 00:00:00 2001 From: Andrei Simion Date: Tue, 4 Jun 2024 13:10:30 +0300 Subject: [PATCH 258/279] ASoC: atmel: atmel-classd: Re-add dai_link->platform to fix card init The removed dai_link->platform component cause a fail which is exposed at runtime. (ex: when a sound tool is used) This patch re-adds the dai_link->platform component to have a full card registered. Before this patch: :~$ aplay -l **** List of PLAYBACK Hardware Devices **** card 0: CLASSD [CLASSD], device 0: CLASSD PCM snd-soc-dummy-dai-0 [] Subdevices: 1/1 Subdevice #0: subdevice #0 :~$ speaker-test -t sine speaker-test 1.2.6 Playback device is default Stream parameters are 48000Hz, S16_LE, 1 channels Sine wave rate is 440.0000Hz Playback open error: -22,Invalid argument After this patch which restores the platform component: :~$ aplay -l **** List of PLAYBACK Hardware Devices **** card 0: CLASSD [CLASSD], device 0: CLASSD PCM snd-soc-dummy-dai-0 [CLASSD PCM snd-soc-dummy-dai-0] Subdevices: 1/1 Subdevice #0: subdevice #0 -> Resolve the playback error. Fixes: 2f650f87c03c ("ASoC: atmel: remove unnecessary dai_link->platform") Signed-off-by: Andrei Simion Acked-by: Kuninori Morimoto Link: https://msgid.link/r/20240604101030.237792-1-andrei.simion@microchip.com Signed-off-by: Mark Brown --- sound/soc/atmel/atmel-classd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sound/soc/atmel/atmel-classd.c b/sound/soc/atmel/atmel-classd.c index 6aed1ee443b4..ba314b279919 100644 --- a/sound/soc/atmel/atmel-classd.c +++ b/sound/soc/atmel/atmel-classd.c @@ -473,19 +473,22 @@ static int atmel_classd_asoc_card_init(struct device *dev, if (!dai_link) return -ENOMEM; - comp = devm_kzalloc(dev, sizeof(*comp), GFP_KERNEL); + comp = devm_kzalloc(dev, 2 * sizeof(*comp), GFP_KERNEL); if (!comp) return -ENOMEM; - dai_link->cpus = comp; + dai_link->cpus = &comp[0]; dai_link->codecs = &snd_soc_dummy_dlc; + dai_link->platforms = &comp[1]; dai_link->num_cpus = 1; dai_link->num_codecs = 1; + dai_link->num_platforms = 1; dai_link->name = "CLASSD"; dai_link->stream_name = "CLASSD PCM"; dai_link->cpus->dai_name = dev_name(dev); + dai_link->platforms->name = dev_name(dev); card->dai_link = dai_link; card->num_links = 1; From 97ab304ecd95c0b1703ff8c8c3956dc6e2afe8e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Mon, 3 Jun 2024 12:28:15 +0200 Subject: [PATCH 259/279] ASoC: topology: Fix references to freed memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most users after parsing a topology file, release memory used by it, so having pointer references directly into topology file contents is wrong. Use devm_kmemdup(), to allocate memory as needed. Reported-by: Jason Montleon Link: https://github.com/thesofproject/avs-topology-xml/issues/22#issuecomment-2127892605 Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240603102818.36165-2-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 90ca37e008b3..75d9395a18ed 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1060,15 +1060,32 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, break; } - route->source = elem->source; - route->sink = elem->sink; + route->source = devm_kmemdup(tplg->dev, elem->source, + min(strlen(elem->source), + SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + GFP_KERNEL); + route->sink = devm_kmemdup(tplg->dev, elem->sink, + min(strlen(elem->sink), SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + GFP_KERNEL); + if (!route->source || !route->sink) { + ret = -ENOMEM; + break; + } /* set to NULL atm for tplg users */ route->connected = NULL; - if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == 0) + if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == 0) { route->control = NULL; - else - route->control = elem->control; + } else { + route->control = devm_kmemdup(tplg->dev, elem->control, + min(strlen(elem->control), + SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + GFP_KERNEL); + if (!route->control) { + ret = -ENOMEM; + break; + } + } /* add route dobj to dobj_list */ route->dobj.type = SND_SOC_DOBJ_GRAPH; From fd660b1bd015e5aa9a558ee04088f2431010548d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Mon, 3 Jun 2024 12:28:16 +0200 Subject: [PATCH 260/279] ASoC: Intel: avs: Fix route override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of overriding existing memory strings that may be too short, just allocate needed memory and point the route at it. Reported-by: Jason Montleon Link: https://github.com/thesofproject/avs-topology-xml/issues/22#issuecomment-2127892605 Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240603102818.36165-3-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/avs/topology.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/sound/soc/intel/avs/topology.c b/sound/soc/intel/avs/topology.c index 02bae207f6ec..b6c5d94a1554 100644 --- a/sound/soc/intel/avs/topology.c +++ b/sound/soc/intel/avs/topology.c @@ -1545,8 +1545,8 @@ static int avs_route_load(struct snd_soc_component *comp, int index, { struct snd_soc_acpi_mach *mach = dev_get_platdata(comp->card->dev); size_t len = SNDRV_CTL_ELEM_ID_NAME_MAXLEN; - char buf[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; int ssp_port, tdm_slot; + char *buf; /* See parse_link_formatted_string() for dynamic naming when(s). */ if (!avs_mach_singular_ssp(mach)) @@ -1557,13 +1557,24 @@ static int avs_route_load(struct snd_soc_component *comp, int index, return 0; tdm_slot = avs_mach_ssp_tdm(mach, ssp_port); + buf = devm_kzalloc(comp->card->dev, len, GFP_KERNEL); + if (!buf) + return -ENOMEM; avs_ssp_sprint(buf, len, route->source, ssp_port, tdm_slot); - strscpy((char *)route->source, buf, len); + route->source = buf; + + buf = devm_kzalloc(comp->card->dev, len, GFP_KERNEL); + if (!buf) + return -ENOMEM; avs_ssp_sprint(buf, len, route->sink, ssp_port, tdm_slot); - strscpy((char *)route->sink, buf, len); + route->sink = buf; + if (route->control) { + buf = devm_kzalloc(comp->card->dev, len, GFP_KERNEL); + if (!buf) + return -ENOMEM; avs_ssp_sprint(buf, len, route->control, ssp_port, tdm_slot); - strscpy((char *)route->control, buf, len); + route->control = buf; } return 0; From daf0b99d4720c9f05bdb81c73b2efdb43fa9def3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Mon, 3 Jun 2024 12:28:17 +0200 Subject: [PATCH 261/279] ASoC: topology: Do not assign fields that are already set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The routes are allocated with kzalloc(), so all fields are zeroed by default, skip unnecessary assignments. Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240603102818.36165-4-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 75d9395a18ed..1db540aaad45 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1072,11 +1072,7 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, break; } - /* set to NULL atm for tplg users */ - route->connected = NULL; - if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == 0) { - route->control = NULL; - } else { + if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) != 0) { route->control = devm_kmemdup(tplg->dev, elem->control, min(strlen(elem->control), SNDRV_CTL_ELEM_ID_NAME_MAXLEN), From e0e7bc2cbee93778c4ad7d9a792d425ffb5af6f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Mon, 3 Jun 2024 12:28:18 +0200 Subject: [PATCH 262/279] ASoC: topology: Clean up route loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using very long macro name, assign it to shorter variable and use it instead. While doing that, we can reduce multiple if checks using this define to one. Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240603102818.36165-5-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 1db540aaad45..2ac442644ed4 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1021,6 +1021,7 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, struct snd_soc_tplg_hdr *hdr) { struct snd_soc_dapm_context *dapm = &tplg->comp->dapm; + const size_t maxlen = SNDRV_CTL_ELEM_ID_NAME_MAXLEN; struct snd_soc_tplg_dapm_graph_elem *elem; struct snd_soc_dapm_route *route; int count, i; @@ -1044,38 +1045,27 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, tplg->pos += sizeof(struct snd_soc_tplg_dapm_graph_elem); /* validate routes */ - if (strnlen(elem->source, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == - SNDRV_CTL_ELEM_ID_NAME_MAXLEN) { - ret = -EINVAL; - break; - } - if (strnlen(elem->sink, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == - SNDRV_CTL_ELEM_ID_NAME_MAXLEN) { - ret = -EINVAL; - break; - } - if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == - SNDRV_CTL_ELEM_ID_NAME_MAXLEN) { + if ((strnlen(elem->source, maxlen) == maxlen) || + (strnlen(elem->sink, maxlen) == maxlen) || + (strnlen(elem->control, maxlen) == maxlen)) { ret = -EINVAL; break; } route->source = devm_kmemdup(tplg->dev, elem->source, - min(strlen(elem->source), - SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + min(strlen(elem->source), maxlen), GFP_KERNEL); route->sink = devm_kmemdup(tplg->dev, elem->sink, - min(strlen(elem->sink), SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + min(strlen(elem->sink), maxlen), GFP_KERNEL); if (!route->source || !route->sink) { ret = -ENOMEM; break; } - if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) != 0) { + if (strnlen(elem->control, maxlen) != 0) { route->control = devm_kmemdup(tplg->dev, elem->control, - min(strlen(elem->control), - SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + min(strlen(elem->control), maxlen), GFP_KERNEL); if (!route->control) { ret = -ENOMEM; From e3209a1827646daaab744aa6a5767b1f57fb5385 Mon Sep 17 00:00:00 2001 From: Thomas GENTY Date: Sat, 8 Jun 2024 19:02:51 +0200 Subject: [PATCH 263/279] bytcr_rt5640 : inverse jack detect for Archos 101 cesium When headphones are plugged in, they appear absent; when they are removed, they appear present. Add a specific entry in bytcr_rt5640 for this device Signed-off-by: Thomas GENTY Reviewed-by: Hans de Goede Acked-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20240608170251.99936-1-tomlohave@gmail.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/bytcr_rt5640.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index b41a1147f1c3..a64d1989e28a 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -610,6 +610,17 @@ static const struct dmi_system_id byt_rt5640_quirk_table[] = { BYT_RT5640_SSP0_AIF1 | BYT_RT5640_MCLK_EN), }, + { + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ARCHOS"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "ARCHOS 101 CESIUM"), + }, + .driver_data = (void *)(BYTCR_INPUT_DEFAULTS | + BYT_RT5640_JD_NOT_INV | + BYT_RT5640_DIFF_MIC | + BYT_RT5640_SSP0_AIF1 | + BYT_RT5640_MCLK_EN), + }, { .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ARCHOS"), From e8343410ddf08fc36a9b9cc7c51a4e53a262d4c6 Mon Sep 17 00:00:00 2001 From: Jai Luthra Date: Tue, 11 Jun 2024 18:02:55 +0530 Subject: [PATCH 264/279] ALSA: dmaengine: Synchronize dma channel after drop() Sometimes the stream may be stopped due to XRUN events, in which case the userspace can call snd_pcm_drop() and snd_pcm_prepare() to stop and start the stream again. In these cases, we must wait for the DMA channel to synchronize before marking the stream as prepared for playback, as the DMA channel gets stopped by drop() without any synchronization. Make sure the ALSA core synchronizes the DMA channel by adding a sync_stop() hook. Reviewed-by: Peter Ujfalusi Signed-off-by: Jai Luthra Link: https://lore.kernel.org/r/20240611-asoc_next-v3-1-fcfd84b12164@ti.com Signed-off-by: Mark Brown --- include/sound/dmaengine_pcm.h | 1 + sound/core/pcm_dmaengine.c | 10 ++++++++++ sound/soc/soc-generic-dmaengine-pcm.c | 8 ++++++++ 3 files changed, 19 insertions(+) diff --git a/include/sound/dmaengine_pcm.h b/include/sound/dmaengine_pcm.h index c11aaf8079fb..f6baa9a01868 100644 --- a/include/sound/dmaengine_pcm.h +++ b/include/sound/dmaengine_pcm.h @@ -36,6 +36,7 @@ snd_pcm_uframes_t snd_dmaengine_pcm_pointer_no_residue(struct snd_pcm_substream int snd_dmaengine_pcm_open(struct snd_pcm_substream *substream, struct dma_chan *chan); int snd_dmaengine_pcm_close(struct snd_pcm_substream *substream); +int snd_dmaengine_pcm_sync_stop(struct snd_pcm_substream *substream); int snd_dmaengine_pcm_open_request_chan(struct snd_pcm_substream *substream, dma_filter_fn filter_fn, void *filter_data); diff --git a/sound/core/pcm_dmaengine.c b/sound/core/pcm_dmaengine.c index 12aa1cef11a1..ed07fa5693d2 100644 --- a/sound/core/pcm_dmaengine.c +++ b/sound/core/pcm_dmaengine.c @@ -349,6 +349,16 @@ int snd_dmaengine_pcm_open_request_chan(struct snd_pcm_substream *substream, } EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_open_request_chan); +int snd_dmaengine_pcm_sync_stop(struct snd_pcm_substream *substream) +{ + struct dmaengine_pcm_runtime_data *prtd = substream_to_prtd(substream); + + dmaengine_synchronize(prtd->dma_chan); + + return 0; +} +EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_sync_stop); + /** * snd_dmaengine_pcm_close - Close a dmaengine based PCM substream * @substream: PCM substream diff --git a/sound/soc/soc-generic-dmaengine-pcm.c b/sound/soc/soc-generic-dmaengine-pcm.c index ea3bc9318412..a63e942fdc0b 100644 --- a/sound/soc/soc-generic-dmaengine-pcm.c +++ b/sound/soc/soc-generic-dmaengine-pcm.c @@ -318,6 +318,12 @@ static int dmaengine_copy(struct snd_soc_component *component, return 0; } +static int dmaengine_pcm_sync_stop(struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + return snd_dmaengine_pcm_sync_stop(substream); +} + static const struct snd_soc_component_driver dmaengine_pcm_component = { .name = SND_DMAENGINE_PCM_DRV_NAME, .probe_order = SND_SOC_COMP_ORDER_LATE, @@ -327,6 +333,7 @@ static const struct snd_soc_component_driver dmaengine_pcm_component = { .trigger = dmaengine_pcm_trigger, .pointer = dmaengine_pcm_pointer, .pcm_construct = dmaengine_pcm_new, + .sync_stop = dmaengine_pcm_sync_stop, }; static const struct snd_soc_component_driver dmaengine_pcm_component_process = { @@ -339,6 +346,7 @@ static const struct snd_soc_component_driver dmaengine_pcm_component_process = { .pointer = dmaengine_pcm_pointer, .copy = dmaengine_copy, .pcm_construct = dmaengine_pcm_new, + .sync_stop = dmaengine_pcm_sync_stop, }; static const char * const dmaengine_pcm_dma_channel_names[] = { From c5dcf8ab10606e76c1d8a0ec77f27d84a392e874 Mon Sep 17 00:00:00 2001 From: Jai Luthra Date: Tue, 11 Jun 2024 18:02:56 +0530 Subject: [PATCH 265/279] ASoC: ti: davinci-mcasp: Set min period size using FIFO config The minimum period size was enforced to 64 as older devices integrating McASP with EDMA used an internal FIFO of 64 samples. With UDMA based platforms this internal McASP FIFO is optional, as the DMA engine internally does some buffering which is already accounted for when registering the platform. So we should read the actual FIFO configuration (txnumevt/rxnumevt) instead of hardcoding frames.min to 64. Acked-by: Peter Ujfalusi Signed-off-by: Jai Luthra Link: https://lore.kernel.org/r/20240611-asoc_next-v3-2-fcfd84b12164@ti.com Signed-off-by: Mark Brown --- sound/soc/ti/davinci-mcasp.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sound/soc/ti/davinci-mcasp.c b/sound/soc/ti/davinci-mcasp.c index 1e760c315521..2b1ed91a736c 100644 --- a/sound/soc/ti/davinci-mcasp.c +++ b/sound/soc/ti/davinci-mcasp.c @@ -1472,10 +1472,11 @@ static int davinci_mcasp_hw_rule_min_periodsize( { struct snd_interval *period_size = hw_param_interval(params, SNDRV_PCM_HW_PARAM_PERIOD_SIZE); + u8 numevt = *((u8 *)rule->private); struct snd_interval frames; snd_interval_any(&frames); - frames.min = 64; + frames.min = numevt; frames.integer = 1; return snd_interval_refine(period_size, &frames); @@ -1490,6 +1491,7 @@ static int davinci_mcasp_startup(struct snd_pcm_substream *substream, u32 max_channels = 0; int i, dir, ret; int tdm_slots = mcasp->tdm_slots; + u8 *numevt; /* Do not allow more then one stream per direction */ if (mcasp->substreams[substream->stream]) @@ -1589,9 +1591,12 @@ static int davinci_mcasp_startup(struct snd_pcm_substream *substream, return ret; } + numevt = (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) ? + &mcasp->txnumevt : + &mcasp->rxnumevt; snd_pcm_hw_rule_add(substream->runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, - davinci_mcasp_hw_rule_min_periodsize, NULL, + davinci_mcasp_hw_rule_min_periodsize, numevt, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, -1); return 0; From 524d3f126362b6033e92cbe107ae2158d7fbff94 Mon Sep 17 00:00:00 2001 From: Primoz Fiser Date: Mon, 10 Jun 2024 14:58:47 +0200 Subject: [PATCH 266/279] ASoC: ti: omap-hdmi: Fix too long driver name Set driver name to "HDMI". This simplifies the code and gets rid of the following error messages: ASoC: driver name too long 'HDMI 58040000.encoder' -> 'HDMI_58040000_e' Signed-off-by: Primoz Fiser Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20240610125847.773394-1-primoz.fiser@norik.com Signed-off-by: Mark Brown --- sound/soc/ti/omap-hdmi.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sound/soc/ti/omap-hdmi.c b/sound/soc/ti/omap-hdmi.c index 639bc83f4263..cf43ac19c4a6 100644 --- a/sound/soc/ti/omap-hdmi.c +++ b/sound/soc/ti/omap-hdmi.c @@ -354,11 +354,7 @@ static int omap_hdmi_audio_probe(struct platform_device *pdev) if (!card) return -ENOMEM; - card->name = devm_kasprintf(dev, GFP_KERNEL, - "HDMI %s", dev_name(ad->dssdev)); - if (!card->name) - return -ENOMEM; - + card->name = "HDMI"; card->owner = THIS_MODULE; card->dai_link = devm_kzalloc(dev, sizeof(*(card->dai_link)), GFP_KERNEL); From 6f2a43e3d14f6e31a3b041a1043195d02c54d615 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 12 Jun 2024 15:12:03 +0300 Subject: [PATCH 267/279] ASoC: SOF: sof-audio: Skip unprepare for in-use widgets on error rollback If the ipc_prepare() callback fails for a module instance, on error rewind we must skip the ipc_unprepare() call for ones that has positive use count. The positive use count means that the module instance is in active use, it cannot be unprepared. The issue affects capture direction paths with branches (single dai with multiple PCMs), the affected widgets are in the shared part of the paths. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Kai Vehmanen Reviewed-by: Ranjani Sridharan Link: https://lore.kernel.org/r/20240612121203.15468-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/sof-audio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sof/sof-audio.c b/sound/soc/sof/sof-audio.c index b3ac040811e7..ef9318947d74 100644 --- a/sound/soc/sof/sof-audio.c +++ b/sound/soc/sof/sof-audio.c @@ -485,7 +485,7 @@ sof_prepare_widgets_in_path(struct snd_sof_dev *sdev, struct snd_soc_dapm_widget if (ret < 0) { /* unprepare the source widget */ if (widget_ops[widget->id].ipc_unprepare && - swidget && swidget->prepared) { + swidget && swidget->prepared && swidget->use_count == 0) { widget_ops[widget->id].ipc_unprepare(swidget); swidget->prepared = false; } From f3b198e4788fcc8d03ed0c8bd5e3856c6a5760c5 Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Wed, 12 Jun 2024 09:01:07 +0000 Subject: [PATCH 268/279] ASoC: rt722-sdca-sdw: add debounce time for type detection Add debounce time in headset type detection for better performance. Signed-off-by: Jack Yu Link: https://lore.kernel.org/r/7e502e9a9dd94122a1b60deb5ceb60fb@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca-sdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/rt722-sdca-sdw.c b/sound/soc/codecs/rt722-sdca-sdw.c index f73ee3bf90f5..87354bb1564e 100644 --- a/sound/soc/codecs/rt722-sdca-sdw.c +++ b/sound/soc/codecs/rt722-sdca-sdw.c @@ -352,7 +352,7 @@ static int rt722_sdca_interrupt_callback(struct sdw_slave *slave, if (status->sdca_cascade && !rt722->disable_irq) mod_delayed_work(system_power_efficient_wq, - &rt722->jack_detect_work, msecs_to_jiffies(30)); + &rt722->jack_detect_work, msecs_to_jiffies(280)); mutex_unlock(&rt722->disable_irq_lock); From 0298f51652be47b79780833e0b63194e1231fa34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Thu, 13 Jun 2024 11:01:26 +0200 Subject: [PATCH 269/279] ASoC: topology: Fix route memory corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was reported that recent fix for memory corruption during topology load, causes corruption in other cases. Instead of being overeager with checking topology, assume that it is properly formatted and just duplicate strings. Reported-by: Pierre-Louis Bossart Closes: https://lore.kernel.org/linux-sound/171812236450.201359.3019210915105428447.b4-ty@kernel.org/T/#m8c4bd5abf453960fde6f826c4b7f84881da63e9d Suggested-by: Péter Ujfalusi Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240613090126.841189-1-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 2ac442644ed4..6951ff7bc61e 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1052,21 +1052,15 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, break; } - route->source = devm_kmemdup(tplg->dev, elem->source, - min(strlen(elem->source), maxlen), - GFP_KERNEL); - route->sink = devm_kmemdup(tplg->dev, elem->sink, - min(strlen(elem->sink), maxlen), - GFP_KERNEL); + route->source = devm_kstrdup(tplg->dev, elem->source, GFP_KERNEL); + route->sink = devm_kstrdup(tplg->dev, elem->sink, GFP_KERNEL); if (!route->source || !route->sink) { ret = -ENOMEM; break; } if (strnlen(elem->control, maxlen) != 0) { - route->control = devm_kmemdup(tplg->dev, elem->control, - min(strlen(elem->control), maxlen), - GFP_KERNEL); + route->control = devm_kstrdup(tplg->dev, elem->control, GFP_KERNEL); if (!route->control) { ret = -ENOMEM; break; From 8af49868e51ed1ba117b74728af12abe1eda82e5 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 13 Jun 2024 14:25:27 +0100 Subject: [PATCH 270/279] ASoC: cs35l56: Disconnect ASP1 TX sources when ASP1 DAI is hooked up If the ASP1 DAI is hooked up by the machine driver the ASP TX mixer sources should be initialized to disconnected. There aren't currently any available products using the ASP so this doesn't affect any existing systems. The cs35l56 does not have any fixed default for the mixer source registers. When the cs35l56 boots, its firmware patches these registers to setup a system-specific routing; this is so that Windows can use generic SDCA drivers instead of needing knowledge of chip-specific registers. The setup varies between end-products, which each have customized firmware, and so the default register state varies between end-products. It can also change if the firmware on an end-product is upgraded - for example if a change was needed to the routing for Windows use-cases. It must be emphasized that the settings applied by the firmware are not internal magic tuning; they are statically implementing use-case setup that on Linux would be done via ALSA controls. The driver is currently syncing the mixer controls with whatever initial state the firmware wrote to the registers, so that they report the actual audio routing. But if the ASP DAI is hooked up this can create a powered-up DAPM graph without anything intentionally setting up a path. This can lead to parts of the audio system powering up unexpectedly. For example when cs35l56 is connected to cs42l43 using a codec-codec link, this can create a complete DAPM graph which then powers-up cs42l43. But the cs42l43 can only be clocked from its SoundWire bus so this causes a bunch of errors in the kernel log where cs42l43 is unexpectedly powered-up without a clock. If the host is taking ownership of the ASP (either directly or as a codec-to-codec link) there is no need to keep the mixer settings that the firmware wrote. The driver has ALSA controls for setting these using standard Linux mechanisms. So if the machine driver hooks up the ASP the ASP mixers are initialized to "None" (no input). This prevents unintended DAPM-graph power-ups, and means the initial state of the mixers is always going to be None. Since the initial state of the mixers can vary from system to system and potentially between firmware upgrades, no use-case manager can currently assume that cs35l56 has a known initial state. The firmware could just as easily default them to "None" as to any input source. So defaulting them to "None" in the driver is not increasing the entropy of the system. Signed-off-by: Richard Fitzgerald Link: https://lore.kernel.org/r/20240613132527.46537-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56-shared.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index 8af89a263594..30497152e02a 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -215,6 +215,10 @@ static const struct reg_sequence cs35l56_asp1_defaults[] = { REG_SEQ0(CS35L56_ASP1_FRAME_CONTROL5, 0x00020100), REG_SEQ0(CS35L56_ASP1_DATA_CONTROL1, 0x00000018), REG_SEQ0(CS35L56_ASP1_DATA_CONTROL5, 0x00000018), + REG_SEQ0(CS35L56_ASP1TX1_INPUT, 0x00000000), + REG_SEQ0(CS35L56_ASP1TX2_INPUT, 0x00000000), + REG_SEQ0(CS35L56_ASP1TX3_INPUT, 0x00000000), + REG_SEQ0(CS35L56_ASP1TX4_INPUT, 0x00000000), }; /* From be1fae62cf253a5b67526cee9fbc07689b97c125 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Thu, 13 Jun 2024 13:13:05 +0100 Subject: [PATCH 271/279] ASoC: q6apm-lpass-dai: close graph on prepare errors There is an issue around with error handling and graph management with the exising code, none of the error paths close the graph, which result in leaving the loaded graph in dsp, however the driver thinks otherwise. This can have a nasty side effect specially when we try to load the same graph to dsp, dsp returns error which leaves the board with no sound and requires restart. Fix this by properly closing the graph when we hit errors between open and close. Fixes: 30ad723b93ad ("ASoC: qdsp6: audioreach: add q6apm lpass dai support") Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Tested-by: Dmitry Baryshkov # X13s Link: https://lore.kernel.org/r/20240613-q6apm-fixes-v1-1-d88953675ab3@linaro.org Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/q6apm-lpass-dais.c | 32 +++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c index 68a38f63a2db..66b911b49e3f 100644 --- a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c +++ b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c @@ -141,14 +141,17 @@ static void q6apm_lpass_dai_shutdown(struct snd_pcm_substream *substream, struct struct q6apm_lpass_dai_data *dai_data = dev_get_drvdata(dai->dev); int rc; - if (!dai_data->is_port_started[dai->id]) - return; - rc = q6apm_graph_stop(dai_data->graph[dai->id]); - if (rc < 0) - dev_err(dai->dev, "fail to close APM port (%d)\n", rc); + if (dai_data->is_port_started[dai->id]) { + rc = q6apm_graph_stop(dai_data->graph[dai->id]); + dai_data->is_port_started[dai->id] = false; + if (rc < 0) + dev_err(dai->dev, "fail to close APM port (%d)\n", rc); + } - q6apm_graph_close(dai_data->graph[dai->id]); - dai_data->is_port_started[dai->id] = false; + if (dai_data->graph[dai->id]) { + q6apm_graph_close(dai_data->graph[dai->id]); + dai_data->graph[dai->id] = NULL; + } } static int q6apm_lpass_dai_prepare(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) @@ -163,8 +166,10 @@ static int q6apm_lpass_dai_prepare(struct snd_pcm_substream *substream, struct s q6apm_graph_stop(dai_data->graph[dai->id]); dai_data->is_port_started[dai->id] = false; - if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) + if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { q6apm_graph_close(dai_data->graph[dai->id]); + dai_data->graph[dai->id] = NULL; + } } /** @@ -183,26 +188,29 @@ static int q6apm_lpass_dai_prepare(struct snd_pcm_substream *substream, struct s cfg->direction = substream->stream; rc = q6apm_graph_media_format_pcm(dai_data->graph[dai->id], cfg); - if (rc) { dev_err(dai->dev, "Failed to set media format %d\n", rc); - return rc; + goto err; } rc = q6apm_graph_prepare(dai_data->graph[dai->id]); if (rc) { dev_err(dai->dev, "Failed to prepare Graph %d\n", rc); - return rc; + goto err; } rc = q6apm_graph_start(dai_data->graph[dai->id]); if (rc < 0) { dev_err(dai->dev, "fail to start APM port %x\n", dai->id); - return rc; + goto err; } dai_data->is_port_started[dai->id] = true; return 0; +err: + q6apm_graph_close(dai_data->graph[dai->id]); + dai_data->graph[dai->id] = NULL; + return rc; } static int q6apm_lpass_dai_startup(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) From 60ff540a1d476c2d48b96f7bc8ac8581b820e878 Mon Sep 17 00:00:00 2001 From: Shuming Fan Date: Wed, 12 Jun 2024 15:57:40 +0800 Subject: [PATCH 272/279] ASoC: Intel: soc-acpi: mtl: fix speaker no sound on Dell SKU 0C64 Dell SKU 0C64 has a single rt1318 amplifier. The prefix name of control still needs to be set rt1318-1 corresponding to UCM config. Signed-off-by: Shuming Fan Reviewed-by: Bard Liao Link: https://msgid.link/r/20240612075740.1678082-1-shumingf@realtek.com Signed-off-by: Mark Brown --- sound/soc/intel/common/soc-acpi-intel-mtl-match.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c index 48252fa9e39e..8e0ae3635a35 100644 --- a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c @@ -293,7 +293,7 @@ static const struct snd_soc_acpi_adr_device rt1318_1_single_adr[] = { .adr = 0x000130025D131801, .num_endpoints = 1, .endpoints = &single_endpoint, - .name_prefix = "rt1318" + .name_prefix = "rt1318-1" } }; From 98d919dfee1cc402ca29d45da642852d7c9a2301 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 17 Jun 2024 12:58:34 +0530 Subject: [PATCH 273/279] ASoC: amd: acp: add a null check for chip_pdev structure When acp platform device creation is skipped, chip->chip_pdev value will remain NULL. Add NULL check for chip->chip_pdev structure in snd_acp_resume() function to avoid null pointer dereference. Fixes: 088a40980efb ("ASoC: amd: acp: add pm ops support for acp pci driver") Signed-off-by: Vijendar Mukunda Link: https://msgid.link/r/20240617072844.871468-1-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-pci.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sound/soc/amd/acp/acp-pci.c b/sound/soc/amd/acp/acp-pci.c index ad320b29e87d..aa3e72d13451 100644 --- a/sound/soc/amd/acp/acp-pci.c +++ b/sound/soc/amd/acp/acp-pci.c @@ -199,10 +199,12 @@ static int __maybe_unused snd_acp_resume(struct device *dev) ret = acp_init(chip); if (ret) dev_err(dev, "ACP init failed\n"); - child = chip->chip_pdev->dev; - adata = dev_get_drvdata(&child); - if (adata) - acp_enable_interrupts(adata); + if (chip->chip_pdev) { + child = chip->chip_pdev->dev; + adata = dev_get_drvdata(&child); + if (adata) + acp_enable_interrupts(adata); + } return ret; } From 70fa3900c3ed92158628710e81d274e5cb52f92b Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 17 Jun 2024 12:58:35 +0530 Subject: [PATCH 274/279] ASoC: amd: acp: remove i2s configuration check in acp_i2s_probe() ACP supports different pin configurations for I2S IO. Checking ACP pin configuration value against specific value breaks the functionality for other I2S pin configurations. This check is no longer required in i2s dai driver probe call as i2s configuration check will be verified during acp platform device creation sequence. Remove i2s_mode check in acp_i2s_probe() function. Fixes: b24484c18b10 ("ASoC: amd: acp: ACP code generic to support newer platforms") Signed-off-by: Vijendar Mukunda Link: https://msgid.link/r/20240617072844.871468-2-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-i2s.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sound/soc/amd/acp/acp-i2s.c b/sound/soc/amd/acp/acp-i2s.c index 60cbc881be6e..ef12f97ddc69 100644 --- a/sound/soc/amd/acp/acp-i2s.c +++ b/sound/soc/amd/acp/acp-i2s.c @@ -588,20 +588,12 @@ static int acp_i2s_probe(struct snd_soc_dai *dai) { struct device *dev = dai->component->dev; struct acp_dev_data *adata = dev_get_drvdata(dev); - struct acp_resource *rsrc = adata->rsrc; - unsigned int val; if (!adata->acp_base) { dev_err(dev, "I2S base is NULL\n"); return -EINVAL; } - val = readl(adata->acp_base + rsrc->i2s_pin_cfg_offset); - if (val != rsrc->i2s_mode) { - dev_err(dev, "I2S Mode not supported val %x\n", val); - return -EINVAL; - } - return 0; } From 379bcd2c9197bf2c429434e8a01cea0ee1852316 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 17 Jun 2024 12:58:36 +0530 Subject: [PATCH 275/279] ASoC: amd: acp: move chip->flag variable assignment chip->flag variable assignment will be skipped when acp platform device creation is skipped. In this case chip>flag value will not be set. chip->flag variable should be assigned along with other structure variables for 'chip' structure. Move chip->flag variable assignment prior to acp platform device creation. Fixes: 3a94c8ad0aae ("ASoC: amd: acp: add code for scanning acp pdm controller") Signed-off-by: Vijendar Mukunda Link: https://msgid.link/r/20240617072844.871468-3-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/amd/acp/acp-pci.c b/sound/soc/amd/acp/acp-pci.c index aa3e72d13451..777b5a78d8a9 100644 --- a/sound/soc/amd/acp/acp-pci.c +++ b/sound/soc/amd/acp/acp-pci.c @@ -100,6 +100,7 @@ static int acp_pci_probe(struct pci_dev *pci, const struct pci_device_id *pci_id ret = -EINVAL; goto release_regions; } + chip->flag = flag; dmic_dev = platform_device_register_data(dev, "dmic-codec", PLATFORM_DEVID_NONE, NULL, 0); if (IS_ERR(dmic_dev)) { dev_err(dev, "failed to create DMIC device\n"); @@ -139,7 +140,6 @@ static int acp_pci_probe(struct pci_dev *pci, const struct pci_device_id *pci_id } } - chip->flag = flag; memset(&pdevinfo, 0, sizeof(pdevinfo)); pdevinfo.name = chip->name; From 90f3feb24172185f1832636264943e8b5e289245 Mon Sep 17 00:00:00 2001 From: Elinor Montmasson Date: Thu, 20 Jun 2024 15:25:03 +0200 Subject: [PATCH 276/279] ASoC: fsl-asoc-card: set priv->pdev before using it priv->pdev pointer was set after being used in fsl_asoc_card_audmux_init(). Move this assignment at the start of the probe function, so sub-functions can correctly use pdev through priv. fsl_asoc_card_audmux_init() dereferences priv->pdev to get access to the dev struct, used with dev_err macros. As priv is zero-initialised, there would be a NULL pointer dereference. Note that if priv->dev is dereferenced before assignment but never used, for example if there is no error to be printed, the driver won't crash probably due to compiler optimisations. Fixes: 708b4351f08c ("ASoC: fsl: Add Freescale Generic ASoC Sound Card with ASRC support") Signed-off-by: Elinor Montmasson Link: https://patch.msgid.link/20240620132511.4291-2-elinor.montmasson@savoirfairelinux.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl-asoc-card.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/fsl/fsl-asoc-card.c b/sound/soc/fsl/fsl-asoc-card.c index 5ddc0c2fe53f..eb67689dcd6e 100644 --- a/sound/soc/fsl/fsl-asoc-card.c +++ b/sound/soc/fsl/fsl-asoc-card.c @@ -559,6 +559,8 @@ static int fsl_asoc_card_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; + priv->pdev = pdev; + cpu_np = of_parse_phandle(np, "audio-cpu", 0); /* Give a chance to old DT binding */ if (!cpu_np) @@ -787,7 +789,6 @@ static int fsl_asoc_card_probe(struct platform_device *pdev) } /* Initialize sound card */ - priv->pdev = pdev; priv->card.dev = &pdev->dev; priv->card.owner = THIS_MODULE; ret = snd_soc_of_parse_card_name(&priv->card, "model"); From 282a4482e198e03781c152c88aac8aa382ef9a55 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 24 Jun 2024 14:12:56 +0800 Subject: [PATCH 277/279] ASoC: mediatek: mt8195: Add platform entry for ETDM1_OUT_BE dai link Commit e70b8dd26711 ("ASoC: mediatek: mt8195: Remove afe-dai component and rework codec link") removed the codec entry for the ETDM1_OUT_BE dai link entirely instead of replacing it with COMP_EMPTY(). This worked by accident as the remaining COMP_EMPTY() platform entry became the codec entry, and the platform entry became completely empty, effectively the same as COMP_DUMMY() since snd_soc_fill_dummy_dai() doesn't do anything for platform entries. This causes a KASAN out-of-bounds warning in mtk_soundcard_common_probe() in sound/soc/mediatek/common/mtk-soundcard-driver.c: for_each_card_prelinks(card, i, dai_link) { if (adsp_node && !strncmp(dai_link->name, "AFE_SOF", strlen("AFE_SOF"))) dai_link->platforms->of_node = adsp_node; else if (!dai_link->platforms->name && !dai_link->platforms->of_node) dai_link->platforms->of_node = platform_node; } where the code expects the platforms array to have space for at least one entry. Add an COMP_EMPTY() entry so that dai_link->platforms has space. Fixes: e70b8dd26711 ("ASoC: mediatek: mt8195: Remove afe-dai component and rework codec link") Signed-off-by: Chen-Yu Tsai Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20240624061257.3115467-1-wenst@chromium.org Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8195/mt8195-mt6359.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/mediatek/mt8195/mt8195-mt6359.c b/sound/soc/mediatek/mt8195/mt8195-mt6359.c index ca8751190520..2832ef78eaed 100644 --- a/sound/soc/mediatek/mt8195/mt8195-mt6359.c +++ b/sound/soc/mediatek/mt8195/mt8195-mt6359.c @@ -827,6 +827,7 @@ SND_SOC_DAILINK_DEFS(ETDM2_IN_BE, SND_SOC_DAILINK_DEFS(ETDM1_OUT_BE, DAILINK_COMP_ARRAY(COMP_CPU("ETDM1_OUT")), + DAILINK_COMP_ARRAY(COMP_EMPTY()), DAILINK_COMP_ARRAY(COMP_EMPTY())); SND_SOC_DAILINK_DEFS(ETDM2_OUT_BE, From 63b47f026cc841bd3d3438dd6fccbc394dfead87 Mon Sep 17 00:00:00 2001 From: Vyacheslav Frantsishko Date: Wed, 26 Jun 2024 10:03:34 +0300 Subject: [PATCH 278/279] ASoC: amd: yc: Fix non-functional mic on ASUS M5602RA The Vivobook S 16X IPS needs a quirks-table entry for the internal microphone to function properly. Signed-off-by: Vyacheslav Frantsishko Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/20240626070334.45633-1-itmymaill@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index 1760b5d42460..4e3a8ce690a4 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -283,6 +283,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "M5402RA"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "M5602RA"), + } + }, { .driver_data = &acp6x_card, .matches = { From 68f97fe330e01450ace63da0ce5cab676fc97f9a Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Wed, 26 Jun 2024 08:25:34 +0000 Subject: [PATCH 279/279] ASoC: rt5645: fix issue of random interrupt from push-button Modify register setting sequence of enabling inline command to fix issue of random interrupt from push-button. Signed-off-by: Jack Yu Link: https://patch.msgid.link/9a7a3a66cbcb426487ca6f558f45e922@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5645.c | 24 ++++++++++++++++++------ sound/soc/codecs/rt5645.h | 6 ++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/sound/soc/codecs/rt5645.c b/sound/soc/codecs/rt5645.c index cdb7ff7020e9..51187b1e0ed2 100644 --- a/sound/soc/codecs/rt5645.c +++ b/sound/soc/codecs/rt5645.c @@ -81,7 +81,7 @@ static const struct reg_sequence init_list[] = { static const struct reg_sequence rt5650_init_list[] = { {0xf6, 0x0100}, {RT5645_PWR_ANLG1, 0x02}, - {RT5645_IL_CMD3, 0x0018}, + {RT5645_IL_CMD3, 0x6728}, }; static const struct reg_default rt5645_reg[] = { @@ -3130,20 +3130,32 @@ static void rt5645_enable_push_button_irq(struct snd_soc_component *component, bool enable) { struct snd_soc_dapm_context *dapm = snd_soc_component_get_dapm(component); + int ret; if (enable) { snd_soc_dapm_force_enable_pin(dapm, "ADC L power"); snd_soc_dapm_force_enable_pin(dapm, "ADC R power"); snd_soc_dapm_sync(dapm); + snd_soc_component_update_bits(component, RT5650_4BTN_IL_CMD2, + RT5645_EN_4BTN_IL_MASK | RT5645_RST_4BTN_IL_MASK, + RT5645_EN_4BTN_IL_EN | RT5645_RST_4BTN_IL_RST); + usleep_range(10000, 15000); + snd_soc_component_update_bits(component, RT5650_4BTN_IL_CMD2, + RT5645_EN_4BTN_IL_MASK | RT5645_RST_4BTN_IL_MASK, + RT5645_EN_4BTN_IL_EN | RT5645_RST_4BTN_IL_NORM); + msleep(50); + ret = snd_soc_component_read(component, RT5645_INT_IRQ_ST); + pr_debug("%s read %x = %x\n", __func__, RT5645_INT_IRQ_ST, + snd_soc_component_read(component, RT5645_INT_IRQ_ST)); + snd_soc_component_write(component, RT5645_INT_IRQ_ST, ret); + ret = snd_soc_component_read(component, RT5650_4BTN_IL_CMD1); + pr_debug("%s read %x = %x\n", __func__, RT5650_4BTN_IL_CMD1, + snd_soc_component_read(component, RT5650_4BTN_IL_CMD1)); + snd_soc_component_write(component, RT5650_4BTN_IL_CMD1, ret); snd_soc_component_update_bits(component, RT5650_4BTN_IL_CMD1, 0x3, 0x3); snd_soc_component_update_bits(component, RT5645_INT_IRQ_ST, 0x8, 0x8); - snd_soc_component_update_bits(component, - RT5650_4BTN_IL_CMD2, 0x8000, 0x8000); - snd_soc_component_read(component, RT5650_4BTN_IL_CMD1); - pr_debug("%s read %x = %x\n", __func__, RT5650_4BTN_IL_CMD1, - snd_soc_component_read(component, RT5650_4BTN_IL_CMD1)); } else { snd_soc_component_update_bits(component, RT5650_4BTN_IL_CMD2, 0x8000, 0x0); snd_soc_component_update_bits(component, RT5645_INT_IRQ_ST, 0x8, 0x0); diff --git a/sound/soc/codecs/rt5645.h b/sound/soc/codecs/rt5645.h index 90816b2c5489..bef74b29fd54 100644 --- a/sound/soc/codecs/rt5645.h +++ b/sound/soc/codecs/rt5645.h @@ -2011,6 +2011,12 @@ #define RT5645_ZCD_HP_DIS (0x0 << 15) #define RT5645_ZCD_HP_EN (0x1 << 15) +/* Buttons Inline Command Function 2 (0xe0) */ +#define RT5645_EN_4BTN_IL_MASK (0x1 << 15) +#define RT5645_EN_4BTN_IL_EN (0x1 << 15) +#define RT5645_RST_4BTN_IL_MASK (0x1 << 14) +#define RT5645_RST_4BTN_IL_RST (0x0 << 14) +#define RT5645_RST_4BTN_IL_NORM (0x1 << 14) /* Codec Private Register definition */ /* DAC ADC Digital Volume (0x00) */