From 2c87309ea741341c6722efdf1fb3f50dd427c823 Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Tue, 29 Oct 2024 19:27:12 +0100 Subject: [PATCH 001/807] ieee802154: ca8210: Add missing check for kfifo_alloc() in ca8210_probe() ca8210_test_interface_init() returns the result of kfifo_alloc(), which can be non-zero in case of an error. The caller, ca8210_probe(), should check the return value and do error-handling if it fails. Fixes: ded845a781a5 ("ieee802154: Add CA8210 IEEE 802.15.4 device driver") Signed-off-by: Keisuke Nishimura Reviewed-by: Simon Horman Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/20241029182712.318271-1-keisuke.nishimura@inria.fr Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/ca8210.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index e685a7f946f0..753215ebc67c 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -3072,7 +3072,11 @@ static int ca8210_probe(struct spi_device *spi_device) spi_set_drvdata(priv->spi, priv); if (IS_ENABLED(CONFIG_IEEE802154_CA8210_DEBUGFS)) { cascoda_api_upstream = ca8210_test_int_driver_write; - ca8210_test_interface_init(priv); + ret = ca8210_test_interface_init(priv); + if (ret) { + dev_crit(&spi_device->dev, "ca8210_test_interface_init failed\n"); + goto error; + } } else { cascoda_api_upstream = NULL; } From eb09fbeb48709fe66c0d708aed81e910a577a30a Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Wed, 13 Nov 2024 17:51:29 +0800 Subject: [PATCH 002/807] mac802154: check local interfaces before deleting sdata list syzkaller reported a corrupted list in ieee802154_if_remove. [1] Remove an IEEE 802.15.4 network interface after unregister an IEEE 802.15.4 hardware device from the system. CPU0 CPU1 ==== ==== genl_family_rcv_msg_doit ieee802154_unregister_hw ieee802154_del_iface ieee802154_remove_interfaces rdev_del_virtual_intf_deprecated list_del(&sdata->list) ieee802154_if_remove list_del_rcu The net device has been unregistered, since the rcu grace period, unregistration must be run before ieee802154_if_remove. To avoid this issue, add a check for local->interfaces before deleting sdata list. [1] kernel BUG at lib/list_debug.c:58! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 6277 Comm: syz-executor157 Not tainted 6.12.0-rc6-syzkaller-00005-g557329bcecc2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__list_del_entry_valid_or_report+0xf4/0x140 lib/list_debug.c:56 Code: e8 a1 7e 00 07 90 0f 0b 48 c7 c7 e0 37 60 8c 4c 89 fe e8 8f 7e 00 07 90 0f 0b 48 c7 c7 40 38 60 8c 4c 89 fe e8 7d 7e 00 07 90 <0f> 0b 48 c7 c7 a0 38 60 8c 4c 89 fe e8 6b 7e 00 07 90 0f 0b 48 c7 RSP: 0018:ffffc9000490f3d0 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: d211eee56bb28d00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88805b278dd8 R08: ffffffff8174a12c R09: 1ffffffff2852f0d R10: dffffc0000000000 R11: fffffbfff2852f0e R12: dffffc0000000000 R13: dffffc0000000000 R14: dead000000000100 R15: ffff88805b278cc0 FS: 0000555572f94380(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056262e4a3000 CR3: 0000000078496000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __list_del_entry_valid include/linux/list.h:124 [inline] __list_del_entry include/linux/list.h:215 [inline] list_del_rcu include/linux/rculist.h:157 [inline] ieee802154_if_remove+0x86/0x1e0 net/mac802154/iface.c:687 rdev_del_virtual_intf_deprecated net/ieee802154/rdev-ops.h:24 [inline] ieee802154_del_iface+0x2c0/0x5c0 net/ieee802154/nl-phy.c:323 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb14/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8e4/0xcb0 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:744 ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2607 ___sys_sendmsg net/socket.c:2661 [inline] __sys_sendmsg+0x292/0x380 net/socket.c:2690 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Reported-and-tested-by: syzbot+985f827280dc3a6e7e92@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=985f827280dc3a6e7e92 Signed-off-by: Lizhi Xu Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/20241113095129.1457225-1-lizhi.xu@windriver.com Signed-off-by: Stefan Schmidt --- net/mac802154/iface.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index c0e2da5072be..9e4631fade90 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -684,6 +684,10 @@ void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata) ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); + if (list_empty(&sdata->local->interfaces)) { + mutex_unlock(&sdata->local->iflist_mtx); + return; + } list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); From bc7acc0bd0f94c26bc0defc902311794a3d0fae9 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 20 Nov 2024 15:31:16 -0800 Subject: [PATCH 003/807] of: property: fw_devlink: Do not use interrupt-parent directly commit 7f00be96f125 ("of: property: Add device link support for interrupt-parent, dmas and -gpio(s)") started adding device links for the interrupt-parent property. commit 4104ca776ba3 ("of: property: Add fw_devlink support for interrupts") and commit f265f06af194 ("of: property: Fix fw_devlink handling of interrupts/interrupts-extended") later added full support for parsing the interrupts and interrupts-extended properties, which includes looking up the node of the parent domain. This made the handler for the interrupt-parent property redundant. In fact, creating device links based solely on interrupt-parent is problematic, because it can create spurious cycles. A node may have this property without itself being an interrupt controller or consumer. For example, this property is often present in the root node or a /soc bus node to set the default interrupt parent for child nodes. However, it is incorrect for the bus to depend on the interrupt controller, as some of the bus's children may not be interrupt consumers at all or may have a different interrupt parent. Resolving these spurious dependency cycles can cause an incorrect probe order for interrupt controller drivers. This was observed on a RISC-V system with both an APLIC and IMSIC under /soc, where interrupt-parent in /soc points to the APLIC, and the APLIC msi-parent points to the IMSIC. fw_devlink found three dependency cycles and attempted to probe the APLIC before the IMSIC. After applying this patch, there were no dependency cycles and the probe order was correct. Acked-by: Marc Zyngier Cc: stable@vger.kernel.org Fixes: 4104ca776ba3 ("of: property: Add fw_devlink support for interrupts") Signed-off-by: Samuel Holland Link: https://lore.kernel.org/r/20241120233124.3649382-1-samuel.holland@sifive.com Signed-off-by: Rob Herring (Arm) --- drivers/of/property.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/of/property.c b/drivers/of/property.c index 519bf9229e61..cfc8aea002e4 100644 --- a/drivers/of/property.c +++ b/drivers/of/property.c @@ -1286,7 +1286,6 @@ DEFINE_SIMPLE_PROP(iommus, "iommus", "#iommu-cells") DEFINE_SIMPLE_PROP(mboxes, "mboxes", "#mbox-cells") DEFINE_SIMPLE_PROP(io_channels, "io-channels", "#io-channel-cells") DEFINE_SIMPLE_PROP(io_backends, "io-backends", "#io-backend-cells") -DEFINE_SIMPLE_PROP(interrupt_parent, "interrupt-parent", NULL) DEFINE_SIMPLE_PROP(dmas, "dmas", "#dma-cells") DEFINE_SIMPLE_PROP(power_domains, "power-domains", "#power-domain-cells") DEFINE_SIMPLE_PROP(hwlocks, "hwlocks", "#hwlock-cells") @@ -1432,7 +1431,6 @@ static const struct supplier_bindings of_supplier_bindings[] = { { .parse_prop = parse_mboxes, }, { .parse_prop = parse_io_channels, }, { .parse_prop = parse_io_backends, }, - { .parse_prop = parse_interrupt_parent, }, { .parse_prop = parse_dmas, .optional = true, }, { .parse_prop = parse_power_domains, }, { .parse_prop = parse_hwlocks, }, From 1a75e81baf4f1b322f3498ffd373eaada8e60589 Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Sun, 24 Nov 2024 11:05:36 +0100 Subject: [PATCH 004/807] of/unittest: Add empty dma-ranges address translation tests Intermediate DT PCI nodes dynamically generated by enabling CONFIG_PCI_DYNAMIC_OF_NODES have empty dma-ranges property. PCI address specifiers have 3 cells and when dma-ranges is missing or empty, of_translate_one() is currently dropping the flag portion of PCI addresses which are subnodes of the aforementioned ones, failing the translation. Add new tests covering this case. With this test, we get 1 new failure which is fixed in subsequent commit: FAIL of_unittest_pci_empty_dma_ranges():1245 for_each_of_pci_range wrong CPU addr (ffffffffffffffff) on node /testcase-data/address-tests2/pcie@d1070000/pci@0,0/dev@0,0/local-bus@0 Signed-off-by: Andrea della Porta Link: https://lore.kernel.org/r/08f8fee4fdc0379240fda2f4a0e6f11ebf9647a8.1732441813.git.andrea.porta@suse.com Signed-off-by: Rob Herring (Arm) --- drivers/of/unittest-data/tests-address.dtsi | 2 ++ drivers/of/unittest.c | 39 +++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/drivers/of/unittest-data/tests-address.dtsi b/drivers/of/unittest-data/tests-address.dtsi index 3344f15c3755..f02a181bb125 100644 --- a/drivers/of/unittest-data/tests-address.dtsi +++ b/drivers/of/unittest-data/tests-address.dtsi @@ -114,6 +114,7 @@ device_type = "pci"; ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x7f00000>, <0x81000000 0 0x00000000 0 0xefff0000 0 0x0010000>; + dma-ranges = <0x43000000 0x10 0x00 0x00 0x00 0x00 0x10000000>; reg = <0x00000000 0xd1070000 0x20000>; pci@0,0 { @@ -142,6 +143,7 @@ #size-cells = <0x01>; ranges = <0xa0000000 0 0 0 0x2000000>, <0xb0000000 1 0 0 0x1000000>; + dma-ranges = <0xc0000000 0x43000000 0x10 0x00 0x10000000>; dev@e0000000 { reg = <0xa0001000 0x1000>, diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index daf9a2dddd7e..80483e38d7b4 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -1213,6 +1213,44 @@ static void __init of_unittest_pci_dma_ranges(void) of_node_put(np); } +static void __init of_unittest_pci_empty_dma_ranges(void) +{ + struct device_node *np; + struct of_pci_range range; + struct of_pci_range_parser parser; + + if (!IS_ENABLED(CONFIG_PCI)) + return; + + np = of_find_node_by_path("/testcase-data/address-tests2/pcie@d1070000/pci@0,0/dev@0,0/local-bus@0"); + if (!np) { + pr_err("missing testcase data\n"); + return; + } + + if (of_pci_dma_range_parser_init(&parser, np)) { + pr_err("missing dma-ranges property\n"); + return; + } + + /* + * Get the dma-ranges from the device tree + */ + for_each_of_pci_range(&parser, &range) { + unittest(range.size == 0x10000000, + "for_each_of_pci_range wrong size on node %pOF size=%llx\n", + np, range.size); + unittest(range.cpu_addr == 0x00000000, + "for_each_of_pci_range wrong CPU addr (%llx) on node %pOF", + range.cpu_addr, np); + unittest(range.pci_addr == 0xc0000000, + "for_each_of_pci_range wrong DMA addr (%llx) on node %pOF", + range.pci_addr, np); + } + + of_node_put(np); +} + static void __init of_unittest_bus_ranges(void) { struct device_node *np; @@ -4272,6 +4310,7 @@ static int __init of_unittest(void) of_unittest_dma_get_max_cpu_address(); of_unittest_parse_dma_ranges(); of_unittest_pci_dma_ranges(); + of_unittest_pci_empty_dma_ranges(); of_unittest_bus_ranges(); of_unittest_bus_3cell_ranges(); of_unittest_reg(); From 7f05e20b989ac33c9c0f8c2028ec0a566493548f Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Sun, 24 Nov 2024 11:05:37 +0100 Subject: [PATCH 005/807] of: address: Preserve the flags portion on 1:1 dma-ranges mapping A missing or empty dma-ranges in a DT node implies a 1:1 mapping for dma translations. In this specific case, the current behaviour is to zero out the entire specifier so that the translation could be carried on as an offset from zero. This includes address specifier that has flags (e.g. PCI ranges). Once the flags portion has been zeroed, the translation chain is broken since the mapping functions will check the upcoming address specifier against mismatching flags, always failing the 1:1 mapping and its entire purpose of always succeeding. Set to zero only the address portion while passing the flags through. Fixes: dbbdee94734b ("of/address: Merge all of the bus translation code") Cc: stable@vger.kernel.org Signed-off-by: Andrea della Porta Tested-by: Herve Codina Link: https://lore.kernel.org/r/e51ae57874e58a9b349c35e2e877425ebc075d7a.1732441813.git.andrea.porta@suse.com Signed-off-by: Rob Herring (Arm) --- drivers/of/address.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index c5b925ac469f..5b7ee3ed5296 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -459,7 +459,8 @@ static int of_translate_one(const struct device_node *parent, const struct of_bu } if (ranges == NULL || rlen == 0) { offset = of_read_number(addr, na); - memset(addr, 0, pna * 4); + /* set address to zero, pass flags through */ + memset(addr + pbus->flag_cells, 0, (pna - pbus->flag_cells) * 4); pr_debug("empty ranges; 1:1 translation\n"); goto finish; } From 61a6ba233fe1198e9eacc9ca1d1cbdb27f70cee5 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 13 Nov 2024 16:56:13 -0600 Subject: [PATCH 006/807] dt-bindings: Unify "fsl,liodn" type definitions The type definition of "fsl,liodn" is defined as uint32 in crypto/fsl,sec-v4.0.yaml and uint32-array in soc/fsl/fsl,bman.yaml, soc/fsl/fsl,qman-portal.yaml, and soc/fsl/fsl,qman.yaml. Unify the type to be uint32-array and constraint the single entry cases. Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20241113225614.1782862-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/crypto/fsl,sec-v4.0.yaml | 10 ++++++---- .../devicetree/bindings/soc/fsl/fsl,qman-portal.yaml | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml b/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml index 9c8c9991f29a..f0c4a7c83568 100644 --- a/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml +++ b/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml @@ -114,8 +114,9 @@ patternProperties: table that specifies the PPID to LIODN mapping. Needed if the PAMU is used. Value is a 12 bit value where value is a LIODN ID for this JR. This property is normally set by boot firmware. - $ref: /schemas/types.yaml#/definitions/uint32 - maximum: 0xfff + $ref: /schemas/types.yaml#/definitions/uint32-array + items: + - maximum: 0xfff '^rtic@[0-9a-f]+$': type: object @@ -186,8 +187,9 @@ patternProperties: Needed if the PAMU is used. Value is a 12 bit value where value is a LIODN ID for this JR. This property is normally set by boot firmware. - $ref: /schemas/types.yaml#/definitions/uint32 - maximum: 0xfff + $ref: /schemas/types.yaml#/definitions/uint32-array + items: + - maximum: 0xfff fsl,rtic-region: description: diff --git a/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml b/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml index 17016184143f..e459fec02ba8 100644 --- a/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml +++ b/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml @@ -35,6 +35,7 @@ properties: fsl,liodn: $ref: /schemas/types.yaml#/definitions/uint32-array + maxItems: 2 description: See pamu.txt. Two LIODN(s). DQRR LIODN (DLIODN) and Frame LIODN (FLIODN) @@ -69,6 +70,7 @@ patternProperties: type: object properties: fsl,liodn: + $ref: /schemas/types.yaml#/definitions/uint32-array description: See pamu.txt, PAMU property used for static LIODN assignment fsl,iommu-parent: From e60b14f47d779edc38bc1f14d2c995d477cec6f9 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 28 Nov 2024 20:21:47 +0530 Subject: [PATCH 007/807] arm64: dts: qcom: sa8775p: Fix the size of 'addr_space' regions For both the controller instances, size of the 'addr_space' region should be 0x1fe00000 as per the hardware memory layout. Otherwise, endpoint drivers cannot request even reasonable BAR size of 1MB. Cc: stable@vger.kernel.org # 6.11 Fixes: c5f5de8434ec ("arm64: dts: qcom: sa8775p: Add ep pcie1 controller node") Fixes: 1924f5518224 ("arm64: dts: qcom: sa8775p: Add ep pcie0 controller node") Signed-off-by: Manivannan Sadhasivam Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20241128145147.145618-1-manivannan.sadhasivam@linaro.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/sa8775p.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sa8775p.dtsi b/arch/arm64/boot/dts/qcom/sa8775p.dtsi index 9f315a51a7c1..368bcf7c9802 100644 --- a/arch/arm64/boot/dts/qcom/sa8775p.dtsi +++ b/arch/arm64/boot/dts/qcom/sa8775p.dtsi @@ -6092,7 +6092,7 @@ <0x0 0x40000000 0x0 0xf20>, <0x0 0x40000f20 0x0 0xa8>, <0x0 0x40001000 0x0 0x4000>, - <0x0 0x40200000 0x0 0x100000>, + <0x0 0x40200000 0x0 0x1fe00000>, <0x0 0x01c03000 0x0 0x1000>, <0x0 0x40005000 0x0 0x2000>; reg-names = "parf", "dbi", "elbi", "atu", "addr_space", @@ -6250,7 +6250,7 @@ <0x0 0x60000000 0x0 0xf20>, <0x0 0x60000f20 0x0 0xa8>, <0x0 0x60001000 0x0 0x4000>, - <0x0 0x60200000 0x0 0x100000>, + <0x0 0x60200000 0x0 0x1fe00000>, <0x0 0x01c13000 0x0 0x1000>, <0x0 0x60005000 0x0 0x2000>; reg-names = "parf", "dbi", "elbi", "atu", "addr_space", From 098d8374033f323dae87a1d792a3c8911c2cf57f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Sun, 1 Dec 2024 18:11:20 +0100 Subject: [PATCH 008/807] bus: mhi: host: pci_generic: fix MHI BAR mapping A recent change converting the MHI pci_generic driver to use pcim_iomap_region() failed to update the BAR parameter which is an index rather than a mask. This specifically broke the modem on machines like the Lenovo ThinkPad X13s and x1e80100 CRD: mhi-pci-generic 0004:01:00.0: failed to map pci region: -22 mhi-pci-generic 0004:01:00.0: probe with driver mhi-pci-generic failed with error -22 Fixes: bd23e836423e ("bus: mhi: host: pci_generic: Use pcim_iomap_region() to request and map MHI BAR") Signed-off-by: Johan Hovold Signed-off-by: Manivannan Sadhasivam Reviewed-by: Manivannan Sadhasivam Cc: Manivannan Sadhasivam Cc: Mayank Rana Link: https://lore.kernel.org/r/20241201171120.31616-1-johan+linaro@kernel.org --- drivers/bus/mhi/host/pci_generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bus/mhi/host/pci_generic.c b/drivers/bus/mhi/host/pci_generic.c index 07645ce2119a..56ba4192c89c 100644 --- a/drivers/bus/mhi/host/pci_generic.c +++ b/drivers/bus/mhi/host/pci_generic.c @@ -917,7 +917,7 @@ static int mhi_pci_claim(struct mhi_controller *mhi_cntrl, return err; } - mhi_cntrl->regs = pcim_iomap_region(pdev, 1 << bar_num, pci_name(pdev)); + mhi_cntrl->regs = pcim_iomap_region(pdev, bar_num, pci_name(pdev)); if (IS_ERR(mhi_cntrl->regs)) { err = PTR_ERR(mhi_cntrl->regs); dev_err(&pdev->dev, "failed to map pci region: %d\n", err); From b905bafdea21a75d75a96855edd9e0b6051eee30 Mon Sep 17 00:00:00 2001 From: Leo Stone Date: Sat, 30 Nov 2024 21:14:19 -0800 Subject: [PATCH 009/807] hfs: Sanity check the root record In the syzbot reproducer, the hfs_cat_rec for the root dir has type HFS_CDR_FIL after being read with hfs_bnode_read() in hfs_super_fill(). This indicates it should be used as an hfs_cat_file, which is 102 bytes. Only the first 70 bytes of that struct are initialized, however, because the entrylength passed into hfs_bnode_read() is still the length of a directory record. This causes uninitialized values to be used later on, when the hfs_cat_rec union is treated as the larger hfs_cat_file struct. Add a check to make sure the retrieved record has the correct type for the root directory (HFS_CDR_DIR), and make sure we load the correct number of bytes for a directory record. Reported-by: syzbot+2db3c7526ba68f4ea776@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2db3c7526ba68f4ea776 Tested-by: syzbot+2db3c7526ba68f4ea776@syzkaller.appspotmail.com Tested-by: Leo Stone Signed-off-by: Leo Stone Link: https://lore.kernel.org/r/20241201051420.77858-1-leocstone@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/hfs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 3bee9b5dba5e..fe09c2093a93 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -349,11 +349,13 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) goto bail_no_root; res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); if (!res) { - if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { + if (fd.entrylength != sizeof(rec.dir)) { res = -EIO; goto bail_hfs_find; } hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); + if (rec.type != HFS_CDR_DIR) + res = -EIO; } if (res) goto bail_hfs_find; From 60bc447c85f80d3184c7ac327e1d29e0b0a11d46 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 2 Dec 2024 14:15:17 +0100 Subject: [PATCH 010/807] of: Add #address-cells/#size-cells in the device-tree root empty node On systems where ACPI is enabled or when a device-tree is not passed to the kernel by the bootloader, a device-tree root empty node is created. This device-tree root empty node does not have the #address-cells and the #size-cells properties This leads to the use of the default address cells and size cells values which are defined in the code to 1 for the address cells value and 1 for the size cells value. According to the devicetree specification and the OpenFirmware standard (IEEE 1275-1994) the default value for #address-cells should be 2. Also, according to the devicetree specification, the #address-cells and the #size-cells are required properties in the root node. The device tree compiler already uses 2 as default value for address cells and 1 for size cells. The powerpc PROM code also uses 2 as default value for address cells and 1 for size cells. Modern implementation should have the #address-cells and the #size-cells properties set and should not rely on default values. On x86, this root empty node is used and the code default values are used. In preparation of the support for device-tree overlay on PCI devices feature on x86 (i.e. the creation of the PCI root bus device-tree node), the default value for #address-cells needs to be updated. Indeed, on x86_64, addresses are on 64bits and the upper part of an address is needed for correct address translations. On x86_32 having the default value updated does not lead to issues while the upper part of a 64-bit value is zero. Changing the default value for all architectures may break device-tree compatibility. Indeed, existing dts file without the #address-cells property set in the root node will not be compatible with this modification. Instead of updating default values, add both required #address-cells and #size-cells properties in the device-tree empty node. Use 2 for both properties value in order to fully support 64-bit addresses and sizes on systems using this empty root node. Signed-off-by: Herve Codina Link: https://lore.kernel.org/r/20241202131522.142268-6-herve.codina@bootlin.com Signed-off-by: Rob Herring (Arm) --- drivers/of/empty_root.dts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/of/empty_root.dts b/drivers/of/empty_root.dts index cf9e97a60f48..cbe169ba3db5 100644 --- a/drivers/of/empty_root.dts +++ b/drivers/of/empty_root.dts @@ -2,5 +2,12 @@ /dts-v1/; / { - + /* + * #address-cells/#size-cells are required properties at root node. + * Use 2 cells for both address cells and size cells in order to fully + * support 64-bit addresses and sizes on systems using this empty root + * node. + */ + #address-cells = <0x02>; + #size-cells = <0x02>; }; From c43ec96e8d34399bd9dab2f2dc316b904892133f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 29 Oct 2024 08:28:45 +0000 Subject: [PATCH 011/807] dmaengine: at_xdmac: avoid null_prt_deref in at_xdmac_prep_dma_memset The at_xdmac_memset_create_desc may return NULL, which will lead to a null pointer dereference. For example, the len input is error, or the atchan->free_descs_list is empty and memory is exhausted. Therefore, add check to avoid this. Fixes: b206d9a23ac7 ("dmaengine: xdmac: Add memset support") Signed-off-by: Chen Ridong Link: https://lore.kernel.org/r/20241029082845.1185380-1-chenridong@huaweicloud.com Signed-off-by: Vinod Koul --- drivers/dma/at_xdmac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 9c7b40220004..ba25c23164e7 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1363,6 +1363,8 @@ at_xdmac_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value, return NULL; desc = at_xdmac_memset_create_desc(chan, atchan, dest, len, value); + if (!desc) + return NULL; list_add_tail(&desc->desc_node, &desc->descs_list); desc->tx_dma_desc.cookie = -EBUSY; From f0e870a0e9c5521f2952ea9f3ea9d3d122631a89 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Nov 2024 11:50:50 +0200 Subject: [PATCH 012/807] dmaengine: dw: Select only supported masters for ACPI devices The recently submitted fix-commit revealed a problem in the iDMA 32-bit platform code. Even though the controller supported only a single master the dw_dma_acpi_filter() method hard-coded two master interfaces with IDs 0 and 1. As a result the sanity check implemented in the commit b336268dde75 ("dmaengine: dw: Add peripheral bus width verification") got incorrect interface data width and thus prevented the client drivers from configuring the DMA-channel with the EINVAL error returned. E.g., the next error was printed for the PXA2xx SPI controller driver trying to configure the requested channels: > [ 164.525604] pxa2xx_spi_pci 0000:00:07.1: DMA slave config failed > [ 164.536105] pxa2xx_spi_pci 0000:00:07.1: failed to get DMA TX descriptor > [ 164.543213] spidev spi-SPT0001:00: SPI transfer failed: -16 The problem would have been spotted much earlier if the iDMA 32-bit controller supported more than one master interfaces. But since it supports just a single master and the iDMA 32-bit specific code just ignores the master IDs in the CTLLO preparation method, the issue has been gone unnoticed so far. Fix the problem by specifying the default master ID for both memory and peripheral devices in the driver data. Thus the issue noticed for the iDMA 32-bit controllers will be eliminated and the ACPI-probed DW DMA controllers will be configured with the correct master ID by default. Cc: stable@vger.kernel.org Fixes: b336268dde75 ("dmaengine: dw: Add peripheral bus width verification") Fixes: 199244d69458 ("dmaengine: dw: add support of iDMA 32-bit hardware") Reported-by: Ferry Toth Closes: https://lore.kernel.org/dmaengine/ZuXbCKUs1iOqFu51@black.fi.intel.com/ Reported-by: Andy Shevchenko Closes: https://lore.kernel.org/dmaengine/ZuXgI-VcHpMgbZ91@black.fi.intel.com/ Tested-by: Ferry Toth Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241104095142.157925-1-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- drivers/dma/dw/acpi.c | 6 ++++-- drivers/dma/dw/internal.h | 8 ++++++++ drivers/dma/dw/pci.c | 4 ++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/dma/dw/acpi.c b/drivers/dma/dw/acpi.c index c510c109d2c3..b6452fffa657 100644 --- a/drivers/dma/dw/acpi.c +++ b/drivers/dma/dw/acpi.c @@ -8,13 +8,15 @@ static bool dw_dma_acpi_filter(struct dma_chan *chan, void *param) { + struct dw_dma *dw = to_dw_dma(chan->device); + struct dw_dma_chip_pdata *data = dev_get_drvdata(dw->dma.dev); struct acpi_dma_spec *dma_spec = param; struct dw_dma_slave slave = { .dma_dev = dma_spec->dev, .src_id = dma_spec->slave_id, .dst_id = dma_spec->slave_id, - .m_master = 0, - .p_master = 1, + .m_master = data->m_master, + .p_master = data->p_master, }; return dw_dma_filter(chan, &slave); diff --git a/drivers/dma/dw/internal.h b/drivers/dma/dw/internal.h index 563ce73488db..f1bd06a20cd6 100644 --- a/drivers/dma/dw/internal.h +++ b/drivers/dma/dw/internal.h @@ -51,11 +51,15 @@ struct dw_dma_chip_pdata { int (*probe)(struct dw_dma_chip *chip); int (*remove)(struct dw_dma_chip *chip); struct dw_dma_chip *chip; + u8 m_master; + u8 p_master; }; static __maybe_unused const struct dw_dma_chip_pdata dw_dma_chip_pdata = { .probe = dw_dma_probe, .remove = dw_dma_remove, + .m_master = 0, + .p_master = 1, }; static const struct dw_dma_platform_data idma32_pdata = { @@ -72,6 +76,8 @@ static __maybe_unused const struct dw_dma_chip_pdata idma32_chip_pdata = { .pdata = &idma32_pdata, .probe = idma32_dma_probe, .remove = idma32_dma_remove, + .m_master = 0, + .p_master = 0, }; static const struct dw_dma_platform_data xbar_pdata = { @@ -88,6 +94,8 @@ static __maybe_unused const struct dw_dma_chip_pdata xbar_chip_pdata = { .pdata = &xbar_pdata, .probe = idma32_dma_probe, .remove = idma32_dma_remove, + .m_master = 0, + .p_master = 0, }; #endif /* _DMA_DW_INTERNAL_H */ diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c index ad2d4d012cf7..e8a0eb81726a 100644 --- a/drivers/dma/dw/pci.c +++ b/drivers/dma/dw/pci.c @@ -56,10 +56,10 @@ static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid) if (ret) return ret; - dw_dma_acpi_controller_register(chip->dw); - pci_set_drvdata(pdev, data); + dw_dma_acpi_controller_register(chip->dw); + return 0; } From 4b65d5322e1d8994acfdb9b867aa00bdb30d177b Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Mon, 28 Oct 2024 17:34:13 +0800 Subject: [PATCH 013/807] dmaengine: loongson2-apb: Change GENMASK to GENMASK_ULL Fix the following smatch static checker warning: drivers/dma/loongson2-apb-dma.c:189 ls2x_dma_write_cmd() warn: was expecting a 64 bit value instead of '~(((0)) + (((~((0))) - (((1)) << (0)) + 1) & (~((0)) >> ((8 * 4) - 1 - (4)))))' The GENMASK macro used "unsigned long", which caused build issues when using a 32-bit toolchain because it would try to access bits > 31. This patch switches GENMASK to GENMASK_ULL, which uses "unsigned long long". Fixes: 71e7d3cb6e55 ("dmaengine: ls2x-apb: New driver for the Loongson LS2X APB DMA controller") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/87cdc025-7246-4548-85ca-3d36fdc2be2d@stanley.mountain/ Signed-off-by: Binbin Zhou Link: https://lore.kernel.org/r/20241028093413.1145820-1-zhoubinbin@loongson.cn Signed-off-by: Vinod Koul --- drivers/dma/loongson2-apb-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/loongson2-apb-dma.c b/drivers/dma/loongson2-apb-dma.c index 367ed34ce4da..c528f02b9f84 100644 --- a/drivers/dma/loongson2-apb-dma.c +++ b/drivers/dma/loongson2-apb-dma.c @@ -31,7 +31,7 @@ #define LDMA_ASK_VALID BIT(2) #define LDMA_START BIT(3) /* DMA start operation */ #define LDMA_STOP BIT(4) /* DMA stop operation */ -#define LDMA_CONFIG_MASK GENMASK(4, 0) /* DMA controller config bits mask */ +#define LDMA_CONFIG_MASK GENMASK_ULL(4, 0) /* DMA controller config bits mask */ /* Bitfields in ndesc_addr field of HW descriptor */ #define LDMA_DESC_EN BIT(0) /*1: The next descriptor is valid */ From 8b9c12757f919157752646faf3821abf2b7d2a64 Mon Sep 17 00:00:00 2001 From: Chukun Pan Date: Fri, 22 Nov 2024 15:30:05 +0800 Subject: [PATCH 014/807] arm64: dts: rockchip: add reset-names for combphy on rk3568 The reset-names of combphy are missing, add it. Signed-off-by: Chukun Pan Fixes: fd3ac6e80497 ("dt-bindings: phy: rockchip: rk3588 has two reset lines") Link: https://lore.kernel.org/r/20241122073006.99309-1-amadeus@jmu.edu.cn Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3568.dtsi | 1 + arch/arm64/boot/dts/rockchip/rk356x-base.dtsi | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3568.dtsi b/arch/arm64/boot/dts/rockchip/rk3568.dtsi index ecaefe208e3e..695cccbdab0f 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3568.dtsi @@ -350,6 +350,7 @@ assigned-clocks = <&pmucru CLK_PCIEPHY0_REF>; assigned-clock-rates = <100000000>; resets = <&cru SRST_PIPEPHY0>; + reset-names = "phy"; rockchip,pipe-grf = <&pipegrf>; rockchip,pipe-phy-grf = <&pipe_phy_grf0>; #phy-cells = <1>; diff --git a/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi b/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi index 62be06f3b863..e55390629114 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi @@ -1681,6 +1681,7 @@ assigned-clocks = <&pmucru CLK_PCIEPHY1_REF>; assigned-clock-rates = <100000000>; resets = <&cru SRST_PIPEPHY1>; + reset-names = "phy"; rockchip,pipe-grf = <&pipegrf>; rockchip,pipe-phy-grf = <&pipe_phy_grf1>; #phy-cells = <1>; @@ -1697,6 +1698,7 @@ assigned-clocks = <&pmucru CLK_PCIEPHY2_REF>; assigned-clock-rates = <100000000>; resets = <&cru SRST_PIPEPHY2>; + reset-names = "phy"; rockchip,pipe-grf = <&pipegrf>; rockchip,pipe-phy-grf = <&pipe_phy_grf2>; #phy-cells = <1>; From 2ddd93481bce86c6a46223f45accdb3b149a43e4 Mon Sep 17 00:00:00 2001 From: FUKAUMI Naoki Date: Thu, 28 Nov 2024 12:06:30 +0000 Subject: [PATCH 015/807] arm64: dts: rockchip: rename rfkill label for Radxa ROCK 5B on ROCK 5B, there is no PCIe slot, instead there is a M.2 slot. rfkill pin is not exclusive to PCIe devices, there is SDIO Wi-Fi devices. rename rfkill label from "rfkill-pcie-wlan" to "rfkill-m2-wlan", it matches with rfkill-bt. Fixes: 82d40b141a4c ("arm64: dts: rockchip: add rfkill node for M.2 Key E WiFi on rock-5b") Reviewed-by: Dragan Simic Signed-off-by: FUKAUMI Naoki Fixes: 82d40b141a4c ("arm64: dts: rockchip: add rfkill node for M.2 Key E WiFi on rock-5b") Link: https://lore.kernel.org/r/20241128120631.37458-1-naoki@radxa.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts index c44d001da169..d597112f1d5b 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts @@ -72,7 +72,7 @@ rfkill { compatible = "rfkill-gpio"; - label = "rfkill-pcie-wlan"; + label = "rfkill-m2-wlan"; radio-type = "wlan"; shutdown-gpios = <&gpio4 RK_PA2 GPIO_ACTIVE_HIGH>; }; From 989e0cdc0f18a594b25cabc60426d29659aeaf58 Mon Sep 17 00:00:00 2001 From: Brahmajit Das Date: Sat, 5 Oct 2024 01:21:32 +0530 Subject: [PATCH 016/807] fs/qnx6: Fix building with GCC 15 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qnx6_checkroot() had been using weirdly spelled initializer - it needed to initialize 3-element arrays of char and it used NUL-padded 3-character string literals (i.e. 4-element initializers, with completely pointless zeroes at the end). That had been spotted by gcc-15[*]; prior to that gcc quietly dropped the 4th element of initializers. However, none of that had been needed in the first place - all this array is used for is checking that the first directory entry in root directory is "." and the second - "..". The check had been expressed as a loop, using that match_root[] array. Since there is no chance that we ever want to extend that list of entries, the entire thing is much too fancy for its own good; what we need is just a couple of explicit memcmp() and that's it. [*]: fs/qnx6/inode.c: In function ‘qnx6_checkroot’: fs/qnx6/inode.c:182:41: error: initializer-string for array of ‘char’ is too long [-Werror=unterminated-string-initialization] 182 | static char match_root[2][3] = {".\0\0", "..\0"}; | ^~~~~~~ fs/qnx6/inode.c:182:50: error: initializer-string for array of ‘char’ is too long [-Werror=unterminated-string-initialization] 182 | static char match_root[2][3] = {".\0\0", "..\0"}; | ^~~~~~ Signed-off-by: Brahmajit Das Link: https://lore.kernel.org/r/20241004195132.1393968-1-brahmajit.xyz@gmail.com Acked-by: Al Viro Signed-off-by: Christian Brauner --- fs/qnx6/inode.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 85925ec0051a..3310d1ad4d0e 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -179,8 +179,7 @@ static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf) */ static const char *qnx6_checkroot(struct super_block *s) { - static char match_root[2][3] = {".\0\0", "..\0"}; - int i, error = 0; + int error = 0; struct qnx6_dir_entry *dir_entry; struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; @@ -189,11 +188,9 @@ static const char *qnx6_checkroot(struct super_block *s) if (IS_ERR(folio)) return "error reading root directory"; dir_entry = kmap_local_folio(folio, 0); - for (i = 0; i < 2; i++) { - /* maximum 3 bytes - due to match_root limitation */ - if (strncmp(dir_entry[i].de_fname, match_root[i], 3)) - error = 1; - } + if (memcmp(dir_entry[0].de_fname, ".", 2) || + memcmp(dir_entry[1].de_fname, "..", 3)) + error = 1; folio_release_kmap(folio, dir_entry); if (error) return "error reading root directory."; From b77bd3ba762f34e5eb731134cf50e233d1060053 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 27 Nov 2024 16:06:05 -0300 Subject: [PATCH 017/807] ARM: imx: Re-introduce the PINCTRL selection Since commit 17d210018914 ("ARM: imx: Allow user to disable pinctrl"), the CONFIG_PINCTRL option is no longer implicitly selected, causing several i.MX SoC pinctrl drivers no longer getting selected by default. This causes boot regressions on the ARMv4, ARMv5, ARMv6 and ARMv7 i.MX SoCs. Fix it by selecting CONFIG_PINCTRL as before. This defeats the purpose of 7d210018914 ("ARM: imx: Allow user to disable pinctrl"), but it is the less invasive fix for the boot regressions. The attempt to build Layerscape without pinctrl can still be explored later as suggested by Arnd: "Overall, my best advice here is still to not change the way i.MX pinctrl works at all, but just fix Layerscape to not depend on i.MX. The reason for the 'select' here is clearly that the i.MX machines would fail to boot without pinctrl, and changing that because of Layerscape seems backwards." Fixes: 17d210018914 ("ARM: imx: Allow user to disable pinctrl") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/linux-arm-kernel/49ff070a-ce67-42d7-84ec-8b54fd7e9742@roeck-us.net/ Signed-off-by: Fabio Estevam Acked-by: Arnd Bergmann Reviewed-by: Linus Walleij Tested-by: Guenter Roeck Link: https://lore.kernel.org/20241127190605.1367157-1-festevam@gmail.com Signed-off-by: Linus Walleij --- arch/arm/mach-imx/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig index e4fe059cd861..dc47b2312127 100644 --- a/arch/arm/mach-imx/Kconfig +++ b/arch/arm/mach-imx/Kconfig @@ -6,6 +6,7 @@ menuconfig ARCH_MXC select CLKSRC_IMX_GPT select GENERIC_IRQ_CHIP select GPIOLIB + select PINCTRL select PM_OPP if PM select SOC_BUS select SRAM From 514b2262ade48a0503ac6aa03c3bfb8c5be69b21 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 16 Nov 2024 00:05:18 +0100 Subject: [PATCH 018/807] firmware: arm_scmi: Fix i.MX build dependency The newly added SCMI vendor driver references functions in the protocol driver but needs a Kconfig dependency to ensure it can link, essentially the Kconfig dependency needs to be reversed to match the link time dependency: | arm-linux-gnueabi-ld: sound/soc/fsl/fsl_mqs.o: in function `fsl_mqs_sm_write': | fsl_mqs.c:(.text+0x1aa): undefined reference to `scmi_imx_misc_ctrl_set' | arm-linux-gnueabi-ld: sound/soc/fsl/fsl_mqs.o: in function `fsl_mqs_sm_read': | fsl_mqs.c:(.text+0x1ee): undefined reference to `scmi_imx_misc_ctrl_get' This however only works after changing the dependency in the SND_SOC_FSL_MQS driver as well, which uses 'select IMX_SCMI_MISC_DRV' to turn on a driver it depends on. This is generally a bad idea, so the best solution is to change that into a dependency. To allow the ASoC driver to keep building with the SCMI support, this needs to be an optional dependency that enforces the link-time dependency if IMX_SCMI_MISC_DRV is a loadable module but not depend on it if that is disabled. Fixes: 61c9f03e22fc ("firmware: arm_scmi: Add initial support for i.MX MISC protocol") Fixes: 101c9023594a ("ASoC: fsl_mqs: Support accessing registers by scmi interface") Signed-off-by: Arnd Bergmann Acked-by: Mark Brown Acked-by: Shengjiu Wang Message-Id: <20241115230555.2435004-1-arnd@kernel.org> Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/vendors/imx/Kconfig | 1 + drivers/firmware/imx/Kconfig | 1 - sound/soc/fsl/Kconfig | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/arm_scmi/vendors/imx/Kconfig b/drivers/firmware/arm_scmi/vendors/imx/Kconfig index 2883ed24a84d..a01bf5e47301 100644 --- a/drivers/firmware/arm_scmi/vendors/imx/Kconfig +++ b/drivers/firmware/arm_scmi/vendors/imx/Kconfig @@ -15,6 +15,7 @@ config IMX_SCMI_BBM_EXT config IMX_SCMI_MISC_EXT tristate "i.MX SCMI MISC EXTENSION" depends on ARM_SCMI_PROTOCOL || (COMPILE_TEST && OF) + depends on IMX_SCMI_MISC_DRV default y if ARCH_MXC help This enables i.MX System MISC control logic such as gpio expander diff --git a/drivers/firmware/imx/Kconfig b/drivers/firmware/imx/Kconfig index 477d3f32d99a..907cd149c40a 100644 --- a/drivers/firmware/imx/Kconfig +++ b/drivers/firmware/imx/Kconfig @@ -25,7 +25,6 @@ config IMX_SCU config IMX_SCMI_MISC_DRV tristate "IMX SCMI MISC Protocol driver" - depends on IMX_SCMI_MISC_EXT || COMPILE_TEST default y if ARCH_MXC help The System Controller Management Interface firmware (SCMI FW) is diff --git a/sound/soc/fsl/Kconfig b/sound/soc/fsl/Kconfig index 8e88830e8e57..678540b78280 100644 --- a/sound/soc/fsl/Kconfig +++ b/sound/soc/fsl/Kconfig @@ -29,8 +29,8 @@ config SND_SOC_FSL_SAI config SND_SOC_FSL_MQS tristate "Medium Quality Sound (MQS) module support" depends on SND_SOC_FSL_SAI + depends on IMX_SCMI_MISC_DRV || !IMX_SCMI_MISC_DRV select REGMAP_MMIO - select IMX_SCMI_MISC_DRV if IMX_SCMI_MISC_EXT !=n help Say Y if you want to add Medium Quality Sound (MQS) support for the Freescale CPUs. From 239521712b2b568b99d5f0ef7c1f874d797f4a29 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 13 Nov 2024 16:56:31 -0600 Subject: [PATCH 019/807] dt-bindings: mtd: fixed-partitions: Fix "compression" typo The example erroneously has "compress" property rather than the documented "compression" property. Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20241113225632.1783241-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/mtd/partitions/fixed-partitions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml b/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml index 058253d6d889..62086366837c 100644 --- a/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml +++ b/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml @@ -82,7 +82,7 @@ examples: uimage@100000 { reg = <0x0100000 0x200000>; - compress = "lzma"; + compression = "lzma"; }; }; From d7dfa7fde63dde4d2ec0083133efe2c6686c03ff Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 2 Dec 2024 17:58:19 +0100 Subject: [PATCH 020/807] of: Fix error path in of_parse_phandle_with_args_map() The current code uses some 'goto put;' to cancel the parsing operation and can lead to a return code value of 0 even on error cases. Indeed, some goto calls are done from a loop without setting the ret value explicitly before the goto call and so the ret value can be set to 0 due to operation done in previous loop iteration. For instance match can be set to 0 in the previous loop iteration (leading to a new iteration) but ret can also be set to 0 it the of_property_read_u32() call succeed. In that case if no match are found or if an error is detected the new iteration, the return value can be wrongly 0. Avoid those cases setting the ret value explicitly before the goto calls. Fixes: bd6f2fd5a1d5 ("of: Support parsing phandle argument lists through a nexus node") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Link: https://lore.kernel.org/r/20241202165819.158681-1-herve.codina@bootlin.com Signed-off-by: Rob Herring (Arm) --- drivers/of/base.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index a8b0c42bdc8e..44b1c8bf9cc0 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1471,8 +1471,10 @@ int of_parse_phandle_with_args_map(const struct device_node *np, map_len--; /* Check if not found */ - if (!new) + if (!new) { + ret = -EINVAL; goto put; + } if (!of_device_is_available(new)) match = 0; @@ -1482,17 +1484,20 @@ int of_parse_phandle_with_args_map(const struct device_node *np, goto put; /* Check for malformed properties */ - if (WARN_ON(new_size > MAX_PHANDLE_ARGS)) - goto put; - if (map_len < new_size) + if (WARN_ON(new_size > MAX_PHANDLE_ARGS) || + map_len < new_size) { + ret = -EINVAL; goto put; + } /* Move forward by new node's #-cells amount */ map += new_size; map_len -= new_size; } - if (!match) + if (!match) { + ret = -ENOENT; goto put; + } /* Get the -map-pass-thru property (optional) */ pass = of_get_property(cur, pass_name, NULL); From 793baff3f24f16dab9061045e23eea67724feae6 Mon Sep 17 00:00:00 2001 From: Honglei Wang Date: Fri, 29 Nov 2024 17:10:03 +0800 Subject: [PATCH 021/807] sched_ext: Add __weak to fix the build errors commit 5cbb302880f5 ("sched_ext: Rename scx_bpf_dispatch[_vtime]_from_dsq*() -> scx_bpf_dsq_move[_vtime]*()") introduced several new functions which caused compilation errors when compiled with clang. Let's fix this by adding __weak markers. Signed-off-by: Honglei Wang Signed-off-by: Tejun Heo Fixes: 5cbb302880f5 ("sched_ext: Rename scx_bpf_dispatch[_vtime]_from_dsq*() -> scx_bpf_dsq_move[_vtime]*()") Acked-by: Andrii Nakryiko --- tools/sched_ext/include/scx/common.bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 2f36b7b6418d..625f5b046776 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -40,9 +40,9 @@ void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_fl void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; -bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym; -void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; -void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; +bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak; +void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; u32 scx_bpf_reenqueue_local(void) __ksym; From b03917e02bf9861be887a7e67c399b3b014f88be Mon Sep 17 00:00:00 2001 From: Konstantin Andrikopoulos Date: Wed, 27 Nov 2024 15:07:38 +0000 Subject: [PATCH 022/807] rust: add safety comment in workqueue traits Add missing safety comments for the implementation of the unsafe traits WorkItemPointer and RawWorkItem for Arc in workqueue.rs Link: https://github.com/Rust-for-Linux/linux/issues/351. Co-developed-by: Vangelis Mamalakis Signed-off-by: Vangelis Mamalakis Suggested-by: Miguel Ojeda Reviewed-by: Alice Ryhl Signed-off-by: Konstantin Andrikopoulos Signed-off-by: Tejun Heo --- rust/kernel/workqueue.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs index 4d1d2062f6eb..fd3e97192ed8 100644 --- a/rust/kernel/workqueue.rs +++ b/rust/kernel/workqueue.rs @@ -519,7 +519,15 @@ impl_has_work! { impl{T} HasWork for ClosureWork { self.work } } -// SAFETY: TODO. +// SAFETY: The `__enqueue` implementation in RawWorkItem uses a `work_struct` initialized with the +// `run` method of this trait as the function pointer because: +// - `__enqueue` gets the `work_struct` from the `Work` field, using `T::raw_get_work`. +// - The only safe way to create a `Work` object is through `Work::new`. +// - `Work::new` makes sure that `T::Pointer::run` is passed to `init_work_with_key`. +// - Finally `Work` and `RawWorkItem` guarantee that the correct `Work` field +// will be used because of the ID const generic bound. This makes sure that `T::raw_get_work` +// uses the correct offset for the `Work` field, and `Work::new` picks the correct +// implementation of `WorkItemPointer` for `Arc`. unsafe impl WorkItemPointer for Arc where T: WorkItem, @@ -537,7 +545,13 @@ where } } -// SAFETY: TODO. +// SAFETY: The `work_struct` raw pointer is guaranteed to be valid for the duration of the call to +// the closure because we get it from an `Arc`, which means that the ref count will be at least 1, +// and we don't drop the `Arc` ourselves. If `queue_work_on` returns true, it is further guaranteed +// to be valid until a call to the function pointer in `work_struct` because we leak the memory it +// points to, and only reclaim it if the closure returns false, or in `WorkItemPointer::run`, which +// is what the function pointer in the `work_struct` must be pointing to, according to the safety +// requirements of `WorkItemPointer`. unsafe impl RawWorkItem for Arc where T: WorkItem, From 6fe437cfe2cdc797b03f63b338a13fac96ed6a08 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Tue, 3 Dec 2024 14:31:08 +0000 Subject: [PATCH 023/807] firmware: arm_ffa: Fix the race around setting ffa_dev->properties Currently, ffa_dev->properties is set after the ffa_device_register() call return in ffa_setup_partitions(). This could potentially result in a race where the partition's properties is accessed while probing struct ffa_device before it is set. Update the ffa_device_register() to receive ffa_partition_info so all the data from the partition information received from the firmware can be updated into the struct ffa_device before the calling device_register() in ffa_device_register(). Fixes: e781858488b9 ("firmware: arm_ffa: Add initial FFA bus support for device enumeration") Signed-off-by: Levi Yun Message-Id: <20241203143109.1030514-2-yeoreum.yun@arm.com> Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/bus.c | 15 +++++++++++---- drivers/firmware/arm_ffa/driver.c | 7 +------ include/linux/arm_ffa.h | 13 ++++++++----- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c index eb17d03b66fe..dfda5ffc14db 100644 --- a/drivers/firmware/arm_ffa/bus.c +++ b/drivers/firmware/arm_ffa/bus.c @@ -187,13 +187,18 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) return valid; } -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops) +struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops) { int id, ret; + uuid_t uuid; struct device *dev; struct ffa_device *ffa_dev; + if (!part_info) + return NULL; + id = ida_alloc_min(&ffa_bus_id, 1, GFP_KERNEL); if (id < 0) return NULL; @@ -210,9 +215,11 @@ struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, dev_set_name(&ffa_dev->dev, "arm-ffa-%d", id); ffa_dev->id = id; - ffa_dev->vm_id = vm_id; + ffa_dev->vm_id = part_info->id; + ffa_dev->properties = part_info->properties; ffa_dev->ops = ops; - uuid_copy(&ffa_dev->uuid, uuid); + import_uuid(&uuid, (u8 *)part_info->uuid); + uuid_copy(&ffa_dev->uuid, &uuid); ret = device_register(&ffa_dev->dev); if (ret) { diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c index b14cbdae94e8..2c2ec3c35f15 100644 --- a/drivers/firmware/arm_ffa/driver.c +++ b/drivers/firmware/arm_ffa/driver.c @@ -1387,7 +1387,6 @@ static struct notifier_block ffa_bus_nb = { static int ffa_setup_partitions(void) { int count, idx, ret; - uuid_t uuid; struct ffa_device *ffa_dev; struct ffa_dev_part_info *info; struct ffa_partition_info *pbuf, *tpbuf; @@ -1406,23 +1405,19 @@ static int ffa_setup_partitions(void) xa_init(&drv_info->partition_info); for (idx = 0, tpbuf = pbuf; idx < count; idx++, tpbuf++) { - import_uuid(&uuid, (u8 *)tpbuf->uuid); - /* Note that if the UUID will be uuid_null, that will require * ffa_bus_notifier() to find the UUID of this partition id * with help of ffa_device_match_uuid(). FF-A v1.1 and above * provides UUID here for each partition as part of the * discovery API and the same is passed. */ - ffa_dev = ffa_device_register(&uuid, tpbuf->id, &ffa_drv_ops); + ffa_dev = ffa_device_register(tpbuf, &ffa_drv_ops); if (!ffa_dev) { pr_err("%s: failed to register partition ID 0x%x\n", __func__, tpbuf->id); continue; } - ffa_dev->properties = tpbuf->properties; - if (drv_info->version > FFA_VERSION_1_0 && !(tpbuf->properties & FFA_PARTITION_AARCH64_EXEC)) ffa_mode_32bit_set(ffa_dev); diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index a28e2a6a13d0..74169dd0f659 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -166,9 +166,12 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev) return dev_get_drvdata(&fdev->dev); } +struct ffa_partition_info; + #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT) -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops); +struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops); void ffa_device_unregister(struct ffa_device *ffa_dev); int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); @@ -176,9 +179,9 @@ void ffa_driver_unregister(struct ffa_driver *driver); bool ffa_device_is_valid(struct ffa_device *ffa_dev); #else -static inline -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops) +static inline struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops) { return NULL; } From ac1e21bd8c883aeac2f1835fc93b39c1e6838b35 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 3 Dec 2024 09:44:06 +0800 Subject: [PATCH 024/807] jbd2: increase IO priority for writing revoke records Commit '6a3afb6ac6df ("jbd2: increase the journal IO's priority")' increases the priority of journal I/O by marking I/O with the JBD2_JOURNAL_REQ_FLAGS. However, that commit missed the revoke buffers, so also addresses that kind of I/Os. Fixes: 6a3afb6ac6df ("jbd2: increase the journal IO's priority") Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20241203014407.805916-2-yi.zhang@huaweicloud.com Reviewed-by: Kemeng Shi Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/jbd2/revoke.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 4556e4689024..ce63d5fde9c3 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -654,7 +654,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, REQ_SYNC); + write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS); } #endif From a0851ea9cd555c333795b85ddd908898b937c4e1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 3 Dec 2024 09:44:07 +0800 Subject: [PATCH 025/807] jbd2: flush filesystem device before updating tail sequence When committing transaction in jbd2_journal_commit_transaction(), the disk caches for the filesystem device should be flushed before updating the journal tail sequence. However, this step is missed if the journal is not located on the filesystem device. As a result, the filesystem may become inconsistent following a power failure or system crash. Fix it by ensuring that the filesystem device is flushed appropriately. Fixes: 3339578f0578 ("jbd2: cleanup journal tail after transaction commit") Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20241203014407.805916-3-yi.zhang@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/jbd2/commit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 9153ff3a08e7..e8e80761ac73 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -772,9 +772,9 @@ start_journal_io: /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue - * the commit record + * the commit record and update the journal tail sequence. */ - if (commit_transaction->t_need_data_flush && + if ((commit_transaction->t_need_data_flush || update_tail) && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(journal->j_fs_dev); From 790fb9956eead785b720ccc0851f09a5ca3a093e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Dec 2024 09:20:04 -0800 Subject: [PATCH 026/807] linux/dmaengine.h: fix a few kernel-doc warnings The comment block for "Interleaved Transfer Request" should not begin with "/**" since it is not in kernel-doc format. Fix doc name for enum sum_check_flags. Fix all (4) missing struct member warnings. Use "Warning:" for one "Note:" in enum dma_desc_metadata_mode since scripts/kernel-doc does not allow more than one Note: per function or identifier description. This leaves around 49 kernel-doc warnings like: include/linux/dmaengine.h:43: warning: Enum value 'DMA_OUT_OF_ORDER' not described in enum 'dma_status' and another scripts/kernel-doc problem with it not being able to parse some typedefs. Fixes: b14dab792dee ("DMAEngine: Define interleaved transfer request api") Fixes: ad283ea4a3ce ("async_tx: add sum check flags") Fixes: 272420214d26 ("dmaengine: Add DMA_CTRL_REUSE") Fixes: f067025bc676 ("dmaengine: add support to provide error result from a DMA transation") Fixes: d38a8c622a1b ("dmaengine: prepare for generic 'unmap' data") Fixes: 5878853fc938 ("dmaengine: Add API function dmaengine_prep_peripheral_dma_vec()") Signed-off-by: Randy Dunlap Cc: Dan Williams Cc: Dave Jiang Cc: Paul Cercueil Cc: Nuno Sa Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Link: https://lore.kernel.org/r/20241202172004.76020-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index b137fdb56093..346251bf1026 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -84,7 +84,7 @@ enum dma_transfer_direction { DMA_TRANS_NONE, }; -/** +/* * Interleaved Transfer Request * ---------------------------- * A chunk is collection of contiguous bytes to be transferred. @@ -223,7 +223,7 @@ enum sum_check_bits { }; /** - * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations + * enum sum_check_flags - result of async_{xor,pq}_zero_sum operations * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise */ @@ -286,7 +286,7 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; * pointer to the engine's metadata area * 4. Read out the metadata from the pointer * - * Note: the two mode is not compatible and clients must use one mode for a + * Warning: the two modes are not compatible and clients must use one mode for a * descriptor. */ enum dma_desc_metadata_mode { @@ -594,9 +594,13 @@ struct dma_descriptor_metadata_ops { * @phys: physical address of the descriptor * @chan: target channel for this operation * @tx_submit: accept the descriptor, assign ordered cookie and mark the + * @desc_free: driver's callback function to free a resusable descriptor + * after completion * descriptor pending. To be pushed on .issue_pending() call * @callback: routine to call after this operation is complete + * @callback_result: error result from a DMA transaction * @callback_param: general parameter to pass to the callback routine + * @unmap: hook for generic DMA unmap data * @desc_metadata_mode: core managed metadata mode to protect mixed use of * DESC_METADATA_CLIENT or DESC_METADATA_ENGINE. Otherwise * DESC_METADATA_NONE @@ -827,6 +831,9 @@ struct dma_filter { * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_memset_sg: prepares a memset operation over a scatter list * @device_prep_dma_interrupt: prepares an end of chain interrupt operation + * @device_prep_peripheral_dma_vec: prepares a scatter-gather DMA transfer, + * where the address and size of each segment is located in one entry of + * the dma_vec array. * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. * The function takes a buffer of size buf_len. The callback function will From 8d55e8a16f019211163f1180fd9f9fbe05901900 Mon Sep 17 00:00:00 2001 From: Sasha Finkelstein Date: Sun, 24 Nov 2024 16:48:28 +0100 Subject: [PATCH 027/807] dmaengine: apple-admac: Avoid accessing registers in probe The ADMAC attached to the AOP has complex power sequencing, and is power gated when the probe callback runs. Move the register reads to other functions, where we can guarantee that the hardware is switched on. Fixes: 568aa6dd641f ("dmaengine: apple-admac: Allocate cache SRAM to channels") Signed-off-by: Sasha Finkelstein Link: https://lore.kernel.org/r/20241124-admac-power-v1-1-58f2165a4d55@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/apple-admac.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/dma/apple-admac.c b/drivers/dma/apple-admac.c index c499173d80b2..bd49f0374291 100644 --- a/drivers/dma/apple-admac.c +++ b/drivers/dma/apple-admac.c @@ -153,6 +153,8 @@ static int admac_alloc_sram_carveout(struct admac_data *ad, { struct admac_sram *sram; int i, ret = 0, nblocks; + ad->txcache.size = readl_relaxed(ad->base + REG_TX_SRAM_SIZE); + ad->rxcache.size = readl_relaxed(ad->base + REG_RX_SRAM_SIZE); if (dir == DMA_MEM_TO_DEV) sram = &ad->txcache; @@ -912,12 +914,7 @@ static int admac_probe(struct platform_device *pdev) goto free_irq; } - ad->txcache.size = readl_relaxed(ad->base + REG_TX_SRAM_SIZE); - ad->rxcache.size = readl_relaxed(ad->base + REG_RX_SRAM_SIZE); - dev_info(&pdev->dev, "Audio DMA Controller\n"); - dev_info(&pdev->dev, "imprint %x TX cache %u RX cache %u\n", - readl_relaxed(ad->base + REG_IMPRINT), ad->txcache.size, ad->rxcache.size); return 0; From dcbef0798eb825cd584f7a93f62bed63f7fbbfc9 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 18 Sep 2024 11:10:22 -0700 Subject: [PATCH 028/807] dmaengine: amd: qdma: Remove using the private get and set dma_ops APIs The get_dma_ops and set_dma_ops APIs were never for driver to use. Remove these calls from QDMA driver. Instead, pass the DMA device pointer from the qdma_platdata structure. Fixes: 73d5fc92a11c ("dmaengine: amd: qdma: Add AMD QDMA driver") Signed-off-by: Lizhi Hou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240918181022.2155715-1-lizhi.hou@amd.com Signed-off-by: Vinod Koul --- drivers/dma/amd/qdma/qdma.c | 28 +++++++++++--------------- include/linux/platform_data/amd_qdma.h | 2 ++ 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/dma/amd/qdma/qdma.c b/drivers/dma/amd/qdma/qdma.c index 6d9079458fe9..66f00ad67351 100644 --- a/drivers/dma/amd/qdma/qdma.c +++ b/drivers/dma/amd/qdma/qdma.c @@ -7,9 +7,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -492,18 +492,9 @@ static int qdma_device_verify(struct qdma_device *qdev) static int qdma_device_setup(struct qdma_device *qdev) { - struct device *dev = &qdev->pdev->dev; u32 ring_sz = QDMA_DEFAULT_RING_SIZE; int ret = 0; - while (dev && get_dma_ops(dev)) - dev = dev->parent; - if (!dev) { - qdma_err(qdev, "dma device not found"); - return -EINVAL; - } - set_dma_ops(&qdev->pdev->dev, get_dma_ops(dev)); - ret = qdma_setup_fmap_context(qdev); if (ret) { qdma_err(qdev, "Failed setup fmap context"); @@ -548,11 +539,12 @@ static void qdma_free_queue_resources(struct dma_chan *chan) { struct qdma_queue *queue = to_qdma_queue(chan); struct qdma_device *qdev = queue->qdev; - struct device *dev = qdev->dma_dev.dev; + struct qdma_platdata *pdata; qdma_clear_queue_context(queue); vchan_free_chan_resources(&queue->vchan); - dma_free_coherent(dev, queue->ring_size * QDMA_MM_DESC_SIZE, + pdata = dev_get_platdata(&qdev->pdev->dev); + dma_free_coherent(pdata->dma_dev, queue->ring_size * QDMA_MM_DESC_SIZE, queue->desc_base, queue->dma_desc_base); } @@ -565,6 +557,7 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) struct qdma_queue *queue = to_qdma_queue(chan); struct qdma_device *qdev = queue->qdev; struct qdma_ctxt_sw_desc desc; + struct qdma_platdata *pdata; size_t size; int ret; @@ -572,8 +565,9 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) if (ret) return ret; + pdata = dev_get_platdata(&qdev->pdev->dev); size = queue->ring_size * QDMA_MM_DESC_SIZE; - queue->desc_base = dma_alloc_coherent(qdev->dma_dev.dev, size, + queue->desc_base = dma_alloc_coherent(pdata->dma_dev, size, &queue->dma_desc_base, GFP_KERNEL); if (!queue->desc_base) { @@ -588,7 +582,7 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) if (ret) { qdma_err(qdev, "Failed to setup SW desc ctxt for %s", chan->name); - dma_free_coherent(qdev->dma_dev.dev, size, queue->desc_base, + dma_free_coherent(pdata->dma_dev, size, queue->desc_base, queue->dma_desc_base); return ret; } @@ -948,8 +942,9 @@ static int qdma_init_error_irq(struct qdma_device *qdev) static int qdmam_alloc_qintr_rings(struct qdma_device *qdev) { - u32 ctxt[QDMA_CTXT_REGMAP_LEN]; + struct qdma_platdata *pdata = dev_get_platdata(&qdev->pdev->dev); struct device *dev = &qdev->pdev->dev; + u32 ctxt[QDMA_CTXT_REGMAP_LEN]; struct qdma_intr_ring *ring; struct qdma_ctxt_intr intr_ctxt; u32 vector; @@ -969,7 +964,8 @@ static int qdmam_alloc_qintr_rings(struct qdma_device *qdev) ring->msix_id = qdev->err_irq_idx + i + 1; ring->ridx = i; ring->color = 1; - ring->base = dmam_alloc_coherent(dev, QDMA_INTR_RING_SIZE, + ring->base = dmam_alloc_coherent(pdata->dma_dev, + QDMA_INTR_RING_SIZE, &ring->dev_base, GFP_KERNEL); if (!ring->base) { qdma_err(qdev, "Failed to alloc intr ring %d", i); diff --git a/include/linux/platform_data/amd_qdma.h b/include/linux/platform_data/amd_qdma.h index 576d952f97ed..967a6ef31cf9 100644 --- a/include/linux/platform_data/amd_qdma.h +++ b/include/linux/platform_data/amd_qdma.h @@ -26,11 +26,13 @@ struct dma_slave_map; * @max_mm_channels: Maximum number of MM DMA channels in each direction * @device_map: DMA slave map * @irq_index: The index of first IRQ + * @dma_dev: The device pointer for dma operations */ struct qdma_platdata { u32 max_mm_channels; u32 irq_index; struct dma_slave_map *device_map; + struct device *dma_dev; }; #endif /* _PLATDATA_AMD_QDMA_H */ From eb867d797d294a00a092b5027d08439da68940b2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 Nov 2024 15:10:31 +0200 Subject: [PATCH 029/807] RDMA/bnxt_re: Remove always true dattr validity check res->dattr is always valid at this point as it was initialized during device addition in bnxt_re_add_device(). This change is fixing the following smatch error: drivers/infiniband/hw/bnxt_re/qplib_fp.c:1090 bnxt_qplib_create_qp() error: we previously assumed 'res->dattr' could be null (see line 985) Fixes: 07f830ae4913 ("RDMA/bnxt_re: Adds MSN table capability for Gen P7 adapters") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202411222329.YTrwonWi-lkp@intel.com/ Link: https://patch.msgid.link/be0d8836b64cba3e479fbcbca717acad04aae02e.1732626579.git.leonro@nvidia.com Acked-by: Selvin Xavier Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index e42abf5be6c0..9af8aaadc99a 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1000,9 +1000,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) u32 tbl_indx; u16 nsge; - if (res->dattr) - qp->is_host_msn_tbl = _is_host_msn_table(res->dattr->dev_cap_flags2); - + qp->is_host_msn_tbl = _is_host_msn_table(res->dattr->dev_cap_flags2); sq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_QP, From 0a92ea87bdd6f77ca4e17fe19649882cf5209edd Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Thu, 24 Oct 2024 14:35:40 -0700 Subject: [PATCH 030/807] phy: usb: Toggle the PHY power during init When bringing up the PHY, it might be in a bad state if left powered. One case is we lose the PLL lock if the PLL is gated while the PHY is powered. Toggle the PHY power so we can start from a known state. Fixes: 4e5b9c9a73b3 ("phy: usb: Add support for new Synopsys USB controller on the 7216") Signed-off-by: Justin Chen Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20241024213540.1059412-1-justin.chen@broadcom.com Signed-off-by: Vinod Koul --- drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c b/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c index 950b7ae1d1a8..dc452610934a 100644 --- a/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c +++ b/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c @@ -325,6 +325,12 @@ static void usb_init_common_7216(struct brcm_usb_init_params *params) void __iomem *ctrl = params->regs[BRCM_REGS_CTRL]; USB_CTRL_UNSET(ctrl, USB_PM, XHC_S2_CLK_SWITCH_EN); + + /* + * The PHY might be in a bad state if it is already powered + * up. Toggle the power just in case. + */ + USB_CTRL_SET(ctrl, USB_PM, USB_PWRDN); USB_CTRL_UNSET(ctrl, USB_PM, USB_PWRDN); /* 1 millisecond - for USB clocks to settle down */ From fbcbffbac994aca1264e3c14da96ac9bfd90466e Mon Sep 17 00:00:00 2001 From: Chukun Pan Date: Fri, 22 Nov 2024 15:30:06 +0800 Subject: [PATCH 031/807] phy: rockchip: naneng-combphy: fix phy reset Currently, the USB port via combophy on the RK3528/RK3588 SoC is broken. usb usb8-port1: Cannot enable. Maybe the USB cable is bad? This is due to the combphy of RK3528/RK3588 SoC has multiple resets, but only "phy resets" need assert and deassert, "apb resets" don't need. So change the driver to only match the phy resets, which is also what the vendor kernel does. Fixes: 7160820d742a ("phy: rockchip: add naneng combo phy for RK3568") Cc: FUKAUMI Naoki Cc: Michael Zimmermann Signed-off-by: Chukun Pan Reviewed-by: Heiko Stuebner Tested-by: FUKAUMI Naoki Link: https://lore.kernel.org/r/20241122073006.99309-2-amadeus@jmu.edu.cn Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-naneng-combphy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c b/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c index 0a9989e41237..2eb3329ca23f 100644 --- a/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c +++ b/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c @@ -309,7 +309,7 @@ static int rockchip_combphy_parse_dt(struct device *dev, struct rockchip_combphy priv->ext_refclk = device_property_present(dev, "rockchip,ext-refclk"); - priv->phy_rst = devm_reset_control_array_get_exclusive(dev); + priv->phy_rst = devm_reset_control_get(dev, "phy"); if (IS_ERR(priv->phy_rst)) return dev_err_probe(dev, PTR_ERR(priv->phy_rst), "failed to get phy reset\n"); From d0257e089d1bbd35c69b6c97ff73e3690ab149a9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 30 Nov 2024 13:06:41 +0300 Subject: [PATCH 032/807] RDMA/uverbs: Prevent integer overflow issue In the expression "cmd.wqe_size * cmd.wr_count", both variables are u32 values that come from the user so the multiplication can lead to integer wrapping. Then we pass the result to uverbs_request_next_ptr() which also could potentially wrap. The "cmd.sge_count * sizeof(struct ib_uverbs_sge)" multiplication can also overflow on 32bit systems although it's fine on 64bit systems. This patch does two things. First, I've re-arranged the condition in uverbs_request_next_ptr() so that the use controlled variable "len" is on one side of the comparison by itself without any math. Then I've modified all the callers to use size_mul() for the multiplications. Fixes: 67cdb40ca444 ("[IB] uverbs: Implement more commands") Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/b8765ab3-c2da-4611-aae0-ddd6ba173d23@stanley.mountain Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_cmd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 66b02fbf077a..5ad14c39d48c 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -161,7 +161,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter, { const void __user *res = iter->cur; - if (iter->cur + len > iter->end) + if (len > iter->end - iter->cur) return (void __force __user *)ERR_PTR(-ENOSPC); iter->cur += len; return res; @@ -2008,11 +2008,13 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd)); if (ret) return ret; - wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count); + wqes = uverbs_request_next_ptr(&iter, size_mul(cmd.wqe_size, + cmd.wr_count)); if (IS_ERR(wqes)) return PTR_ERR(wqes); - sgls = uverbs_request_next_ptr( - &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge)); + sgls = uverbs_request_next_ptr(&iter, + size_mul(cmd.sge_count, + sizeof(struct ib_uverbs_sge))); if (IS_ERR(sgls)) return PTR_ERR(sgls); ret = uverbs_request_finish(&iter); @@ -2198,11 +2200,11 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, if (wqe_size < sizeof(struct ib_uverbs_recv_wr)) return ERR_PTR(-EINVAL); - wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count); + wqes = uverbs_request_next_ptr(iter, size_mul(wqe_size, wr_count)); if (IS_ERR(wqes)) return ERR_CAST(wqes); - sgls = uverbs_request_next_ptr( - iter, sge_count * sizeof(struct ib_uverbs_sge)); + sgls = uverbs_request_next_ptr(iter, size_mul(sge_count, + sizeof(struct ib_uverbs_sge))); if (IS_ERR(sgls)) return ERR_CAST(sgls); ret = uverbs_request_finish(iter); From 8886fb3240931a0afce82dea87edfe46bcb0a586 Mon Sep 17 00:00:00 2001 From: Krishna Kurapati Date: Tue, 12 Nov 2024 14:58:31 +0530 Subject: [PATCH 033/807] phy: qcom-qmp: Fix register name in RX Lane config of SC8280XP In RX Lane configuration sequence of SC8280XP, the register V5_RX_UCDR_FO_GAIN is incorrectly spelled as RX_UCDR_SO_GAIN and hence the programming sequence is wrong. Fix the register sequence accordingly to avoid any compliance failures. This has been tested on SA8775P by checking device mode enumeration in SuperSpeed. Cc: stable@vger.kernel.org Fixes: c0c7769cdae2 ("phy: qcom-qmp: Add SC8280XP USB3 UNI phy") Signed-off-by: Krishna Kurapati Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20241112092831.4110942-1-quic_kriskura@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-usb.c b/drivers/phy/qualcomm/phy-qcom-qmp-usb.c index acd6075bf6d9..c9c337840715 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-usb.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-usb.c @@ -1052,7 +1052,7 @@ static const struct qmp_phy_init_tbl sc8280xp_usb3_uniphy_rx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FASTLOCK_FO_GAIN, 0x2f), QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FASTLOCK_COUNT_LOW, 0xff), QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FASTLOCK_COUNT_HIGH, 0x0f), - QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_SO_GAIN, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FO_GAIN, 0x0a), QMP_PHY_INIT_CFG(QSERDES_V5_RX_VGA_CAL_CNTRL1, 0x54), QMP_PHY_INIT_CFG(QSERDES_V5_RX_VGA_CAL_CNTRL2, 0x0f), QMP_PHY_INIT_CFG(QSERDES_V5_RX_RX_EQU_ADAPTOR_CNTRL2, 0x0f), From 2de679ecd724b823c2cb58caab8508c7eec8aefc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Nov 2024 11:37:02 +0100 Subject: [PATCH 034/807] phy: stm32: work around constant-value overflow assertion FIELD_PREP() checks that a constant fits into the available bitfield, but if one of the two lookup tables in stm32_impedance_tune() does not find a matching entry, the index is out of range, which gcc correctly complains about: In file included from : In function 'stm32_impedance_tune', inlined from 'stm32_combophy_pll_init' at drivers/phy/st/phy-stm32-combophy.c:247:9: include/linux/compiler_types.h:517:38: error: call to '__compiletime_assert_447' declared with attribute error: FIELD_PREP: value too large for the field 517 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ include/linux/bitfield.h:68:3: note: in expansion of macro 'BUILD_BUG_ON_MSG' 68 | BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ 115 | __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \ | ^~~~~~~~~~~~~~~~ drivers/phy/st/phy-stm32-combophy.c:162:8: note: in expansion of macro 'FIELD_PREP' 162 | FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of)); | ^~~~~~~~~~ Rework this so the field value gets set inside of the loop and otherwise set to zero. Fixes: 47e1bb6b4ba0 ("phy: stm32: Add support for STM32MP25 COMBOPHY.") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241111103712.3520611-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/st/phy-stm32-combophy.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/phy/st/phy-stm32-combophy.c b/drivers/phy/st/phy-stm32-combophy.c index 765bb34fe358..49e9fa90a681 100644 --- a/drivers/phy/st/phy-stm32-combophy.c +++ b/drivers/phy/st/phy-stm32-combophy.c @@ -122,6 +122,7 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) u32 max_vswing = imp_lookup[imp_size - 1].vswing[vswing_size - 1]; u32 min_vswing = imp_lookup[0].vswing[0]; u32 val; + u32 regval; if (!of_property_read_u32(combophy->dev->of_node, "st,output-micro-ohms", &val)) { if (val < min_imp || val > max_imp) { @@ -129,16 +130,20 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) return -EINVAL; } - for (imp_of = 0; imp_of < ARRAY_SIZE(imp_lookup); imp_of++) - if (imp_lookup[imp_of].microohm <= val) + regval = 0; + for (imp_of = 0; imp_of < ARRAY_SIZE(imp_lookup); imp_of++) { + if (imp_lookup[imp_of].microohm <= val) { + regval = FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_OHM, imp_of); break; + } + } dev_dbg(combophy->dev, "Set %u micro-ohms output impedance\n", imp_lookup[imp_of].microohm); regmap_update_bits(combophy->regmap, SYSCFG_PCIEPRGCR, STM32MP25_PCIEPRG_IMPCTRL_OHM, - FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_OHM, imp_of)); + regval); } else { regmap_read(combophy->regmap, SYSCFG_PCIEPRGCR, &val); imp_of = FIELD_GET(STM32MP25_PCIEPRG_IMPCTRL_OHM, val); @@ -150,16 +155,20 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) return -EINVAL; } - for (vswing_of = 0; vswing_of < ARRAY_SIZE(imp_lookup[imp_of].vswing); vswing_of++) - if (imp_lookup[imp_of].vswing[vswing_of] >= val) + regval = 0; + for (vswing_of = 0; vswing_of < ARRAY_SIZE(imp_lookup[imp_of].vswing); vswing_of++) { + if (imp_lookup[imp_of].vswing[vswing_of] >= val) { + regval = FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of); break; + } + } dev_dbg(combophy->dev, "Set %u microvolt swing\n", imp_lookup[imp_of].vswing[vswing_of]); regmap_update_bits(combophy->regmap, SYSCFG_PCIEPRGCR, STM32MP25_PCIEPRG_IMPCTRL_VSWING, - FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of)); + regval); } return 0; From ef7009decc30eb2515a64253791d61b72229c119 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Thu, 21 Nov 2024 21:40:17 +0000 Subject: [PATCH 035/807] selftests/sched_ext: fix build after renames in sched_ext API The selftests are falining to build on current tip of bpf-next and sched_ext [1]. This has broken BPF CI [2] after merge from upstream. Use appropriate function names in the selftests according to the recent changes in the sched_ext API [3]. [1] https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=fc39fb56917bb3cb53e99560ca3612a84456ada2 [2] https://github.com/kernel-patches/bpf/actions/runs/11959327258/job/33340923745 [3] https://lore.kernel.org/all/20241109194853.580310-1-tj@kernel.org/ Signed-off-by: Ihor Solodrai Acked-by: Andrea Righi Acked-by: David Vernet Signed-off-by: Tejun Heo --- .../testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c | 2 +- .../selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c | 4 ++-- tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 2 +- .../selftests/sched_ext/enq_select_cpu_fails.bpf.c | 2 +- tools/testing/selftests/sched_ext/exit.bpf.c | 4 ++-- tools/testing/selftests/sched_ext/maximal.bpf.c | 4 ++-- tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c | 2 +- .../selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c | 2 +- .../testing/selftests/sched_ext/select_cpu_dispatch.bpf.c | 2 +- .../selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c | 2 +- .../selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c | 4 ++-- tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c | 8 ++++---- 12 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c index 37d9bf6fb745..6f4c3f5a1c5d 100644 --- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c @@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, * If we dispatch to a bogus DSQ that will fall back to the * builtin global DSQ, we fail gracefully. */ - scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + scx_bpf_dsq_insert_vtime(p, 0xcafef00d, SCX_SLICE_DFL, p->scx.dsq_vtime, 0); return cpu; } diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c index dffc97d9cdf1..e4a55027778f 100644 --- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c @@ -17,8 +17,8 @@ s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, if (cpu >= 0) { /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ - scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, - p->scx.dsq_vtime, 0); + scx_bpf_dsq_insert_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); return cpu; } diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index 6a7db1502c29..6325bf76f47e 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -45,7 +45,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) target = bpf_get_prandom_u32() % nr_cpus; - scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); bpf_task_release(p); } diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c index 1efb50d61040..a7cf868d5e31 100644 --- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c @@ -31,7 +31,7 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, /* Can only call from ops.select_cpu() */ scx_bpf_select_cpu_dfl(p, 0, 0, &found); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); } SEC(".struct_ops.link") diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c index d75d4faf07f6..4bc36182d3ff 100644 --- a/tools/testing/selftests/sched_ext/exit.bpf.c +++ b/tools/testing/selftests/sched_ext/exit.bpf.c @@ -33,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) if (exit_point == EXIT_ENQUEUE) EXIT_CLEANLY(); - scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); } void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) @@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) if (exit_point == EXIT_DISPATCH) EXIT_CLEANLY(); - scx_bpf_consume(DSQ_ID); + scx_bpf_dsq_move_to_local(DSQ_ID); } void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c index 4d4cd8d966db..4c005fa71810 100644 --- a/tools/testing/selftests/sched_ext/maximal.bpf.c +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) { - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); } void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) @@ -28,7 +28,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) { - scx_bpf_consume(SCX_DSQ_GLOBAL); + scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL); } void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c index f171ac470970..13d0f5be788d 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c @@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, } scx_bpf_put_idle_cpumask(idle_mask); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); } SEC(".struct_ops.link") diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c index 9efdbb7da928..815f1d5d61ac 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c @@ -67,7 +67,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, saw_local = true; } - scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); } s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c index 59bfc4f36167..4bb99699e920 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c @@ -29,7 +29,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, cpu = prev_cpu; dispatch: - scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, 0); return cpu; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c index 3bbd5fcdfb18..2a75de11b2cf 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c @@ -18,7 +18,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p s32 prev_cpu, u64 wake_flags) { /* Dispatching to a random DSQ should fail. */ - scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, 0xcafef00d, SCX_SLICE_DFL, 0); return prev_cpu; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c index 0fda57fe0ecf..99d075695c97 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c @@ -18,8 +18,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p s32 prev_cpu, u64 wake_flags) { /* Dispatching twice in a row is disallowed. */ - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); return prev_cpu; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c index e6c67bcf5e6e..bfcb96cd4954 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c @@ -2,8 +2,8 @@ /* * A scheduler that validates that enqueue flags are properly stored and * applied at dispatch time when a task is directly dispatched from - * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and - * making the test a very basic vtime scheduler. + * ops.select_cpu(). We validate this by using scx_bpf_dsq_insert_vtime(), + * and making the test a very basic vtime scheduler. * * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. * Copyright (c) 2024 David Vernet @@ -47,13 +47,13 @@ s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, cpu = prev_cpu; scx_bpf_test_and_clear_cpu_idle(cpu); ddsp: - scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + scx_bpf_dsq_insert_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); return cpu; } void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) { - if (scx_bpf_consume(VTIME_DSQ)) + if (scx_bpf_dsq_move_to_local(VTIME_DSQ)) consumed = true; } From f24d192985cbd6782850fdbb3839039da2f0ee76 Mon Sep 17 00:00:00 2001 From: guanjing Date: Sun, 17 Nov 2024 10:51:29 +0800 Subject: [PATCH 036/807] sched_ext: fix application of sizeof to pointer sizeof when applied to a pointer typed expression gives the size of the pointer. The proper fix in this particular case is to code sizeof(*cpuset) instead of sizeof(cpuset). This issue was detected with the help of Coccinelle. Fixes: 22a920209ab6 ("sched_ext: Implement tickless support") Signed-off-by: guanjing Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/scx_central.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 21deea320bd7..e938156ed0a0 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -97,7 +97,7 @@ restart: SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); CPU_ZERO(cpuset); CPU_SET(skel->rodata->central_cpu, cpuset); - SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), + SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset), "Failed to affinitize to central CPU %d (max %d)", skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); CPU_FREE(cpuset); From e05feab22fd7dabcd6d272c4e2401ec1acdfdb9b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Tue, 3 Dec 2024 15:45:37 +0200 Subject: [PATCH 037/807] RDMA/mlx5: Enforce same type port association for multiport RoCE Different core device types such as PFs and VFs shouldn't be affiliated together since they have different capabilities, fix that by enforcing type check before doing the affiliation. Fixes: 32f69e4be269 ("{net, IB}/mlx5: Manage port association for multiport RoCE") Reviewed-by: Mark Bloch Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/88699500f690dff1c1852c1ddb71f8a1cc8b956e.1733233480.git.leonro@nvidia.com Reviewed-by: Mateusz Polchlopek Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 6 ++++-- include/linux/mlx5/driver.h | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bc7930d0c564..c2314797afc9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3639,7 +3639,8 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, list) { if (dev->sys_image_guid == mpi->sys_image_guid && - (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + (mlx5_core_native_port_num(mpi->mdev) - 1) == i && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) { bound = mlx5_ib_bind_slave_port(dev, mpi); } @@ -4785,7 +4786,8 @@ static int mlx5r_mp_probe(struct auxiliary_device *adev, mutex_lock(&mlx5_ib_multiport_mutex); list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { - if (dev->sys_image_guid == mpi->sys_image_guid) + if (dev->sys_image_guid == mpi->sys_image_guid && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) bound = mlx5_ib_bind_slave_port(dev, mpi); if (bound) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..4f9e6f6dbaab 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1202,6 +1202,12 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_VF; } +static inline bool mlx5_core_same_coredev_type(const struct mlx5_core_dev *dev1, + const struct mlx5_core_dev *dev2) +{ + return dev1->coredev_type == dev2->coredev_type; +} + static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; From 79d330fbdffd8cee06d8bdf38d82cb62d8363a27 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Wed, 4 Dec 2024 13:24:12 +0530 Subject: [PATCH 038/807] RDMA/bnxt_re: Fix max SGEs for the Work Request Gen P7 supports up to 13 SGEs for now. WQE software structure can hold only 6 now. Since the max send sge is reported as 13, the stack can give requests up to 13 SGEs. This is causing traffic failures and system crashes. Use the define for max SGE supported for variable size. This will work for both static and variable WQEs. Fixes: 227f51743b61 ("RDMA/bnxt_re: Fix the max WQE size for static WQE support") Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index ef3424c81345..19e279871f10 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -114,7 +114,6 @@ struct bnxt_qplib_sge { u32 size; }; -#define BNXT_QPLIB_QP_MAX_SGL 6 struct bnxt_qplib_swq { u64 wr_id; int next_idx; @@ -154,7 +153,7 @@ struct bnxt_qplib_swqe { #define BNXT_QPLIB_SWQE_FLAGS_UC_FENCE BIT(2) #define BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT BIT(3) #define BNXT_QPLIB_SWQE_FLAGS_INLINE BIT(4) - struct bnxt_qplib_sge sg_list[BNXT_QPLIB_QP_MAX_SGL]; + struct bnxt_qplib_sge sg_list[BNXT_VAR_MAX_SGE]; int num_sge; /* Max inline data is 96 bytes */ u32 inline_len; From 5effcacc8a8f3eb2a9f069d7e81a9ac793598dfb Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 4 Dec 2024 13:24:13 +0530 Subject: [PATCH 039/807] RDMA/bnxt_re: Avoid initializing the software queue for user queues Software Queues to hold the WRs needs to be created for only kernel queues. Avoid allocating the unnecessary memory for user Queues. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Fixes: 159fb4ceacd7 ("RDMA/bnxt_re: introduce a function to allocate swq") Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-3-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 42 +++++++++++++----------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 9af8aaadc99a..72f35070f671 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -659,13 +659,6 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, rc = bnxt_qplib_alloc_init_hwq(&srq->hwq, &hwq_attr); if (rc) return rc; - - srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), - GFP_KERNEL); - if (!srq->swq) { - rc = -ENOMEM; - goto fail; - } srq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_SRQ, @@ -694,9 +687,17 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, spin_lock_init(&srq->lock); srq->start_idx = 0; srq->last_idx = srq->hwq.max_elements - 1; - for (idx = 0; idx < srq->hwq.max_elements; idx++) - srq->swq[idx].next_idx = idx + 1; - srq->swq[srq->last_idx].next_idx = -1; + if (!srq->hwq.is_user) { + srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), + GFP_KERNEL); + if (!srq->swq) { + rc = -ENOMEM; + goto fail; + } + for (idx = 0; idx < srq->hwq.max_elements; idx++) + srq->swq[idx].next_idx = idx + 1; + srq->swq[srq->last_idx].next_idx = -1; + } srq->id = le32_to_cpu(resp.xid); srq->dbinfo.hwq = &srq->hwq; @@ -1042,13 +1043,14 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (rc) return rc; - rc = bnxt_qplib_alloc_init_swq(sq); - if (rc) - goto fail_sq; - - if (psn_sz) - bnxt_qplib_init_psn_ptr(qp, psn_sz); + if (!sq->hwq.is_user) { + rc = bnxt_qplib_alloc_init_swq(sq); + if (rc) + goto fail_sq; + if (psn_sz) + bnxt_qplib_init_psn_ptr(qp, psn_sz); + } req.sq_size = cpu_to_le32(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); pbl = &sq->hwq.pbl[PBL_LVL_0]; req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); @@ -1074,9 +1076,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr); if (rc) goto sq_swq; - rc = bnxt_qplib_alloc_init_swq(rq); - if (rc) - goto fail_rq; + if (!rq->hwq.is_user) { + rc = bnxt_qplib_alloc_init_swq(rq); + if (rc) + goto fail_rq; + } req.rq_size = cpu_to_le32(rq->max_wqe); pbl = &rq->hwq.pbl[PBL_LVL_0]; From 064c22408a73b9e945139b64614c534cbbefb591 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Wed, 4 Dec 2024 13:24:14 +0530 Subject: [PATCH 040/807] RDMA/bnxt_re: Avoid sending the modify QP workaround for latest adapters The workaround to modify the UD QP from RTS to RTS is required only for older adapters. Issuing this for latest adapters can caus some unexpected behavior. Fix it Fixes: 1801d87b3598 ("RDMA/bnxt_re: Support new 5760X P7 devices") Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 82023394e330..5428a1408cee 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2824,7 +2824,8 @@ bad: wr = wr->next; } bnxt_qplib_post_send_db(&qp->qplib_qp); - bnxt_ud_qp_hw_stall_workaround(qp); + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_ud_qp_hw_stall_workaround(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); return rc; } @@ -2936,7 +2937,8 @@ bad: wr = wr->next; } bnxt_qplib_post_send_db(&qp->qplib_qp); - bnxt_ud_qp_hw_stall_workaround(qp); + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_ud_qp_hw_stall_workaround(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); return rc; From d507d29bfde3fee6a74d098a9ac640b8fc1a549b Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 4 Dec 2024 13:24:16 +0530 Subject: [PATCH 041/807] RDMA/bnxt_re: Don't fail destroy QP and cleanup debugfs earlier Change bnxt_re_destroy_qp to always return 0 and don't fail in case of error during destroy. In addition, delete debugfs QP to earlier stage. Fixes: d7d54769c042 ("RDMA/bnxt_re: Add debugfs hook in the driver") Reviewed-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-6-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 5428a1408cee..215074c0860b 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -967,13 +967,13 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) unsigned int flags; int rc; + bnxt_re_debug_rem_qpinfo(rdev, qp); + bnxt_qplib_flush_cqn_wq(&qp->qplib_qp); rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp); - if (rc) { + if (rc) ibdev_err(&rdev->ibdev, "Failed to destroy HW QP"); - return rc; - } if (rdma_is_kernel_res(&qp->ib_qp.res)) { flags = bnxt_re_lock_cqs(qp); @@ -983,11 +983,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp); - if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) { - rc = bnxt_re_destroy_gsi_sqp(qp); - if (rc) - return rc; - } + if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) + bnxt_re_destroy_gsi_sqp(qp); mutex_lock(&rdev->qp_lock); list_del(&qp->list); @@ -998,8 +995,6 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) else if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD) atomic_dec(&rdev->stats.res.ud_qp_count); - bnxt_re_debug_rem_qpinfo(rdev, qp); - ib_umem_release(qp->rumem); ib_umem_release(qp->sumem); From d8e4771f99c0400a1873235704b28bb803c83d17 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 23 Oct 2024 11:40:56 +0300 Subject: [PATCH 042/807] mtd: rawnand: fix double free in atmel_pmecc_create_user() The "user" pointer was converted from being allocated with kzalloc() to being allocated by devm_kzalloc(). Calling kfree(user) will lead to a double free. Fixes: 6d734f1bfc33 ("mtd: rawnand: atmel: Fix possible memory leak") Signed-off-by: Dan Carpenter Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/atmel/pmecc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/mtd/nand/raw/atmel/pmecc.c b/drivers/mtd/nand/raw/atmel/pmecc.c index a22aab4ed4e8..3c7dee1be21d 100644 --- a/drivers/mtd/nand/raw/atmel/pmecc.c +++ b/drivers/mtd/nand/raw/atmel/pmecc.c @@ -380,10 +380,8 @@ atmel_pmecc_create_user(struct atmel_pmecc *pmecc, user->delta = user->dmu + req->ecc.strength + 1; gf_tables = atmel_pmecc_get_gf_tables(req); - if (IS_ERR(gf_tables)) { - kfree(user); + if (IS_ERR(gf_tables)) return ERR_CAST(gf_tables); - } user->gf_tables = gf_tables; From 9b458e8be0d13e81ed03fffa23f8f9b528bbd786 Mon Sep 17 00:00:00 2001 From: Zichen Xie Date: Wed, 23 Oct 2024 16:13:10 -0500 Subject: [PATCH 043/807] mtd: diskonchip: Cast an operand to prevent potential overflow There may be a potential integer overflow issue in inftl_partscan(). parts[0].size is defined as "uint64_t" while mtd->erasesize and ip->firstUnit are defined as 32-bit unsigned integer. The result of the calculation will be limited to 32 bits without correct casting. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Zichen Xie Cc: stable@vger.kernel.org Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/diskonchip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/diskonchip.c b/drivers/mtd/nand/raw/diskonchip.c index 8db7fc424571..70d6c2250f32 100644 --- a/drivers/mtd/nand/raw/diskonchip.c +++ b/drivers/mtd/nand/raw/diskonchip.c @@ -1098,7 +1098,7 @@ static inline int __init inftl_partscan(struct mtd_info *mtd, struct mtd_partiti (i == 0) && (ip->firstUnit > 0)) { parts[0].name = " DiskOnChip IPL / Media Header partition"; parts[0].offset = 0; - parts[0].size = mtd->erasesize * ip->firstUnit; + parts[0].size = (uint64_t)mtd->erasesize * ip->firstUnit; numparts = 1; } From b086a46dae48829e11c0c02580e30d920b76743c Mon Sep 17 00:00:00 2001 From: Maciej Andrzejewski Date: Mon, 2 Dec 2024 13:51:07 +0100 Subject: [PATCH 044/807] mtd: rawnand: arasan: Fix double assertion of chip-select When two chip-selects are configured in the device tree, and the second is a non-native GPIO, both the GPIO-based chip-select and the first native chip-select may be asserted simultaneously. This double assertion causes incorrect read and write operations. The issue occurs because when nfc->ncs <= 2, nfc->spare_cs is always initialized to 0 due to static initialization. Consequently, when the second chip-select (GPIO-based) is selected in anfc_assert_cs(), it is detected by anfc_is_gpio_cs(), and nfc->native_cs is assigned the value 0. This results in both the GPIO-based chip-select being asserted and the NAND controller register receiving 0, erroneously selecting the native chip-select. This patch resolves the issue, as confirmed by oscilloscope testing with configurations involving two or more chip-selects in the device tree. Fixes: acbd3d0945f9 ("mtd: rawnand: arasan: Leverage additional GPIO CS") Cc: stable@vger.kernel.org Signed-off-by: Maciej Andrzejewski Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/arasan-nand-controller.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/arasan-nand-controller.c b/drivers/mtd/nand/raw/arasan-nand-controller.c index db42aa0c7b6b..26b506107a1a 100644 --- a/drivers/mtd/nand/raw/arasan-nand-controller.c +++ b/drivers/mtd/nand/raw/arasan-nand-controller.c @@ -1409,8 +1409,8 @@ static int anfc_parse_cs(struct arasan_nfc *nfc) * case, the "not" chosen CS is assigned to nfc->spare_cs and selected * whenever a GPIO CS must be asserted. */ - if (nfc->cs_array && nfc->ncs > 2) { - if (!nfc->cs_array[0] && !nfc->cs_array[1]) { + if (nfc->cs_array) { + if (nfc->ncs > 2 && !nfc->cs_array[0] && !nfc->cs_array[1]) { dev_err(nfc->dev, "Assign a single native CS when using GPIOs\n"); return -EINVAL; From 11e6831fd81468cf48155b9b3c11295c391da723 Mon Sep 17 00:00:00 2001 From: Maciej Andrzejewski Date: Mon, 2 Dec 2024 19:58:36 +0100 Subject: [PATCH 045/807] mtd: rawnand: arasan: Fix missing de-registration of NAND The NAND chip-selects are registered for the Arasan driver during initialization but are not de-registered when the driver is unloaded. As a result, if the driver is loaded again, the chip-selects remain registered and busy, making them unavailable for use. Fixes: 197b88fecc50 ("mtd: rawnand: arasan: Add new Arasan NAND controller") Cc: stable@vger.kernel.org Signed-off-by: Maciej Andrzejewski ICEYE Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/arasan-nand-controller.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mtd/nand/raw/arasan-nand-controller.c b/drivers/mtd/nand/raw/arasan-nand-controller.c index 26b506107a1a..865754737f5f 100644 --- a/drivers/mtd/nand/raw/arasan-nand-controller.c +++ b/drivers/mtd/nand/raw/arasan-nand-controller.c @@ -1478,8 +1478,15 @@ static int anfc_probe(struct platform_device *pdev) static void anfc_remove(struct platform_device *pdev) { + int i; struct arasan_nfc *nfc = platform_get_drvdata(pdev); + for (i = 0; i < nfc->ncs; i++) { + if (nfc->cs_array[i]) { + gpiod_put(nfc->cs_array[i]); + } + } + anfc_chips_cleanup(nfc); } From 140054a25f85036ec847e722c76cc1bfaf3f0d96 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Tue, 3 Dec 2024 15:33:17 +0200 Subject: [PATCH 046/807] mtd: rawnand: omap2: Fix build warnings with W=1 Add kernel-doc for functions to get rid of below warnings when built with W=1. drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_out_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_out_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_out_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_out_dma_pref' Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412031716.JfNIh1Uu-lkp@intel.com/ Signed-off-by: Roger Quadros Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/omap2.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c index d9141f3c0dd1..b8af3a3533fc 100644 --- a/drivers/mtd/nand/raw/omap2.c +++ b/drivers/mtd/nand/raw/omap2.c @@ -254,6 +254,10 @@ static int omap_prefetch_reset(int cs, struct omap_nand_info *info) /** * omap_nand_data_in_pref - NAND data in using prefetch engine + * @chip: NAND chip + * @buf: output buffer where NAND data is placed into + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_in_pref(struct nand_chip *chip, void *buf, unsigned int len, bool force_8bit) @@ -297,6 +301,10 @@ static void omap_nand_data_in_pref(struct nand_chip *chip, void *buf, /** * omap_nand_data_out_pref - NAND data out using Write Posting engine + * @chip: NAND chip + * @buf: input buffer that is sent to NAND + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_out_pref(struct nand_chip *chip, const void *buf, unsigned int len, @@ -440,6 +448,10 @@ out_copy: /** * omap_nand_data_in_dma_pref - NAND data in using DMA and Prefetch + * @chip: NAND chip + * @buf: output buffer where NAND data is placed into + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_in_dma_pref(struct nand_chip *chip, void *buf, unsigned int len, bool force_8bit) @@ -460,6 +472,10 @@ static void omap_nand_data_in_dma_pref(struct nand_chip *chip, void *buf, /** * omap_nand_data_out_dma_pref - NAND data out using DMA and write posting + * @chip: NAND chip + * @buf: input buffer that is sent to NAND + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_out_dma_pref(struct nand_chip *chip, const void *buf, unsigned int len, From 4f776d81bf927a4f25d5e32a4d0df08ee509dd6c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V (Arm)" Date: Thu, 28 Nov 2024 20:55:43 +0530 Subject: [PATCH 047/807] arm64: dts: fvp: Update PCIe bus-range property These days, the Fixed Virtual Platforms(FVP) Base RevC model supports more PCI devices. Update the max bus number so that Linux can enumerate them correctly. Without this, the kernel throws the below error while booting with the default hierarchy | pci_bus 0000:01: busn_res: [bus 01] end is updated to 01 | pci_bus 0000:02: busn_res: can not insert [bus 02-01] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:02: busn_res: [bus 02-01] end is updated to 02 | pci_bus 0000:02: busn_res: can not insert [bus 02] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:03: busn_res: can not insert [bus 03-01] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:03: busn_res: [bus 03-01] end is updated to 03 | pci_bus 0000:03: busn_res: can not insert [bus 03] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:04: busn_res: can not insert [bus 04-01] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:04: busn_res: [bus 04-01] end is updated to 04 | pci_bus 0000:04: busn_res: can not insert [bus 04] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci 0000:00:01.0: BAR 14: assigned [mem 0x50000000-0x500fffff] | pci-host-generic 40000000.pci: ECAM at [mem 0x40000000-0x4fffffff] | for [bus 00-01] The change is using 0xff as max bus number because the ECAM window is 256MB in size. Below is the lspci output with and without the change: without fix =========== | 00:00.0 Host bridge: ARM Device 00ba (rev 01) | 00:01.0 PCI bridge: ARM Device 0def | 00:02.0 PCI bridge: ARM Device 0def | 00:03.0 PCI bridge: ARM Device 0def | 00:04.0 PCI bridge: ARM Device 0def | 00:1e.0 Unassigned class [ff00]: ARM Device ff80 | 00:1e.1 Unassigned class [ff00]: ARM Device ff80 | 00:1f.0 SATA controller: Device 0abc:aced (rev 01) | 01:00.0 SATA controller: Device 0abc:aced (rev 01) with fix ======== | 00:00.0 Host bridge: ARM Device 00ba (rev 01) | 00:01.0 PCI bridge: ARM Device 0def | 00:02.0 PCI bridge: ARM Device 0def | 00:03.0 PCI bridge: ARM Device 0def | 00:04.0 PCI bridge: ARM Device 0def | 00:1e.0 Unassigned class [ff00]: ARM Device ff80 | 00:1e.1 Unassigned class [ff00]: ARM Device ff80 | 00:1f.0 SATA controller: Device 0abc:aced (rev 01) | 01:00.0 SATA controller: Device 0abc:aced (rev 01) | 02:00.0 Unassigned class [ff00]: ARM Device ff80 | 02:00.4 Unassigned class [ff00]: ARM Device ff80 | 03:00.0 PCI bridge: ARM Device 0def | 04:00.0 PCI bridge: ARM Device 0def | 04:01.0 PCI bridge: ARM Device 0def | 04:02.0 PCI bridge: ARM Device 0def | 05:00.0 SATA controller: Device 0abc:aced (rev 01) | 06:00.0 Unassigned class [ff00]: ARM Device ff80 | 06:00.7 Unassigned class [ff00]: ARM Device ff80 | 07:00.0 Unassigned class [ff00]: ARM Device ff80 | 07:00.3 Unassigned class [ff00]: ARM Device ff80 | 08:00.0 Unassigned class [ff00]: ARM Device ff80 | 08:00.1 Unassigned class [ff00]: ARM Device ff80 Cc: Sudeep Holla Cc: Lorenzo Pieralisi Cc: Rob Herring Cc: Krzysztof Kozlowski Cc: Conor Dooley Reviewed-by: Liviu Dudau Signed-off-by: Aneesh Kumar K.V (Arm) Message-Id: <20241128152543.1821878-1-aneesh.kumar@kernel.org> Signed-off-by: Sudeep Holla --- arch/arm64/boot/dts/arm/fvp-base-revc.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/arm/fvp-base-revc.dts b/arch/arm64/boot/dts/arm/fvp-base-revc.dts index 19973ab4ea6b..9e10d7a6b5a2 100644 --- a/arch/arm64/boot/dts/arm/fvp-base-revc.dts +++ b/arch/arm64/boot/dts/arm/fvp-base-revc.dts @@ -233,7 +233,7 @@ #interrupt-cells = <0x1>; compatible = "pci-host-ecam-generic"; device_type = "pci"; - bus-range = <0x0 0x1>; + bus-range = <0x0 0xff>; reg = <0x0 0x40000000 0x0 0x10000000>; ranges = <0x2000000 0x0 0x50000000 0x0 0x50000000 0x0 0x10000000>; interrupt-map = <0 0 0 1 &gic 0 0 GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>, From 48808b55b07c3cea64805267a5547f03e6452a9f Mon Sep 17 00:00:00 2001 From: Valentina Fernandez Date: Mon, 18 Nov 2024 15:53:54 +0000 Subject: [PATCH 048/807] firmware: microchip: fix UL_IAP lock check in mpfs_auto_update_state() To verify that Auto Update is possible, the mpfs_auto_update_state() function performs a "Query Security Service Request" to the system controller. Previously, the check was performed on the first element of the response message, which was accessed using a 32-bit pointer. This caused the bitwise operation to reference incorrect data, as the response should be inspected at the byte level. Fixed this by casting the response to a u8 * pointer, ensuring the check correctly inspects the appropriate byte of the response message. Additionally, rename "UL_Auto Update" to "UL_IAP" to match the PolarFire Family System Services User Guide. Signed-off-by: Valentina Fernandez Signed-off-by: Conor Dooley --- drivers/firmware/microchip/mpfs-auto-update.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/microchip/mpfs-auto-update.c b/drivers/firmware/microchip/mpfs-auto-update.c index 38a03698cec9..e194f7acb2a9 100644 --- a/drivers/firmware/microchip/mpfs-auto-update.c +++ b/drivers/firmware/microchip/mpfs-auto-update.c @@ -402,10 +402,10 @@ static int mpfs_auto_update_available(struct mpfs_auto_update_priv *priv) return -EIO; /* - * Bit 5 of byte 1 is "UL_Auto Update" & if it is set, Auto Update is + * Bit 5 of byte 1 is "UL_IAP" & if it is set, Auto Update is * not possible. */ - if (response_msg[1] & AUTO_UPDATE_FEATURE_ENABLED) + if ((((u8 *)response_msg)[1] & AUTO_UPDATE_FEATURE_ENABLED)) return -EPERM; return 0; From c0599762f0c7e260b99c6b7bceb8eae69b804c94 Mon Sep 17 00:00:00 2001 From: Zicheng Qu Date: Thu, 7 Nov 2024 01:10:14 +0000 Subject: [PATCH 049/807] staging: iio: ad9834: Correct phase range check User Perspective: When a user sets the phase value, the ad9834_write_phase() is called. The phase register has a 12-bit resolution, so the valid range is 0 to 4095. If the phase offset value of 4096 is input, it effectively exactly equals 0 in the lower 12 bits, meaning no offset. Reasons for the Change: 1) Original Condition (phase > BIT(AD9834_PHASE_BITS)): This condition allows a phase value equal to 2^12, which is 4096. However, this value exceeds the valid 12-bit range, as the maximum valid phase value should be 4095. 2) Modified Condition (phase >= BIT(AD9834_PHASE_BITS)): Ensures that the phase value is within the valid range, preventing invalid datafrom being written. Impact on Subsequent Logic: st->data = cpu_to_be16(addr | phase): If the phase value is 2^12, i.e., 4096 (0001 0000 0000 0000), and addr is AD9834_REG_PHASE0 (1100 0000 0000 0000), then addr | phase results in 1101 0000 0000 0000, occupying DB12. According to the section of WRITING TO A PHASE REGISTER in the datasheet, the MSB 12 PHASE0 bits should be DB11. The original condition leads to incorrect DB12 usage, which contradicts the datasheet and could pose potential issues for future updates if DB12 is used in such related cases. Fixes: 12b9d5bf76bf ("Staging: IIO: DDS: AD9833 / AD9834 driver") Cc: stable@vger.kernel.org Signed-off-by: Zicheng Qu Reviewed-by: Dan Carpenter Link: https://patch.msgid.link/20241107011015.2472600-2-quzicheng@huawei.com Signed-off-by: Jonathan Cameron --- drivers/staging/iio/frequency/ad9834.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/iio/frequency/ad9834.c b/drivers/staging/iio/frequency/ad9834.c index 47e7d7e6d920..6e99e008c5f4 100644 --- a/drivers/staging/iio/frequency/ad9834.c +++ b/drivers/staging/iio/frequency/ad9834.c @@ -131,7 +131,7 @@ static int ad9834_write_frequency(struct ad9834_state *st, static int ad9834_write_phase(struct ad9834_state *st, unsigned long addr, unsigned long phase) { - if (phase > BIT(AD9834_PHASE_BITS)) + if (phase >= BIT(AD9834_PHASE_BITS)) return -EINVAL; st->data = cpu_to_be16(addr | phase); From 4636e859ebe0011f41e35fa79bab585b8004e9a3 Mon Sep 17 00:00:00 2001 From: Zicheng Qu Date: Thu, 7 Nov 2024 01:10:15 +0000 Subject: [PATCH 050/807] staging: iio: ad9832: Correct phase range check User Perspective: When a user sets the phase value, the ad9832_write_phase() is called. The phase register has a 12-bit resolution, so the valid range is 0 to 4095. If the phase offset value of 4096 is input, it effectively exactly equals 0 in the lower 12 bits, meaning no offset. Reasons for the Change: 1) Original Condition (phase > BIT(AD9832_PHASE_BITS)): This condition allows a phase value equal to 2^12, which is 4096. However, this value exceeds the valid 12-bit range, as the maximum valid phase value should be 4095. 2) Modified Condition (phase >= BIT(AD9832_PHASE_BITS)): Ensures that the phase value is within the valid range, preventing invalid datafrom being written. Impact on Subsequent Logic: st->data = cpu_to_be16(addr | phase): If the phase value is 2^12, i.e., 4096 (0001 0000 0000 0000), and addr is AD9832_REG_PHASE0 (1100 0000 0000 0000), then addr | phase results in 1101 0000 0000 0000, occupying DB12. According to the section of WRITING TO A PHASE REGISTER in the datasheet, the MSB 12 PHASE0 bits should be DB11. The original condition leads to incorrect DB12 usage, which contradicts the datasheet and could pose potential issues for future updates if DB12 is used in such related cases. Fixes: ea707584bac1 ("Staging: IIO: DDS: AD9832 / AD9835 driver") Cc: stable@vger.kernel.org Signed-off-by: Zicheng Qu Link: https://patch.msgid.link/20241107011015.2472600-3-quzicheng@huawei.com Signed-off-by: Jonathan Cameron --- drivers/staging/iio/frequency/ad9832.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/iio/frequency/ad9832.c b/drivers/staging/iio/frequency/ad9832.c index 492612e8f8ba..140ee4f9c137 100644 --- a/drivers/staging/iio/frequency/ad9832.c +++ b/drivers/staging/iio/frequency/ad9832.c @@ -158,7 +158,7 @@ static int ad9832_write_frequency(struct ad9832_state *st, static int ad9832_write_phase(struct ad9832_state *st, unsigned long addr, unsigned long phase) { - if (phase > BIT(AD9832_PHASE_BITS)) + if (phase >= BIT(AD9832_PHASE_BITS)) return -EINVAL; st->phase_data[0] = cpu_to_be16((AD9832_CMD_PHA8BITSW << CMD_SHIFT) | From 4be339af334c283a1a1af3cb28e7e448a0aa8a7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Nov 2024 11:19:04 +0100 Subject: [PATCH 051/807] iio: adc: ad7124: Disable all channels at probe time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When during a measurement two channels are enabled, two measurements are done that are reported sequencially in the DATA register. As the code triggered by reading one of the sysfs properties expects that only one channel is enabled it only reads the first data set which might or might not belong to the intended channel. To prevent this situation disable all channels during probe. This fixes a problem in practise because the reset default for channel 0 is enabled. So all measurements before the first measurement on channel 0 (which disables channel 0 at the end) might report wrong values. Fixes: 7b8d045e497a ("iio: adc: ad7124: allow more than 8 channels") Reviewed-by: Nuno Sa Signed-off-by: Uwe Kleine-König Link: https://patch.msgid.link/20241104101905.845737-2-u.kleine-koenig@baylibre.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7124.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c index 7314fb32bdec..3d678c420cbf 100644 --- a/drivers/iio/adc/ad7124.c +++ b/drivers/iio/adc/ad7124.c @@ -917,6 +917,9 @@ static int ad7124_setup(struct ad7124_state *st) * set all channels to this default value. */ ad7124_set_channel_odr(st, i, 10); + + /* Disable all channels to prevent unintended conversions. */ + ad_sd_write_reg(&st->sd, AD7124_CHANNEL(i), 2, 0); } ret = ad_sd_write_reg(&st->sd, AD7124_ADC_CONTROL, 2, st->adc_control); From aaa90d0751071d38f3e74c4e14bc1083abcb0c15 Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Wed, 30 Oct 2024 11:48:54 +0800 Subject: [PATCH 052/807] iio: test : check null return of kunit_kmalloc in iio_rescale_test_scale kunit_kmalloc may fail, return value might be NULL and will cause NULL pointer dereference.Add KUNIT_ASSERT_NOT_ERR_OR_NULL fix it. Signed-off-by: Pei Xiao Fixes: 8e74a48d17d5 ("iio: test: add basic tests for the iio-rescale driver") Link: https://patch.msgid.link/ecd56a85e54a96c2f0313c114075a21a76071ea2.1730259869.git.xiaopei01@kylinos.cn Signed-off-by: Jonathan Cameron --- drivers/iio/test/iio-test-rescale.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/iio/test/iio-test-rescale.c b/drivers/iio/test/iio-test-rescale.c index cbf13337ed1f..bbc6a2e1c2c1 100644 --- a/drivers/iio/test/iio-test-rescale.c +++ b/drivers/iio/test/iio-test-rescale.c @@ -652,6 +652,8 @@ static void iio_rescale_test_scale(struct kunit *test) int rel_ppm; int ret; + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buff); + rescale.numerator = t->numerator; rescale.denominator = t->denominator; rescale.offset = t->offset; @@ -681,6 +683,8 @@ static void iio_rescale_test_offset(struct kunit *test) int values[2]; int ret; + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buff_off); + rescale.numerator = t->numerator; rescale.denominator = t->denominator; rescale.offset = t->offset; From fa13ac6cdf9b6c358e7d77c29fb60145c7a87965 Mon Sep 17 00:00:00 2001 From: Carlos Song Date: Sat, 16 Nov 2024 10:29:45 -0500 Subject: [PATCH 053/807] iio: gyro: fxas21002c: Fix missing data update in trigger handler The fxas21002c_trigger_handler() may fail to acquire sample data because the runtime PM enters the autosuspend state and sensor can not return sample data in standby mode.. Resume the sensor before reading the sample data into the buffer within the trigger handler. After the data is read, place the sensor back into the autosuspend state. Fixes: a0701b6263ae ("iio: gyro: add core driver for fxas21002c") Signed-off-by: Carlos Song Signed-off-by: Frank Li Link: https://patch.msgid.link/20241116152945.4006374-1-Frank.Li@nxp.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/gyro/fxas21002c_core.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/iio/gyro/fxas21002c_core.c b/drivers/iio/gyro/fxas21002c_core.c index 0391c78c2f18..754c8a564ba4 100644 --- a/drivers/iio/gyro/fxas21002c_core.c +++ b/drivers/iio/gyro/fxas21002c_core.c @@ -730,14 +730,21 @@ static irqreturn_t fxas21002c_trigger_handler(int irq, void *p) int ret; mutex_lock(&data->lock); - ret = regmap_bulk_read(data->regmap, FXAS21002C_REG_OUT_X_MSB, - data->buffer, CHANNEL_SCAN_MAX * sizeof(s16)); + ret = fxas21002c_pm_get(data); if (ret < 0) goto out_unlock; + ret = regmap_bulk_read(data->regmap, FXAS21002C_REG_OUT_X_MSB, + data->buffer, CHANNEL_SCAN_MAX * sizeof(s16)); + if (ret < 0) + goto out_pm_put; + iio_push_to_buffers_with_timestamp(indio_dev, data->buffer, data->timestamp); +out_pm_put: + fxas21002c_pm_put(data); + out_unlock: mutex_unlock(&data->lock); From c0f866de4ce447bca3191b9cefac60c4b36a7922 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Maneyrol Date: Tue, 12 Nov 2024 10:30:10 +0100 Subject: [PATCH 054/807] iio: imu: inv_icm42600: fix spi burst write not supported Burst write with SPI is not working for all icm42600 chips. It was only used for setting user offsets with regmap_bulk_write. Add specific SPI regmap config for using only single write with SPI. Fixes: 9f9ff91b775b ("iio: imu: inv_icm42600: add SPI driver for inv_icm42600 driver") Cc: stable@vger.kernel.org Signed-off-by: Jean-Baptiste Maneyrol Link: https://patch.msgid.link/20241112-inv-icm42600-fix-spi-burst-write-not-supported-v2-1-97690dc03607@tdk.com Signed-off-by: Jonathan Cameron --- drivers/iio/imu/inv_icm42600/inv_icm42600.h | 1 + drivers/iio/imu/inv_icm42600/inv_icm42600_core.c | 15 +++++++++++++++ drivers/iio/imu/inv_icm42600/inv_icm42600_spi.c | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600.h b/drivers/iio/imu/inv_icm42600/inv_icm42600.h index 3a07e43e4cf1..18787a43477b 100644 --- a/drivers/iio/imu/inv_icm42600/inv_icm42600.h +++ b/drivers/iio/imu/inv_icm42600/inv_icm42600.h @@ -403,6 +403,7 @@ struct inv_icm42600_sensor_state { typedef int (*inv_icm42600_bus_setup)(struct inv_icm42600_state *); extern const struct regmap_config inv_icm42600_regmap_config; +extern const struct regmap_config inv_icm42600_spi_regmap_config; extern const struct dev_pm_ops inv_icm42600_pm_ops; const struct iio_mount_matrix * diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c index 561d245c1d64..e43538e536f0 100644 --- a/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c +++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c @@ -87,6 +87,21 @@ const struct regmap_config inv_icm42600_regmap_config = { }; EXPORT_SYMBOL_NS_GPL(inv_icm42600_regmap_config, "IIO_ICM42600"); +/* define specific regmap for SPI not supporting burst write */ +const struct regmap_config inv_icm42600_spi_regmap_config = { + .name = "inv_icm42600", + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x4FFF, + .ranges = inv_icm42600_regmap_ranges, + .num_ranges = ARRAY_SIZE(inv_icm42600_regmap_ranges), + .volatile_table = inv_icm42600_regmap_volatile_accesses, + .rd_noinc_table = inv_icm42600_regmap_rd_noinc_accesses, + .cache_type = REGCACHE_RBTREE, + .use_single_write = true, +}; +EXPORT_SYMBOL_NS_GPL(inv_icm42600_spi_regmap_config, "IIO_ICM42600"); + struct inv_icm42600_hw { uint8_t whoami; const char *name; diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_spi.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_spi.c index c55d8e672183..2bd2c4c8e50c 100644 --- a/drivers/iio/imu/inv_icm42600/inv_icm42600_spi.c +++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_spi.c @@ -59,7 +59,8 @@ static int inv_icm42600_probe(struct spi_device *spi) return -EINVAL; chip = (uintptr_t)match; - regmap = devm_regmap_init_spi(spi, &inv_icm42600_regmap_config); + /* use SPI specific regmap */ + regmap = devm_regmap_init_spi(spi, &inv_icm42600_spi_regmap_config); if (IS_ERR(regmap)) return PTR_ERR(regmap); From 65a60a590142c54a3f3be11ff162db2d5b0e1e06 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Maneyrol Date: Wed, 13 Nov 2024 21:25:45 +0100 Subject: [PATCH 055/807] iio: imu: inv_icm42600: fix timestamps after suspend if sensor is on Currently suspending while sensors are one will result in timestamping continuing without gap at resume. It can work with monotonic clock but not with other clocks. Fix that by resetting timestamping. Fixes: ec74ae9fd37c ("iio: imu: inv_icm42600: add accurate timestamping") Cc: stable@vger.kernel.org Signed-off-by: Jean-Baptiste Maneyrol Link: https://patch.msgid.link/20241113-inv_icm42600-fix-timestamps-after-suspend-v1-1-dfc77c394173@tdk.com Signed-off-by: Jonathan Cameron --- drivers/iio/imu/inv_icm42600/inv_icm42600_core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c index e43538e536f0..ef9875d3b79d 100644 --- a/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c +++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c @@ -829,6 +829,8 @@ out_unlock: static int inv_icm42600_resume(struct device *dev) { struct inv_icm42600_state *st = dev_get_drvdata(dev); + struct inv_icm42600_sensor_state *gyro_st = iio_priv(st->indio_gyro); + struct inv_icm42600_sensor_state *accel_st = iio_priv(st->indio_accel); int ret; mutex_lock(&st->lock); @@ -849,9 +851,12 @@ static int inv_icm42600_resume(struct device *dev) goto out_unlock; /* restore FIFO data streaming */ - if (st->fifo.on) + if (st->fifo.on) { + inv_sensors_timestamp_reset(&gyro_st->ts); + inv_sensors_timestamp_reset(&accel_st->ts); ret = regmap_write(st->map, INV_ICM42600_REG_FIFO_CONFIG, INV_ICM42600_FIFO_CONFIG_STREAM); + } out_unlock: mutex_unlock(&st->lock); From dddfd0c489e9a3fde98fedda8832cb9ecaae3abf Mon Sep 17 00:00:00 2001 From: Trevor Gamblin Date: Wed, 13 Nov 2024 15:52:58 -0500 Subject: [PATCH 056/807] iio: adc: ad4695: fix buffered read, single sample timings Modify ad4695_buffer_preenable() by adding an extra SPI transfer after each data read to help ensure that the timing requirement between the last SCLK rising edge and the next CNV rising edge is met. This requires a restructure of the buf_read_xfer array in ad4695_state. Also define AD4695_T_SCK_CNV_DELAY_NS to use for each added transfer. Without this change it is possible for the data to become corrupted on sequential buffered reads due to the device not properly exiting conversion mode. Similarly, make adjustments to ad4695_read_one_sample() so that timings are respected, and clean up the function slightly in the process. Fixes: 6cc7e4bf2e08 ("iio: adc: ad4695: implement triggered buffer") Co-developed-by: David Lechner Signed-off-by: David Lechner Signed-off-by: Trevor Gamblin Reviewed-by: David Lechner Tested-by: David Lechner Link: https://patch.msgid.link/20241113-tgamblin-ad4695_improvements-v2-1-b6bb7c758fc4@baylibre.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad4695.c | 98 ++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 30 deletions(-) diff --git a/drivers/iio/adc/ad4695.c b/drivers/iio/adc/ad4695.c index 595ec4158e73..0146aed9069f 100644 --- a/drivers/iio/adc/ad4695.c +++ b/drivers/iio/adc/ad4695.c @@ -91,6 +91,7 @@ #define AD4695_T_WAKEUP_SW_MS 3 #define AD4695_T_REFBUF_MS 100 #define AD4695_T_REGCONFIG_NS 20 +#define AD4695_T_SCK_CNV_DELAY_NS 80 #define AD4695_REG_ACCESS_SCLK_HZ (10 * MEGA) /* Max number of voltage input channels. */ @@ -132,8 +133,13 @@ struct ad4695_state { unsigned int vref_mv; /* Common mode input pin voltage. */ unsigned int com_mv; - /* 1 per voltage and temperature chan plus 1 xfer to trigger 1st CNV */ - struct spi_transfer buf_read_xfer[AD4695_MAX_CHANNELS + 2]; + /* + * 2 per voltage and temperature chan plus 1 xfer to trigger 1st + * CNV. Excluding the trigger xfer, every 2nd xfer only serves + * to control CS and add a delay between the last SCLK and next + * CNV rising edges. + */ + struct spi_transfer buf_read_xfer[AD4695_MAX_CHANNELS * 2 + 3]; struct spi_message buf_read_msg; /* Raw conversion data received. */ u8 buf[ALIGN((AD4695_MAX_CHANNELS + 2) * AD4695_MAX_CHANNEL_SIZE, @@ -423,7 +429,7 @@ static int ad4695_buffer_preenable(struct iio_dev *indio_dev) u8 temp_chan_bit = st->chip_info->num_voltage_inputs; u32 bit, num_xfer, num_slots; u32 temp_en = 0; - int ret; + int ret, rx_buf_offset = 0; /* * We are using the advanced sequencer since it is the only way to read @@ -449,11 +455,9 @@ static int ad4695_buffer_preenable(struct iio_dev *indio_dev) iio_for_each_active_channel(indio_dev, bit) { xfer = &st->buf_read_xfer[num_xfer]; xfer->bits_per_word = 16; - xfer->rx_buf = &st->buf[(num_xfer - 1) * 2]; + xfer->rx_buf = &st->buf[rx_buf_offset]; xfer->len = 2; - xfer->cs_change = 1; - xfer->cs_change_delay.value = AD4695_T_CONVERT_NS; - xfer->cs_change_delay.unit = SPI_DELAY_UNIT_NSECS; + rx_buf_offset += xfer->len; if (bit == temp_chan_bit) { temp_en = 1; @@ -468,21 +472,44 @@ static int ad4695_buffer_preenable(struct iio_dev *indio_dev) } num_xfer++; + + /* + * We need to add a blank xfer in data reads, to meet the timing + * requirement of a minimum delay between the last SCLK rising + * edge and the CS deassert. + */ + xfer = &st->buf_read_xfer[num_xfer]; + xfer->delay.value = AD4695_T_SCK_CNV_DELAY_NS; + xfer->delay.unit = SPI_DELAY_UNIT_NSECS; + xfer->cs_change = 1; + xfer->cs_change_delay.value = AD4695_T_CONVERT_NS; + xfer->cs_change_delay.unit = SPI_DELAY_UNIT_NSECS; + + num_xfer++; } /* * The advanced sequencer requires that at least 2 slots are enabled. * Since slot 0 is always used for other purposes, we need only 1 - * enabled voltage channel to meet this requirement. If the temperature - * channel is the only enabled channel, we need to add one more slot - * in the sequence but not read from it. + * enabled voltage channel to meet this requirement. If the temperature + * channel is the only enabled channel, we need to add one more slot in + * the sequence but not read from it. This is because the temperature + * sensor is sampled at the end of the channel sequence in advanced + * sequencer mode (see datasheet page 38). + * + * From the iio_for_each_active_channel() block above, we now have an + * xfer with data followed by a blank xfer to allow us to meet the + * timing spec, so move both of those up before adding an extra to + * handle the temperature-only case. */ if (num_slots < 2) { - /* move last xfer so we can insert one more xfer before it */ - st->buf_read_xfer[num_xfer] = *xfer; + /* Move last two xfers */ + st->buf_read_xfer[num_xfer] = st->buf_read_xfer[num_xfer - 1]; + st->buf_read_xfer[num_xfer - 1] = st->buf_read_xfer[num_xfer - 2]; num_xfer++; - /* modify 2nd to last xfer for extra slot */ + /* Modify inserted xfer for extra slot. */ + xfer = &st->buf_read_xfer[num_xfer - 3]; memset(xfer, 0, sizeof(*xfer)); xfer->cs_change = 1; xfer->delay.value = st->chip_info->t_acq_ns; @@ -499,6 +526,12 @@ static int ad4695_buffer_preenable(struct iio_dev *indio_dev) return ret; num_slots++; + + /* + * We still want to point at the last xfer when finished, so + * update the pointer. + */ + xfer = &st->buf_read_xfer[num_xfer - 1]; } /* @@ -583,8 +616,20 @@ out: */ static int ad4695_read_one_sample(struct ad4695_state *st, unsigned int address) { - struct spi_transfer xfer[2] = { }; - int ret, i = 0; + struct spi_transfer xfers[2] = { + { + .speed_hz = AD4695_REG_ACCESS_SCLK_HZ, + .bits_per_word = 16, + .tx_buf = &st->cnv_cmd, + .len = 2, + }, + { + /* Required delay between last SCLK and CNV/CS */ + .delay.value = AD4695_T_SCK_CNV_DELAY_NS, + .delay.unit = SPI_DELAY_UNIT_NSECS, + } + }; + int ret; ret = ad4695_set_single_cycle_mode(st, address); if (ret) @@ -592,29 +637,22 @@ static int ad4695_read_one_sample(struct ad4695_state *st, unsigned int address) /* * Setting the first channel to the temperature channel isn't supported - * in single-cycle mode, so we have to do an extra xfer to read the - * temperature. + * in single-cycle mode, so we have to do an extra conversion to read + * the temperature. */ if (address == AD4695_CMD_TEMP_CHAN) { - /* We aren't reading, so we can make this a short xfer. */ - st->cnv_cmd2 = AD4695_CMD_TEMP_CHAN << 3; - xfer[0].tx_buf = &st->cnv_cmd2; - xfer[0].len = 1; - xfer[0].cs_change = 1; - xfer[0].cs_change_delay.value = AD4695_T_CONVERT_NS; - xfer[0].cs_change_delay.unit = SPI_DELAY_UNIT_NSECS; + st->cnv_cmd = AD4695_CMD_TEMP_CHAN << 11; - i = 1; + ret = spi_sync_transfer(st->spi, xfers, ARRAY_SIZE(xfers)); + if (ret) + return ret; } /* Then read the result and exit conversion mode. */ st->cnv_cmd = AD4695_CMD_EXIT_CNV_MODE << 11; - xfer[i].bits_per_word = 16; - xfer[i].tx_buf = &st->cnv_cmd; - xfer[i].rx_buf = &st->raw_data; - xfer[i].len = 2; + xfers[0].rx_buf = &st->raw_data; - return spi_sync_transfer(st->spi, xfer, i + 1); + return spi_sync_transfer(st->spi, xfers, ARRAY_SIZE(xfers)); } static int ad4695_read_raw(struct iio_dev *indio_dev, From ad8479ac083b841da42975d79288b25c088c5cc3 Mon Sep 17 00:00:00 2001 From: Olivier Moysan Date: Thu, 14 Nov 2024 11:24:59 +0100 Subject: [PATCH 057/807] iio: adc: stm32-dfsdm: handle label as an optional property The label property is defined as optional in the DFSDM binding. Parse the label property only when it is defined in the device tree. Fixes: 3208fa0cd919 ("iio: adc: stm32-dfsdm: adopt generic channels bindings") Signed-off-by: Olivier Moysan Link: https://patch.msgid.link/20241114102459.2497178-1-olivier.moysan@foss.st.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/stm32-dfsdm-adc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/iio/adc/stm32-dfsdm-adc.c b/drivers/iio/adc/stm32-dfsdm-adc.c index 1f9eca2fb2bf..fe11b0d8eab3 100644 --- a/drivers/iio/adc/stm32-dfsdm-adc.c +++ b/drivers/iio/adc/stm32-dfsdm-adc.c @@ -691,11 +691,14 @@ static int stm32_dfsdm_generic_channel_parse_of(struct stm32_dfsdm *dfsdm, return -EINVAL; } - ret = fwnode_property_read_string(node, "label", &ch->datasheet_name); - if (ret < 0) { - dev_err(&indio_dev->dev, - " Error parsing 'label' for idx %d\n", ch->channel); - return ret; + if (fwnode_property_present(node, "label")) { + /* label is optional */ + ret = fwnode_property_read_string(node, "label", &ch->datasheet_name); + if (ret < 0) { + dev_err(&indio_dev->dev, + " Error parsing 'label' for idx %d\n", ch->channel); + return ret; + } } df_ch = &dfsdm->ch_list[ch->channel]; From bcb394bb28e55312cace75362b8e489eb0e02a30 Mon Sep 17 00:00:00 2001 From: Charles Han Date: Mon, 18 Nov 2024 17:02:08 +0800 Subject: [PATCH 058/807] iio: adc: ti-ads1298: Add NULL check in ads1298_init devm_kasprintf() can return a NULL pointer on failure. A check on the return value of such a call in ads1298_init() is missing. Add it. Fixes: 00ef7708fa60 ("iio: adc: ti-ads1298: Add driver") Signed-off-by: Charles Han Link: https://patch.msgid.link/20241118090208.14586-1-hanchunchao@inspur.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ti-ads1298.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/adc/ti-ads1298.c b/drivers/iio/adc/ti-ads1298.c index 36d43495f603..03f762415fa5 100644 --- a/drivers/iio/adc/ti-ads1298.c +++ b/drivers/iio/adc/ti-ads1298.c @@ -613,6 +613,8 @@ static int ads1298_init(struct iio_dev *indio_dev) } indio_dev->name = devm_kasprintf(dev, GFP_KERNEL, "ads129%u%s", indio_dev->num_channels, suffix); + if (!indio_dev->name) + return -ENOMEM; /* Enable internal test signal, double amplitude, double frequency */ ret = regmap_write(priv->regmap, ADS1298_REG_CONFIG2, From 55d82a7ac7e9432d2c92ed485c29aad0aa99281d Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Fri, 29 Nov 2024 15:54:42 +0100 Subject: [PATCH 059/807] dt-bindings: iio: st-sensors: Re-add IIS2MDC magnetometer "iio: st-sensors: Update ST Sensor bindings" accidentially dropped the compatible for the IIS2MDC magnetometer. Fixes: 0cd71145803d ("iio: st-sensors: Update ST Sensor bindings") Signed-off-by: Sean Nyekjaer Acked-by: Conor Dooley Link: https://patch.msgid.link/20241129-stmagdt-v1-1-963f0347fb0a@geanix.com Signed-off-by: Jonathan Cameron --- Documentation/devicetree/bindings/iio/st,st-sensors.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/iio/st,st-sensors.yaml b/Documentation/devicetree/bindings/iio/st,st-sensors.yaml index 71c1ee33a393..e955eb8e8797 100644 --- a/Documentation/devicetree/bindings/iio/st,st-sensors.yaml +++ b/Documentation/devicetree/bindings/iio/st,st-sensors.yaml @@ -65,6 +65,7 @@ properties: - st,lsm9ds0-gyro - description: STMicroelectronics Magnetometers enum: + - st,iis2mdc - st,lis2mdl - st,lis3mdl-magn - st,lsm303agr-magn From fbeba4364c5619428714625a70cd8444e6b1e4fd Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 28 Nov 2024 10:46:29 +0200 Subject: [PATCH 060/807] iio: test: Fix GTS test config The test config contained a copy-paste error. The IIO GTS helper test was errorneously titled as "Test IIO formatting functions" in the menuconfig. Change the title of the tests to reflect what is tested. Fixes: cf996f039679 ("iio: test: test gain-time-scale helpers") Signed-off-by: Matti Vaittinen Link: https://patch.msgid.link/Z0gt5R86WdeK73u2@mva-rohm Signed-off-by: Jonathan Cameron --- drivers/iio/test/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/test/Kconfig b/drivers/iio/test/Kconfig index 33cca49c8058..7a181cac3cc9 100644 --- a/drivers/iio/test/Kconfig +++ b/drivers/iio/test/Kconfig @@ -5,7 +5,7 @@ # Keep in alphabetical order config IIO_GTS_KUNIT_TEST - tristate "Test IIO formatting functions" if !KUNIT_ALL_TESTS + tristate "Test IIO gain-time-scale helpers" if !KUNIT_ALL_TESTS depends on KUNIT select IIO_GTS_HELPER select TEST_KUNIT_DEVICE_HELPERS From 333be433ee908a53f283beb95585dfc14c8ffb46 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 25 Nov 2024 22:16:17 +0100 Subject: [PATCH 061/807] iio: dummy: iio_simply_dummy_buffer: fix information leak in triggered buffer The 'data' array is allocated via kmalloc() and it is used to push data to user space from a triggered buffer, but it does not set values for inactive channels, as it only uses iio_for_each_active_channel() to assign new values. Use kzalloc for the memory allocation to avoid pushing uninitialized information to userspace. Cc: stable@vger.kernel.org Fixes: 415f79244757 ("iio: Move IIO Dummy Driver out of staging") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241125-iio_memset_scan_holes-v1-9-0cb6e98d895c@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/dummy/iio_simple_dummy_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/dummy/iio_simple_dummy_buffer.c b/drivers/iio/dummy/iio_simple_dummy_buffer.c index 4ca3f1aaff99..288880346707 100644 --- a/drivers/iio/dummy/iio_simple_dummy_buffer.c +++ b/drivers/iio/dummy/iio_simple_dummy_buffer.c @@ -48,7 +48,7 @@ static irqreturn_t iio_simple_dummy_trigger_h(int irq, void *p) int i = 0, j; u16 *data; - data = kmalloc(indio_dev->scan_bytes, GFP_KERNEL); + data = kzalloc(indio_dev->scan_bytes, GFP_KERNEL); if (!data) goto done; From 2a7377ccfd940cd6e9201756aff1e7852c266e69 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 25 Nov 2024 22:16:16 +0100 Subject: [PATCH 062/807] iio: adc: ti-ads8688: fix information leak in triggered buffer The 'buffer' local array is used to push data to user space from a triggered buffer, but it does not set values for inactive channels, as it only uses iio_for_each_active_channel() to assign new values. Initialize the array to zero before using it to avoid pushing uninitialized information to userspace. Cc: stable@vger.kernel.org Fixes: 61fa5dfa5f52 ("iio: adc: ti-ads8688: Fix alignment of buffer in iio_push_to_buffers_with_timestamp()") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241125-iio_memset_scan_holes-v1-8-0cb6e98d895c@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ti-ads8688.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/ti-ads8688.c b/drivers/iio/adc/ti-ads8688.c index 9b1814f1965a..a31658b760a4 100644 --- a/drivers/iio/adc/ti-ads8688.c +++ b/drivers/iio/adc/ti-ads8688.c @@ -381,7 +381,7 @@ static irqreturn_t ads8688_trigger_handler(int irq, void *p) struct iio_poll_func *pf = p; struct iio_dev *indio_dev = pf->indio_dev; /* Ensure naturally aligned timestamp */ - u16 buffer[ADS8688_MAX_CHANNELS + sizeof(s64)/sizeof(u16)] __aligned(8); + u16 buffer[ADS8688_MAX_CHANNELS + sizeof(s64)/sizeof(u16)] __aligned(8) = { }; int i, j = 0; iio_for_each_active_channel(indio_dev, i) { From b62fbe3b8eedd3cf3c9ad0b7cb9f72c3f40815f0 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 25 Nov 2024 22:16:15 +0100 Subject: [PATCH 063/807] iio: light: bh1745: fix information leak in triggered buffer The 'scan' local struct is used to push data to user space from a triggered buffer, but it does not set values for inactive channels, as it only uses iio_for_each_active_channel() to assign new values. Initialize the struct to zero before using it to avoid pushing uninitialized information to userspace. Cc: stable@vger.kernel.org Fixes: eab35358aae7 ("iio: light: ROHM BH1745 colour sensor") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241125-iio_memset_scan_holes-v1-7-0cb6e98d895c@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/light/bh1745.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/light/bh1745.c b/drivers/iio/light/bh1745.c index 56e32689bb97..63bf729df517 100644 --- a/drivers/iio/light/bh1745.c +++ b/drivers/iio/light/bh1745.c @@ -746,6 +746,8 @@ static irqreturn_t bh1745_trigger_handler(int interrupt, void *p) int i; int j = 0; + memset(&scan, 0, sizeof(scan)); + iio_for_each_active_channel(indio_dev, i) { ret = regmap_bulk_read(data->regmap, BH1745_RED_LSB + 2 * i, &value, 2); From 47b43e53c0a0edf5578d5d12f5fc71c019649279 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 25 Nov 2024 22:16:14 +0100 Subject: [PATCH 064/807] iio: light: vcnl4035: fix information leak in triggered buffer The 'buffer' local array is used to push data to userspace from a triggered buffer, but it does not set an initial value for the single data element, which is an u16 aligned to 8 bytes. That leaves at least 4 bytes uninitialized even after writing an integer value with regmap_read(). Initialize the array to zero before using it to avoid pushing uninitialized information to userspace. Cc: stable@vger.kernel.org Fixes: ec90b52c07c0 ("iio: light: vcnl4035: Fix buffer alignment in iio_push_to_buffers_with_timestamp()") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241125-iio_memset_scan_holes-v1-6-0cb6e98d895c@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/light/vcnl4035.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/light/vcnl4035.c b/drivers/iio/light/vcnl4035.c index 337a1332c2c6..67c94be02018 100644 --- a/drivers/iio/light/vcnl4035.c +++ b/drivers/iio/light/vcnl4035.c @@ -105,7 +105,7 @@ static irqreturn_t vcnl4035_trigger_consumer_handler(int irq, void *p) struct iio_dev *indio_dev = pf->indio_dev; struct vcnl4035_data *data = iio_priv(indio_dev); /* Ensure naturally aligned timestamp */ - u8 buffer[ALIGN(sizeof(u16), sizeof(s64)) + sizeof(s64)] __aligned(8); + u8 buffer[ALIGN(sizeof(u16), sizeof(s64)) + sizeof(s64)] __aligned(8) = { }; int ret; ret = regmap_read(data->regmap, VCNL4035_ALS_DATA, (int *)buffer); From 6ae053113f6a226a2303caa4936a4c37f3bfff7b Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 25 Nov 2024 22:16:13 +0100 Subject: [PATCH 065/807] iio: imu: kmx61: fix information leak in triggered buffer The 'buffer' local array is used to push data to user space from a triggered buffer, but it does not set values for inactive channels, as it only uses iio_for_each_active_channel() to assign new values. Initialize the array to zero before using it to avoid pushing uninitialized information to userspace. Cc: stable@vger.kernel.org Fixes: c3a23ecc0901 ("iio: imu: kmx61: Add support for data ready triggers") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241125-iio_memset_scan_holes-v1-5-0cb6e98d895c@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/imu/kmx61.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/imu/kmx61.c b/drivers/iio/imu/kmx61.c index 324c38764656..e19c5d3137c6 100644 --- a/drivers/iio/imu/kmx61.c +++ b/drivers/iio/imu/kmx61.c @@ -1193,7 +1193,7 @@ static irqreturn_t kmx61_trigger_handler(int irq, void *p) struct kmx61_data *data = kmx61_get_data(indio_dev); int bit, ret, i = 0; u8 base; - s16 buffer[8]; + s16 buffer[8] = { }; if (indio_dev == data->acc_indio_dev) base = KMX61_ACC_XOUT_L; From 38724591364e1e3b278b4053f102b49ea06ee17c Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 25 Nov 2024 22:16:12 +0100 Subject: [PATCH 066/807] iio: adc: rockchip_saradc: fix information leak in triggered buffer The 'data' local struct is used to push data to user space from a triggered buffer, but it does not set values for inactive channels, as it only uses iio_for_each_active_channel() to assign new values. Initialize the struct to zero before using it to avoid pushing uninitialized information to userspace. Cc: stable@vger.kernel.org Fixes: 4e130dc7b413 ("iio: adc: rockchip_saradc: Add support iio buffers") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241125-iio_memset_scan_holes-v1-4-0cb6e98d895c@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/rockchip_saradc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/adc/rockchip_saradc.c b/drivers/iio/adc/rockchip_saradc.c index 240cfa391674..dfd47a6e1f4a 100644 --- a/drivers/iio/adc/rockchip_saradc.c +++ b/drivers/iio/adc/rockchip_saradc.c @@ -368,6 +368,8 @@ static irqreturn_t rockchip_saradc_trigger_handler(int irq, void *p) int ret; int i, j = 0; + memset(&data, 0, sizeof(data)); + mutex_lock(&info->lock); iio_for_each_active_channel(i_dev, i) { From 6007d10c5262f6f71479627c1216899ea7f09073 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 25 Nov 2024 22:16:11 +0100 Subject: [PATCH 067/807] iio: pressure: zpa2326: fix information leak in triggered buffer The 'sample' local struct is used to push data to user space from a triggered buffer, but it has a hole between the temperature and the timestamp (u32 pressure, u16 temperature, GAP, u64 timestamp). This hole is never initialized. Initialize the struct to zero before using it to avoid pushing uninitialized information to userspace. Cc: stable@vger.kernel.org Fixes: 03b262f2bbf4 ("iio:pressure: initial zpa2326 barometer support") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241125-iio_memset_scan_holes-v1-3-0cb6e98d895c@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/pressure/zpa2326.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/pressure/zpa2326.c b/drivers/iio/pressure/zpa2326.c index 2adea84f5b4d..9db1c94dfc18 100644 --- a/drivers/iio/pressure/zpa2326.c +++ b/drivers/iio/pressure/zpa2326.c @@ -586,6 +586,8 @@ static int zpa2326_fill_sample_buffer(struct iio_dev *indio_dev, } sample; int err; + memset(&sample, 0, sizeof(sample)); + if (test_bit(0, indio_dev->active_scan_mask)) { /* Get current pressure from hardware FIFO. */ err = zpa2326_dequeue_pressure(indio_dev, &sample.pressure); From 75f339d3ecd38cb1ce05357d647189d4a7f7ed08 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 25 Nov 2024 22:16:10 +0100 Subject: [PATCH 068/807] iio: adc: ti-ads1119: fix information leak in triggered buffer The 'scan' local struct is used to push data to user space from a triggered buffer, but it has a hole between the sample (unsigned int) and the timestamp. This hole is never initialized. Initialize the struct to zero before using it to avoid pushing uninitialized information to userspace. Cc: stable@vger.kernel.org Fixes: a9306887eba4 ("iio: adc: ti-ads1119: Add driver") Signed-off-by: Javier Carrasco Reviewed-by: Francesco Dolcini Link: https://patch.msgid.link/20241125-iio_memset_scan_holes-v1-2-0cb6e98d895c@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ti-ads1119.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/adc/ti-ads1119.c b/drivers/iio/adc/ti-ads1119.c index e9d9d4d46d38..2615a275acb3 100644 --- a/drivers/iio/adc/ti-ads1119.c +++ b/drivers/iio/adc/ti-ads1119.c @@ -506,6 +506,8 @@ static irqreturn_t ads1119_trigger_handler(int irq, void *private) unsigned int index; int ret; + memset(&scan, 0, sizeof(scan)); + if (!iio_trigger_using_own(indio_dev)) { index = find_first_bit(indio_dev->active_scan_mask, iio_get_masklength(indio_dev)); From 2a8e34096ec70d73ebb6d9920688ea312700cbd9 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Fri, 22 Nov 2024 13:43:08 -0300 Subject: [PATCH 069/807] iio: adc: ti-ads124s08: Use gpiod_set_value_cansleep() Using gpiod_set_value() to control the reset GPIO causes some verbose warnings during boot when the reset GPIO is controlled by an I2C IO expander. As the caller can sleep, use the gpiod_set_value_cansleep() variant to fix the issue. Tested on a custom i.MX93 board with a ADS124S08 ADC. Cc: stable@kernel.org Fixes: e717f8c6dfec ("iio: adc: Add the TI ads124s08 ADC code") Signed-off-by: Fabio Estevam Link: https://patch.msgid.link/20241122164308.390340-1-festevam@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ti-ads124s08.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/ti-ads124s08.c b/drivers/iio/adc/ti-ads124s08.c index 425b48d8986f..f452f57f11c9 100644 --- a/drivers/iio/adc/ti-ads124s08.c +++ b/drivers/iio/adc/ti-ads124s08.c @@ -183,9 +183,9 @@ static int ads124s_reset(struct iio_dev *indio_dev) struct ads124s_private *priv = iio_priv(indio_dev); if (priv->reset_gpio) { - gpiod_set_value(priv->reset_gpio, 0); + gpiod_set_value_cansleep(priv->reset_gpio, 0); udelay(200); - gpiod_set_value(priv->reset_gpio, 1); + gpiod_set_value_cansleep(priv->reset_gpio, 1); } else { return ads124s_write_cmd(indio_dev, ADS124S08_CMD_RESET); } From 36a44e05cd807a54e5ffad4b96d0d67f68ad8576 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Wed, 27 Nov 2024 14:01:53 -0600 Subject: [PATCH 070/807] iio: adc: ad7173: fix using shared static info struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a possible race condition during driver probe in the ad7173 driver due to using a shared static info struct. If more that one instance of the driver is probed at the same time, some of the info could be overwritten by the other instance, leading to incorrect operation. To fix this, make the static info struct const so that it is read-only and make a copy of the info struct for each instance of the driver that can be modified. Reported-by: Uwe Kleine-König Fixes: 76a1e6a42802 ("iio: adc: ad7173: add AD7173 driver") Signed-off-by: David Lechner Tested-by: Guillaume Ranquet Link: https://patch.msgid.link/20241127-iio-adc-ad7313-fix-non-const-info-struct-v2-1-b6d7022b7466@baylibre.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7173.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/iio/adc/ad7173.c b/drivers/iio/adc/ad7173.c index 8a0c931ca83a..8b03c1e5567e 100644 --- a/drivers/iio/adc/ad7173.c +++ b/drivers/iio/adc/ad7173.c @@ -200,6 +200,7 @@ struct ad7173_channel { struct ad7173_state { struct ad_sigma_delta sd; + struct ad_sigma_delta_info sigma_delta_info; const struct ad7173_device_info *info; struct ad7173_channel *channels; struct regulator_bulk_data regulators[3]; @@ -753,7 +754,7 @@ static int ad7173_disable_one(struct ad_sigma_delta *sd, unsigned int chan) return ad_sd_write_reg(sd, AD7173_REG_CH(chan), 2, 0); } -static struct ad_sigma_delta_info ad7173_sigma_delta_info = { +static const struct ad_sigma_delta_info ad7173_sigma_delta_info = { .set_channel = ad7173_set_channel, .append_status = ad7173_append_status, .disable_all = ad7173_disable_all, @@ -1403,7 +1404,7 @@ static int ad7173_fw_parse_device_config(struct iio_dev *indio_dev) if (ret < 0) return dev_err_probe(dev, ret, "Interrupt 'rdy' is required\n"); - ad7173_sigma_delta_info.irq_line = ret; + st->sigma_delta_info.irq_line = ret; return ad7173_fw_parse_channel_config(indio_dev); } @@ -1436,8 +1437,9 @@ static int ad7173_probe(struct spi_device *spi) spi->mode = SPI_MODE_3; spi_setup(spi); - ad7173_sigma_delta_info.num_slots = st->info->num_configs; - ret = ad_sd_init(&st->sd, indio_dev, spi, &ad7173_sigma_delta_info); + st->sigma_delta_info = ad7173_sigma_delta_info; + st->sigma_delta_info.num_slots = st->info->num_configs; + ret = ad_sd_init(&st->sd, indio_dev, spi, &st->sigma_delta_info); if (ret) return ret; From de6a73bad1743e9e81ea5a24c178c67429ff510b Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 7 Dec 2024 13:30:45 +0900 Subject: [PATCH 071/807] iio: adc: at91: call input_free_device() on allocated iio_dev Current implementation of at91_ts_register() calls input_free_deivce() on st->ts_input, however, the err label can be reached before the allocated iio_dev is stored to st->ts_input. Thus call input_free_device() on input instead of st->ts_input. Fixes: 84882b060301 ("iio: adc: at91_adc: Add support for touchscreens without TSMR") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241207043045.1255409-1-joe@pf.is.s.u-tokyo.ac.jp Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/at91_adc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c index a3f0a2321666..5927756b749a 100644 --- a/drivers/iio/adc/at91_adc.c +++ b/drivers/iio/adc/at91_adc.c @@ -979,7 +979,7 @@ static int at91_ts_register(struct iio_dev *idev, return ret; err: - input_free_device(st->ts_input); + input_free_device(input); return ret; } From bbf6b6d53e29b6db4f31eb25b5533a12b9134302 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 6 Dec 2024 17:39:28 +0100 Subject: [PATCH 072/807] iio: adc: ad9467: Fix the "don't allow reading vref if not available" case The commit in Fixes adds a special case when only one possible scale is available. If several scales are available, it sets the .read_avail field of the struct iio_info to ad9467_read_avail(). However, this field already holds this function pointer, so the code is a no-op. Use another struct iio_info instead to actually reflect the intent described in the commit message. This way, the structure to use is selected at runtime and they can be kept as const. This is safer because modifying static structs that are shared between all instances like this, based on the properties of a single instance, is asking for trouble down the road. Fixes: b92f94f74826 ("iio: adc: ad9467: don't allow reading vref if not available") Signed-off-by: Christophe JAILLET Link: https://patch.msgid.link/cc65da19e0578823d29e11996f86042e84d5715c.1733503146.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad9467.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/iio/adc/ad9467.c b/drivers/iio/adc/ad9467.c index d358958ab310..f30119b42ba0 100644 --- a/drivers/iio/adc/ad9467.c +++ b/drivers/iio/adc/ad9467.c @@ -895,7 +895,7 @@ static int ad9467_update_scan_mode(struct iio_dev *indio_dev, return 0; } -static struct iio_info ad9467_info = { +static const struct iio_info ad9467_info = { .read_raw = ad9467_read_raw, .write_raw = ad9467_write_raw, .update_scan_mode = ad9467_update_scan_mode, @@ -903,6 +903,14 @@ static struct iio_info ad9467_info = { .read_avail = ad9467_read_avail, }; +/* Same as above, but without .read_avail */ +static const struct iio_info ad9467_info_no_read_avail = { + .read_raw = ad9467_read_raw, + .write_raw = ad9467_write_raw, + .update_scan_mode = ad9467_update_scan_mode, + .debugfs_reg_access = ad9467_reg_access, +}; + static int ad9467_scale_fill(struct ad9467_state *st) { const struct ad9467_chip_info *info = st->info; @@ -1214,11 +1222,12 @@ static int ad9467_probe(struct spi_device *spi) } if (st->info->num_scales > 1) - ad9467_info.read_avail = ad9467_read_avail; + indio_dev->info = &ad9467_info; + else + indio_dev->info = &ad9467_info_no_read_avail; indio_dev->name = st->info->name; indio_dev->channels = st->info->channels; indio_dev->num_channels = st->info->num_channels; - indio_dev->info = &ad9467_info; ret = ad9467_iio_backend_get(st); if (ret) From 9d23e48654620fdccfcc74cc2cef04eaf7353d07 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Wed, 23 Oct 2024 20:29:54 +0300 Subject: [PATCH 073/807] phy: rockchip: samsung-hdptx: Set drvdata before enabling runtime PM In some cases, rk_hdptx_phy_runtime_resume() may be invoked before platform_set_drvdata() is executed in ->probe(), leading to a NULL pointer dereference when using the return of dev_get_drvdata(). Ensure platform_set_drvdata() is called before devm_pm_runtime_enable(). Reported-by: Dmitry Osipenko Fixes: 553be2830c5f ("phy: rockchip: Add Samsung HDMI/eDP Combo PHY driver") Signed-off-by: Cristian Ciocaltea Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20241023-phy-sam-hdptx-rpm-fix-v1-1-87f4c994e346@collabora.com Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c b/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c index ceab9c71d3b5..0965b9d4f9cf 100644 --- a/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c +++ b/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c @@ -1101,6 +1101,8 @@ static int rk_hdptx_phy_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(hdptx->grf), "Could not get GRF syscon\n"); + platform_set_drvdata(pdev, hdptx); + ret = devm_pm_runtime_enable(dev); if (ret) return dev_err_probe(dev, ret, "Failed to enable runtime PM\n"); @@ -1110,7 +1112,6 @@ static int rk_hdptx_phy_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(hdptx->phy), "Failed to create HDMI PHY\n"); - platform_set_drvdata(pdev, hdptx); phy_set_drvdata(hdptx->phy, hdptx); phy_set_bus_width(hdptx->phy, 8); From 64f43895b4457532a3cc524ab250b7a30739a1b1 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Wed, 4 Dec 2024 20:13:42 +0900 Subject: [PATCH 074/807] iio: inkern: call iio_device_put() only on mapped devices In the error path of iio_channel_get_all(), iio_device_put() is called on all IIO devices, which can cause a refcount imbalance. Fix this error by calling iio_device_put() only on IIO devices whose refcounts were previously incremented by iio_device_get(). Fixes: 314be14bb893 ("iio: Rename _st_ functions to loose the bit that meant the staging version.") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241204111342.1246706-1-joe@pf.is.s.u-tokyo.ac.jp Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/inkern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c index 136b225b6bc8..9050a59129e6 100644 --- a/drivers/iio/inkern.c +++ b/drivers/iio/inkern.c @@ -500,7 +500,7 @@ struct iio_channel *iio_channel_get_all(struct device *dev) return_ptr(chans); error_free_chans: - for (i = 0; i < nummaps; i++) + for (i = 0; i < mapind; i++) iio_device_put(chans[i].indio_dev); return ERR_PTR(ret); } From 2f43d5200c7330143089bfd1f2440753bac10617 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Wed, 4 Dec 2024 00:55:31 +0100 Subject: [PATCH 075/807] iio: temperature: tmp006: fix information leak in triggered buffer The 'scan' local struct is used to push data to user space from a triggered buffer, but it has a hole between the two 16-bit data channels and the timestamp. This hole is never initialized. Initialize the struct to zero before using it to avoid pushing uninitialized information to userspace. Fixes: 91f75ccf9f03 ("iio: temperature: tmp006: add triggered buffer support") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241204-iio_memset_scan_holes-v2-1-3f941592a76d@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/temperature/tmp006.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/temperature/tmp006.c b/drivers/iio/temperature/tmp006.c index 0c844137d7aa..02b27f471baa 100644 --- a/drivers/iio/temperature/tmp006.c +++ b/drivers/iio/temperature/tmp006.c @@ -252,6 +252,8 @@ static irqreturn_t tmp006_trigger_handler(int irq, void *p) } scan; s32 ret; + memset(&scan, 0, sizeof(scan)); + ret = i2c_smbus_read_word_data(data->client, TMP006_VOBJECT); if (ret < 0) goto err; From 54d394905c92b9ecc65c1f9b2692c8e10716d8e1 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 2 Dec 2024 20:18:44 +0100 Subject: [PATCH 076/807] iio: adc: ti-ads1119: fix sample size in scan struct for triggered buffer This device returns signed, 16-bit samples as stated in its datasheet (see 8.5.2 Data Format). That is in line with the scan_type definition for the IIO_VOLTAGE channel, but 'unsigned int' is being used to read and push the data to userspace. Given that the size of that type depends on the architecture (at least 2 bytes to store values up to 65535, but its actual size is often 4 bytes), use the 's16' type to provide the same structure in all cases. Fixes: a9306887eba4 ("iio: adc: ti-ads1119: Add driver") Signed-off-by: Javier Carrasco Reviewed-by: Francesco Dolcini Link: https://patch.msgid.link/20241202-ti-ads1119_s16_chan-v1-1-fafe3136dc90@gmail.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ti-ads1119.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/ti-ads1119.c b/drivers/iio/adc/ti-ads1119.c index 2615a275acb3..c268e27eec12 100644 --- a/drivers/iio/adc/ti-ads1119.c +++ b/drivers/iio/adc/ti-ads1119.c @@ -500,7 +500,7 @@ static irqreturn_t ads1119_trigger_handler(int irq, void *private) struct iio_dev *indio_dev = pf->indio_dev; struct ads1119_state *st = iio_priv(indio_dev); struct { - unsigned int sample; + s16 sample; s64 timestamp __aligned(8); } scan; unsigned int index; From c70812cb281fd2529051e818ea25eb736b369753 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 28 Oct 2024 15:49:32 -0400 Subject: [PATCH 077/807] arm64: dts: imx8-ss-audio: add fallback compatible string fsl,imx6ull-esai for esai The ESAI of i.MX8QM is the same as i.MX6ULL. So add fsl,imx6ull-esai for esai. Signed-off-by: Frank Li Acked-by: Rob Herring (Arm) Fixes: adf7ea48ce05 ("ASoC: dt-bindings: fsl-esai: allow fsl,imx8qm-esai fallback to fsl,imx6ull-esai") Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-audio.dtsi | 2 +- arch/arm64/boot/dts/freescale/imx8qm-ss-audio.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-audio.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-audio.dtsi index a60ebb718789..c32a6947ae9c 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-audio.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-audio.dtsi @@ -165,7 +165,7 @@ audio_subsys: bus@59000000 { }; esai0: esai@59010000 { - compatible = "fsl,imx8qm-esai"; + compatible = "fsl,imx8qm-esai", "fsl,imx6ull-esai"; reg = <0x59010000 0x10000>; interrupts = ; clocks = <&esai0_lpcg IMX_LPCG_CLK_4>, diff --git a/arch/arm64/boot/dts/freescale/imx8qm-ss-audio.dtsi b/arch/arm64/boot/dts/freescale/imx8qm-ss-audio.dtsi index e24e639b98ee..c9b55f02497a 100644 --- a/arch/arm64/boot/dts/freescale/imx8qm-ss-audio.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8qm-ss-audio.dtsi @@ -134,7 +134,7 @@ }; esai1: esai@59810000 { - compatible = "fsl,imx8qm-esai"; + compatible = "fsl,imx8qm-esai", "fsl,imx6ull-esai"; reg = <0x59810000 0x10000>; interrupts = ; clocks = <&esai1_lpcg IMX_LPCG_CLK_0>, From c5b8d2c370842e3f9a15655893d8c597e2d981d9 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 5 Nov 2024 13:46:02 +0800 Subject: [PATCH 078/807] arm64: dts: imx95: correct the address length of netcmix_blk_ctrl The netc_blk_ctrl is controlled by the imx95-blk-ctl clock driver and provides relevant clock configurations for NETC, SAI and MQS. Its address length should be 8 bytes instead of 0x1000. Fixes: 7764fef26ea9 ("arm64: dts: imx95: Add NETCMIX block control support") Signed-off-by: Wei Fang Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx95.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi index d10f62eacfe0..e9c7a8265d71 100644 --- a/arch/arm64/boot/dts/freescale/imx95.dtsi +++ b/arch/arm64/boot/dts/freescale/imx95.dtsi @@ -1673,7 +1673,7 @@ netcmix_blk_ctrl: syscon@4c810000 { compatible = "nxp,imx95-netcmix-blk-ctrl", "syscon"; - reg = <0x0 0x4c810000 0x0 0x10000>; + reg = <0x0 0x4c810000 0x0 0x8>; #clock-cells = <1>; clocks = <&scmi_clk IMX95_CLK_BUSNETCMIX>; assigned-clocks = <&scmi_clk IMX95_CLK_BUSNETCMIX>; From 1ddb61a7c0150ba2ab42ec9cb74c392c5c4616b4 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Fri, 8 Nov 2024 16:34:34 +0100 Subject: [PATCH 079/807] ARM: imx_v6_v7_defconfig: enable SND_SOC_SPDIF Enable SND_SOC_SPDIF in imx_v6_v7_defconfig to support SPDIF audio. With commit d469b771afe1 ("ARM: dts: imx6: update spdif sound card node properties"), the more generic audio-codec property is used instead of the old spdif-controller property. Since most i.MX6 boards now use the audio-codec property together with the linux,spdif-dit and linux,spdif-dir compatible driver, it makes sense to enable SND_SOC_SPDIF in the imx_v6_v7_defconfig. This will ensure compatibility with the updated device tree. Without this change, boards that use the audio-codec property will show the following error message during boot when using the imx_v6_v7_defconfig and spdif audio is not working: [ 24.165534] platform sound-spdif: deferred probe pending: fsl-asoc-card: snd_soc_register_card failed Signed-off-by: Stefan Eichenberger Signed-off-by: Shawn Guo --- arch/arm/configs/imx_v6_v7_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index 0beecdde55f5..f25eadcba5e6 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -323,6 +323,7 @@ CONFIG_SND_SOC_IMX_SGTL5000=y CONFIG_SND_SOC_FSL_ASOC_CARD=y CONFIG_SND_SOC_AC97_CODEC=y CONFIG_SND_SOC_CS42XX8_I2C=y +CONFIG_SND_SOC_SPDIF=y CONFIG_SND_SOC_TLV320AIC3X_I2C=y CONFIG_SND_SOC_WM8960=y CONFIG_SND_SOC_WM8962=y From 5f122030061db3e5d2bddd9cf5c583deaa6c54ff Mon Sep 17 00:00:00 2001 From: Jesse Taube Date: Mon, 18 Nov 2024 10:36:41 -0500 Subject: [PATCH 080/807] ARM: dts: imxrt1050: Fix clocks for mmc One of the usdhc1 controller's clocks should be IMXRT1050_CLK_AHB_PODF not IMXRT1050_CLK_OSC. Fixes: 1c4f01be3490 ("ARM: dts: imx: Add i.MXRT1050-EVK support") Signed-off-by: Jesse Taube Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imxrt1050.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/nxp/imx/imxrt1050.dtsi b/arch/arm/boot/dts/nxp/imx/imxrt1050.dtsi index dd714d235d5f..b0bad0d1ba36 100644 --- a/arch/arm/boot/dts/nxp/imx/imxrt1050.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imxrt1050.dtsi @@ -87,7 +87,7 @@ reg = <0x402c0000 0x4000>; interrupts = <110>; clocks = <&clks IMXRT1050_CLK_IPG_PDOF>, - <&clks IMXRT1050_CLK_OSC>, + <&clks IMXRT1050_CLK_AHB_PODF>, <&clks IMXRT1050_CLK_USDHC1>; clock-names = "ipg", "ahb", "per"; bus-width = <4>; From cb1b78f1c726c938bd47497c1ab16b01ce967f37 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 10 Sep 2024 00:44:32 +0000 Subject: [PATCH 081/807] tools: hv: Fix a complier warning in the fcopy uio daemon hv_fcopy_uio_daemon.c:436:53: warning: '%s' directive output may be truncated writing up to 14 bytes into a region of size 10 [-Wformat-truncation=] 436 | snprintf(uio_dev_path, sizeof(uio_dev_path), "/dev/%s", uio_name); Also added 'static' for the array 'desc[]'. Fixes: 82b0945ce2c2 ("tools: hv: Add new fcopy application based on uio driver") Cc: stable@vger.kernel.org # 6.10+ Signed-off-by: Dexuan Cui Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/20240910004433.50254-1-decui@microsoft.com Signed-off-by: Wei Liu Message-ID: <20240910004433.50254-1-decui@microsoft.com> --- tools/hv/hv_fcopy_uio_daemon.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c index 7a00f3066a98..12743d7f164f 100644 --- a/tools/hv/hv_fcopy_uio_daemon.c +++ b/tools/hv/hv_fcopy_uio_daemon.c @@ -35,8 +35,6 @@ #define WIN8_SRV_MINOR 1 #define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) -#define MAX_FOLDER_NAME 15 -#define MAX_PATH_LEN 15 #define FCOPY_UIO "/sys/bus/vmbus/devices/eb765408-105f-49b6-b4aa-c123b64d17d4/uio" #define FCOPY_VER_COUNT 1 @@ -51,7 +49,7 @@ static const int fw_versions[] = { #define HV_RING_SIZE 0x4000 /* 16KB ring buffer size */ -unsigned char desc[HV_RING_SIZE]; +static unsigned char desc[HV_RING_SIZE]; static int target_fd; static char target_fname[PATH_MAX]; @@ -409,8 +407,8 @@ int main(int argc, char *argv[]) struct vmbus_br txbr, rxbr; void *ring; uint32_t len = HV_RING_SIZE; - char uio_name[MAX_FOLDER_NAME] = {0}; - char uio_dev_path[MAX_PATH_LEN] = {0}; + char uio_name[NAME_MAX] = {0}; + char uio_dev_path[PATH_MAX] = {0}; static struct option long_options[] = { {"help", no_argument, 0, 'h' }, From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Tue, 17 Sep 2024 11:09:17 +0530 Subject: [PATCH 082/807] x86/hyperv: Fix hv tsc page based sched_clock for hibernation read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is bigger than the variable hv_sched_clock_offset, which is cached during early boot, but depending on the timing this assumption may be false when a hibernated VM starts again (the clock counter starts from 0 again) and is resuming back (Note: hv_init_tsc_clocksource() is not called during hibernation/resume); consequently, read_hv_sched_clock_tsc() may return a negative integer (which is interpreted as a huge positive integer since the return type is u64) and new kernel messages are prefixed with huge timestamps before read_hv_sched_clock_tsc() grows big enough (which typically takes several seconds). Fix the issue by saving the Hyper-V clock counter just before the suspend, and using it to correct the hv_sched_clock_offset in resume. This makes hv tsc page based sched_clock continuous and ensures that post resume, it starts from where it left off during suspend. Override x86_platform.save_sched_clock_state and x86_platform.restore_sched_clock_state routines to correct this as soon as possible. Note: if Invariant TSC is available, the issue doesn't happen because 1) we don't register read_hv_sched_clock_tsc() for sched clock: See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework clocksource and sched clock setup"); 2) the common x86 code adjusts TSC similarly: see __restore_processor_state() -> tsc_verify_tsc_adjust(true) and x86_platform.restore_sched_clock_state(). Cc: stable@vger.kernel.org Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation") Co-developed-by: Dexuan Cui Signed-off-by: Dexuan Cui Signed-off-by: Naman Jain Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20240917053917.76787-1-namjain@linux.microsoft.com> --- arch/x86/kernel/cpu/mshyperv.c | 58 ++++++++++++++++++++++++++++++ drivers/clocksource/hyperv_timer.c | 14 +++++++- include/clocksource/hyperv_timer.h | 2 ++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index d18078834ded..dc12fe5ef3ca 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -223,6 +223,63 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs) hyperv_cleanup(); } #endif /* CONFIG_CRASH_DUMP */ + +static u64 hv_ref_counter_at_suspend; +static void (*old_save_sched_clock_state)(void); +static void (*old_restore_sched_clock_state)(void); + +/* + * Hyper-V clock counter resets during hibernation. Save and restore clock + * offset during suspend/resume, while also considering the time passed + * before suspend. This is to make sure that sched_clock using hv tsc page + * based clocksource, proceeds from where it left off during suspend and + * it shows correct time for the timestamps of kernel messages after resume. + */ +static void save_hv_clock_tsc_state(void) +{ + hv_ref_counter_at_suspend = hv_read_reference_counter(); +} + +static void restore_hv_clock_tsc_state(void) +{ + /* + * Adjust the offsets used by hv tsc clocksource to + * account for the time spent before hibernation. + * adjusted value = reference counter (time) at suspend + * - reference counter (time) now. + */ + hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter()); +} + +/* + * Functions to override save_sched_clock_state and restore_sched_clock_state + * functions of x86_platform. The Hyper-V clock counter is reset during + * suspend-resume and the offset used to measure time needs to be + * corrected, post resume. + */ +static void hv_save_sched_clock_state(void) +{ + old_save_sched_clock_state(); + save_hv_clock_tsc_state(); +} + +static void hv_restore_sched_clock_state(void) +{ + restore_hv_clock_tsc_state(); + old_restore_sched_clock_state(); +} + +static void __init x86_setup_ops_for_tsc_pg_clock(void) +{ + if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) + return; + + old_save_sched_clock_state = x86_platform.save_sched_clock_state; + x86_platform.save_sched_clock_state = hv_save_sched_clock_state; + + old_restore_sched_clock_state = x86_platform.restore_sched_clock_state; + x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state; +} #endif /* CONFIG_HYPERV */ static uint32_t __init ms_hyperv_platform(void) @@ -579,6 +636,7 @@ static void __init ms_hyperv_init_platform(void) /* Register Hyper-V specific clocksource */ hv_init_clocksource(); + x86_setup_ops_for_tsc_pg_clock(); hv_vtl_init_platform(); #endif /* diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 99177835cade..b39dee7b93af 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -27,7 +27,8 @@ #include static struct clock_event_device __percpu *hv_clock_event; -static u64 hv_sched_clock_offset __ro_after_init; +/* Note: offset can hold negative values after hibernation. */ +static u64 hv_sched_clock_offset __read_mostly; /* * If false, we're using the old mechanism for stimer0 interrupts @@ -470,6 +471,17 @@ static void resume_hv_clock_tsc(struct clocksource *arg) hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); } +/* + * Called during resume from hibernation, from overridden + * x86_platform.restore_sched_clock_state routine. This is to adjust offsets + * used to calculate time for hv tsc page based sched_clock, to account for + * time spent before hibernation. + */ +void hv_adj_sched_clock_offset(u64 offset) +{ + hv_sched_clock_offset -= offset; +} + #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK static int hv_cs_enable(struct clocksource *cs) { diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h index 6cdc873ac907..aa5233b1eba9 100644 --- a/include/clocksource/hyperv_timer.h +++ b/include/clocksource/hyperv_timer.h @@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(void); extern unsigned long hv_get_tsc_pfn(void); extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); +extern void hv_adj_sched_clock_offset(u64 offset); + static __always_inline bool hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc, u64 *time) From 91ae69c7ed9e262f24240c425ad1eef2cf6639b7 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Wed, 16 Oct 2024 16:35:10 +0200 Subject: [PATCH 083/807] tools: hv: change permissions of NetworkManager configuration file Align permissions of the resulting .nmconnection file, instead of the input file from hv_kvp_daemon. To avoid the tiny time frame where the output file is world-readable, use umask instead of chmod. Fixes: 42999c904612 ("hv/hv_kvp_daemon:Support for keyfile based connection profile") Signed-off-by: Olaf Hering Reviewed-by: Shradha Gupta Link: https://lore.kernel.org/r/20241016143521.3735-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241016143521.3735-1-olaf@aepfle.de> --- tools/hv/hv_set_ifconfig.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hv/hv_set_ifconfig.sh b/tools/hv/hv_set_ifconfig.sh index 440a91b35823..2f8baed2b8f7 100755 --- a/tools/hv/hv_set_ifconfig.sh +++ b/tools/hv/hv_set_ifconfig.sh @@ -81,7 +81,7 @@ echo "ONBOOT=yes" >> $1 cp $1 /etc/sysconfig/network-scripts/ -chmod 600 $2 +umask 0177 interface=$(echo $2 | awk -F - '{ print $2 }') filename="${2##*/}" From 67b5e1042d90d8a9814f22312c1147b4c9cd501a Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Wed, 30 Oct 2024 17:47:36 +0000 Subject: [PATCH 084/807] drivers: hv: Convert open-coded timeouts to secs_to_jiffies() We have several places where timeouts are open-coded as N (seconds) * HZ, but best practice is to use the utility functions from jiffies.h. Convert the timeouts to be compliant. This doesn't fix any bugs, it's a simple code improvement. Signed-off-by: Easwar Hariharan Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20241030-open-coded-timeouts-v3-2-9ba123facf88@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20241030-open-coded-timeouts-v3-2-9ba123facf88@linux.microsoft.com> --- drivers/hv/hv_balloon.c | 9 +++++---- drivers/hv/hv_kvp.c | 4 ++-- drivers/hv/hv_snapshot.c | 3 ++- drivers/hv/vmbus_drv.c | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index c38dcdfcb914..a99112e6f0b8 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -756,7 +756,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, * adding succeeded, it is ok to proceed even if the memory was * not onlined in time. */ - wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ); + wait_for_completion_timeout(&dm_device.ol_waitevent, secs_to_jiffies(5)); post_status(&dm_device); } } @@ -1373,7 +1373,8 @@ static int dm_thread_func(void *dm_dev) struct hv_dynmem_device *dm = dm_dev; while (!kthread_should_stop()) { - wait_for_completion_interruptible_timeout(&dm_device.config_event, 1 * HZ); + wait_for_completion_interruptible_timeout(&dm_device.config_event, + secs_to_jiffies(1)); /* * The host expects us to post information on the memory * pressure every second. @@ -1748,7 +1749,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5 * HZ); + t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5)); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1806,7 +1807,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5 * HZ); + t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5)); if (t == 0) { ret = -ETIMEDOUT; goto out; diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index d35b60c06114..29e01247a087 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -655,7 +655,7 @@ void hv_kvp_onchannelcallback(void *context) if (host_negotiatied == NEGO_NOT_STARTED) { host_negotiatied = NEGO_IN_PROGRESS; schedule_delayed_work(&kvp_host_handshake_work, - HV_UTIL_NEGO_TIMEOUT * HZ); + secs_to_jiffies(HV_UTIL_NEGO_TIMEOUT)); } return; } @@ -724,7 +724,7 @@ void hv_kvp_onchannelcallback(void *context) */ schedule_work(&kvp_sendkey_work); schedule_delayed_work(&kvp_timeout_work, - HV_UTIL_TIMEOUT * HZ); + secs_to_jiffies(HV_UTIL_TIMEOUT)); return; diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 0d2184be1691..86d87486ed40 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -193,7 +193,8 @@ static void vss_send_op(void) vss_transaction.state = HVUTIL_USERSPACE_REQ; schedule_delayed_work(&vss_timeout_work, op == VSS_OP_FREEZE ? - VSS_FREEZE_TIMEOUT * HZ : HV_UTIL_TIMEOUT * HZ); + secs_to_jiffies(VSS_FREEZE_TIMEOUT) : + secs_to_jiffies(HV_UTIL_TIMEOUT)); rc = hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg), NULL); if (rc) { diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 6d89d37b069a..2892b8da20a5 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2507,7 +2507,7 @@ static int vmbus_bus_resume(struct device *dev) vmbus_request_offers(); if (wait_for_completion_timeout( - &vmbus_connection.ready_for_resume_event, 10 * HZ) == 0) + &vmbus_connection.ready_for_resume_event, secs_to_jiffies(10)) == 0) pr_err("Some vmbus device is missing after suspending?\n"); /* Reset the event for the next suspend. */ From a9640fcdd400463442846677e62b8208b81cb031 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 5 Nov 2024 09:14:04 +0100 Subject: [PATCH 085/807] tools/hv: terminate fcopy daemon if read from uio fails Terminate endless loop in reading fails, to avoid flooding syslog. This happens if the state of "Guest services" integration service is changed from "enabled" to "disabled" at runtime in the VM settings. In this case pread returns EIO. Also handle an interrupted system call, and continue in this case. Signed-off-by: Olaf Hering Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/20241105081437.15689-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241105081437.15689-1-olaf@aepfle.de> --- tools/hv/hv_fcopy_uio_daemon.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c index 12743d7f164f..0198321d14a2 100644 --- a/tools/hv/hv_fcopy_uio_daemon.c +++ b/tools/hv/hv_fcopy_uio_daemon.c @@ -466,8 +466,10 @@ int main(int argc, char *argv[]) */ ret = pread(fcopy_fd, &tmp, sizeof(int), 0); if (ret < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; syslog(LOG_ERR, "pread failed: %s", strerror(errno)); - continue; + goto close; } len = HV_RING_SIZE; From 96e052d1473843d644ceba2adf46d3d2180b8ca7 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 6 Nov 2024 07:42:46 -0800 Subject: [PATCH 086/807] Drivers: hv: util: Don't force error code to ENODEV in util_probe() If the util_init function call in util_probe() returns an error code, util_probe() always return ENODEV, and the error code from the util_init function is lost. The error message output in the caller, vmbus_probe(), doesn't show the real error code. Fix this by just returning the error code from the util_init function. There doesn't seem to be a reason to force ENODEV, as other errors such as ENOMEM can already be returned from util_probe(). And the code in call_driver_probe() implies that ENODEV should mean that a matching driver wasn't found, which is not the case here. Suggested-by: Dexuan Cui Signed-off-by: Michael Kelley Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241106154247.2271-2-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241106154247.2271-2-mhklinux@outlook.com> --- drivers/hv/hv_util.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index c4f525325790..370722220134 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -590,10 +590,8 @@ static int util_probe(struct hv_device *dev, srv->channel = dev->channel; if (srv->util_init) { ret = srv->util_init(srv); - if (ret) { - ret = -ENODEV; + if (ret) goto error1; - } } /* From 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 6 Nov 2024 07:42:47 -0800 Subject: [PATCH 087/807] Drivers: hv: util: Avoid accessing a ringbuffer not initialized yet If the KVP (or VSS) daemon starts before the VMBus channel's ringbuffer is fully initialized, we can hit the panic below: hv_utils: Registering HyperV Utility Driver hv_vmbus: registering driver hv_utils ... BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 44 UID: 0 PID: 2552 Comm: hv_kvp_daemon Tainted: G E 6.11.0-rc3+ #1 RIP: 0010:hv_pkt_iter_first+0x12/0xd0 Call Trace: ... vmbus_recvpacket hv_kvp_onchannelcallback vmbus_on_event tasklet_action_common tasklet_action handle_softirqs irq_exit_rcu sysvec_hyperv_stimer0 asm_sysvec_hyperv_stimer0 ... kvp_register_done hvt_op_read vfs_read ksys_read __x64_sys_read This can happen because the KVP/VSS channel callback can be invoked even before the channel is fully opened: 1) as soon as hv_kvp_init() -> hvutil_transport_init() creates /dev/vmbus/hv_kvp, the kvp daemon can open the device file immediately and register itself to the driver by writing a message KVP_OP_REGISTER1 to the file (which is handled by kvp_on_msg() ->kvp_handle_handshake()) and reading the file for the driver's response, which is handled by hvt_op_read(), which calls hvt->on_read(), i.e. kvp_register_done(). 2) the problem with kvp_register_done() is that it can cause the channel callback to be called even before the channel is fully opened, and when the channel callback is starting to run, util_probe()-> vmbus_open() may have not initialized the ringbuffer yet, so the callback can hit the panic of NULL pointer dereference. To reproduce the panic consistently, we can add a "ssleep(10)" for KVP in __vmbus_open(), just before the first hv_ringbuffer_init(), and then we unload and reload the driver hv_utils, and run the daemon manually within the 10 seconds. Fix the panic by reordering the steps in util_probe() so the char dev entry used by the KVP or VSS daemon is not created until after vmbus_open() has completed. This reordering prevents the race condition from happening. Reported-by: Dexuan Cui Fixes: e0fa3e5e7df6 ("Drivers: hv: utils: fix a race on userspace daemons registration") Cc: stable@vger.kernel.org Signed-off-by: Michael Kelley Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241106154247.2271-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241106154247.2271-3-mhklinux@outlook.com> --- drivers/hv/hv_kvp.c | 6 ++++++ drivers/hv/hv_snapshot.c | 6 ++++++ drivers/hv/hv_util.c | 9 +++++++++ drivers/hv/hyperv_vmbus.h | 2 ++ include/linux/hyperv.h | 1 + 5 files changed, 24 insertions(+) diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index 29e01247a087..7400a5a4d2bd 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv) */ kvp_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_kvp_init_transport(void) +{ hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL, kvp_on_msg, kvp_on_reset); if (!hvt) diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 86d87486ed40..bde637a96c37 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -389,6 +389,12 @@ hv_vss_init(struct hv_util_service *srv) */ vss_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_vss_init_transport(void) +{ hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL, vss_on_msg, vss_on_reset); if (!hvt) { diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 370722220134..36ee89c0358b 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -141,6 +141,7 @@ static struct hv_util_service util_heartbeat = { static struct hv_util_service util_kvp = { .util_cb = hv_kvp_onchannelcallback, .util_init = hv_kvp_init, + .util_init_transport = hv_kvp_init_transport, .util_pre_suspend = hv_kvp_pre_suspend, .util_pre_resume = hv_kvp_pre_resume, .util_deinit = hv_kvp_deinit, @@ -149,6 +150,7 @@ static struct hv_util_service util_kvp = { static struct hv_util_service util_vss = { .util_cb = hv_vss_onchannelcallback, .util_init = hv_vss_init, + .util_init_transport = hv_vss_init_transport, .util_pre_suspend = hv_vss_pre_suspend, .util_pre_resume = hv_vss_pre_resume, .util_deinit = hv_vss_deinit, @@ -611,6 +613,13 @@ static int util_probe(struct hv_device *dev, if (ret) goto error; + if (srv->util_init_transport) { + ret = srv->util_init_transport(); + if (ret) { + vmbus_close(dev->channel); + goto error; + } + } return 0; error: diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index d2856023d53c..52cb744b4d7f 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -370,12 +370,14 @@ void vmbus_on_event(unsigned long data); void vmbus_on_msg_dpc(unsigned long data); int hv_kvp_init(struct hv_util_service *srv); +int hv_kvp_init_transport(void); void hv_kvp_deinit(void); int hv_kvp_pre_suspend(void); int hv_kvp_pre_resume(void); void hv_kvp_onchannelcallback(void *context); int hv_vss_init(struct hv_util_service *srv); +int hv_vss_init_transport(void); void hv_vss_deinit(void); int hv_vss_pre_suspend(void); int hv_vss_pre_resume(void); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 22c22fb91042..02a226bcf0ed 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1559,6 +1559,7 @@ struct hv_util_service { void *channel; void (*util_cb)(void *); int (*util_init)(struct hv_util_service *); + int (*util_init_transport)(void); void (*util_deinit)(void); int (*util_pre_suspend)(void); int (*util_pre_resume)(void); From 07dfa6e821e1c58cbd0f195173dddbd593721f9b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 12 Nov 2024 16:04:01 +0100 Subject: [PATCH 088/807] hv/hv_kvp_daemon: Pass NIC name to hv_get_dns_info as well The reference implementation of hv_get_dns_info which is in the tree uses /etc/resolv.conf to get DNS servers and this does not require to know which NIC is queried. Distro specific implementations, however, may want to provide per-NIC, fine grained information. E.g. NetworkManager keeps track of DNS servers per connection. Similar to hv_get_dhcp_info, pass NIC name as a parameter to hv_get_dns_info script. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20241112150401.217094-1-vkuznets@redhat.com Signed-off-by: Wei Liu Message-ID: <20241112150401.217094-1-vkuznets@redhat.com> --- tools/hv/hv_kvp_daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index ae57bf69ad4a..296a7a62c54d 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -725,7 +725,7 @@ static void kvp_get_ipconfig_info(char *if_name, * . */ - sprintf(cmd, KVP_SCRIPTS_PATH "%s", "hv_get_dns_info"); + sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dns_info", if_name); /* * Execute the command to gather DNS info. From a4d024fe2e77063069c5f423f2f9be766450f0f9 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 2 Dec 2024 13:04:10 +0100 Subject: [PATCH 089/807] tools/hv: reduce resouce usage in hv_get_dns_info helper Remove the usage of cat. Replace the shell process with awk with 'exec'. Also use a generic shell because no bash specific features will be used. Signed-off-by: Olaf Hering Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241202120432.21115-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241202120432.21115-1-olaf@aepfle.de> --- tools/hv/hv_get_dns_info.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/hv/hv_get_dns_info.sh b/tools/hv/hv_get_dns_info.sh index 058c17b46ffc..268521234d4b 100755 --- a/tools/hv/hv_get_dns_info.sh +++ b/tools/hv/hv_get_dns_info.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # This example script parses /etc/resolv.conf to retrive DNS information. # In the interest of keeping the KVP daemon code free of distro specific @@ -10,4 +10,4 @@ # this script can be based on the Network Manager APIs for retrieving DNS # entries. -cat /etc/resolv.conf 2>/dev/null | awk '/^nameserver/ { print $2 }' +exec awk '/^nameserver/ { print $2 }' /etc/resolv.conf 2>/dev/null From becc7fe329c09a7744fa908fca83418fa94a45a0 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 2 Dec 2024 13:40:52 +0100 Subject: [PATCH 090/807] tools/hv: add a .gitignore file Remove generated files from 'git status' output after 'make -C tools/hv'. Signed-off-by: Olaf Hering Link: https://lore.kernel.org/r/20241202124107.28650-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241202124107.28650-1-olaf@aepfle.de> --- tools/hv/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/hv/.gitignore diff --git a/tools/hv/.gitignore b/tools/hv/.gitignore new file mode 100644 index 000000000000..0c5bc15d602f --- /dev/null +++ b/tools/hv/.gitignore @@ -0,0 +1,3 @@ +hv_fcopy_uio_daemon +hv_kvp_daemon +hv_vss_daemon From 175c71c2aceef173ae6d3dceb41edfc2ac0d5937 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Sun, 8 Dec 2024 23:47:17 +0000 Subject: [PATCH 091/807] tools/hv: reduce resource usage in hv_kvp_daemon hv_kvp_daemon uses popen(3) and system(3) as convinience helper to launch external helpers. These helpers are invoked via a temporary shell process. There is no need to keep this temporary process around while the helper runs. Replace this temporary shell with the actual helper process via 'exec'. Signed-off-by: Olaf Hering Link: https://lore.kernel.org/linux-hyperv/20241202123520.27812-1-olaf@aepfle.de/ Signed-off-by: Wei Liu --- tools/hv/hv_kvp_daemon.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 296a7a62c54d..04ba035d67e9 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -725,7 +725,7 @@ static void kvp_get_ipconfig_info(char *if_name, * . */ - sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dns_info", if_name); + sprintf(cmd, "exec %s %s", KVP_SCRIPTS_PATH "hv_get_dns_info", if_name); /* * Execute the command to gather DNS info. @@ -742,7 +742,7 @@ static void kvp_get_ipconfig_info(char *if_name, * Enabled: DHCP enabled. */ - sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dhcp_info", if_name); + sprintf(cmd, "exec %s %s", KVP_SCRIPTS_PATH "hv_get_dhcp_info", if_name); file = popen(cmd, "r"); if (file == NULL) @@ -1606,8 +1606,9 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) * invoke the external script to do its magic. */ - str_len = snprintf(cmd, sizeof(cmd), KVP_SCRIPTS_PATH "%s %s %s", - "hv_set_ifconfig", if_filename, nm_filename); + str_len = snprintf(cmd, sizeof(cmd), "exec %s %s %s", + KVP_SCRIPTS_PATH "hv_set_ifconfig", + if_filename, nm_filename); /* * This is a little overcautious, but it's necessary to suppress some * false warnings from gcc 8.0.1. From afc6e39e824ad0e44b2af50a97885caec8d213d1 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Mon, 9 Dec 2024 11:46:15 +0100 Subject: [PATCH 092/807] power: supply: gpio-charger: Fix set charge current limits Fix set charge current limits for devices which allow to set the lowest charge current limit to be greater zero. If requested charge current limit is below lowest limit, the index equals current_limit_map_size which leads to accessing memory beyond allocated memory. Fixes: be2919d8355e ("power: supply: gpio-charger: add charge-current-limit feature") Cc: stable@vger.kernel.org Signed-off-by: Dimitri Fedrau Link: https://lore.kernel.org/r/20241209-fix-charge-current-limit-v1-1-760d9b8f2af3@liebherr.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/gpio-charger.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/power/supply/gpio-charger.c b/drivers/power/supply/gpio-charger.c index 68212b39785b..6139f736ecbe 100644 --- a/drivers/power/supply/gpio-charger.c +++ b/drivers/power/supply/gpio-charger.c @@ -67,6 +67,14 @@ static int set_charge_current_limit(struct gpio_charger *gpio_charger, int val) if (gpio_charger->current_limit_map[i].limit_ua <= val) break; } + + /* + * If a valid charge current limit isn't found, default to smallest + * current limitation for safety reasons. + */ + if (i >= gpio_charger->current_limit_map_size) + i = gpio_charger->current_limit_map_size - 1; + mapping = gpio_charger->current_limit_map[i]; for (i = 0; i < ndescs; i++) { From e5f84d1cf562f7b45e28d6e5f6490626f870f81c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 8 Dec 2024 15:59:26 +0100 Subject: [PATCH 093/807] power: supply: cros_charge-control: add mutex for driver data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Concurrent accesses through sysfs may lead to inconsistent state in the priv data. Introduce a mutex to avoid this. Fixes: c6ed48ef5259 ("power: supply: add ChromeOS EC based charge control driver") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241208-cros_charge-control-v2-v1-1-8d168d0f08a3@weissschuh.net Signed-off-by: Sebastian Reichel --- drivers/power/supply/cros_charge-control.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/cros_charge-control.c b/drivers/power/supply/cros_charge-control.c index 17c53591ce19..58ca6d9ed613 100644 --- a/drivers/power/supply/cros_charge-control.c +++ b/drivers/power/supply/cros_charge-control.c @@ -7,8 +7,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -49,6 +51,7 @@ struct cros_chctl_priv { struct attribute *attributes[_CROS_CHCTL_ATTR_COUNT]; struct attribute_group group; + struct mutex lock; /* protects fields below and cros_ec */ enum power_supply_charge_behaviour current_behaviour; u8 current_start_threshold, current_end_threshold; }; @@ -85,6 +88,8 @@ static int cros_chctl_configure_ec(struct cros_chctl_priv *priv) { struct ec_params_charge_control req = {}; + lockdep_assert_held(&priv->lock); + req.cmd = EC_CHARGE_CONTROL_CMD_SET; switch (priv->current_behaviour) { @@ -159,6 +164,7 @@ static ssize_t charge_control_start_threshold_show(struct device *dev, struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_START_THRESHOLD); + guard(mutex)(&priv->lock); return sysfs_emit(buf, "%u\n", (unsigned int)priv->current_start_threshold); } @@ -169,6 +175,7 @@ static ssize_t charge_control_start_threshold_store(struct device *dev, struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_START_THRESHOLD); + guard(mutex)(&priv->lock); return cros_chctl_store_threshold(dev, priv, 0, buf, count); } @@ -178,6 +185,7 @@ static ssize_t charge_control_end_threshold_show(struct device *dev, struct devi struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_END_THRESHOLD); + guard(mutex)(&priv->lock); return sysfs_emit(buf, "%u\n", (unsigned int)priv->current_end_threshold); } @@ -187,6 +195,7 @@ static ssize_t charge_control_end_threshold_store(struct device *dev, struct dev struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_END_THRESHOLD); + guard(mutex)(&priv->lock); return cros_chctl_store_threshold(dev, priv, 1, buf, count); } @@ -195,6 +204,7 @@ static ssize_t charge_behaviour_show(struct device *dev, struct device_attribute struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_CHARGE_BEHAVIOUR); + guard(mutex)(&priv->lock); return power_supply_charge_behaviour_show(dev, EC_CHARGE_CONTROL_BEHAVIOURS, priv->current_behaviour, buf); } @@ -210,6 +220,7 @@ static ssize_t charge_behaviour_store(struct device *dev, struct device_attribut if (ret < 0) return ret; + guard(mutex)(&priv->lock); priv->current_behaviour = ret; ret = cros_chctl_configure_ec(priv); @@ -290,6 +301,10 @@ static int cros_chctl_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; + ret = devm_mutex_init(dev, &priv->lock); + if (ret) + return ret; + ret = cros_ec_get_cmd_versions(cros_ec, EC_CMD_CHARGE_CONTROL); if (ret < 0) return ret; @@ -327,7 +342,8 @@ static int cros_chctl_probe(struct platform_device *pdev) priv->current_end_threshold = 100; /* Bring EC into well-known state */ - ret = cros_chctl_configure_ec(priv); + scoped_guard(mutex, &priv->lock) + ret = cros_chctl_configure_ec(priv); if (ret < 0) return ret; From e65a1b7fad0e112573eea7d64d4ab4fc513b8695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 8 Dec 2024 15:59:27 +0100 Subject: [PATCH 094/807] power: supply: cros_charge-control: allow start_threshold == end_threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow setting the start and stop thresholds to the same value. There is no reason to disallow it. Suggested-by: Thomas Koch Fixes: c6ed48ef5259 ("power: supply: add ChromeOS EC based charge control driver") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241208-cros_charge-control-v2-v1-2-8d168d0f08a3@weissschuh.net Signed-off-by: Sebastian Reichel --- drivers/power/supply/cros_charge-control.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/cros_charge-control.c b/drivers/power/supply/cros_charge-control.c index 58ca6d9ed613..108b121db442 100644 --- a/drivers/power/supply/cros_charge-control.c +++ b/drivers/power/supply/cros_charge-control.c @@ -139,11 +139,11 @@ static ssize_t cros_chctl_store_threshold(struct device *dev, struct cros_chctl_ return -EINVAL; if (is_end_threshold) { - if (val <= priv->current_start_threshold) + if (val < priv->current_start_threshold) return -EINVAL; priv->current_end_threshold = val; } else { - if (val >= priv->current_end_threshold) + if (val > priv->current_end_threshold) return -EINVAL; priv->current_start_threshold = val; } From c28dc9fc24f5fa802d44ef7620a511035bdd803e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 8 Dec 2024 15:59:28 +0100 Subject: [PATCH 095/807] power: supply: cros_charge-control: hide start threshold on v2 cmd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ECs implementing the v2 command will not stop charging when the end threshold is reached. Instead they will begin discharging until the start threshold is reached, leading to permanent charge and discharge cycles. This defeats the point of the charge control mechanism. Avoid the issue by hiding the start threshold on v2 systems. Instead on those systems program the EC with start == end which forces the EC to reach and stay at that level. v1 does not support thresholds and v3 works correctly, at least judging from the code. Reported-by: Thomas Koch Fixes: c6ed48ef5259 ("power: supply: add ChromeOS EC based charge control driver") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241208-cros_charge-control-v2-v1-3-8d168d0f08a3@weissschuh.net Signed-off-by: Sebastian Reichel --- drivers/power/supply/cros_charge-control.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/power/supply/cros_charge-control.c b/drivers/power/supply/cros_charge-control.c index 108b121db442..9b0a7500296b 100644 --- a/drivers/power/supply/cros_charge-control.c +++ b/drivers/power/supply/cros_charge-control.c @@ -139,6 +139,10 @@ static ssize_t cros_chctl_store_threshold(struct device *dev, struct cros_chctl_ return -EINVAL; if (is_end_threshold) { + /* Start threshold is not exposed, use fixed value */ + if (priv->cmd_version == 2) + priv->current_start_threshold = val == 100 ? 0 : val; + if (val < priv->current_start_threshold) return -EINVAL; priv->current_end_threshold = val; @@ -234,12 +238,10 @@ static umode_t cros_chtl_attr_is_visible(struct kobject *kobj, struct attribute { struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(attr, n); - if (priv->cmd_version < 2) { - if (n == CROS_CHCTL_ATTR_START_THRESHOLD) - return 0; - if (n == CROS_CHCTL_ATTR_END_THRESHOLD) - return 0; - } + if (n == CROS_CHCTL_ATTR_START_THRESHOLD && priv->cmd_version < 3) + return 0; + else if (n == CROS_CHCTL_ATTR_END_THRESHOLD && priv->cmd_version < 2) + return 0; return attr->mode; } From 919bfa9b2dbf3bc0c478afd4e44445836381dacb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 10 Dec 2024 03:25:57 +0000 Subject: [PATCH 096/807] cpufreq/amd-pstate: Detect preferred core support before driver registration Booting with amd-pstate on 3rd Generation EPYC system incorrectly enabled ITMT support despite the system not supporting Preferred Core ranking. amd_pstate_init_prefcore() called during amd_pstate*_cpu_init() requires "amd_pstate_prefcore" to be set correctly however the preferred core support is detected only after driver registration which is too late. Swap the function calls around to detect preferred core support before registring the driver via amd_pstate_register_driver(). This ensures amd_pstate*_cpu_init() sees the correct value of "amd_pstate_prefcore" considering the platform support. Fixes: 279f838a61f9 ("x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()") Fixes: ff2653ded4d9 ("cpufreq/amd-pstate: Move registration after static function call update") Signed-off-by: K Prateek Nayak Acked-by: Mario Limonciello Link: https://lore.kernel.org/r/20241210032557.754-1-kprateek.nayak@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d7630bab2516..8b36450bbdf6 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1869,18 +1869,18 @@ static int __init amd_pstate_init(void) static_call_update(amd_pstate_update_perf, shmem_update_perf); } - ret = amd_pstate_register_driver(cppc_state); - if (ret) { - pr_err("failed to register with return %d\n", ret); - return ret; - } - if (amd_pstate_prefcore) { ret = amd_detect_prefcore(&amd_pstate_prefcore); if (ret) return ret; } + ret = amd_pstate_register_driver(cppc_state); + if (ret) { + pr_err("failed to register with return %d\n", ret); + return ret; + } + dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { ret = sysfs_create_group(&dev_root->kobj, &amd_pstate_global_attr_group); From 8644b48714dca8bf2f42a4ff8311de8efc9bd8c3 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 14 May 2024 10:15:14 +0300 Subject: [PATCH 097/807] thunderbolt: Add support for Intel Panther Lake-M/P Intel Panther Lake-M/P has the same integrated Thunderbolt/USB4 controller as Lunar Lake. Add these PCI IDs to the driver list of supported devices. Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/nhi.c | 8 ++++++++ drivers/thunderbolt/nhi.h | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 1257dd3ce7e6..f3a2264e012b 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -1520,6 +1520,14 @@ static struct pci_device_id nhi_ids[] = { .driver_data = (kernel_ulong_t)&icl_nhi_ops }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_LNL_NHI1), .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_M_NHI0), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_M_NHI1), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_P_NHI0), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_P_NHI1), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_80G_NHI) }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_40G_NHI) }, diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h index 7a07c7c1a9c2..16744f25a9a0 100644 --- a/drivers/thunderbolt/nhi.h +++ b/drivers/thunderbolt/nhi.h @@ -92,6 +92,10 @@ extern const struct tb_nhi_ops icl_nhi_ops; #define PCI_DEVICE_ID_INTEL_RPL_NHI1 0xa76d #define PCI_DEVICE_ID_INTEL_LNL_NHI0 0xa833 #define PCI_DEVICE_ID_INTEL_LNL_NHI1 0xa834 +#define PCI_DEVICE_ID_INTEL_PTL_M_NHI0 0xe333 +#define PCI_DEVICE_ID_INTEL_PTL_M_NHI1 0xe334 +#define PCI_DEVICE_ID_INTEL_PTL_P_NHI0 0xe433 +#define PCI_DEVICE_ID_INTEL_PTL_P_NHI1 0xe434 #define PCI_CLASS_SERIAL_USB_USB4 0x0c0340 From a4048c83fd87c65657a4acb17d639092d4b6133d Mon Sep 17 00:00:00 2001 From: Anumula Murali Mohan Reddy Date: Tue, 3 Dec 2024 19:30:53 +0530 Subject: [PATCH 098/807] RDMA/core: Fix ENODEV error for iWARP test over vlan If traffic is over vlan, cma_validate_port() fails to match net_device ifindex with bound_if_index and results in ENODEV error. As iWARP gid table is static, it contains entry corresponding to only one net device which is either real netdev or vlan netdev for cases like siw attached to a vlan interface. This patch fixes the issue by assigning bound_if_index with net device index, if real net device obtained from bound if index matches with net device retrieved from gid table Fixes: f8ef1be816bf ("RDMA/cma: Avoid GID lookups on iWARP devices") Link: https://lore.kernel.org/all/ZzNgdrjo1kSCGbRz@chelsio.com/ Signed-off-by: Anumula Murali Mohan Reddy Signed-off-by: Potnuri Bharat Teja Link: https://patch.msgid.link/20241203140052.3985-1-anumula@chelsio.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 64ace0b968f0..91db10515d74 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -690,6 +690,7 @@ cma_validate_port(struct ib_device *device, u32 port, int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; + struct net_device *pdev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; @@ -714,6 +715,21 @@ cma_validate_port(struct ib_device *device, u32 port, rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); + if (ndev->ifindex != bound_if_index) { + pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index); + if (pdev) { + if (is_vlan_dev(pdev)) { + pdev = vlan_dev_real_dev(pdev); + if (ndev->ifindex == pdev->ifindex) + bound_if_index = pdev->ifindex; + } + if (is_vlan_dev(ndev)) { + pdev = vlan_dev_real_dev(ndev); + if (bound_if_index == pdev->ifindex) + bound_if_index = ndev->ifindex; + } + } + } if (!net_eq(dev_net(ndev), dev_addr->net) || ndev->ifindex != bound_if_index) { rdma_put_gid_attr(sgid_attr); From efb113fc30e7b805f7375d269b93bb4593d11d97 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Nov 2024 17:23:10 +0100 Subject: [PATCH 099/807] drm: rework FB_CORE dependency The 'select FB_CORE' statement moved from CONFIG_DRM to DRM_CLIENT_LIB, but there are now configurations that have code calling into fb_core as built-in even though the client_lib itself is a loadable module: x86_64-linux-ld: drivers/gpu/drm/drm_fbdev_shmem.o: in function `drm_fbdev_shmem_driver_fbdev_probe': drm_fbdev_shmem.c:(.text+0x1fc): undefined reference to `fb_deferred_io_init' x86_64-linux-ld: drivers/gpu/drm/drm_fbdev_shmem.o: in function `drm_fbdev_shmem_fb_destroy': drm_fbdev_shmem.c:(.text+0x2e1): undefined reference to `fb_deferred_io_cleanup' In addition to DRM_CLIENT_LIB, the 'select' needs to be at least in two more parts, DRM_KMS_HELPER and DRM_GEM_SHMEM_HELPER, so add those here. v3: - Remove FB_CORE from DRM_KMS_HELPER to avoid circular dependency Fixes: dadd28d4142f ("drm/client: Add client-lib module") Signed-off-by: Arnd Bergmann Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20241115162323.3555229-1-arnd@kernel.org --- drivers/gpu/drm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 5504721007cc..a0690049b292 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -372,6 +372,7 @@ config DRM_GEM_DMA_HELPER config DRM_GEM_SHMEM_HELPER tristate depends on DRM && MMU + select FB_CORE if DRM_FBDEV_EMULATION select FB_SYSMEM_HELPERS_DEFERRED if DRM_FBDEV_EMULATION help Choose this if you need the GEM shmem helper functions From c1043cdb019ed4d053d673e62b553a5cea1a287d Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Sat, 7 Dec 2024 21:26:55 -0300 Subject: [PATCH 100/807] alienware-wmi: Fix X Series and G Series quirks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devices that are known to support the WMI thermal interface do not support the legacy LED control interface. Make `.num_zones = 0` and avoid calling alienware_zone_init() if that's the case. Fixes: 9f6c43041552 ("alienware-wmi: added platform profile support") Fixes: 1c1eb70e7d23 ("alienware-wmi: extends the list of supported models") Suggested-by: Armin Wolf Reviewed-by: Armin Wolf Signed-off-by: Kurt Borja Link: https://lore.kernel.org/r/20241208002652.5885-4-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/dell/alienware-wmi.c b/drivers/platform/x86/dell/alienware-wmi.c index 77465ed9b449..e69bf9a7b6c8 100644 --- a/drivers/platform/x86/dell/alienware-wmi.c +++ b/drivers/platform/x86/dell/alienware-wmi.c @@ -190,7 +190,7 @@ static struct quirk_entry quirk_asm201 = { }; static struct quirk_entry quirk_g_series = { - .num_zones = 2, + .num_zones = 0, .hdmi_mux = 0, .amplifier = 0, .deepslp = 0, @@ -199,7 +199,7 @@ static struct quirk_entry quirk_g_series = { }; static struct quirk_entry quirk_x_series = { - .num_zones = 2, + .num_zones = 0, .hdmi_mux = 0, .amplifier = 0, .deepslp = 0, @@ -687,6 +687,9 @@ static void alienware_zone_exit(struct platform_device *dev) { u8 zone; + if (!quirks->num_zones) + return; + sysfs_remove_group(&dev->dev.kobj, &zone_attribute_group); led_classdev_unregister(&global_led); if (zone_dev_attrs) { @@ -1229,9 +1232,11 @@ static int __init alienware_wmi_init(void) goto fail_prep_thermal_profile; } - ret = alienware_zone_init(platform_device); - if (ret) - goto fail_prep_zones; + if (quirks->num_zones > 0) { + ret = alienware_zone_init(platform_device); + if (ret) + goto fail_prep_zones; + } return 0; From 54a8cada2f3d7efb4a7920807473d89c442d9c45 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Sat, 7 Dec 2024 21:30:15 -0300 Subject: [PATCH 101/807] alienware-wmi: Adds support to Alienware m16 R1 AMD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds support to Alienware m16 R1 AMD. Tested-by: Cihan Ozakca Signed-off-by: Kurt Borja Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20241208003013.6490-3-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/dell/alienware-wmi.c b/drivers/platform/x86/dell/alienware-wmi.c index e69bf9a7b6c8..341d01d3e3e4 100644 --- a/drivers/platform/x86/dell/alienware-wmi.c +++ b/drivers/platform/x86/dell/alienware-wmi.c @@ -241,6 +241,15 @@ static const struct dmi_system_id alienware_quirks[] __initconst = { }, .driver_data = &quirk_asm201, }, + { + .callback = dmi_matched, + .ident = "Alienware m16 R1 AMD", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m16 R1 AMD"), + }, + .driver_data = &quirk_x_series, + }, { .callback = dmi_matched, .ident = "Alienware m17 R5", From 9244524d60ddea55f4df54c51200e8fef2032447 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:33 +0900 Subject: [PATCH 102/807] p2sb: Factor out p2sb_read_from_cache() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare for the following fix, factor out the code to read the P2SB resource from the cache to the new function p2sb_read_from_cache(). Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-2-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index d51eb0db0626..a685781d1272 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -172,6 +172,22 @@ static int p2sb_cache_resources(void) return ret; } +static int p2sb_read_from_cache(struct pci_bus *bus, unsigned int devfn, + struct resource *mem) +{ + struct p2sb_res_cache *cache = &p2sb_resources[PCI_FUNC(devfn)]; + + if (cache->bus_dev_id != bus->dev.id) + return -ENODEV; + + if (!p2sb_valid_resource(&cache->res)) + return -ENOENT; + + memcpy(mem, &cache->res, sizeof(*mem)); + + return 0; +} + /** * p2sb_bar - Get Primary to Sideband (P2SB) bridge device BAR * @bus: PCI bus to communicate with @@ -188,8 +204,6 @@ static int p2sb_cache_resources(void) */ int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem) { - struct p2sb_res_cache *cache; - bus = p2sb_get_bus(bus); if (!bus) return -ENODEV; @@ -197,15 +211,7 @@ int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem) if (!devfn) p2sb_get_devfn(&devfn); - cache = &p2sb_resources[PCI_FUNC(devfn)]; - if (cache->bus_dev_id != bus->dev.id) - return -ENODEV; - - if (!p2sb_valid_resource(&cache->res)) - return -ENOENT; - - memcpy(mem, &cache->res, sizeof(*mem)); - return 0; + return p2sb_read_from_cache(bus, devfn, mem); } EXPORT_SYMBOL_GPL(p2sb_bar); From ae3e6ebc5ab046d434c05c58a3e3f7e94441fec2 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:34 +0900 Subject: [PATCH 103/807] p2sb: Introduce the global flag p2sb_hidden_by_bios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare for the following fix, introduce the global flag p2sb_hidden_by_bios. Check if the BIOS hides the P2SB device and store the result in the flag. This allows to refer to the check result across functions. Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-3-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index a685781d1272..630068e01f7e 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -43,6 +43,7 @@ struct p2sb_res_cache { }; static struct p2sb_res_cache p2sb_resources[NR_P2SB_RES_CACHE]; +static bool p2sb_hidden_by_bios; static void p2sb_get_devfn(unsigned int *devfn) { @@ -158,13 +159,14 @@ static int p2sb_cache_resources(void) * Unhide the P2SB device here, if needed. */ pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value); - if (value & P2SBC_HIDE) + p2sb_hidden_by_bios = value & P2SBC_HIDE; + if (p2sb_hidden_by_bios) pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, 0); ret = p2sb_scan_and_cache(bus, devfn_p2sb); /* Hide the P2SB device, if it was hidden */ - if (value & P2SBC_HIDE) + if (p2sb_hidden_by_bios) pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, P2SBC_HIDE); pci_unlock_rescan_remove(); From 0286070c74ee48391fc07f7f617460479472d221 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:35 +0900 Subject: [PATCH 104/807] p2sb: Move P2SB hide and unhide code to p2sb_scan_and_cache() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare for the following fix, move the code to hide and unhide the P2SB device from p2sb_cache_resources() to p2sb_scan_and_cache(). Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-4-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index 630068e01f7e..46c108bbcbba 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -98,6 +98,14 @@ static void p2sb_scan_and_cache_devfn(struct pci_bus *bus, unsigned int devfn) static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) { + /* + * The BIOS prevents the P2SB device from being enumerated by the PCI + * subsystem, so we need to unhide and hide it back to lookup the BAR. + * Unhide the P2SB device here, if needed. + */ + if (p2sb_hidden_by_bios) + pci_bus_write_config_dword(bus, devfn, P2SBC, 0); + /* Scan the P2SB device and cache its BAR0 */ p2sb_scan_and_cache_devfn(bus, devfn); @@ -105,6 +113,10 @@ static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) if (devfn == P2SB_DEVFN_GOLDMONT) p2sb_scan_and_cache_devfn(bus, SPI_DEVFN_GOLDMONT); + /* Hide the P2SB device, if it was hidden */ + if (p2sb_hidden_by_bios) + pci_bus_write_config_dword(bus, devfn, P2SBC, P2SBC_HIDE); + if (!p2sb_valid_resource(&p2sb_resources[PCI_FUNC(devfn)].res)) return -ENOENT; @@ -153,22 +165,11 @@ static int p2sb_cache_resources(void) */ pci_lock_rescan_remove(); - /* - * The BIOS prevents the P2SB device from being enumerated by the PCI - * subsystem, so we need to unhide and hide it back to lookup the BAR. - * Unhide the P2SB device here, if needed. - */ pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value); p2sb_hidden_by_bios = value & P2SBC_HIDE; - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, 0); ret = p2sb_scan_and_cache(bus, devfn_p2sb); - /* Hide the P2SB device, if it was hidden */ - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, P2SBC_HIDE); - pci_unlock_rescan_remove(); return ret; From 360c400d0f568636c1b98d1d5f9f49aa3d420c70 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:36 +0900 Subject: [PATCH 105/807] p2sb: Do not scan and remove the P2SB device when it is unhidden MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When drivers access P2SB device resources, it calls p2sb_bar(). Before the commit 5913320eb0b3 ("platform/x86: p2sb: Allow p2sb_bar() calls during PCI device probe"), p2sb_bar() obtained the resources and then called pci_stop_and_remove_bus_device() for clean up. Then the P2SB device disappeared. The commit 5913320eb0b3 introduced the P2SB device resource cache feature in the boot process. During the resource cache, pci_stop_and_remove_bus_device() is called for the P2SB device, then the P2SB device disappears regardless of whether p2sb_bar() is called or not. Such P2SB device disappearance caused a confusion [1]. To avoid the confusion, avoid the pci_stop_and_remove_bus_device() call when the BIOS does not hide the P2SB device. For that purpose, cache the P2SB device resources only if the BIOS hides the P2SB device. Call p2sb_scan_and_cache() only if p2sb_hidden_by_bios is true. This allows removing two branches from p2sb_scan_and_cache(). When p2sb_bar() is called, get the resources from the cache if the P2SB device is hidden. Otherwise, read the resources from the unhidden P2SB device. Reported-by: Daniel Walker (danielwa) Closes: https://lore.kernel.org/lkml/ZzTI+biIUTvFT6NC@goliath/ [1] Fixes: 5913320eb0b3 ("platform/x86: p2sb: Allow p2sb_bar() calls during PCI device probe") Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-5-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 42 +++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index 46c108bbcbba..cbbb0f809704 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -101,10 +101,8 @@ static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) /* * The BIOS prevents the P2SB device from being enumerated by the PCI * subsystem, so we need to unhide and hide it back to lookup the BAR. - * Unhide the P2SB device here, if needed. */ - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn, P2SBC, 0); + pci_bus_write_config_dword(bus, devfn, P2SBC, 0); /* Scan the P2SB device and cache its BAR0 */ p2sb_scan_and_cache_devfn(bus, devfn); @@ -113,9 +111,7 @@ static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) if (devfn == P2SB_DEVFN_GOLDMONT) p2sb_scan_and_cache_devfn(bus, SPI_DEVFN_GOLDMONT); - /* Hide the P2SB device, if it was hidden */ - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn, P2SBC, P2SBC_HIDE); + pci_bus_write_config_dword(bus, devfn, P2SBC, P2SBC_HIDE); if (!p2sb_valid_resource(&p2sb_resources[PCI_FUNC(devfn)].res)) return -ENOENT; @@ -142,7 +138,7 @@ static int p2sb_cache_resources(void) u32 value = P2SBC_HIDE; struct pci_bus *bus; u16 class; - int ret; + int ret = 0; /* Get devfn for P2SB device itself */ p2sb_get_devfn(&devfn_p2sb); @@ -168,7 +164,12 @@ static int p2sb_cache_resources(void) pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value); p2sb_hidden_by_bios = value & P2SBC_HIDE; - ret = p2sb_scan_and_cache(bus, devfn_p2sb); + /* + * If the BIOS does not hide the P2SB device then its resources + * are accesilble. Cache them only if the P2SB device is hidden. + */ + if (p2sb_hidden_by_bios) + ret = p2sb_scan_and_cache(bus, devfn_p2sb); pci_unlock_rescan_remove(); @@ -191,6 +192,26 @@ static int p2sb_read_from_cache(struct pci_bus *bus, unsigned int devfn, return 0; } +static int p2sb_read_from_dev(struct pci_bus *bus, unsigned int devfn, + struct resource *mem) +{ + struct pci_dev *pdev; + int ret = 0; + + pdev = pci_get_slot(bus, devfn); + if (!pdev) + return -ENODEV; + + if (p2sb_valid_resource(pci_resource_n(pdev, 0))) + p2sb_read_bar0(pdev, mem); + else + ret = -ENOENT; + + pci_dev_put(pdev); + + return ret; +} + /** * p2sb_bar - Get Primary to Sideband (P2SB) bridge device BAR * @bus: PCI bus to communicate with @@ -214,7 +235,10 @@ int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem) if (!devfn) p2sb_get_devfn(&devfn); - return p2sb_read_from_cache(bus, devfn, mem); + if (p2sb_hidden_by_bios) + return p2sb_read_from_cache(bus, devfn, mem); + + return p2sb_read_from_dev(bus, devfn, mem); } EXPORT_SYMBOL_GPL(p2sb_bar); From 41856638e6c4ed51d8aa9e54f70059d1e357b46e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 29 Nov 2024 17:39:27 +0100 Subject: [PATCH 106/807] s390/mm: Fix DirectMap accounting With uncoupling of physical and virtual address spaces population of the identity mapping was changed to use the type POPULATE_IDENTITY instead of POPULATE_DIRECT. This breaks DirectMap accounting: > cat /proc/meminfo DirectMap4k: 55296 kB DirectMap1M: 18446744073709496320 kB Adjust all locations of update_page_count() in vmem.c to use POPULATE_IDENTITY instead of POPULATE_DIRECT as well. With this accounting is correct again: > cat /proc/meminfo DirectMap4k: 54264 kB DirectMap1M: 8334336 kB Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") Cc: stable@vger.kernel.org Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/boot/vmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 145035f84a0e..3fa28db2fe59 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -306,7 +306,7 @@ static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long e pages++; } } - if (mode == POPULATE_DIRECT) + if (mode == POPULATE_IDENTITY) update_page_count(PG_DIRECT_MAP_4K, pages); } @@ -339,7 +339,7 @@ static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long e } pgtable_pte_populate(pmd, addr, next, mode); } - if (mode == POPULATE_DIRECT) + if (mode == POPULATE_IDENTITY) update_page_count(PG_DIRECT_MAP_1M, pages); } @@ -372,7 +372,7 @@ static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long e } pgtable_pmd_populate(pud, addr, next, mode); } - if (mode == POPULATE_DIRECT) + if (mode == POPULATE_IDENTITY) update_page_count(PG_DIRECT_MAP_2G, pages); } From a56335c85b592cb2833db0a71f7112b7d9f0d56b Mon Sep 17 00:00:00 2001 From: Prathamesh Shete Date: Mon, 9 Dec 2024 15:40:09 +0530 Subject: [PATCH 107/807] mmc: sdhci-tegra: Remove SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC quirk Value 0 in ADMA length descriptor is interpreted as 65536 on new Tegra chips, remove SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC quirk to make sure max ADMA2 length is 65536. Fixes: 4346b7c7941d ("mmc: tegra: Add Tegra186 support") Cc: stable@vger.kernel.org Signed-off-by: Prathamesh Shete Acked-by: Thierry Reding Acked-by: Adrian Hunter Message-ID: <20241209101009.22710-1-pshete@nvidia.com> Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-tegra.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 4d402b601883..b2f5c3f8b839 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -1525,7 +1525,6 @@ static const struct sdhci_pltfm_data sdhci_tegra186_pdata = { .quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL | SDHCI_QUIRK_SINGLE_POWER_WRITE | SDHCI_QUIRK_NO_HISPD_BIT | - SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | SDHCI_QUIRK2_ISSUE_CMD_DAT_RESET_TOGETHER, From f3d87abe11ed04d1b23a474a212f0e5deeb50892 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Tue, 3 Dec 2024 11:34:42 +0900 Subject: [PATCH 108/807] mmc: mtk-sd: disable wakeup in .remove() and in the error path of .probe() Current implementation leaves pdev->dev as a wakeup source. Add a device_init_wakeup(&pdev->dev, false) call in the .remove() function and in the error path of the .probe() function. Signed-off-by: Joe Hattori Fixes: 527f36f5efa4 ("mmc: mediatek: add support for SDIO eint wakup IRQ") Cc: stable@vger.kernel.org Message-ID: <20241203023442.2434018-1-joe@pf.is.s.u-tokyo.ac.jp> Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index efb0d2d5716b..af445d3f8e2a 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -3070,6 +3070,7 @@ release_clk: msdc_gate_clock(host); platform_set_drvdata(pdev, NULL); release_mem: + device_init_wakeup(&pdev->dev, false); if (host->dma.gpd) dma_free_coherent(&pdev->dev, 2 * sizeof(struct mt_gpdma_desc), @@ -3103,6 +3104,7 @@ static void msdc_drv_remove(struct platform_device *pdev) host->dma.gpd, host->dma.gpd_addr); dma_free_coherent(&pdev->dev, MAX_BD_NUM * sizeof(struct mt_bdma_desc), host->dma.bd, host->dma.bd_addr); + device_init_wakeup(&pdev->dev, false); } static void msdc_save_reg(struct msdc_host *host) From 50a062a7620051c09adacd6d140ebd56881a333b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:34 -0600 Subject: [PATCH 109/807] cpufreq/amd-pstate: Store the boost numerator as highest perf again commit ad4caad58d91d ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()") changed the semantics for highest perf and commit 18d9b52271213 ("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled") worked around those semantic changes. This however is a confusing result and furthermore makes it awkward to change frequency limits and boost due to the scaling differences. Restore the boost numerator to highest perf again. Suggested-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Fixes: ad4caad58d91 ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()") Link: https://lore.kernel.org/r/20241209185248.16301-2-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- Documentation/admin-guide/pm/amd-pstate.rst | 4 +--- drivers/cpufreq/amd-pstate.c | 25 ++++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index 210a808b74ec..412423c54f25 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -251,9 +251,7 @@ performance supported in `AMD CPPC Performance Capability `_). In some ASICs, the highest CPPC performance is not the one in the ``_CPC`` table, so we need to expose it to sysfs. If boost is not active, but still supported, this maximum frequency will be larger than the one in -``cpuinfo``. On systems that support preferred core, the driver will have -different values for some cores than others and this will reflect the values -advertised by the platform at bootup. +``cpuinfo``. This attribute is read-only. ``amd_pstate_lowest_nonlinear_freq`` diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 8b36450bbdf6..ab6fe9c2150c 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -374,15 +374,19 @@ static inline int amd_pstate_cppc_enable(bool enable) static int msr_init_perf(struct amd_cpudata *cpudata) { - u64 cap1; + u64 cap1, numerator; int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &cap1); if (ret) return ret; - WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); - WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1)); + ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); + if (ret) + return ret; + + WRITE_ONCE(cpudata->highest_perf, numerator); + WRITE_ONCE(cpudata->max_limit_perf, numerator); WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); @@ -394,13 +398,18 @@ static int msr_init_perf(struct amd_cpudata *cpudata) static int shmem_init_perf(struct amd_cpudata *cpudata) { struct cppc_perf_caps cppc_perf; + u64 numerator; int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); if (ret) return ret; - WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf); - WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf); + ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); + if (ret) + return ret; + + WRITE_ONCE(cpudata->highest_perf, numerator); + WRITE_ONCE(cpudata->max_limit_perf, numerator); WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); WRITE_ONCE(cpudata->lowest_nonlinear_perf, cppc_perf.lowest_nonlinear_perf); @@ -889,7 +898,6 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) { int ret; u32 min_freq, max_freq; - u64 numerator; u32 nominal_perf, nominal_freq; u32 lowest_nonlinear_perf, lowest_nonlinear_freq; u32 boost_ratio, lowest_nonlinear_ratio; @@ -911,10 +919,7 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) nominal_perf = READ_ONCE(cpudata->nominal_perf); - ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); - if (ret) - return ret; - boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf); + boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); From 2993b29b2a98f2bc9d55dfd37ef39f56a2908748 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:35 -0600 Subject: [PATCH 110/807] cpufreq/amd-pstate: Use boost numerator for upper bound of frequencies commit 18d9b5227121 ("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled") introduced different semantics for min/max limits based upon whether the user turned off boost from sysfs. This however is not necessary when the highest perf value is the boost numerator. Suggested-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Fixes: 18d9b5227121 ("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled") Link: https://lore.kernel.org/r/20241209185248.16301-3-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ab6fe9c2150c..66e5dfc711c0 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -570,16 +570,13 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) { - u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf; + u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf, max_freq; struct amd_cpudata *cpudata = policy->driver_data; - if (cpudata->boost_supported && !policy->boost_enabled) - max_perf = READ_ONCE(cpudata->nominal_perf); - else - max_perf = READ_ONCE(cpudata->highest_perf); - - max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq); - min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq); + max_perf = READ_ONCE(cpudata->highest_perf); + max_freq = READ_ONCE(cpudata->max_freq); + max_limit_perf = div_u64(policy->max * max_perf, max_freq); + min_limit_perf = div_u64(policy->min * max_perf, max_freq); lowest_perf = READ_ONCE(cpudata->lowest_perf); if (min_limit_perf < lowest_perf) From 5d009e024056ded20c5bb1583146b833b23bbd5a Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 6 Dec 2024 08:52:30 +0800 Subject: [PATCH 111/807] of: Fix refcount leakage for OF node returned by __of_get_dma_parent() __of_get_dma_parent() returns OF device node @args.np, but the node's refcount is increased twice, by both of_parse_phandle_with_args() and of_node_get(), so causes refcount leakage for the node. Fix by directly returning the node got by of_parse_phandle_with_args(). Fixes: f83a6e5dea6c ("of: address: Add support for the parent DMA bus") Cc: stable@vger.kernel.org Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241206-of_core_fix-v1-4-dc28ed56bec3@quicinc.com Signed-off-by: Rob Herring (Arm) --- drivers/of/address.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index 5b7ee3ed5296..c1f1c810e810 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -620,7 +620,7 @@ struct device_node *__of_get_dma_parent(const struct device_node *np) if (ret < 0) return of_get_parent(np); - return of_node_get(args.np); + return args.np; } #endif From fec3edc47d5cfc2dd296a5141df887bf567944db Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 9 Dec 2024 21:24:59 +0800 Subject: [PATCH 112/807] of/irq: Fix interrupt-map cell length check in of_irq_parse_imap_parent() On a malformed interrupt-map property which is shorter than expected by 1 cell, we may read bogus data past the end of the property instead of returning an error in of_irq_parse_imap_parent(). Decrement the remaining length when skipping over the interrupt parent phandle cell. Fixes: 935df1bd40d4 ("of/irq: Factor out parsing of interrupt-map parent phandle+args from of_irq_parse_raw()") Cc: stable@vger.kernel.org Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241209-of_irq_fix-v1-1-782f1419c8a1@quicinc.com [rh: reword commit msg] Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 67fc0ceaa5f5..43cf60479b9e 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -111,6 +111,7 @@ const __be32 *of_irq_parse_imap_parent(const __be32 *imap, int len, struct of_ph else np = of_find_node_by_phandle(be32_to_cpup(imap)); imap++; + len--; /* Check if not found */ if (!np) { From 0f7ca6f69354e0c3923bbc28c92d0ecab4d50a3e Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 9 Dec 2024 21:25:02 +0800 Subject: [PATCH 113/807] of/irq: Fix using uninitialized variable @addr_len in API of_irq_parse_one() of_irq_parse_one() may use uninitialized variable @addr_len as shown below: // @addr_len is uninitialized int addr_len; // This operation does not touch @addr_len if it fails. addr = of_get_property(device, "reg", &addr_len); // Use uninitialized @addr_len if the operation fails. if (addr_len > sizeof(addr_buf)) addr_len = sizeof(addr_buf); // Check the operation result here. if (addr) memcpy(addr_buf, addr, addr_len); Fix by initializing @addr_len before the operation. Fixes: b739dffa5d57 ("of/irq: Prevent device address out-of-bounds read in interrupt map walk") Cc: stable@vger.kernel.org Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241209-of_irq_fix-v1-4-782f1419c8a1@quicinc.com Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 43cf60479b9e..98b1cf78ecac 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -355,6 +355,7 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar return of_irq_parse_oldworld(device, index, out_irq); /* Get the reg property (if any) */ + addr_len = 0; addr = of_get_property(device, "reg", &addr_len); /* Prevent out-of-bounds read in case of longer interrupt parent address size */ From da4d8c83358163df9a4addaeba0ef8bcb03b22e8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 15 Nov 2024 09:00:32 -0800 Subject: [PATCH 114/807] cxl/pci: Fix potential bogus return value upon successful probing If cxl_pci_ras_unmask() returns non-zero, cxl_pci_probe() will end up returning that value, instead of zero. Fixes: 248529edc86f ("cxl: add RAS status unmasking for CXL") Reviewed-by: Fan Ni Signed-off-by: Davidlohr Bueso Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20241115170032.108445-1-dave@stgolabs.net Signed-off-by: Dave Jiang --- drivers/cxl/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0241d1d7133a..26ab06c9deff 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1032,8 +1032,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; - rc = cxl_pci_ras_unmask(pdev); - if (rc) + if (cxl_pci_ras_unmask(pdev)) dev_dbg(&pdev->dev, "No RAS reporting unmasked\n"); pci_save_state(pdev); From 09ceba3a93450b652ae6910b6f65be99885f4437 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 29 Nov 2024 21:28:25 +0800 Subject: [PATCH 115/807] cxl/pci: Check dport->regs.rcd_pcie_cap availability before accessing RCD Upstream Port's PCI Express Capability is a component registers block stored in RCD Upstream Port RCRB. CXL PCI driver helps to map it during the RCD probing, but mapping failure is allowed for component registers blocks in CXL PCI driver. dport->regs.rcd_pcie_cap is used to store the virtual address of the RCD Upstream Port's PCI Express Capability, add a dport->regs.rcd_pcie_cap checking in rcd_pcie_cap_emit() just in case user accesses a invalid address via RCD sysfs. Fixes: c5eaec79fa43 ("cxl/pci: Add sysfs attribute for CXL 1.1 device link status") Signed-off-by: Li Ming Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Link: https://patch.msgid.link/20241129132825.569237-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 26ab06c9deff..6d94ff4a4f1a 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -836,6 +836,9 @@ static ssize_t rcd_pcie_cap_emit(struct device *dev, u16 offset, char *buf, size if (!root_dev) return -ENXIO; + if (!dport->regs.rcd_pcie_cap) + return -ENXIO; + guard(device)(root_dev); if (!root_dev->driver) return -ENXIO; From 76467a94810c2aa4dd3096903291ac6df30c399e Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Mon, 9 Dec 2024 15:33:02 -0800 Subject: [PATCH 116/807] cxl/region: Fix region creation for greater than x2 switches The cxl_port_setup_targets() algorithm fails to identify valid target list ordering in the presence of 4-way and above switches resulting in 'cxl create-region' failures of the form: $ cxl create-region -d decoder0.0 -g 1024 -s 2G -t ram -w 8 -m mem4 mem1 mem6 mem3 mem2 mem5 mem7 mem0 cxl region: create_region: region0: failed to set target7 to mem0 cxl region: cmd_create_region: created 0 regions [kernel debug message] check_last_peer:1213: cxl region0: pci0000:0c:port1: cannot host mem6:decoder7.0 at 2 bus_remove_device:574: bus: 'cxl': remove device region0 QEMU can create this failing topology: ACPI0017:00 [root0] | HB_0 [port1] / \ RP_0 RP_1 | | USP [port2] USP [port3] / / \ \ / / \ \ DSP DSP DSP DSP DSP DSP DSP DSP | | | | | | | | mem4 mem6 mem2 mem7 mem1 mem3 mem5 mem0 Pos: 0 2 4 6 1 3 5 7 HB: Host Bridge RP: Root Port USP: Upstream Port DSP: Downstream Port ...with the following command steps: $ qemu-system-x86_64 -machine q35,cxl=on,accel=tcg \ -smp cpus=8 \ -m 8G \ -hda /home/work/vm-images/centos-stream8-02.qcow2 \ -object memory-backend-ram,size=4G,id=m0 \ -object memory-backend-ram,size=4G,id=m1 \ -object memory-backend-ram,size=2G,id=cxl-mem0 \ -object memory-backend-ram,size=2G,id=cxl-mem1 \ -object memory-backend-ram,size=2G,id=cxl-mem2 \ -object memory-backend-ram,size=2G,id=cxl-mem3 \ -object memory-backend-ram,size=2G,id=cxl-mem4 \ -object memory-backend-ram,size=2G,id=cxl-mem5 \ -object memory-backend-ram,size=2G,id=cxl-mem6 \ -object memory-backend-ram,size=2G,id=cxl-mem7 \ -numa node,memdev=m0,cpus=0-3,nodeid=0 \ -numa node,memdev=m1,cpus=4-7,nodeid=1 \ -netdev user,id=net0,hostfwd=tcp::2222-:22 \ -device virtio-net-pci,netdev=net0 \ -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \ -device cxl-rp,port=0,bus=cxl.1,id=root_port0,chassis=0,slot=0 \ -device cxl-rp,port=1,bus=cxl.1,id=root_port1,chassis=0,slot=1 \ -device cxl-upstream,bus=root_port0,id=us0 \ -device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \ -device cxl-type3,bus=swport0,volatile-memdev=cxl-mem0,id=cxl-vmem0 \ -device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \ -device cxl-type3,bus=swport1,volatile-memdev=cxl-mem1,id=cxl-vmem1 \ -device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \ -device cxl-type3,bus=swport2,volatile-memdev=cxl-mem2,id=cxl-vmem2 \ -device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \ -device cxl-type3,bus=swport3,volatile-memdev=cxl-mem3,id=cxl-vmem3 \ -device cxl-upstream,bus=root_port1,id=us1 \ -device cxl-downstream,port=4,bus=us1,id=swport4,chassis=0,slot=8 \ -device cxl-type3,bus=swport4,volatile-memdev=cxl-mem4,id=cxl-vmem4 \ -device cxl-downstream,port=5,bus=us1,id=swport5,chassis=0,slot=9 \ -device cxl-type3,bus=swport5,volatile-memdev=cxl-mem5,id=cxl-vmem5 \ -device cxl-downstream,port=6,bus=us1,id=swport6,chassis=0,slot=10 \ -device cxl-type3,bus=swport6,volatile-memdev=cxl-mem6,id=cxl-vmem6 \ -device cxl-downstream,port=7,bus=us1,id=swport7,chassis=0,slot=11 \ -device cxl-type3,bus=swport7,volatile-memdev=cxl-mem7,id=cxl-vmem7 \ -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=32G & In Guest OS: $ cxl create-region -d decoder0.0 -g 1024 -s 2G -t ram -w 8 -m mem4 mem1 mem6 mem3 mem2 mem5 mem7 mem0 Fix the method to calculate @distance by iterativeley multiplying the number of targets per switch port. This also follows the algorithm recommended here [1]. Fixes: 27b3f8d13830 ("cxl/region: Program target lists") Link: http://lore.kernel.org/6538824b52349_7258329466@dwillia2-xfh.jf.intel.com.notmuch [1] Signed-off-by: Huaisheng Ye Tested-by: Li Zhijian [djbw: add a comment explaining 'distance'] Signed-off-by: Dan Williams Link: https://patch.msgid.link/173378716722.1270362.9546805175813426729.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d77899650798..b98b1ccffd1c 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1295,6 +1295,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region_params *p = &cxlr->params; struct cxl_decoder *cxld = cxl_rr->decoder; struct cxl_switch_decoder *cxlsd; + struct cxl_port *iter = port; u16 eig, peig; u8 eiw, peiw; @@ -1311,16 +1312,26 @@ static int cxl_port_setup_targets(struct cxl_port *port, cxlsd = to_cxl_switch_decoder(&cxld->dev); if (cxl_rr->nr_targets_set) { - int i, distance; + int i, distance = 1; + struct cxl_region_ref *cxl_rr_iter; /* - * Passthrough decoders impose no distance requirements between - * peers + * The "distance" between peer downstream ports represents which + * endpoint positions in the region interleave a given port can + * host. + * + * For example, at the root of a hierarchy the distance is + * always 1 as every index targets a different host-bridge. At + * each subsequent switch level those ports map every Nth region + * position where N is the width of the switch == distance. */ - if (cxl_rr->nr_targets == 1) - distance = 0; - else - distance = p->nr_targets / cxl_rr->nr_targets; + do { + cxl_rr_iter = cxl_rr_load(iter, cxlr); + distance *= cxl_rr_iter->nr_targets; + iter = to_cxl_port(iter->dev.parent); + } while (!is_cxl_root(iter)); + distance *= cxlrd->cxlsd.cxld.interleave_ways; + for (i = 0; i < cxl_rr->nr_targets_set; i++) if (ep->dport == cxlsd->target[i]) { rc = check_last_peer(cxled, ep, cxl_rr, From 2872e21c47c359b902e53faf7e749c8ea682f7f7 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Wed, 4 Dec 2024 16:22:47 +0100 Subject: [PATCH 117/807] MAINTAINERS: align Danilo's maintainer entries Some entries use my kernel.org address, while others use my Red Hat one. Since this is a bit of an inconvinience for me, align them to all use the same (kernel.org) address. Acked-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20241204152248.8644-1-dakr@kernel.org --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b1..c669c5bd61e7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7345,7 +7345,7 @@ F: drivers/gpu/drm/panel/panel-novatek-nt36672a.c DRM DRIVER FOR NVIDIA GEFORCE/QUADRO GPUS M: Karol Herbst M: Lyude Paul -M: Danilo Krummrich +M: Danilo Krummrich L: dri-devel@lists.freedesktop.org L: nouveau@lists.freedesktop.org S: Supported @@ -8922,7 +8922,7 @@ F: include/linux/arm_ffa.h FIRMWARE LOADER (request_firmware) M: Luis Chamberlain M: Russ Weight -M: Danilo Krummrich +M: Danilo Krummrich L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/firmware_class/ From b8f614207b0d5e4abd6df8d5cb3cc11f009d1d93 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 9 Dec 2024 09:29:24 -0600 Subject: [PATCH 118/807] scx: Fix maximal BPF selftest prog maximal.bpf.c is still dispatching to and consuming from SCX_DSQ_GLOBAL. Let's have it use its own DSQ to avoid any runtime errors. Signed-off-by: David Vernet Tested-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/maximal.bpf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c index 4c005fa71810..430f5e13bf55 100644 --- a/tools/testing/selftests/sched_ext/maximal.bpf.c +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -12,6 +12,8 @@ char _license[] SEC("license") = "GPL"; +#define DSQ_ID 0 + s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { @@ -20,7 +22,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) { - scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); } void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) @@ -28,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) { - scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL); + scx_bpf_dsq_move_to_local(DSQ_ID); } void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) @@ -123,7 +125,7 @@ void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) { - return 0; + return scx_bpf_create_dsq(DSQ_ID, -1); } void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) From 2d2f25405a87cfa270ea7b5bb03a612c1a16020a Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 9 Dec 2024 18:45:01 +0100 Subject: [PATCH 119/807] MAINTAINERS: add self as reviewer for sched_ext Add myself as a reviewer for sched_ext, as I am actively working on this project and would like to help review relevant patches and address any related kernel issues. Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b1..41bae8792a77 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20902,6 +20902,7 @@ F: kernel/sched/ SCHEDULER - SCHED_EXT R: Tejun Heo R: David Vernet +R: Andrea Righi L: linux-kernel@vger.kernel.org S: Maintained W: https://github.com/sched-ext/scx From eb1dd15fb26d9ad85204f444ef03f29f9049eb1e Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Wed, 4 Dec 2024 13:04:41 +0200 Subject: [PATCH 120/807] cgroup/cpuset: Remove stale text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Task's cpuset pointer was removed by commit 8793d854edbc ("Task Control Groups: make cpusets a client of cgroups") Paragraph "The task_lock() exception ...." was removed by commit 2df167a300d7 ("cgroups: update comments in cpuset.c") Remove stale text: We also require taking task_lock() when dereferencing a task's cpuset pointer. See "The task_lock() exception", at the end of this comment. Accessing a task's cpuset should be done in accordance with the guidelines for accessing subsystem state in kernel/cgroup.c and reformat. Co-developed-by: Michal Koutný Co-developed-by: Waiman Long Signed-off-by: Costa Shulyupin Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f321ed515f3a..9e2abd6a38a5 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -197,10 +197,8 @@ static struct cpuset top_cpuset = { /* * There are two global locks guarding cpuset structures - cpuset_mutex and - * callback_lock. We also require taking task_lock() when dereferencing a - * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems - * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset + * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel + * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset * structures. Note that cpuset_mutex needs to be a mutex as it is used in * paths that rely on priority inheritance (e.g. scheduler - on RT) for * correctness. @@ -229,9 +227,6 @@ static struct cpuset top_cpuset = { * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. - * - * Accessing a task's cpuset should be done in accordance with the - * guidelines for accessing subsystem state in kernel/cgroup.c */ static DEFINE_MUTEX(cpuset_mutex); From e34f1717ef0632fcec5cb827e5e0e9f223d70c9b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 10:25:51 -0600 Subject: [PATCH 121/807] thunderbolt: Don't display nvm_version unless upgrade supported The read will never succeed if NVM wasn't initialized due to an unknown format. Add a new callback for visibility to only show when supported. Cc: stable@vger.kernel.org Fixes: aef9c693e7e5 ("thunderbolt: Move vendor specific NVM handling into nvm.c") Reported-by: Richard Hughes Closes: https://github.com/fwupd/fwupd/issues/8200 Signed-off-by: Mario Limonciello Signed-off-by: Mika Westerberg --- drivers/thunderbolt/retimer.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/thunderbolt/retimer.c b/drivers/thunderbolt/retimer.c index 89d2919d0193..eeb64433ebbc 100644 --- a/drivers/thunderbolt/retimer.c +++ b/drivers/thunderbolt/retimer.c @@ -103,6 +103,7 @@ static int tb_retimer_nvm_add(struct tb_retimer *rt) err_nvm: dev_dbg(&rt->dev, "NVM upgrade disabled\n"); + rt->no_nvm_upgrade = true; if (!IS_ERR(nvm)) tb_nvm_free(nvm); @@ -182,8 +183,6 @@ static ssize_t nvm_authenticate_show(struct device *dev, if (!rt->nvm) ret = -EAGAIN; - else if (rt->no_nvm_upgrade) - ret = -EOPNOTSUPP; else ret = sysfs_emit(buf, "%#x\n", rt->auth_status); @@ -323,8 +322,6 @@ static ssize_t nvm_version_show(struct device *dev, if (!rt->nvm) ret = -EAGAIN; - else if (rt->no_nvm_upgrade) - ret = -EOPNOTSUPP; else ret = sysfs_emit(buf, "%x.%x\n", rt->nvm->major, rt->nvm->minor); @@ -342,6 +339,19 @@ static ssize_t vendor_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(vendor); +static umode_t retimer_is_visible(struct kobject *kobj, struct attribute *attr, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct tb_retimer *rt = tb_to_retimer(dev); + + if (attr == &dev_attr_nvm_authenticate.attr || + attr == &dev_attr_nvm_version.attr) + return rt->no_nvm_upgrade ? 0 : attr->mode; + + return attr->mode; +} + static struct attribute *retimer_attrs[] = { &dev_attr_device.attr, &dev_attr_nvm_authenticate.attr, @@ -351,6 +361,7 @@ static struct attribute *retimer_attrs[] = { }; static const struct attribute_group retimer_group = { + .is_visible = retimer_is_visible, .attrs = retimer_attrs, }; From fdad4fb7c506bea8b419f70ff2163d99962e8ede Mon Sep 17 00:00:00 2001 From: Daniel Swanemar Date: Mon, 4 Nov 2024 14:42:17 +0100 Subject: [PATCH 122/807] USB: serial: option: add TCL IK512 MBIM & ECM Add the following TCL IK512 compositions: 0x0530: Modem + Diag + AT + MBIM T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 3 Spd=10000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=1bbb ProdID=0530 Rev=05.04 S: Manufacturer=TCL S: Product=TCL 5G USB Dongle S: SerialNumber=3136b91a C: #Ifs= 5 Cfg#= 1 Atr=80 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 4 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms 0x0640: ECM + Modem + Diag + AT T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 4 Spd=10000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=1bbb ProdID=0640 Rev=05.04 S: Manufacturer=TCL S: Product=TCL 5G USB Dongle S: SerialNumber=3136b91a C: #Ifs= 5 Cfg#= 1 Atr=80 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms Signed-off-by: Daniel Swanemar Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 9ba5584061c8..437960002bc3 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2385,6 +2385,10 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, + { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0530, 0xff), /* TCL IK512 MBIM */ + .driver_info = NCTRL(1) }, + { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0640, 0xff), /* TCL IK512 ECM */ + .driver_info = NCTRL(3) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, option_ids); From 724d461e44dfc0815624d2a9792f2f2beb7ee46d Mon Sep 17 00:00:00 2001 From: Michal Hrusecky Date: Tue, 19 Nov 2024 14:00:18 +0100 Subject: [PATCH 123/807] USB: serial: option: add MeiG Smart SLM770A Update the USB serial option driver to support MeiG Smart SLM770A. ID 2dee:4d57 Marvell Mobile Composite Device Bus T: Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=2dee ProdID=4d57 Rev= 1.00 S: Manufacturer=Marvell S: Product=Mobile Composite Device Bus C:* #Ifs= 6 Cfg#= 1 Atr=c0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=e0(wlcon) Sub=01 Prot=03 I:* If#= 0 Alt= 0 #EPs= 1 Cls=e0(wlcon) Sub=01 Prot=03 Driver=rndis_host E: Ad=87(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0c(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0b(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=88(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0a(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=89(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0e(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Tested successfully connecting to the Internet via rndis interface after dialing via AT commands on If#=3 or If#=4. Not sure of the purpose of the other serial interfaces. Signed-off-by: Michal Hrusecky Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 437960002bc3..a807101548e7 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -625,6 +625,8 @@ static void option_instat_callback(struct urb *urb); #define MEIGSMART_PRODUCT_SRM825L 0x4d22 /* MeiG Smart SLM320 based on UNISOC UIS8910 */ #define MEIGSMART_PRODUCT_SLM320 0x4d41 +/* MeiG Smart SLM770A based on ASR1803 */ +#define MEIGSMART_PRODUCT_SLM770A 0x4d57 /* Device flags */ @@ -2382,6 +2384,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM320, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM770A, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, From aa954ae08262bb5cd6ab18dd56a0b58c1315db8b Mon Sep 17 00:00:00 2001 From: Mank Wang Date: Fri, 22 Nov 2024 09:06:00 +0000 Subject: [PATCH 124/807] USB: serial: option: add Netprisma LCUK54 modules for WWAN Ready LCUK54-WRD's pid/vid 0x3731/0x010a 0x3731/0x010c LCUK54-WWD's pid/vid 0x3731/0x010b 0x3731/0x010d Above products use the exact same interface layout and option driver: MBIM + GNSS + DIAG + NMEA + AT + QDSS + DPL T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=02 Dev#= 5 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=3731 ProdID=0101 Rev= 5.04 S: Manufacturer=NetPrisma S: Product=LCUK54-WRD S: SerialNumber=feeba631 C:* #Ifs= 8 Cfg#= 1 Atr=a0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8f(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Mank Wang [ johan: use lower case hex notation ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index a807101548e7..e897c723b041 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2377,6 +2377,18 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for Golbal EDU */ { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0x00, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WRD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WRD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(OPPO_VENDOR_ID, OPPO_PRODUCT_R11, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) }, From f07dfa6a1b65034a5c3ba3a555950d972f252757 Mon Sep 17 00:00:00 2001 From: Jack Wu Date: Thu, 28 Nov 2024 10:22:27 +0800 Subject: [PATCH 125/807] USB: serial: option: add MediaTek T7XX compositions Add the MediaTek T7XX compositions: T: Bus=03 Lev=01 Prnt=01 Port=05 Cnt=01 Dev#= 74 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0e8d ProdID=7129 Rev= 0.01 S: Manufacturer=MediaTek Inc. S: Product=USB DATA CARD S: SerialNumber=004402459035402 C:* #Ifs=10 Cfg#= 1 Atr=a0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 7 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 8 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=08(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 9 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=8a(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=09(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms ------------------------------- | If Number | Function | ------------------------------- | 2 | USB AP Log Port | ------------------------------- | 3 | USB AP GNSS Port| ------------------------------- | 4 | USB AP META Port| ------------------------------- | 5 | ADB port | ------------------------------- | 6 | USB MD AT Port | ------------------------------ | 7 | USB MD META Port| ------------------------------- | 8 | USB NTZ Port | ------------------------------- | 9 | USB Debug port | ------------------------------- Signed-off-by: Jack Wu Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index e897c723b041..dcedb88ad7c1 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2249,6 +2249,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(2) }, { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7127, 0xff, 0x00, 0x00), .driver_info = NCTRL(2) | NCTRL(3) | NCTRL(4) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7129, 0xff, 0x00, 0x00), /* MediaTek T7XX */ + .driver_info = NCTRL(2) | NCTRL(3) | NCTRL(4) }, { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) }, { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MPL200), .driver_info = RSVD(1) | RSVD(4) }, From 8366e64a4454481339e7c56a8ad280161f2e441d Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Mon, 9 Dec 2024 16:32:54 +0100 Subject: [PATCH 126/807] USB: serial: option: add Telit FE910C04 rmnet compositions Add the following Telit FE910C04 compositions: 0x10c0: rmnet + tty (AT/NMEA) + tty (AT) + tty (diag) T: Bus=02 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 13 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10c0 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FE910 S: SerialNumber=f71b8b32 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10c4: rmnet + tty (AT) + tty (AT) + tty (diag) T: Bus=02 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 14 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10c4 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FE910 S: SerialNumber=f71b8b32 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10c8: rmnet + tty (AT) + tty (diag) + DPL (data packet logging) + adb T: Bus=02 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 17 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10c8 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FE910 S: SerialNumber=f71b8b32 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Daniele Palmas Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index dcedb88ad7c1..64317b390d22 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1397,6 +1397,12 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10aa, 0xff), /* Telit FN920C04 (MBIM) */ .driver_info = NCTRL(3) | RSVD(4) | RSVD(5) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c0, 0xff), /* Telit FE910C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c4, 0xff), /* Telit FE910C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c8, 0xff), /* Telit FE910C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), From b44679c63e4d3ac820998b6bd59fba89a72ad3e7 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 9 Dec 2024 19:42:39 +0800 Subject: [PATCH 127/807] iomap: pass byte granular end position to iomap_add_to_ioend This is a preparatory patch for fixing zero padding issues in concurrent append write scenarios. In the following patches, we need to obtain byte-granular writeback end position for io_size trimming after EOF handling. Due to concurrent writeback and truncate operations, inode size may shrink. Resampling inode size would force writeback code to handle the newly appeared post-EOF blocks, which is undesirable. As Dave explained in [1]: "Really, the issue is that writeback mappings have to be able to handle the range being mapped suddenly appear to be beyond EOF. This behaviour is a longstanding writeback constraint, and is what iomap_writepage_handle_eof() is attempting to handle. We handle this by only sampling i_size_read() whilst we have the folio locked and can determine the action we should take with that folio (i.e. nothing, partial zeroing, or skip altogether). Once we've made the decision that the folio is within EOF and taken action on it (i.e. moved the folio to writeback state), we cannot then resample the inode size because a truncate may have started and changed the inode size." To avoid resampling inode size after EOF handling, we convert end_pos to byte-granular writeback position and return it from EOF handling function. Since iomap_set_range_dirty() can handle unaligned lengths, this conversion has no impact on it. However, iomap_find_dirty_range() requires aligned start and end range to find dirty blocks within the given range, so the end position needs to be rounded up when passed to it. LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/ Signed-off-by: Long Li Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com Reviewed-by: Brian Foster Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 955f19e27e47..bcc7831d03af 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1774,7 +1774,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) */ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, - struct inode *inode, loff_t pos, unsigned len) + struct inode *inode, loff_t pos, loff_t end_pos, + unsigned len) { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); @@ -1800,8 +1801,8 @@ new_ioend: static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, - struct inode *inode, u64 pos, unsigned dirty_len, - unsigned *count) + struct inode *inode, u64 pos, u64 end_pos, + unsigned dirty_len, unsigned *count) { int error; @@ -1826,7 +1827,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, break; default: error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, - map_len); + end_pos, map_len); if (!error) (*count)++; break; @@ -1897,11 +1898,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, * remaining memory is zeroed when mapped, and writes to that * region are not written out to the file. * - * Also adjust the writeback range to skip all blocks entirely - * beyond i_size. + * Also adjust the end_pos to the end of file and skip writeback + * for all blocks entirely beyond i_size. */ folio_zero_segment(folio, poff, folio_size(folio)); - *end_pos = round_up(isize, i_blocksize(inode)); + *end_pos = isize; } return true; @@ -1914,6 +1915,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct inode *inode = folio->mapping->host; u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); + u64 end_aligned = 0; unsigned count = 0; int error = 0; u32 rlen; @@ -1955,9 +1957,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, /* * Walk through the folio to find dirty areas to write back. */ - while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) { + end_aligned = round_up(end_pos, i_blocksize(inode)); + while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, - pos, rlen, &count); + pos, end_pos, rlen, &count); if (error) break; pos += rlen; From 51d20d1dacbec589d459e11fc88fbca419f84a99 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 9 Dec 2024 19:42:40 +0800 Subject: [PATCH 128/807] iomap: fix zero padding data issue in concurrent append writes During concurrent append writes to XFS filesystem, zero padding data may appear in the file after power failure. This happens due to imprecise disk size updates when handling write completion. Consider this scenario with concurrent append writes same file: Thread 1: Thread 2: ------------ ----------- write [A, A+B] update inode size to A+B submit I/O [A, A+BS] write [A+B, A+B+C] update inode size to A+B+C After reboot: 1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C] |< Block Size (BS) >| |DDDDDDDDDDDDDDDD0000000000000000| ^ ^ ^ A A+B A+B+C (EOF) 2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS] |< Block Size (BS) >|< Block Size (BS) >| |DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000| ^ ^ ^ ^ A A+B A+BS A+B+C (EOF) D = Valid Data 0 = Zero Padding The issue stems from disk size being set to min(io_offset + io_size, inode->i_size) at I/O completion. Since io_offset+io_size is block size granularity, it may exceed the actual valid file data size. In the case of concurrent append writes, inode->i_size may be larger than the actual range of valid file data written to disk, leading to inaccurate disk size updates. This patch modifies the meaning of io_size to represent the size of valid data within EOF in an ioend. If the ioend spans beyond i_size, io_size will be trimmed to provide the file with more accurate size information. This is particularly useful for on-disk size updates at completion time. After this change, ioends that span i_size will not grow or merge with other ioends in concurrent scenarios. However, these cases that need growth/merging rarely occur and it seems no noticeable performance impact. Although rounding up io_size could enable ioend growth/merging in these scenarios, we decided to keep the code simple after discussion [1]. Another benefit is that it makes the xfs_ioend_is_append() check more accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio() in certain scenarios, such as repeated writes at the file tail without extending the file size. Link [1]: https://patchwork.kernel.org/project/xfs/patch/20241113091907.56937-1-leo.lilong@huawei.com Fixes: ae259a9c8593 ("fs: introduce iomap infrastructure") # goes further back than this Signed-off-by: Long Li Link: https://lore.kernel.org/r/20241209114241.3725722-3-leo.lilong@huawei.com Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 45 ++++++++++++++++++++++++++++++++++++++++++ include/linux/iomap.h | 2 +- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index bcc7831d03af..54dc27d92781 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1794,7 +1794,52 @@ new_ioend: if (ifs) atomic_add(len, &ifs->write_bytes_pending); + + /* + * Clamp io_offset and io_size to the incore EOF so that ondisk + * file size updates in the ioend completion are byte-accurate. + * This avoids recovering files with zeroed tail regions when + * writeback races with appending writes: + * + * Thread 1: Thread 2: + * ------------ ----------- + * write [A, A+B] + * update inode size to A+B + * submit I/O [A, A+BS] + * write [A+B, A+B+C] + * update inode size to A+B+C + * + * + * + * After reboot: + * 1) with A+B+C < A+BS, the file has zero padding in range + * [A+B, A+B+C] + * + * |< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000| + * ^ ^ ^ + * A A+B A+B+C + * (EOF) + * + * 2) with A+B+C > A+BS, the file has zero padding in range + * [A+B, A+BS] + * + * |< Block Size (BS) >|< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| + * ^ ^ ^ ^ + * A A+B A+BS A+B+C + * (EOF) + * + * D = Valid Data + * 0 = Zero Padding + * + * Note that this defeats the ability to chain the ioends of + * appending writes. + */ wpc->ioend->io_size += len; + if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) + wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; + wbc_account_cgroup_owner(wbc, folio, len); return 0; } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5675af6b740c..75bf54e76f3b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -335,7 +335,7 @@ struct iomap_ioend { u16 io_type; u16 io_flags; /* IOMAP_F_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of the extent */ + size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ From 220326c4650a0ef7db3bfcae903f758555ecb973 Mon Sep 17 00:00:00 2001 From: Huy Minh Date: Tue, 10 Dec 2024 22:45:00 +0700 Subject: [PATCH 129/807] platform/x86: touchscreen_dmi: Add info for SARY Tab 3 tablet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no info about the OEM behind the tablet, only online stores listing. This tablet uses an Intel Atom x5-Z8300, 4GB of RAM & 64GB of storage. Signed-off-by: Huy Minh Link: https://lore.kernel.org/r/20241210154500.32124-1-buingoc67@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/touchscreen_dmi.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 0a39f68c641d..bdc19cd8d3ed 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -855,6 +855,23 @@ static const struct ts_dmi_data rwc_nanote_next_data = { .properties = rwc_nanote_next_props, }; +static const struct property_entry sary_tab_3_props[] = { + PROPERTY_ENTRY_U32("touchscreen-size-x", 1730), + PROPERTY_ENTRY_U32("touchscreen-size-y", 1151), + PROPERTY_ENTRY_BOOL("touchscreen-inverted-x"), + PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), + PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), + PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-sary-tab-3.fw"), + PROPERTY_ENTRY_U32("silead,max-fingers", 10), + PROPERTY_ENTRY_BOOL("silead,home-button"), + { } +}; + +static const struct ts_dmi_data sary_tab_3_data = { + .acpi_name = "MSSL1680:00", + .properties = sary_tab_3_props, +}; + static const struct property_entry schneider_sct101ctm_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1715), PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), @@ -1615,6 +1632,15 @@ const struct dmi_system_id touchscreen_dmi_table[] = { DMI_MATCH(DMI_BIOS_VERSION, "S8A70R100-V005"), }, }, + { + /* SARY Tab 3 */ + .driver_data = (void *)&sary_tab_3_data, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "SARY"), + DMI_MATCH(DMI_PRODUCT_NAME, "C210C"), + DMI_MATCH(DMI_PRODUCT_SKU, "TAB3"), + }, + }, { /* Schneider SCT101CTM */ .driver_data = (void *)&schneider_sct101ctm_data, From 6c0a473fc5f89dabbed0af605a09370b533aa856 Mon Sep 17 00:00:00 2001 From: Jithu Joseph Date: Tue, 10 Dec 2024 12:31:52 -0800 Subject: [PATCH 130/807] platform/x86/intel/ifs: Add Clearwater Forest to CPU support list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Clearwater Forest (INTEL_ATOM_DARKMONT_X) to the x86 match table of Intel In Field Scan (IFS) driver, enabling IFS functionality on this processor. Signed-off-by: Jithu Joseph Link: https://lore.kernel.org/r/20241210203152.1136463-1-jithu.joseph@intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/ifs/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/intel/ifs/core.c b/drivers/platform/x86/intel/ifs/core.c index bc252b883210..1ae50702bdb7 100644 --- a/drivers/platform/x86/intel/ifs/core.c +++ b/drivers/platform/x86/intel/ifs/core.c @@ -20,6 +20,7 @@ static const struct x86_cpu_id ifs_cpu_ids[] __initconst = { X86_MATCH(INTEL_GRANITERAPIDS_X, ARRAY_GEN0), X86_MATCH(INTEL_GRANITERAPIDS_D, ARRAY_GEN0), X86_MATCH(INTEL_ATOM_CRESTMONT_X, ARRAY_GEN1), + X86_MATCH(INTEL_ATOM_DARKMONT_X, ARRAY_GEN1), {} }; MODULE_DEVICE_TABLE(x86cpu, ifs_cpu_ids); From 83848e37f6ee80f60b04139fefdfa1bde4aaa826 Mon Sep 17 00:00:00 2001 From: Xi Pardee Date: Tue, 10 Dec 2024 13:26:41 -0800 Subject: [PATCH 131/807] platform/x86/intel/vsec: Add support for Panther Lake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Panther Lake PMT telemetry support. Signed-off-by: Xi Pardee Link: https://lore.kernel.org/r/20241210212646.239211-1-xi.pardee@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 9e0f8e38178c..e54b6a2a1681 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -423,6 +423,7 @@ static const struct intel_vsec_platform_info lnl_info = { #define PCI_DEVICE_ID_INTEL_VSEC_RPL 0xa77d #define PCI_DEVICE_ID_INTEL_VSEC_TGL 0x9a0d #define PCI_DEVICE_ID_INTEL_VSEC_LNL_M 0x647d +#define PCI_DEVICE_ID_INTEL_VSEC_PTL 0xb07d static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_ADL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_DG1, &dg1_info) }, @@ -432,6 +433,7 @@ static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_RPL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_TGL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_LNL_M, &lnl_info) }, + { PCI_DEVICE_DATA(INTEL, VSEC_PTL, &mtl_info) }, { } }; MODULE_DEVICE_TABLE(pci, intel_vsec_pci_ids); From 48ca421268735c60ea0d4c2e19610b224d5c8656 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Wed, 11 Dec 2024 17:39:45 +0900 Subject: [PATCH 132/807] MAINTAINERS: add me as reviewer for sched_ext Add me as a reviewer for sched_ext. I have been actively working on the project and would like to help review patches and address related kernel issues/features. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 41bae8792a77..cbfa4e97a3ff 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20903,6 +20903,7 @@ SCHEDULER - SCHED_EXT R: Tejun Heo R: David Vernet R: Andrea Righi +R: Changwoo Min L: linux-kernel@vger.kernel.org S: Maintained W: https://github.com/sched-ext/scx From 9b496a8bbed9cc292b0dfd796f38ec58b6d0375f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 5 Dec 2024 14:51:01 -0500 Subject: [PATCH 133/807] cgroup/cpuset: Prevent leakage of isolated CPUs into sched domains Isolated CPUs are not allowed to be used in a non-isolated partition. The only exception is the top cpuset which is allowed to contain boot time isolated CPUs. Commit ccac8e8de99c ("cgroup/cpuset: Fix remote root partition creation problem") introduces a simplified scheme of including only partition roots in sched domain generation. However, it does not properly account for this exception case. This can result in leakage of isolated CPUs into a sched domain. Fix it by making sure that isolated CPUs are excluded from the top cpuset before generating sched domains. Also update the way the boot time isolated CPUs are handled in test_cpuset_prs.sh to make sure that those isolated CPUs are really isolated instead of just skipping them in the tests. Fixes: ccac8e8de99c ("cgroup/cpuset: Fix remote root partition creation problem") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 10 +++++- .../selftests/cgroup/test_cpuset_prs.sh | 33 +++++++++++-------- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 9e2abd6a38a5..7ea559fb0cbf 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -885,7 +885,15 @@ v2: */ if (cgrpv2) { for (i = 0; i < ndoms; i++) { - cpumask_copy(doms[i], csa[i]->effective_cpus); + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (csa[i] == &top_cpuset) + cpumask_and(doms[i], csa[i]->effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); if (dattr) dattr[i] = SD_ATTR_INIT; } diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 03c1bdaed2c3..400a696a0d21 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -86,15 +86,15 @@ echo "" > test/cpuset.cpus # # If isolated CPUs have been reserved at boot time (as shown in -# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-7 +# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-8 # that will be used by this script for testing purpose. If not, some of -# the tests may fail incorrectly. These isolated CPUs will also be removed -# before being compared with the expected results. +# the tests may fail incorrectly. These pre-isolated CPUs should stay in +# an isolated state throughout the testing process for now. # BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated) if [[ -n "$BOOT_ISOLCPUS" ]] then - [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 7 ]] && + [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 8 ]] && skip_test "Pre-isolated CPUs ($BOOT_ISOLCPUS) overlap CPUs to be tested" echo "Pre-isolated CPUs: $BOOT_ISOLCPUS" fi @@ -683,15 +683,19 @@ check_isolcpus() EXPECT_VAL2=$EXPECT_VAL fi + # + # Appending pre-isolated CPUs + # Even though CPU #8 isn't used for testing, it can't be pre-isolated + # to make appending those CPUs easier. + # + [[ -n "$BOOT_ISOLCPUS" ]] && { + EXPECT_VAL=${EXPECT_VAL:+${EXPECT_VAL},}${BOOT_ISOLCPUS} + EXPECT_VAL2=${EXPECT_VAL2:+${EXPECT_VAL2},}${BOOT_ISOLCPUS} + } + # # Check cpuset.cpus.isolated cpumask # - if [[ -z "$BOOT_ISOLCPUS" ]] - then - ISOLCPUS=$(cat $ISCPUS) - else - ISOLCPUS=$(cat $ISCPUS | sed -e "s/,*$BOOT_ISOLCPUS//") - fi [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { # Take a 50ms pause and try again pause 0.05 @@ -731,8 +735,6 @@ check_isolcpus() fi done [[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU - [[ -n "BOOT_ISOLCPUS" ]] && - ISOLCPUS=$(echo $ISOLCPUS | sed -e "s/,*$BOOT_ISOLCPUS//") [[ "$EXPECT_VAL" = "$ISOLCPUS" ]] } @@ -836,8 +838,11 @@ run_state_test() # if available [[ -n "$ICPUS" ]] && { check_isolcpus $ICPUS - [[ $? -ne 0 ]] && test_fail $I "isolated CPU" \ - "Expect $ICPUS, get $ISOLCPUS instead" + [[ $? -ne 0 ]] && { + [[ -n "$BOOT_ISOLCPUS" ]] && ICPUS=${ICPUS},${BOOT_ISOLCPUS} + test_fail $I "isolated CPU" \ + "Expect $ICPUS, get $ISOLCPUS instead" + } } reset_cgroup_states # From 2dd59fe0e19e1ab955259978082b62e5751924c7 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Fri, 17 May 2024 08:58:00 -0700 Subject: [PATCH 134/807] media: dvb-frontends: dib3000mb: fix uninit-value in dib3000_write_reg Syzbot reports [1] an uninitialized value issue found by KMSAN in dib3000_read_reg(). Local u8 rb[2] is used in i2c_transfer() as a read buffer; in case that call fails, the buffer may end up with some undefined values. Since no elaborate error handling is expected in dib3000_write_reg(), simply zero out rb buffer to mitigate the problem. [1] Syzkaller report dvb-usb: bulk message failed: -22 (6/0) ===================================================== BUG: KMSAN: uninit-value in dib3000mb_attach+0x2d8/0x3c0 drivers/media/dvb-frontends/dib3000mb.c:758 dib3000mb_attach+0x2d8/0x3c0 drivers/media/dvb-frontends/dib3000mb.c:758 dibusb_dib3000mb_frontend_attach+0x155/0x2f0 drivers/media/usb/dvb-usb/dibusb-mb.c:31 dvb_usb_adapter_frontend_init+0xed/0x9a0 drivers/media/usb/dvb-usb/dvb-usb-dvb.c:290 dvb_usb_adapter_init drivers/media/usb/dvb-usb/dvb-usb-init.c:90 [inline] dvb_usb_init drivers/media/usb/dvb-usb/dvb-usb-init.c:186 [inline] dvb_usb_device_init+0x25a8/0x3760 drivers/media/usb/dvb-usb/dvb-usb-init.c:310 dibusb_probe+0x46/0x250 drivers/media/usb/dvb-usb/dibusb-mb.c:110 ... Local variable rb created at: dib3000_read_reg+0x86/0x4e0 drivers/media/dvb-frontends/dib3000mb.c:54 dib3000mb_attach+0x123/0x3c0 drivers/media/dvb-frontends/dib3000mb.c:758 ... Fixes: 74340b0a8bc6 ("V4L/DVB (4457): Remove dib3000-common-module") Reported-by: syzbot+c88fc0ebe0d5935c70da@syzkaller.appspotmail.com Signed-off-by: Nikita Zhandarovich Link: https://lore.kernel.org/r/20240517155800.9881-1-n.zhandarovich@fintech.ru Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb-frontends/dib3000mb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/dvb-frontends/dib3000mb.c b/drivers/media/dvb-frontends/dib3000mb.c index 822639f11c04..63bc7b74bc8b 100644 --- a/drivers/media/dvb-frontends/dib3000mb.c +++ b/drivers/media/dvb-frontends/dib3000mb.c @@ -51,7 +51,7 @@ MODULE_PARM_DESC(debug, "set debugging level (1=info,2=xfer,4=setfe,8=getfe (|-a static int dib3000_read_reg(struct dib3000_state *state, u16 reg) { u8 wb[] = { ((reg >> 8) | 0x80) & 0xff, reg & 0xff }; - u8 rb[2]; + u8 rb[2] = {}; struct i2c_msg msg[] = { { .addr = state->config.demod_address, .flags = 0, .buf = wb, .len = 2 }, { .addr = state->config.demod_address, .flags = I2C_M_RD, .buf = rb, .len = 2 }, From 5fa49dd8e521a42379e5e41fcf2c92edaaec0a8b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 9 Dec 2024 17:43:48 +0100 Subject: [PATCH 135/807] s390/ipl: Fix never less than zero warning DEFINE_IPL_ATTR_STR_RW() macro produces "unsigned 'len' is never less than zero." warning when sys_vmcmd_on_*_store() callbacks are defined. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412081614.5uel8F6W-lkp@intel.com/ Fixes: 247576bf624a ("s390/ipl: Do not accept z/VM CP diag X'008' cmds longer than max length") Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/ipl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index edbb52ce3f1e..7d12a1305fc9 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -270,7 +270,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ if (len >= sizeof(_value)) \ return -E2BIG; \ len = strscpy(_value, buf, sizeof(_value)); \ - if (len < 0) \ + if ((ssize_t)len < 0) \ return len; \ strim(_value); \ return len; \ From 18b2093f4598d8ee67a8153badc93f0fa7686b8a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 11 Dec 2024 11:01:51 -1000 Subject: [PATCH 136/807] sched_ext: Fix invalid irq restore in scx_ops_bypass() While adding outer irqsave/restore locking, 0e7ffff1b811 ("scx: Fix raciness in scx_ops_bypass()") forgot to convert an inner rq_unlock_irqrestore() to rq_unlock() which could re-enable IRQ prematurely leading to the following warning: raw_local_irq_restore() called with IRQs enabled WARNING: CPU: 1 PID: 96 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x30/0x40 ... Sched_ext: create_dsq (enabling) pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : warn_bogus_irq_restore+0x30/0x40 lr : warn_bogus_irq_restore+0x30/0x40 ... Call trace: warn_bogus_irq_restore+0x30/0x40 (P) warn_bogus_irq_restore+0x30/0x40 (L) scx_ops_bypass+0x224/0x3b8 scx_ops_enable.isra.0+0x2c8/0xaa8 bpf_scx_reg+0x18/0x30 ... irq event stamp: 33739 hardirqs last enabled at (33739): [] scx_ops_bypass+0x174/0x3b8 hardirqs last disabled at (33738): [] _raw_spin_lock_irqsave+0xb4/0xd8 Drop the stray _irqrestore(). Signed-off-by: Tejun Heo Reported-by: Ihor Solodrai Link: http://lkml.kernel.org/r/qC39k3UsonrBYD_SmuxHnZIQLsuuccoCrkiqb_BT7DvH945A1_LZwE4g-5Pu9FcCtqZt4lY1HhIPi0homRuNWxkgo1rgP3bkxa0donw8kV4=@pm.me Fixes: 0e7ffff1b811 ("scx: Fix raciness in scx_ops_bypass()") Cc: stable@vger.kernel.org # v6.12 --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7fff1d045477..98519e6d0dcd 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4763,7 +4763,7 @@ static void scx_ops_bypass(bool bypass) * sees scx_rq_bypassing() before moving tasks to SCX. */ if (!scx_enabled()) { - rq_unlock_irqrestore(rq, &rf); + rq_unlock(rq, &rf); continue; } From ebefac5647968679f6ef5803e5d35a71997d20fa Mon Sep 17 00:00:00 2001 From: Robert Beckett Date: Tue, 12 Nov 2024 19:50:00 +0000 Subject: [PATCH 137/807] nvme-pci: 512 byte aligned dma pool segment quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We initially introduced a quick fix limiting the queue depth to 1 as experimentation showed that it fixed data corruption on 64GB steamdecks. Further experimentation revealed corruption only happens when the last PRP data element aligns to the end of the page boundary. The device appears to treat this as a PRP chain to a new list instead of the data element that it actually is. This implementation is in violation of the spec. Encountering this errata with the Linux driver requires the host request a 128k transfer and coincidently be handed the last small pool dma buffer within a page. The QD1 quirk effectly works around this because the last data PRP always was at a 248 byte offset from the page start, so it never appeared at the end of the page, but comes at the expense of throttling IO and wasting the remainder of the PRP page beyond 256 bytes. Also to note, the MDTS on these devices is small enough that the "large" prp pool can hold enough PRP elements to never reach the end, so that pool is not a problem either. Introduce a new quirk to ensure the small pool is always aligned such that the last PRP element can't appear a the end of the page. This comes at the expense of wasting 256 bytes per small pool page allocated. Link: https://lore.kernel.org/linux-nvme/20241113043151.GA20077@lst.de/T/#u Fixes: 83bdfcbdbe5d ("nvme-pci: qdepth 1 quirk") Cc: Paweł Anikiel Signed-off-by: Robert Beckett Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 611b02c8a8b3..c4bb8dfe1a45 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -173,6 +173,11 @@ enum nvme_quirks { * MSI (but not MSI-X) interrupts are broken and never fire. */ NVME_QUIRK_BROKEN_MSI = (1 << 21), + + /* + * Align dma pool segment size to 512 bytes + */ + NVME_QUIRK_DMAPOOL_ALIGN_512 = (1 << 22), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1a5ba80f1811..e2634f437f33 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2834,15 +2834,20 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_setup_prp_pools(struct nvme_dev *dev) { + size_t small_align = 256; + dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0); if (!dev->prp_page_pool) return -ENOMEM; + if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) + small_align = 512; + /* Optimisation for I/Os between 4k and 128k */ dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, - 256, 256, 0); + 256, small_align, 0); if (!dev->prp_small_pool) { dma_pool_destroy(dev->prp_page_pool); return -ENOMEM; @@ -3607,7 +3612,7 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */ - .driver_data = NVME_QUIRK_QDEPTH_ONE }, + .driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, }, { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_BOGUS_NID, }, From 9cb189a882738c1d28b349d4e7c6a1ef9b3d8f87 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 4 Dec 2024 17:26:19 +0100 Subject: [PATCH 138/807] udmabuf: fix racy memfd sealing check The current check_memfd_seals() is racy: Since we first do check_memfd_seals() and then udmabuf_pin_folios() without holding any relevant lock across both, F_SEAL_WRITE can be set in between. This is problematic because we can end up holding pins to pages in a write-sealed memfd. Fix it using the inode lock, that's probably the easiest way. In the future, we might want to consider moving this logic into memfd, especially if anyone else wants to use memfd_pin_folios(). Reported-by: Julian Orth Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219106 Closes: https://lore.kernel.org/r/CAG48ez0w8HrFEZtJkfmkVKFDhE5aP7nz=obrimeTgpD+StkV9w@mail.gmail.com Fixes: fbb0de795078 ("Add udmabuf misc device") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Acked-by: Joel Fernandes (Google) Acked-by: Vivek Kasireddy Signed-off-by: Vivek Kasireddy Link: https://patchwork.freedesktop.org/patch/msgid/20241204-udmabuf-fixes-v2-1-23887289de1c@google.com --- drivers/dma-buf/udmabuf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 8ce1f074c2d3..c1d8c2766d6d 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -436,14 +436,19 @@ static long udmabuf_create(struct miscdevice *device, goto err; } + /* + * Take the inode lock to protect against concurrent + * memfd_add_seals(), which takes this lock in write mode. + */ + inode_lock_shared(file_inode(memfd)); ret = check_memfd_seals(memfd); - if (ret < 0) { - fput(memfd); - goto err; - } + if (ret) + goto out_unlock; ret = udmabuf_pin_folios(ubuf, memfd, list[i].offset, list[i].size, folios); +out_unlock: + inode_unlock_shared(file_inode(memfd)); fput(memfd); if (ret) goto err; From 0a16e24e34f28210f68195259456c73462518597 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 4 Dec 2024 17:26:20 +0100 Subject: [PATCH 139/807] udmabuf: also check for F_SEAL_FUTURE_WRITE When F_SEAL_FUTURE_WRITE was introduced, it was overlooked that udmabuf must reject memfds with this flag, just like ones with F_SEAL_WRITE. Fix it by adding F_SEAL_FUTURE_WRITE to SEALS_DENIED. Fixes: ab3948f58ff8 ("mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd") Cc: stable@vger.kernel.org Acked-by: Vivek Kasireddy Signed-off-by: Jann Horn Reviewed-by: Joel Fernandes (Google) Signed-off-by: Vivek Kasireddy Link: https://patchwork.freedesktop.org/patch/msgid/20241204-udmabuf-fixes-v2-2-23887289de1c@google.com --- drivers/dma-buf/udmabuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index c1d8c2766d6d..b330b99fcc76 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -297,7 +297,7 @@ static const struct dma_buf_ops udmabuf_ops = { }; #define SEALS_WANTED (F_SEAL_SHRINK) -#define SEALS_DENIED (F_SEAL_WRITE) +#define SEALS_DENIED (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) static int check_memfd_seals(struct file *memfd) { From f49856f525acd5bef52ae28b7da2e001bbe7439e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 4 Dec 2024 17:26:21 +0100 Subject: [PATCH 140/807] udmabuf: fix memory leak on last export_udmabuf() error path In export_udmabuf(), if dma_buf_fd() fails because the FD table is full, a dma_buf owning the udmabuf has already been created; but the error handling in udmabuf_create() will tear down the udmabuf without doing anything about the containing dma_buf. This leaves a dma_buf in memory that contains a dangling pointer; though that doesn't seem to lead to anything bad except a memory leak. Fix it by moving the dma_buf_fd() call out of export_udmabuf() so that we can give it different error handling. Note that the shape of this code changed a lot in commit 5e72b2b41a21 ("udmabuf: convert udmabuf driver to use folios"); but the memory leak seems to have existed since the introduction of udmabuf. Fixes: fbb0de795078 ("Add udmabuf misc device") Acked-by: Vivek Kasireddy Signed-off-by: Jann Horn Signed-off-by: Vivek Kasireddy Link: https://patchwork.freedesktop.org/patch/msgid/20241204-udmabuf-fixes-v2-3-23887289de1c@google.com --- drivers/dma-buf/udmabuf.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index b330b99fcc76..cc7398cc17d6 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -317,12 +317,10 @@ static int check_memfd_seals(struct file *memfd) return 0; } -static int export_udmabuf(struct udmabuf *ubuf, - struct miscdevice *device, - u32 flags) +static struct dma_buf *export_udmabuf(struct udmabuf *ubuf, + struct miscdevice *device) { DEFINE_DMA_BUF_EXPORT_INFO(exp_info); - struct dma_buf *buf; ubuf->device = device; exp_info.ops = &udmabuf_ops; @@ -330,11 +328,7 @@ static int export_udmabuf(struct udmabuf *ubuf, exp_info.priv = ubuf; exp_info.flags = O_RDWR; - buf = dma_buf_export(&exp_info); - if (IS_ERR(buf)) - return PTR_ERR(buf); - - return dma_buf_fd(buf, flags); + return dma_buf_export(&exp_info); } static long udmabuf_pin_folios(struct udmabuf *ubuf, struct file *memfd, @@ -391,6 +385,7 @@ static long udmabuf_create(struct miscdevice *device, struct folio **folios = NULL; pgoff_t pgcnt = 0, pglimit; struct udmabuf *ubuf; + struct dma_buf *dmabuf; long ret = -EINVAL; u32 i, flags; @@ -455,9 +450,20 @@ out_unlock: } flags = head->flags & UDMABUF_FLAGS_CLOEXEC ? O_CLOEXEC : 0; - ret = export_udmabuf(ubuf, device, flags); - if (ret < 0) + dmabuf = export_udmabuf(ubuf, device); + if (IS_ERR(dmabuf)) { + ret = PTR_ERR(dmabuf); goto err; + } + /* + * Ownership of ubuf is held by the dmabuf from here. + * If the following dma_buf_fd() fails, dma_buf_put() cleans up both the + * dmabuf and the ubuf (through udmabuf_ops.release). + */ + + ret = dma_buf_fd(dmabuf, flags); + if (ret < 0) + dma_buf_put(dmabuf); kvfree(folios); return ret; From 7a4f541873734f41f9645ec147cfae72ef3ffd00 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 11 Dec 2024 12:55:56 -0800 Subject: [PATCH 141/807] fuse: fix direct io folio offset and length calculation For the direct io case, the pages from userspace may be part of a huge folio, even if all folios in the page cache for fuse are small. Fix the logic for calculating the offset and length of the folio for the direct io case, which currently incorrectly assumes that all folios encountered are one page size. Fixes: 3b97c3652d91 ("fuse: convert direct io to use folios") Signed-off-by: Joanne Koong Reviewed-by: Jingbo Xu Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 88d0946b5bc9..15b08d6a5739 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1557,18 +1557,22 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, nbytes += ret; - ret += start; - /* Currently, all folios in FUSE are one page */ - nfolios = DIV_ROUND_UP(ret, PAGE_SIZE); + nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE); - ap->descs[ap->num_folios].offset = start; - fuse_folio_descs_length_init(ap->descs, ap->num_folios, nfolios); - for (i = 0; i < nfolios; i++) - ap->folios[i + ap->num_folios] = page_folio(pages[i]); + for (i = 0; i < nfolios; i++) { + struct folio *folio = page_folio(pages[i]); + unsigned int offset = start + + (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); + unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start); + + ap->descs[ap->num_folios].offset = offset; + ap->descs[ap->num_folios].length = len; + ap->folios[ap->num_folios] = folio; + start = 0; + ret -= len; + ap->num_folios++; + } - ap->num_folios += nfolios; - ap->descs[ap->num_folios - 1].length -= - (PAGE_SIZE - ret) & (PAGE_SIZE - 1); nr_pages += nfolios; } kfree(pages); From 0cff90dec63da908fb16d9ea2872ebbcd2d18e6a Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Sun, 17 Nov 2024 17:03:25 +0000 Subject: [PATCH 142/807] dma-buf: Fix __dma_buf_debugfs_list_del argument for !CONFIG_DEBUG_FS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The arguments for __dma_buf_debugfs_list_del do not match for both the CONFIG_DEBUG_FS case and the !CONFIG_DEBUG_FS case. The !CONFIG_DEBUG_FS case should take a struct dma_buf *, but it's currently struct file *. This can lead to the build error: error: passing argument 1 of ‘__dma_buf_debugfs_list_del’ from incompatible pointer type [-Werror=incompatible-pointer-types] dma-buf.c:63:53: note: expected ‘struct file *’ but argument is of type ‘struct dma_buf *’ 63 | static void __dma_buf_debugfs_list_del(struct file *file) Fixes: bfc7bc539392 ("dma-buf: Do not build debugfs related code when !CONFIG_DEBUG_FS") Signed-off-by: T.J. Mercier Reviewed-by: Tvrtko Ursulin Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20241117170326.1971113-1-tjmercier@google.com --- drivers/dma-buf/dma-buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 5ad0e9e2e1b9..84bc32134862 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -60,7 +60,7 @@ static void __dma_buf_debugfs_list_add(struct dma_buf *dmabuf) { } -static void __dma_buf_debugfs_list_del(struct file *file) +static void __dma_buf_debugfs_list_del(struct dma_buf *dmabuf) { } #endif From b10a1e5643e505c367c7e16aa6d8a9a0dc07354b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 3 Dec 2024 15:28:21 +0800 Subject: [PATCH 143/807] erofs: fix rare pcluster memory leak after unmounting There may still exist some pcluster with valid reference counts during unmounting. Instead of introducing another synchronization primitive, just try again as unmounting is relatively rare. This approach is similar to z_erofs_cache_invalidate_folio(). It was also reported by syzbot as a UAF due to commit f5ad9f9a603f ("erofs: free pclusters if no cached folio is attached"): BUG: KASAN: slab-use-after-free in do_raw_spin_trylock+0x72/0x1f0 kernel/locking/spinlock_debug.c:123 .. queued_spin_trylock include/asm-generic/qspinlock.h:92 [inline] do_raw_spin_trylock+0x72/0x1f0 kernel/locking/spinlock_debug.c:123 __raw_spin_trylock include/linux/spinlock_api_smp.h:89 [inline] _raw_spin_trylock+0x20/0x80 kernel/locking/spinlock.c:138 spin_trylock include/linux/spinlock.h:361 [inline] z_erofs_put_pcluster fs/erofs/zdata.c:959 [inline] z_erofs_decompress_pcluster fs/erofs/zdata.c:1403 [inline] z_erofs_decompress_queue+0x3798/0x3ef0 fs/erofs/zdata.c:1425 z_erofs_decompressqueue_work+0x99/0xe0 fs/erofs/zdata.c:1437 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa68/0x1840 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f2/0x390 kernel/kthread.c:389 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 However, it seems a long outstanding memory leak. Fix it now. Fixes: f5ad9f9a603f ("erofs: free pclusters if no cached folio is attached") Reported-by: syzbot+7ff87b095e7ca0c5ac39@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/674c1235.050a0220.ad585.0032.GAE@google.com Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241203072821.1885740-1-hsiangkao@linux.alibaba.com --- fs/erofs/zutil.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 75704f58ecfa..0dd65cefce33 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -230,9 +230,10 @@ void erofs_shrinker_unregister(struct super_block *sb) struct erofs_sb_info *const sbi = EROFS_SB(sb); mutex_lock(&sbi->umount_mutex); - /* clean up all remaining pclusters in memory */ - z_erofs_shrink_scan(sbi, ~0UL); - + while (!xa_empty(&sbi->managed_pslots)) { + z_erofs_shrink_scan(sbi, ~0UL); + cond_resched(); + } spin_lock(&erofs_sb_list_lock); list_del(&sbi->list); spin_unlock(&erofs_sb_list_lock); From 1a2180f6859c73c674809f9f82e36c94084682ba Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 27 Nov 2024 16:52:36 +0800 Subject: [PATCH 144/807] erofs: fix PSI memstall accounting Max Kellermann recently reported psi_group_cpu.tasks[NR_MEMSTALL] is incorrect in the 6.11.9 kernel. The root cause appears to be that, since the problematic commit, bio can be NULL, causing psi_memstall_leave() to be skipped in z_erofs_submit_queue(). Reported-by: Max Kellermann Closes: https://lore.kernel.org/r/CAKPOu+8tvSowiJADW2RuKyofL_CSkm_SuyZA7ME5vMLWmL6pqw@mail.gmail.com Fixes: 9e2f9d34dd12 ("erofs: handle overlapped pclusters out of crafted images properly") Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241127085236.3538334-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 01f147505487..19ef4ff2a134 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1792,9 +1792,9 @@ drain_io: erofs_fscache_submit_bio(bio); else submit_bio(bio); - if (memstall) - psi_memstall_leave(&pflags); } + if (memstall) + psi_memstall_leave(&pflags); /* * although background is preferred, no one is pending for submission. From 6d1917045ef4f584593ad30b3dbb887d95fc331f Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Wed, 11 Dec 2024 16:09:18 +0800 Subject: [PATCH 145/807] MAINTAINERS: erofs: update Yue Hu's email address The current email address is no longer valid, use my gmail instead. Signed-off-by: Yue Hu Acked-by: Gao Xiang Acked-by: Chao Yu Link: https://lore.kernel.org/r/20241211080918.8512-1-zbestahu@163.com Signed-off-by: Gao Xiang --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b1..ec84534ff87a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8451,7 +8451,7 @@ F: include/video/s1d13xxxfb.h EROFS FILE SYSTEM M: Gao Xiang M: Chao Yu -R: Yue Hu +R: Yue Hu R: Jeffle Xu R: Sandeep Dhavale L: linux-erofs@lists.ozlabs.org From e2de3c1bf6a0c99b089bd706a62da8f988918858 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 12 Dec 2024 21:35:01 +0800 Subject: [PATCH 146/807] erofs: add erofs_sb_free() helper Unify the common parts of erofs_fc_free() and erofs_kill_sb() as erofs_sb_free(). Thus, fput() in erofs_fc_get_tree() is no longer needed, too. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212133504.2047178-1-hsiangkao@linux.alibaba.com --- fs/erofs/super.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index c235a8e4315e..de8e3ecc6381 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -703,16 +703,19 @@ static int erofs_fc_get_tree(struct fs_context *fc) GET_TREE_BDEV_QUIET_LOOKUP : 0); #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE if (ret == -ENOTBLK) { + struct file *file; + if (!fc->source) return invalf(fc, "No source specified"); - sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); - if (IS_ERR(sbi->fdev)) - return PTR_ERR(sbi->fdev); + + file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + sbi->fdev = file; if (S_ISREG(file_inode(sbi->fdev)->i_mode) && sbi->fdev->f_mapping->a_ops->read_folio) return get_tree_nodev(fc, erofs_fc_fill_super); - fput(sbi->fdev); } #endif return ret; @@ -763,17 +766,22 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs) kfree(devs); } +static void erofs_sb_free(struct erofs_sb_info *sbi) +{ + erofs_free_dev_context(sbi->devs); + kfree(sbi->fsid); + kfree(sbi->domain_id); + if (sbi->fdev) + fput(sbi->fdev); + kfree(sbi); +} + static void erofs_fc_free(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; - if (!sbi) - return; - - erofs_free_dev_context(sbi->devs); - kfree(sbi->fsid); - kfree(sbi->domain_id); - kfree(sbi); + if (sbi) /* free here if an error occurs before transferring to sb */ + erofs_sb_free(sbi); } static const struct fs_context_operations erofs_context_ops = { @@ -813,15 +821,9 @@ static void erofs_kill_sb(struct super_block *sb) kill_anon_super(sb); else kill_block_super(sb); - - erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_fs(sb); - kfree(sbi->fsid); - kfree(sbi->domain_id); - if (sbi->fdev) - fput(sbi->fdev); - kfree(sbi); + erofs_sb_free(sbi); sb->s_fs_info = NULL; } From f9244fb55f37356f75c739c57323d9422d7aa0f8 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 7 Nov 2024 16:17:00 +0100 Subject: [PATCH 147/807] xen/netfront: fix crash when removing device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When removing a netfront device directly after a suspend/resume cycle it might happen that the queues have not been setup again, causing a crash during the attempt to stop the queues another time. Fix that by checking the queues are existing before trying to stop them. This is XSA-465 / CVE-2024-53240. Reported-by: Marek Marczykowski-Górecki Fixes: d50b7914fae0 ("xen-netfront: Fix NULL sring after live migration") Signed-off-by: Juergen Gross --- drivers/net/xen-netfront.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 4265c1cd0ff7..63fe51d0e64d 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -867,7 +867,7 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev static int xennet_close(struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); - unsigned int num_queues = dev->real_num_tx_queues; + unsigned int num_queues = np->queues ? dev->real_num_tx_queues : 0; unsigned int i; struct netfront_queue *queue; netif_tx_stop_all_queues(np->netdev); @@ -882,6 +882,9 @@ static void xennet_destroy_queues(struct netfront_info *info) { unsigned int i; + if (!info->queues) + return; + for (i = 0; i < info->netdev->real_num_tx_queues; i++) { struct netfront_queue *queue = &info->queues[i]; From efbcd61d9bebb771c836a3b8bfced8165633db7c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 08:29:48 +0200 Subject: [PATCH 148/807] x86: make get_cpu_vendor() accessible from Xen code In order to be able to differentiate between AMD and Intel based systems for very early hypercalls without having to rely on the Xen hypercall page, make get_cpu_vendor() non-static. Refactor early_cpu_init() for the same reason by splitting out the loop initializing cpu_devs() into an externally callable function. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/cpu/common.c | 38 ++++++++++++++++++-------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c0975815980c..20e6009381ed 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -230,6 +230,8 @@ static inline unsigned long long l1tf_pfn_limit(void) return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT); } +void init_cpu_devs(void); +void get_cpu_vendor(struct cpuinfo_x86 *c); extern void early_cpu_init(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5c28975c608..3e9037690814 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -867,7 +867,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } -static void get_cpu_vendor(struct cpuinfo_x86 *c) +void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; @@ -1649,15 +1649,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) detect_nopl(); } -void __init early_cpu_init(void) +void __init init_cpu_devs(void) { const struct cpu_dev *const *cdev; int count = 0; -#ifdef CONFIG_PROCESSOR_SELECT - pr_info("KERNEL supported cpus:\n"); -#endif - for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { const struct cpu_dev *cpudev = *cdev; @@ -1665,20 +1661,30 @@ void __init early_cpu_init(void) break; cpu_devs[count] = cpudev; count++; + } +} + +void __init early_cpu_init(void) +{ +#ifdef CONFIG_PROCESSOR_SELECT + unsigned int i, j; + + pr_info("KERNEL supported cpus:\n"); +#endif + + init_cpu_devs(); #ifdef CONFIG_PROCESSOR_SELECT - { - unsigned int j; - - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - pr_info(" %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); - } + for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) { + for (j = 0; j < 2; j++) { + if (!cpu_devs[i]->c_ident[j]) + continue; + pr_info(" %s %s\n", cpu_devs[i]->c_vendor, + cpu_devs[i]->c_ident[j]); } -#endif } +#endif + early_identify_cpu(&boot_cpu_data); } From dda014ba59331dee4f3b773a020e109932f4bd24 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 29 Nov 2024 15:47:49 +0100 Subject: [PATCH 149/807] objtool/x86: allow syscall instruction The syscall instruction is used in Xen PV mode for doing hypercalls. Allow syscall to be used in the kernel in case it is tagged with an unwind hint for objtool. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra --- tools/objtool/check.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4ce176ad411f..76060da755b5 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3820,9 +3820,12 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, break; case INSN_CONTEXT_SWITCH: - if (func && (!next_insn || !next_insn->hint)) { - WARN_INSN(insn, "unsupported instruction in callable function"); - return 1; + if (func) { + if (!next_insn || !next_insn->hint) { + WARN_INSN(insn, "unsupported instruction in callable function"); + return 1; + } + break; } return 0; From 0ef8047b737d7480a5d4c46d956e97c190f13050 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 29 Nov 2024 16:15:54 +0100 Subject: [PATCH 150/807] x86/static-call: provide a way to do very early static-call updates Add static_call_update_early() for updating static-call targets in very early boot. This will be needed for support of Xen guest type specific hypercall functions. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra Co-developed-by: Josh Poimboeuf --- arch/x86/include/asm/static_call.h | 15 ++++++++++++ arch/x86/include/asm/sync_core.h | 6 ++--- arch/x86/kernel/static_call.c | 9 +++++++ include/linux/compiler.h | 39 +++++++++++++++++++++--------- include/linux/static_call.h | 1 + kernel/static_call_inline.c | 2 +- 6 files changed, 56 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 125c407e2abe..41502bd2afd6 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -65,4 +65,19 @@ extern bool __static_call_fixup(void *tramp, u8 op, void *dest); +extern void __static_call_update_early(void *tramp, void *func); + +#define static_call_update_early(name, _func) \ +({ \ + typeof(&STATIC_CALL_TRAMP(name)) __F = (_func); \ + if (static_call_initialized) { \ + __static_call_update(&STATIC_CALL_KEY(name), \ + STATIC_CALL_TRAMP_ADDR(name), __F);\ + } else { \ + WRITE_ONCE(STATIC_CALL_KEY(name).func, _func); \ + __static_call_update_early(STATIC_CALL_TRAMP_ADDR(name),\ + __F); \ + } \ +}) + #endif /* _ASM_STATIC_CALL_H */ diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h index ab7382f92aff..96bda43538ee 100644 --- a/arch/x86/include/asm/sync_core.h +++ b/arch/x86/include/asm/sync_core.h @@ -8,7 +8,7 @@ #include #ifdef CONFIG_X86_32 -static inline void iret_to_self(void) +static __always_inline void iret_to_self(void) { asm volatile ( "pushfl\n\t" @@ -19,7 +19,7 @@ static inline void iret_to_self(void) : ASM_CALL_CONSTRAINT : : "memory"); } #else -static inline void iret_to_self(void) +static __always_inline void iret_to_self(void) { unsigned int tmp; @@ -55,7 +55,7 @@ static inline void iret_to_self(void) * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ -static inline void sync_core(void) +static __always_inline void sync_core(void) { /* * The SERIALIZE instruction is the most straightforward way to diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 4eefaac64c6c..9eed0c144dad 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -172,6 +172,15 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) } EXPORT_SYMBOL_GPL(arch_static_call_transform); +noinstr void __static_call_update_early(void *tramp, void *func) +{ + BUG_ON(system_state != SYSTEM_BOOTING); + BUG_ON(!early_boot_irqs_disabled); + BUG_ON(static_call_initialized); + __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE); + sync_core(); +} + #ifdef CONFIG_MITIGATION_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 469a64dd6495..240c632c5b95 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -216,18 +216,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ -/* - * Force the compiler to emit 'sym' as a symbol, so that we can reference - * it from inline assembler. Necessary in case 'sym' could be inlined - * otherwise, or eliminated entirely due to lack of references that are - * visible to the compiler. - */ -#define ___ADDRESSABLE(sym, __attrs) \ - static void * __used __attrs \ - __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; -#define __ADDRESSABLE(sym) \ - ___ADDRESSABLE(sym, __section(".discard.addressable")) - /** * offset_to_ptr - convert a relative memory offset to an absolute pointer * @off: the address of the 32-bit offset value @@ -239,6 +227,33 @@ static inline void *offset_to_ptr(const int *off) #endif /* __ASSEMBLY__ */ +#ifdef CONFIG_64BIT +#define ARCH_SEL(a,b) a +#else +#define ARCH_SEL(a,b) b +#endif + +/* + * Force the compiler to emit 'sym' as a symbol, so that we can reference + * it from inline assembler. Necessary in case 'sym' could be inlined + * otherwise, or eliminated entirely due to lack of references that are + * visible to the compiler. + */ +#define ___ADDRESSABLE(sym, __attrs) \ + static void * __used __attrs \ + __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + +#define __ADDRESSABLE(sym) \ + ___ADDRESSABLE(sym, __section(".discard.addressable")) + +#define __ADDRESSABLE_ASM(sym) \ + .pushsection .discard.addressable,"aw"; \ + .align ARCH_SEL(8,4); \ + ARCH_SEL(.quad, .long) __stringify(sym); \ + .popsection; + +#define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) + #ifdef __CHECKER__ #define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) #else /* __CHECKER__ */ diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 141e6b176a1b..785980af8972 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,6 +138,7 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include +extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index 5259cda486d0..bb7d066a7c39 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -15,7 +15,7 @@ extern struct static_call_site __start_static_call_sites[], extern struct static_call_tramp_key __start_static_call_tramp_key[], __stop_static_call_tramp_key[]; -static int static_call_initialized; +int static_call_initialized; /* * Must be called before early_initcall() to be effective. From a2796dff62d6c6bfc5fbebdf2bee0d5ac0438906 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 16 Oct 2024 10:40:26 +0200 Subject: [PATCH 151/807] x86/xen: don't do PV iret hypercall through hypercall page Instead of jumping to the Xen hypercall page for doing the iret hypercall, directly code the required sequence in xen-asm.S. This is done in preparation of no longer using hypercall page at all, as it has shown to cause problems with speculation mitigations. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- arch/x86/xen/xen-asm.S | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 83189cf5cdce..ca6edfe4c14b 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -176,7 +176,6 @@ SYM_CODE_START(xen_early_idt_handler_array) SYM_CODE_END(xen_early_idt_handler_array) __FINIT -hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* * Xen64 iret frame: * @@ -186,17 +185,28 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 * cs * rip <-- standard iret frame * - * flags + * flags <-- xen_iret must push from here on * - * rcx } - * r11 }<-- pushed by hypercall page - * rsp->rax } + * rcx + * r11 + * rsp->rax */ +.macro xen_hypercall_iret + pushq $0 /* Flags */ + push %rcx + push %r11 + push %rax + mov $__HYPERVISOR_iret, %eax + syscall /* Do the IRET. */ +#ifdef CONFIG_MITIGATION_SLS + int3 +#endif +.endm + SYM_CODE_START(xen_iret) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR - pushq $0 - jmp hypercall_iret + xen_hypercall_iret SYM_CODE_END(xen_iret) /* @@ -301,8 +311,7 @@ SYM_CODE_START(xen_entry_SYSENTER_compat) ENDBR lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax - pushq $0 - jmp hypercall_iret + xen_hypercall_iret SYM_CODE_END(xen_entry_SYSENTER_compat) SYM_CODE_END(xen_entry_SYSCALL_compat) From e1e1af9148dc4c866eda3fb59cd6ec3c7ea34b1d Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Fri, 25 Oct 2024 15:34:08 +0800 Subject: [PATCH 152/807] drm/panel: himax-hx83102: Add a check to prevent NULL pointer dereference drm_mode_duplicate() could return NULL due to lack of memory, which will then call NULL pointer dereference. Add a check to prevent it. Fixes: 0ef94554dc40 ("drm/panel: himax-hx83102: Break out as separate driver") Signed-off-by: Zhang Zekun Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20241025073408.27481-3-zhangzekun11@huawei.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241025073408.27481-3-zhangzekun11@huawei.com --- drivers/gpu/drm/panel/panel-himax-hx83102.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/panel/panel-himax-hx83102.c b/drivers/gpu/drm/panel/panel-himax-hx83102.c index 8b48bba18131..3644a7544b93 100644 --- a/drivers/gpu/drm/panel/panel-himax-hx83102.c +++ b/drivers/gpu/drm/panel/panel-himax-hx83102.c @@ -565,6 +565,8 @@ static int hx83102_get_modes(struct drm_panel *panel, struct drm_display_mode *mode; mode = drm_mode_duplicate(connector->dev, m); + if (!mode) + return -ENOMEM; mode->type = DRM_MODE_TYPE_DRIVER | DRM_MODE_TYPE_PREFERRED; drm_mode_set_name(mode); From f8fd0968eff52cf092c0d517d17507ea2f6e5ea5 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 29 Oct 2024 20:39:57 +0800 Subject: [PATCH 153/807] drm/panel: novatek-nt35950: fix return value check in nt35950_probe() mipi_dsi_device_register_full() never returns NULL pointer, it will return ERR_PTR() when it fails, so replace the check with IS_ERR(). Fixes: 623a3531e9cf ("drm/panel: Add driver for Novatek NT35950 DSI DriverIC panels") Signed-off-by: Yang Yingliang Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20241029123957.1588-1-yangyingliang@huaweicloud.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241029123957.1588-1-yangyingliang@huaweicloud.com --- drivers/gpu/drm/panel/panel-novatek-nt35950.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-novatek-nt35950.c b/drivers/gpu/drm/panel/panel-novatek-nt35950.c index b036208f9356..08b22b592ab0 100644 --- a/drivers/gpu/drm/panel/panel-novatek-nt35950.c +++ b/drivers/gpu/drm/panel/panel-novatek-nt35950.c @@ -481,9 +481,9 @@ static int nt35950_probe(struct mipi_dsi_device *dsi) return dev_err_probe(dev, -EPROBE_DEFER, "Cannot get secondary DSI host\n"); nt->dsi[1] = mipi_dsi_device_register_full(dsi_r_host, info); - if (!nt->dsi[1]) { + if (IS_ERR(nt->dsi[1])) { dev_err(dev, "Cannot get secondary DSI node\n"); - return -ENODEV; + return PTR_ERR(nt->dsi[1]); } num_dsis++; } From 406dd4c7984a457567ca652455d5efad81983f02 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 24 Nov 2024 23:48:07 +0100 Subject: [PATCH 154/807] drm/panel: st7701: Add prepare_prev_first flag to drm_panel The DSI host must be enabled for the panel to be initialized in prepare(). Set the prepare_prev_first flag to guarantee this. This fixes the panel operation on NXP i.MX8MP SoC / Samsung DSIM DSI host. Fixes: 849b2e3ff969 ("drm/panel: Add Sitronix ST7701 panel driver") Signed-off-by: Marek Vasut Reviewed-by: Jessica Zhang Link: https://lore.kernel.org/r/20241124224812.150263-1-marex@denx.de Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241124224812.150263-1-marex@denx.de --- drivers/gpu/drm/panel/panel-sitronix-st7701.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-sitronix-st7701.c b/drivers/gpu/drm/panel/panel-sitronix-st7701.c index eef03d04e0cd..1f72ef7ca74c 100644 --- a/drivers/gpu/drm/panel/panel-sitronix-st7701.c +++ b/drivers/gpu/drm/panel/panel-sitronix-st7701.c @@ -1177,6 +1177,7 @@ static int st7701_probe(struct device *dev, int connector_type) return dev_err_probe(dev, ret, "Failed to get orientation\n"); drm_panel_init(&st7701->panel, dev, &st7701_funcs, connector_type); + st7701->panel.prepare_prev_first = true; /** * Once sleep out has been issued, ST7701 IC required to wait 120ms From d2bd3fcb825725a59c8880070b1206b1710922bd Mon Sep 17 00:00:00 2001 From: Michael Trimarchi Date: Thu, 5 Dec 2024 17:29:58 +0100 Subject: [PATCH 155/807] drm/panel: synaptics-r63353: Fix regulator unbalance The shutdown function can be called when the display is already unprepared. For example during reboot this trigger a kernel backlog. Calling the drm_panel_unprepare, allow us to avoid to trigger the kernel warning. Fixes: 2e87bad7cd33 ("drm/panel: Add Synaptics R63353 panel driver") Tested-by: Dario Binacchi Signed-off-by: Michael Trimarchi Signed-off-by: Dario Binacchi Reviewed-by: Neil Armstrong Reviewed-by: Jessica Zhang Link: https://lore.kernel.org/r/20241205163002.1804784-1-dario.binacchi@amarulasolutions.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241205163002.1804784-1-dario.binacchi@amarulasolutions.com --- drivers/gpu/drm/panel/panel-synaptics-r63353.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-synaptics-r63353.c b/drivers/gpu/drm/panel/panel-synaptics-r63353.c index 169c629746c7..17349825543f 100644 --- a/drivers/gpu/drm/panel/panel-synaptics-r63353.c +++ b/drivers/gpu/drm/panel/panel-synaptics-r63353.c @@ -325,7 +325,7 @@ static void r63353_panel_shutdown(struct mipi_dsi_device *dsi) { struct r63353_panel *rpanel = mipi_dsi_get_drvdata(dsi); - r63353_panel_unprepare(&rpanel->base); + drm_panel_unprepare(&rpanel->base); } static const struct r63353_desc sharp_ls068b3sx02_data = { From 0e8c52091633b354b12d0c29a27a22077584c111 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 12 Dec 2024 13:29:42 +0200 Subject: [PATCH 156/807] wifi: iwlwifi: fix CRF name for Bz We had BE201 hard coded. Look at the RF_ID and decide based on its value. Fixes: 6795a37161fb ("wifi: iwlwifi: Print a specific device name.") Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241212132940.b9eebda1ca60.I36791a134ed5e538e059418eb6520761da97b44c@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/cfg/bz.c | 1 + .../net/wireless/intel/iwlwifi/iwl-config.h | 1 + drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 41 +++++++++++++++++-- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/bz.c b/drivers/net/wireless/intel/iwlwifi/cfg/bz.c index cd1fe8490ae5..1c43f283ac4a 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/bz.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/bz.c @@ -161,6 +161,7 @@ const struct iwl_cfg_trans_params iwl_gl_trans_cfg = { const char iwl_bz_name[] = "Intel(R) TBD Bz device"; const char iwl_fm_name[] = "Intel(R) Wi-Fi 7 BE201 320MHz"; +const char iwl_wh_name[] = "Intel(R) Wi-Fi 7 BE211 320MHz"; const char iwl_gl_name[] = "Intel(R) Wi-Fi 7 BE200 320MHz"; const char iwl_mtp_name[] = "Intel(R) Wi-Fi 7 BE202 160MHz"; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index 34c91deca57b..17721bb47e25 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -545,6 +545,7 @@ extern const char iwl_ax231_name[]; extern const char iwl_ax411_name[]; extern const char iwl_bz_name[]; extern const char iwl_fm_name[]; +extern const char iwl_wh_name[]; extern const char iwl_gl_name[]; extern const char iwl_mtp_name[]; extern const char iwl_sc_name[]; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 805fb249a0c6..8fb2aa282242 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1106,18 +1106,53 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { iwlax210_2ax_cfg_so_jf_b0, iwl9462_name), /* Bz */ -/* FIXME: need to change the naming according to the actual CRF */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax201_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax211_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_fm_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_wh_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax201_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax211_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_fm_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, - iwl_cfg_bz, iwl_fm_name), + iwl_cfg_bz, iwl_wh_name), /* Ga (Gl) */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, From aa21f333c86c8a09d39189de87abb0153d338190 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 11 Dec 2024 13:11:17 +0100 Subject: [PATCH 157/807] fs: fix is_mnt_ns_file() Commit 1fa08aece425 ("nsfs: convert to path_from_stashed() helper") reused nsfs dentry's d_fsdata, which no longer contains a pointer to proc_ns_operations. Fix the remaining use in is_mnt_ns_file(). Fixes: 1fa08aece425 ("nsfs: convert to path_from_stashed() helper") Cc: stable@vger.kernel.org # v6.9 Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20241211121118.85268-1-mszeredi@redhat.com Acked-by: Al Viro Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/namespace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 23e81c2a1e3f..6eec7794f707 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2055,9 +2055,15 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static bool is_mnt_ns_file(struct dentry *dentry) { + struct ns_common *ns; + /* Is this a proxy for a mount namespace? */ - return dentry->d_op == &ns_dentry_operations && - dentry->d_fsdata == &mntns_operations; + if (dentry->d_op != &ns_dentry_operations) + return false; + + ns = d_inode(dentry)->i_private; + + return ns->ops == &mntns_operations; } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) From b83accfec0811421df065f820e73ca8df7f6439a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 12 Dec 2024 08:27:05 -0800 Subject: [PATCH 158/807] MAINTAINERS: wifi: ath: add Jeff Johnson as maintainer The "ATHEROS ATH GENERIC UTILITIES" entry shares the same git tree as the ATH10K, ATH11K, and ATH12K entries which I already maintain, so add me to that entry as well. Signed-off-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241212-ath-maintainer-v1-1-7ea5e86780a8@kernel.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e6e71b05710b..158740a571b3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3608,6 +3608,7 @@ F: drivers/phy/qualcomm/phy-ath79-usb.c ATHEROS ATH GENERIC UTILITIES M: Kalle Valo +M: Jeff Johnson L: linux-wireless@vger.kernel.org S: Supported F: drivers/net/wireless/ath/* From f2893c0804d86230ffb8f1c8703fdbb18648abc8 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Thu, 5 Dec 2024 19:41:51 +0800 Subject: [PATCH 159/807] dm array: fix releasing a faulty array block twice in dm_array_cursor_end When dm_bm_read_lock() fails due to locking or checksum errors, it releases the faulty block implicitly while leaving an invalid output pointer behind. The caller of dm_bm_read_lock() should not operate on this invalid dm_block pointer, or it will lead to undefined result. For example, the dm_array_cursor incorrectly caches the invalid pointer on reading a faulty array block, causing a double release in dm_array_cursor_end(), then hitting the BUG_ON in dm-bufio cache_put(). Reproduce steps: 1. initialize a cache device dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 65536 linear /dev/sdc 8192" dmsetup create corig --table "0 524288 linear /dev/sdc $262144" dd if=/dev/zero of=/dev/mapper/cmeta bs=4k count=1 dmsetup create cache --table "0 524288 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" 2. wipe the second array block offline dmsteup remove cache cmeta cdata corig mapping_root=$(dd if=/dev/sdc bs=1c count=8 skip=192 \ 2>/dev/null | hexdump -e '1/8 "%u\n"') ablock=$(dd if=/dev/sdc bs=1c count=8 skip=$((4096*mapping_root+2056)) \ 2>/dev/null | hexdump -e '1/8 "%u\n"') dd if=/dev/zero of=/dev/sdc bs=4k count=1 seek=$ablock 3. try reopen the cache device dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 65536 linear /dev/sdc 8192" dmsetup create corig --table "0 524288 linear /dev/sdc $262144" dmsetup create cache --table "0 524288 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" Kernel logs: (snip) device-mapper: array: array_block_check failed: blocknr 0 != wanted 10 device-mapper: block manager: array validator check failed for block 10 device-mapper: array: get_ablock failed device-mapper: cache metadata: dm_array_cursor_next for mapping failed ------------[ cut here ]------------ kernel BUG at drivers/md/dm-bufio.c:638! Fix by setting the cached block pointer to NULL on errors. In addition to the reproducer described above, this fix can be verified using the "array_cursor/damaged" test in dm-unit: dm-unit run /pdata/array_cursor/damaged --kernel-dir Signed-off-by: Ming-Hung Tsai Fixes: fdd1315aa5f0 ("dm array: introduce cursor api") Reviewed-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-array.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 157c9bd2fed7..4866ff56125f 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -917,23 +917,27 @@ static int load_ablock(struct dm_array_cursor *c) if (c->block) unlock_ablock(c->info, c->block); - c->block = NULL; - c->ab = NULL; c->index = 0; r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le); if (r) { DMERR("dm_btree_cursor_get_value failed"); - dm_btree_cursor_end(&c->cursor); + goto out; } else { r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab); if (r) { DMERR("get_ablock failed"); - dm_btree_cursor_end(&c->cursor); + goto out; } } + return 0; + +out: + dm_btree_cursor_end(&c->cursor); + c->block = NULL; + c->ab = NULL; return r; } From 626f128ee9c4133b1cfce4be2b34a1508949370e Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Thu, 5 Dec 2024 19:41:52 +0800 Subject: [PATCH 160/807] dm array: fix unreleased btree blocks on closing a faulty array cursor The cached block pointer in dm_array_cursor might be NULL if it reaches an unreadable array block, or the array is empty. Therefore, dm_array_cursor_end() should call dm_btree_cursor_end() unconditionally, to prevent leaving unreleased btree blocks. This fix can be verified using the "array_cursor/iterate/empty" test in dm-unit: dm-unit run /pdata/array_cursor/iterate/empty --kernel-dir Signed-off-by: Ming-Hung Tsai Fixes: fdd1315aa5f0 ("dm array: introduce cursor api") Reviewed-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-array.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 4866ff56125f..0850dfdffc8c 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -960,10 +960,10 @@ EXPORT_SYMBOL_GPL(dm_array_cursor_begin); void dm_array_cursor_end(struct dm_array_cursor *c) { - if (c->block) { + if (c->block) unlock_ablock(c->info, c->block); - dm_btree_cursor_end(&c->cursor); - } + + dm_btree_cursor_end(&c->cursor); } EXPORT_SYMBOL_GPL(dm_array_cursor_end); From 0bb1968da2737ba68fd63857d1af2b301a18d3bf Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Thu, 5 Dec 2024 19:41:53 +0800 Subject: [PATCH 161/807] dm array: fix cursor index when skipping across block boundaries dm_array_cursor_skip() seeks to the target position by loading array blocks iteratively until the specified number of entries to skip is reached. When seeking across block boundaries, it uses dm_array_cursor_next() to step into the next block. dm_array_cursor_skip() must first move the cursor index to the end of the current block; otherwise, the cursor position could incorrectly remain in the same block, causing the actual number of skipped entries to be much smaller than expected. This bug affects cache resizing in v2 metadata and could lead to data loss if the fast device is shrunk during the first-time resume. For example: 1. create a cache metadata consists of 32768 blocks, with a dirty block assigned to the second bitmap block. cache_restore v1.0 is required. cat <> cmeta.xml EOF dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" cache_restore -i cmeta.xml -o /dev/mapper/cmeta --metadata-version=2 2. bring up the cache while attempt to discard all the blocks belonging to the second bitmap block (block# 32576 to 32767). The last command is expected to fail, but it actually succeeds. dmsetup create cdata --table "0 2084864 linear /dev/sdc 8192" dmsetup create corig --table "0 65536 linear /dev/sdc 2105344" dmsetup create cache --table "0 65536 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 64 2 metadata2 writeback smq \ 2 migration_threshold 0" In addition to the reproducer described above, this fix can be verified using the "array_cursor/skip" tests in dm-unit: dm-unit run /pdata/array_cursor/skip/ --kernel-dir Signed-off-by: Ming-Hung Tsai Fixes: 9b696229aa7d ("dm persistent data: add cursor skip functions to the cursor APIs") Reviewed-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-array.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 0850dfdffc8c..8f8792e55806 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -1003,6 +1003,7 @@ int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count) } count -= remaining; + c->index += (remaining - 1); r = dm_array_cursor_next(c); } while (!r); From 78f2560fc9fa5ccaaf23ac78edb732c08bad7a92 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 3 Dec 2024 00:01:10 +0100 Subject: [PATCH 162/807] fuse: Set *nbytesp=0 in fuse_get_user_pages on allocation failure In fuse_get_user_pages(), set *nbytesp to 0 when struct page **pages allocation fails. This prevents the caller (fuse_direct_io) from making incorrect assumptions that could lead to NULL pointer dereferences when processing the request reply. Previously, *nbytesp was left unmodified on allocation failure, which could cause issues if the caller assumed pages had been added to ap->descs[] when they hadn't. Reported-by: syzbot+87b8e6ed25dbc41759f7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=87b8e6ed25dbc41759f7 Fixes: 3b97c3652d91 ("fuse: convert direct io to use folios") Signed-off-by: Bernd Schubert Reviewed-by: Joanne Koong Tested-by: Dmitry Antipov Tested-by: David Howells Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 15b08d6a5739..7d92a5479998 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1541,8 +1541,10 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, */ struct page **pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); - if (!pages) - return -ENOMEM; + if (!pages) { + ret = -ENOMEM; + goto out; + } while (nbytes < *nbytesp && nr_pages < max_pages) { unsigned nfolios, i; @@ -1588,6 +1590,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, else ap->args.out_pages = true; +out: *nbytesp = nbytes; return ret < 0 ? ret : 0; From 8b55f8818900c99dd4f55a59a103f5b29e41eb2c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 18 Oct 2024 15:14:42 +0000 Subject: [PATCH 163/807] media: mediatek: vcodec: mark vdec_vp9_slice_map_counts_eob_coef noinline With KASAN enabled, clang fails to optimize the inline version of vdec_vp9_slice_map_counts_eob_coef() properly, leading to kilobytes of temporary values spilled to the stack: drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c:1526:12: error: stack frame size (2160) exceeds limit (2048) in 'vdec_vp9_slice_update_prob' [-Werror,-Wframe-larger-than] This seems to affect all versions of clang including the latest (clang-20), but the degree of stack overhead is different per release. Marking the function as noinline_for_stack is harmless here and avoids the problem completely. Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Signed-off-by: Sebastian Fricke Signed-off-by: Mauro Carvalho Chehab --- .../mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c index eea709d93820..47c302745c1d 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c @@ -1188,7 +1188,8 @@ err: return ret; } -static +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack void vdec_vp9_slice_map_counts_eob_coef(unsigned int i, unsigned int j, unsigned int k, struct vdec_vp9_slice_frame_counts *counts, struct v4l2_vp9_frame_symbol_counts *counts_helper) From 080b2e7b5e9ad23343e4b11f0751e4c724a78958 Mon Sep 17 00:00:00 2001 From: Krzysztof Karas Date: Thu, 12 Dec 2024 11:00:41 +0000 Subject: [PATCH 164/807] drm/display: use ERR_PTR on DP tunnel manager creation fail Instead of returning a generic NULL on error from drm_dp_tunnel_mgr_create(), use error pointers with informative codes to align the function with stub that is executed when CONFIG_DRM_DISPLAY_DP_TUNNEL is unset. This will also trigger IS_ERR() in current caller (intel_dp_tunnerl_mgr_init()) instead of bypassing it via NULL pointer. v2: use error codes inside drm_dp_tunnel_mgr_create() instead of handling on caller's side (Michal, Imre) v3: fixup commit message and add "CC"/"Fixes" lines (Andi), mention aligning function code with stub Fixes: 91888b5b1ad2 ("drm/i915/dp: Add support for DP tunnel BW allocation") Cc: Imre Deak Cc: # v6.9+ Signed-off-by: Krzysztof Karas Reviewed-by: Andi Shyti Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/7q4fpnmmztmchczjewgm6igy55qt6jsm7tfd4fl4ucfq6yg2oy@q4lxtsu6445c --- drivers/gpu/drm/display/drm_dp_tunnel.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/display/drm_dp_tunnel.c b/drivers/gpu/drm/display/drm_dp_tunnel.c index 48b2df120086..90fe07a89260 100644 --- a/drivers/gpu/drm/display/drm_dp_tunnel.c +++ b/drivers/gpu/drm/display/drm_dp_tunnel.c @@ -1896,8 +1896,8 @@ static void destroy_mgr(struct drm_dp_tunnel_mgr *mgr) * * Creates a DP tunnel manager for @dev. * - * Returns a pointer to the tunnel manager if created successfully or NULL in - * case of an error. + * Returns a pointer to the tunnel manager if created successfully or error + * pointer in case of failure. */ struct drm_dp_tunnel_mgr * drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) @@ -1907,7 +1907,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) mgr = kzalloc(sizeof(*mgr), GFP_KERNEL); if (!mgr) - return NULL; + return ERR_PTR(-ENOMEM); mgr->dev = dev; init_waitqueue_head(&mgr->bw_req_queue); @@ -1916,7 +1916,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) if (!mgr->groups) { kfree(mgr); - return NULL; + return ERR_PTR(-ENOMEM); } #ifdef CONFIG_DRM_DISPLAY_DP_TUNNEL_STATE_DEBUG @@ -1927,7 +1927,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) if (!init_group(mgr, &mgr->groups[i])) { destroy_mgr(mgr); - return NULL; + return ERR_PTR(-ENOMEM); } mgr->group_count++; From 9398332f23fab10c5ec57c168b44e72997d6318e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 29 Nov 2024 06:26:28 +0200 Subject: [PATCH 165/807] drm/modes: Avoid divide by zero harder in drm_mode_vrefresh() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_mode_vrefresh() is trying to avoid divide by zero by checking whether htotal or vtotal are zero. But we may still end up with a div-by-zero of vtotal*htotal*... Cc: stable@vger.kernel.org Reported-by: syzbot+622bba18029bcde672e1@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=622bba18029bcde672e1 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20241129042629.18280-2-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula --- drivers/gpu/drm/drm_modes.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c index 6ba167a33461..71573b85d924 100644 --- a/drivers/gpu/drm/drm_modes.c +++ b/drivers/gpu/drm/drm_modes.c @@ -1287,14 +1287,11 @@ EXPORT_SYMBOL(drm_mode_set_name); */ int drm_mode_vrefresh(const struct drm_display_mode *mode) { - unsigned int num, den; + unsigned int num = 1, den = 1; if (mode->htotal == 0 || mode->vtotal == 0) return 0; - num = mode->clock; - den = mode->htotal * mode->vtotal; - if (mode->flags & DRM_MODE_FLAG_INTERLACE) num *= 2; if (mode->flags & DRM_MODE_FLAG_DBLSCAN) @@ -1302,6 +1299,12 @@ int drm_mode_vrefresh(const struct drm_display_mode *mode) if (mode->vscan > 1) den *= mode->vscan; + if (check_mul_overflow(mode->clock, num, &num)) + return 0; + + if (check_mul_overflow(mode->htotal * mode->vtotal, den, &den)) + return 0; + return DIV_ROUND_CLOSEST_ULL(mul_u32_u32(num, 1000), den); } EXPORT_SYMBOL(drm_mode_vrefresh); From 429fde2d81bcef0ebab002215358955704586457 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Dec 2024 22:22:47 +0000 Subject: [PATCH 166/807] net: tun: fix tun_napi_alloc_frags() syzbot reported the following crash [1] Issue came with the blamed commit. Instead of going through all the iov components, we keep using the first one and end up with a malformed skb. [1] kernel BUG at net/core/skbuff.c:2849 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 6230 Comm: syz-executor132 Not tainted 6.13.0-rc1-syzkaller-00407-g96b6fcc0ee41 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 RIP: 0010:__pskb_pull_tail+0x1568/0x1570 net/core/skbuff.c:2848 Code: 38 c1 0f 8c 32 f1 ff ff 4c 89 f7 e8 92 96 74 f8 e9 25 f1 ff ff e8 e8 ae 09 f8 48 8b 5c 24 08 e9 eb fb ff ff e8 d9 ae 09 f8 90 <0f> 0b 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 0018:ffffc90004cbef30 EFLAGS: 00010293 RAX: ffffffff8995c347 RBX: 00000000fffffff2 RCX: ffff88802cf45a00 RDX: 0000000000000000 RSI: 00000000fffffff2 RDI: 0000000000000000 RBP: ffff88807df0c06a R08: ffffffff8995b084 R09: 1ffff1100fbe185c R10: dffffc0000000000 R11: ffffed100fbe185d R12: ffff888076e85d50 R13: ffff888076e85c80 R14: ffff888076e85cf4 R15: ffff888076e85c80 FS: 00007f0dca6ea6c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0dca6ead58 CR3: 00000000119da000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_cow_data+0x2da/0xcb0 net/core/skbuff.c:5284 tipc_aead_decrypt net/tipc/crypto.c:894 [inline] tipc_crypto_rcv+0x402/0x24e0 net/tipc/crypto.c:1844 tipc_rcv+0x57e/0x12a0 net/tipc/node.c:2109 tipc_l2_rcv_msg+0x2bd/0x450 net/tipc/bearer.c:668 __netif_receive_skb_list_ptype net/core/dev.c:5720 [inline] __netif_receive_skb_list_core+0x8b7/0x980 net/core/dev.c:5762 __netif_receive_skb_list net/core/dev.c:5814 [inline] netif_receive_skb_list_internal+0xa51/0xe30 net/core/dev.c:5905 gro_normal_list include/net/gro.h:515 [inline] napi_complete_done+0x2b5/0x870 net/core/dev.c:6256 napi_complete include/linux/netdevice.h:567 [inline] tun_get_user+0x2ea0/0x4890 drivers/net/tun.c:1982 tun_chr_write_iter+0x10d/0x1f0 drivers/net/tun.c:2057 do_iter_readv_writev+0x600/0x880 vfs_writev+0x376/0xba0 fs/read_write.c:1050 do_writev+0x1b6/0x360 fs/read_write.c:1096 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: de4f5fed3f23 ("iov_iter: add iter_iovec() helper") Reported-by: syzbot+4f66250f6663c0c1d67e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/675b61aa.050a0220.599f4.00bb.GAE@google.com/T/#u Cc: stable@vger.kernel.org Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Reviewed-by: Jens Axboe Acked-by: Willem de Bruijn Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20241212222247.724674-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index d7a865ef370b..e816aaba8e5f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1481,7 +1481,7 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { - const struct iovec *iov = iter_iov(it); + const struct iovec *iov = iter_iov(it) + i; size_t fragsz = iov->iov_len; struct page *page; void *frag; From 2b33eb8f1b3e8c2f87cfdbc8cc117f6bdfabc6ec Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:16 +0800 Subject: [PATCH 167/807] net/smc: protect link down work from execute after lgr freed link down work may be scheduled before lgr freed but execute after lgr freed, which may result in crash. So it is need to hold a reference before shedule link down work, and put the reference after work executed or canceled. The relevant crash call stack as follows: list_del corruption. prev->next should be ffffb638c9c0fe20, but was 0000000000000000 ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:51! invalid opcode: 0000 [#1] SMP NOPTI CPU: 6 PID: 978112 Comm: kworker/6:119 Kdump: loaded Tainted: G #1 Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 2221b89 04/01/2014 Workqueue: events smc_link_down_work [smc] RIP: 0010:__list_del_entry_valid.cold+0x31/0x47 RSP: 0018:ffffb638c9c0fdd8 EFLAGS: 00010086 RAX: 0000000000000054 RBX: ffff942fb75e5128 RCX: 0000000000000000 RDX: ffff943520930aa0 RSI: ffff94352091fc80 RDI: ffff94352091fc80 RBP: 0000000000000000 R08: 0000000000000000 R09: ffffb638c9c0fc38 R10: ffffb638c9c0fc30 R11: ffffffffa015eb28 R12: 0000000000000002 R13: ffffb638c9c0fe20 R14: 0000000000000001 R15: ffff942f9cd051c0 FS: 0000000000000000(0000) GS:ffff943520900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4f25214000 CR3: 000000025fbae004 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: rwsem_down_write_slowpath+0x17e/0x470 smc_link_down_work+0x3c/0x60 [smc] process_one_work+0x1ac/0x350 worker_thread+0x49/0x2f0 ? rescuer_thread+0x360/0x360 kthread+0x118/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x1f/0x30 Fixes: 541afa10c126 ("net/smc: add smcr_port_err() and smcr_link_down() processing") Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 500952c2e67b..3b125d348b4a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1818,7 +1818,9 @@ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); - schedule_work(&lnk->link_down_wrk); + smcr_link_hold(lnk); /* smcr_link_put in link_down_wrk */ + if (!schedule_work(&lnk->link_down_wrk)) + smcr_link_put(lnk); } } @@ -1850,11 +1852,14 @@ static void smc_link_down_work(struct work_struct *work) struct smc_link_group *lgr = link->lgr; if (list_empty(&lgr->list)) - return; + goto out; wake_up_all(&lgr->llc_msg_waiter); down_write(&lgr->llc_conf_mutex); smcr_link_down(link); up_write(&lgr->llc_conf_mutex); + +out: + smcr_link_put(link); /* smcr_link_hold by schedulers of link_down_work */ } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, From 679e9ddcf90dbdf98aaaa71a492454654b627bcb Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:17 +0800 Subject: [PATCH 168/807] net/smc: check sndbuf_space again after NOSPACE flag is set in smc_poll When application sending data more than sndbuf_space, there have chances application will sleep in epoll_wait, and will never be wakeup again. This is caused by a race between smc_poll and smc_cdc_tx_handler. application tasklet smc_tx_sendmsg(len > sndbuf_space) | epoll_wait for EPOLL_OUT,timeout=0 | smc_poll | if (!smc->conn.sndbuf_space) | | smc_cdc_tx_handler | atomic_add sndbuf_space | smc_tx_sndbuf_nonfull | if (!test_bit SOCK_NOSPACE) | do not sk_write_space; set_bit SOCK_NOSPACE; | return mask=0; | Application will sleep in epoll_wait as smc_poll returns 0. And smc_cdc_tx_handler will not call sk_write_space because the SOCK_NOSPACE has not be set. If there is no inflight cdc msg, sk_write_space will not be called any more, and application will sleep in epoll_wait forever. So check sndbuf_space again after NOSPACE flag is set to break the race. Fixes: 8dce2786a290 ("net/smc: smc_poll improvements") Signed-off-by: Guangguan Wang Suggested-by: Paolo Abeni Signed-off-by: David S. Miller --- net/smc/af_smc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9e6c69d18581..92448f2c362c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2881,6 +2881,13 @@ __poll_t smc_poll(struct file *file, struct socket *sock, } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + if (sk->sk_state != SMC_INIT) { + /* Race breaker the same way as tcp_poll(). */ + smp_mb__after_atomic(); + if (atomic_read(&smc->conn.sndbuf_space)) + mask |= EPOLLOUT | EPOLLWRNORM; + } } if (atomic_read(&smc->conn.bytes_to_rcv)) mask |= EPOLLIN | EPOLLRDNORM; From a29e220d3c8edbf0e1beb0f028878a4a85966556 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:18 +0800 Subject: [PATCH 169/807] net/smc: check iparea_offset and ipv6_prefixes_cnt when receiving proposal msg When receiving proposal msg in server, the field iparea_offset and the field ipv6_prefixes_cnt in proposal msg are from the remote client and can not be fully trusted. Especially the field iparea_offset, once exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks iparea_offset and ipv6_prefixes_cnt before using them. Fixes: e7b7a64a8493 ("smc: support variable CLC proposal messages") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 +++++- net/smc/smc_clc.c | 4 ++++ net/smc/smc_clc.h | 6 +++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 92448f2c362c..9a74c9693f09 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2032,6 +2032,8 @@ static int smc_listen_prfx_check(struct smc_sock *new_smc, if (pclc->hdr.typev1 == SMC_TYPE_N) return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx) + return -EPROTO; if (smc_clc_prfx_match(newclcsock, pclc_prfx)) return SMC_CLC_DECL_DIFFPREFIX; @@ -2221,7 +2223,9 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, int rc = 0; /* check if ISM V1 is available */ - if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) + if (!(ini->smcd_version & SMC_V1) || + !smcd_indicated(ini->smc_type_v1) || + !pclc_smcd) goto not_found; ini->is_smcd = true; /* prepare ISM check */ ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 33fa787c28eb..66a43b97eede 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -354,6 +354,10 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx || + pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) + return false; + if (hdr->version == SMC_V1) { if (hdr->typev1 == SMC_TYPE_N) return false; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5fd6f5b8ef03..ac8de6a177fa 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -336,8 +336,12 @@ struct smc_clc_msg_decline_v2 { /* clc decline message */ static inline struct smc_clc_msg_proposal_prefix * smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) { + u16 offset = ntohs(pclc->iparea_offset); + + if (offset > sizeof(struct smc_clc_msg_smcd)) + return NULL; return (struct smc_clc_msg_proposal_prefix *) - ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); + ((u8 *)pclc + sizeof(*pclc) + offset); } static inline bool smcr_indicated(int smc_type) From 7863c9f3d24ba49dbead7e03dfbe40deb5888fdf Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:19 +0800 Subject: [PATCH 170/807] net/smc: check v2_ext_offset/eid_cnt/ism_gid_cnt when receiving proposal msg When receiving proposal msg in server, the fields v2_ext_offset/ eid_cnt/ism_gid_cnt in proposal msg are from the remote client and can not be fully trusted. Especially the field v2_ext_offset, once exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks the fields v2_ext_offset/eid_cnt/ism_gid_cnt before using them. Fixes: 8c3dca341aea ("net/smc: build and send V2 CLC proposal") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 3 ++- net/smc/smc_clc.c | 8 +++++++- net/smc/smc_clc.h | 8 +++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9a74c9693f09..5d96f9de5b5d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2276,7 +2276,8 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, goto not_found; smc_v2_ext = smc_get_clc_v2_ext(pclc); - if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) + if (!smc_v2_ext || + !smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) goto not_found; /* prepare RDMA check */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 66a43b97eede..f721d03efcbd 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -352,7 +352,6 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) struct smc_clc_msg_hdr *hdr = &pclc->hdr; struct smc_clc_v2_extension *v2_ext; - v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (!pclc_prfx || pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) @@ -369,6 +368,13 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) sizeof(struct smc_clc_msg_trail)) return false; } else { + v2_ext = smc_get_clc_v2_ext(pclc); + if ((hdr->typev2 != SMC_TYPE_N && + (!v2_ext || v2_ext->hdr.eid_cnt > SMC_CLC_MAX_UEID)) || + (smcd_indicated(hdr->typev2) && + v2_ext->hdr.ism_gid_cnt > SMCD_CLC_MAX_V2_GID_ENTRIES)) + return false; + if (ntohs(hdr->length) != sizeof(*pclc) + sizeof(struct smc_clc_msg_smcd) + diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index ac8de6a177fa..23afa4df862e 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -380,8 +380,14 @@ static inline struct smc_clc_v2_extension * smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) { struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop); + u16 max_offset; - if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset)) + max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_smcd) - + offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); + + if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset) || + ntohs(prop_smcd->v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_v2_extension *) From 9ab332deb671d8f7e66d82a2ff2b3f715bc3a4ad Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:20 +0800 Subject: [PATCH 171/807] net/smc: check smcd_v2_ext_offset when receiving proposal msg When receiving proposal msg in server, the field smcd_v2_ext_offset in proposal msg is from the remote client and can not be fully trusted. Once the value of smcd_v2_ext_offset exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks the value of smcd_v2_ext_offset before using it. Fixes: 5c21c4ccafe8 ("net/smc: determine accepted ISM devices") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 ++ net/smc/smc_clc.h | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5d96f9de5b5d..6cc7b846cff1 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2147,6 +2147,8 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); + if (!pclc_smcd || !smc_v2_ext || !smcd_v2_ext) + goto not_found; mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) { diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 23afa4df862e..767289925410 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -400,9 +400,15 @@ smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) static inline struct smc_clc_smcd_v2_extension * smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) { + u16 max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_smcd_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_v2_extension, hdr) - + offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset); + if (!prop_v2ext) return NULL; - if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)) + if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) || + ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_smcd_v2_extension *) From c5b8ee5022a19464783058dc6042e8eefa34e8cd Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:21 +0800 Subject: [PATCH 172/807] net/smc: check return value of sock_recvmsg when draining clc data When receiving clc msg, the field length in smc_clc_msg_hdr indicates the length of msg should be received from network and the value should not be fully trusted as it is from the network. Once the value of length exceeds the value of buflen in function smc_clc_wait_msg it may run into deadloop when trying to drain the remaining data exceeding buflen. This patch checks the return value of sock_recvmsg when draining data in case of deadloop in draining. Fixes: fb4f79264c0f ("net/smc: tolerate future SMCD versions") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index f721d03efcbd..521f5df80e10 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -774,6 +774,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, SMC_CLC_RECV_BUF_LEN : datlen; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); len = sock_recvmsg(smc->clcsock, &msg, krflags); + if (len < recvlen) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } datlen -= len; } if (clcm->type == SMC_CLC_DECLINE) { From 2d5df3a680ffdaf606baa10636bdb1daf757832e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 12 Dec 2024 18:55:45 +0200 Subject: [PATCH 173/807] net: mscc: ocelot: fix incorrect IFH SRC_PORT field in ocelot_ifh_set_basic() Packets injected by the CPU should have a SRC_PORT field equal to the CPU port module index in the Analyzer block (ocelot->num_phys_ports). The blamed commit copied the ocelot_ifh_set_basic() call incorrectly from ocelot_xmit_common() in net/dsa/tag_ocelot.c. Instead of calling with "x", it calls with BIT_ULL(x), but the field is not a port mask, but rather a single port index. [ side note: this is the technical debt of code duplication :( ] The error used to be silent and doesn't appear to have other user-visible manifestations, but with new changes in the packing library, it now fails loudly as follows: ------------[ cut here ]------------ Cannot store 0x40 inside bits 46-43 - will truncate sja1105 spi2.0: xmit timed out WARNING: CPU: 1 PID: 102 at lib/packing.c:98 __pack+0x90/0x198 sja1105 spi2.0: timed out polling for tstamp CPU: 1 UID: 0 PID: 102 Comm: felix_xmit Tainted: G W N 6.13.0-rc1-00372-gf706b85d972d-dirty #2605 Call trace: __pack+0x90/0x198 (P) __pack+0x90/0x198 (L) packing+0x78/0x98 ocelot_ifh_set_basic+0x260/0x368 ocelot_port_inject_frame+0xa8/0x250 felix_port_deferred_xmit+0x14c/0x258 kthread_worker_fn+0x134/0x350 kthread+0x114/0x138 The code path pertains to the ocelot switchdev driver and to the felix secondary DSA tag protocol, ocelot-8021q. Here seen with ocelot-8021q. The messenger (packing) is not really to blame, so fix the original commit instead. Fixes: e1b9e80236c5 ("net: mscc: ocelot: fix QoS class for injected packets with "ocelot-8021q"") Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241212165546.879567-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 3d72aa7b1305..ef93df520887 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1432,7 +1432,7 @@ void ocelot_ifh_set_basic(void *ifh, struct ocelot *ocelot, int port, memset(ifh, 0, OCELOT_TAG_LEN); ocelot_ifh_set_bypass(ifh, 1); - ocelot_ifh_set_src(ifh, BIT_ULL(ocelot->num_phys_ports)); + ocelot_ifh_set_src(ifh, ocelot->num_phys_ports); ocelot_ifh_set_dest(ifh, BIT_ULL(port)); ocelot_ifh_set_qos_class(ifh, qos_class); ocelot_ifh_set_tag_type(ifh, tag_type); From ee76746387f6233bdfa93d7406990f923641568f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Dec 2024 17:25:18 +0000 Subject: [PATCH 174/807] netdevsim: prevent bad user input in nsim_dev_health_break_write() If either a zero count or a large one is provided, kernel can crash. Fixes: 82c93a87bf8b ("netdevsim: implement couple of testing devlink health reporters") Reported-by: syzbot+ea40e4294e58b0292f74@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/675c6862.050a0220.37aaf.00b1.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Jiri Pirko Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213172518.2415666-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/health.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/netdevsim/health.c b/drivers/net/netdevsim/health.c index 70e8bdf34be9..688f05316b5e 100644 --- a/drivers/net/netdevsim/health.c +++ b/drivers/net/netdevsim/health.c @@ -149,6 +149,8 @@ static ssize_t nsim_dev_health_break_write(struct file *file, char *break_msg; int err; + if (count == 0 || count > PAGE_SIZE) + return -EINVAL; break_msg = memdup_user_nul(data, count); if (IS_ERR(break_msg)) return PTR_ERR(break_msg); From 663ad7481f068057f6f692c5368c47150e855370 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 13 Dec 2024 13:07:11 +0000 Subject: [PATCH 175/807] tools/net/ynl: fix sub-message key lookup for nested attributes Use the correct attribute space for sub-message key lookup in nested attributes when adding attributes. This fixes rt_link where the "kind" key and "data" sub-message are nested attributes in "linkinfo". For example: ./tools/net/ynl/cli.py \ --create \ --spec Documentation/netlink/specs/rt_link.yaml \ --do newlink \ --json '{"link": 99, "linkinfo": { "kind": "vlan", "data": {"id": 4 } } }' Signed-off-by: Donald Hunter Fixes: ab463c4342d1 ("tools/net/ynl: Add support for encoding sub-messages") Link: https://patch.msgid.link/20241213130711.40267-1-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 01ec01a90e76..eea29359a899 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -556,10 +556,10 @@ class YnlFamily(SpecFamily): if attr["type"] == 'nest': nl_type |= Netlink.NLA_F_NESTED attr_payload = b'' - sub_attrs = SpaceAttrs(self.attr_sets[space], value, search_attrs) + sub_space = attr['nested-attributes'] + sub_attrs = SpaceAttrs(self.attr_sets[sub_space], value, search_attrs) for subname, subvalue in value.items(): - attr_payload += self._add_attr(attr['nested-attributes'], - subname, subvalue, sub_attrs) + attr_payload += self._add_attr(sub_space, subname, subvalue, sub_attrs) elif attr["type"] == 'flag': if not value: # If value is absent or false then skip attribute creation. From 9590d32e090ea2751e131ae5273859ca22f5ac14 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 12 Dec 2024 13:31:55 -0800 Subject: [PATCH 176/807] ionic: Fix netdev notifier unregister on failure If register_netdev() fails, then the driver leaks the netdev notifier. Fix this by calling ionic_lif_unregister() on register_netdev() failure. This will also call ionic_lif_unregister_phc() if it has already been registered. Fixes: 30b87ab4c0b3 ("ionic: remove lif list concept") Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241212213157.12212-2-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 40496587b2b3..3d3f936779f7 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -3869,8 +3869,8 @@ int ionic_lif_register(struct ionic_lif *lif) /* only register LIF0 for now */ err = register_netdev(lif->netdev); if (err) { - dev_err(lif->ionic->dev, "Cannot register net device, aborting\n"); - ionic_lif_unregister_phc(lif); + dev_err(lif->ionic->dev, "Cannot register net device: %d, aborting\n", err); + ionic_lif_unregister(lif); return err; } From 746e6ae2e202b062b9deee7bd86d94937997ecd7 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 12 Dec 2024 13:31:56 -0800 Subject: [PATCH 177/807] ionic: no double destroy workqueue There are some FW error handling paths that can cause us to try to destroy the workqueue more than once, so let's be sure we're checking for that. The case where this popped up was in an AER event where the handlers got called in such a way that ionic_reset_prepare() and thus ionic_dev_teardown() got called twice in a row. The second time through the workqueue was already destroyed, and destroy_workqueue() choked on the bad wq pointer. We didn't hit this in AER handler testing before because at that time we weren't using a private workqueue. Later we replaced the use of the system workqueue with our own private workqueue but hadn't rerun the AER handler testing since then. Fixes: 9e25450da700 ("ionic: add private workqueue per-device") Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241212213157.12212-3-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c index 9e42d599840d..57edcde9e6f8 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c @@ -277,7 +277,10 @@ void ionic_dev_teardown(struct ionic *ionic) idev->phy_cmb_pages = 0; idev->cmb_npages = 0; - destroy_workqueue(ionic->wq); + if (ionic->wq) { + destroy_workqueue(ionic->wq); + ionic->wq = NULL; + } mutex_destroy(&idev->cmb_inuse_lock); } From b096d62ba1323391b2db98b7704e2468cf3b1588 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 12 Dec 2024 13:31:57 -0800 Subject: [PATCH 178/807] ionic: use ee->offset when returning sprom data Some calls into ionic_get_module_eeprom() don't use a single full buffer size, but instead multiple calls with an offset. Teach our driver to use the offset correctly so we can respond appropriately to the caller. Fixes: 4d03e00a2140 ("ionic: Add initial ethtool support") Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241212213157.12212-4-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index dda22fa4448c..9b7f78b6cdb1 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -961,8 +961,8 @@ static int ionic_get_module_eeprom(struct net_device *netdev, len = min_t(u32, sizeof(xcvr->sprom), ee->len); do { - memcpy(data, xcvr->sprom, len); - memcpy(tbuf, xcvr->sprom, len); + memcpy(data, &xcvr->sprom[ee->offset], len); + memcpy(tbuf, &xcvr->sprom[ee->offset], len); /* Let's make sure we got a consistent copy */ if (!memcmp(data, tbuf, len)) From 282da38b465395c930687974627c24f47ddce5ff Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 10 Dec 2024 12:35:34 +0100 Subject: [PATCH 179/807] s390/mm: Consider KMSAN modules metadata for paging levels The calculation determining whether to use three- or four-level paging didn't account for KMSAN modules metadata. Include this metadata in the virtual memory size calculation to ensure correct paging mode selection and avoiding potentially unnecessary physical memory size limitations. Fixes: 65ca73f9fb36 ("s390/mm: define KMSAN metadata for vmalloc and modules") Acked-by: Heiko Carstens Reviewed-by: Alexander Gordeev Reviewed-by: Ilya Leoshkevich Signed-off-by: Vasily Gorbik Signed-off-by: Alexander Gordeev --- arch/s390/boot/startup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index abe6e6c0ab98..6087d38c7235 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -234,6 +234,8 @@ static unsigned long get_vmem_size(unsigned long identity_size, vsize = round_up(SZ_2G + max_mappable, rte_size) + round_up(vmemmap_size, rte_size) + FIXMAP_SIZE + MODULES_LEN + KASLR_LEN; + if (IS_ENABLED(CONFIG_KMSAN)) + vsize += MODULES_LEN * 2; return size_add(vsize, vmalloc_size); } From 922b4b955a03d19fea98938f33ef0e62d01f5159 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Thu, 12 Dec 2024 11:25:58 +0500 Subject: [PATCH 180/807] net: renesas: rswitch: rework ts tags management The existing linked list based implementation of how ts tags are assigned and managed is unsafe against concurrency and corner cases: - element addition in tx processing can race against element removal in ts queue completion, - element removal in ts queue completion can race against element removal in device close, - if a large number of frames gets added to tx queue without ts queue completions in between, elements with duplicate tag values can get added. Use a different implementation, based on per-port used tags bitmaps and saved skb arrays. Safety for addition in tx processing vs removal in ts completion is provided by: tag = find_first_zero_bit(...); smp_mb(); ts_skb[tag]> set_bit(...); vs ts_skb[tag]> smp_mb(); clear_bit(...); Safety for removal in ts completion vs removal in device close is provided by using atomic read-and-clear for rdev->ts_skb[tag]: ts_skb = xchg(&rdev->ts_skb[tag], NULL); if (ts_skb) Fixes: 33f5d733b589 ("net: renesas: rswitch: Improve TX timestamp accuracy") Signed-off-by: Nikita Yushchenko Link: https://patch.msgid.link/20241212062558.436455-1-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 68 ++++++++++++++------------ drivers/net/ethernet/renesas/rswitch.h | 13 ++--- 2 files changed, 39 insertions(+), 42 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index dbbbf024e7ab..9ac6e2aad18f 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -547,7 +547,6 @@ static int rswitch_gwca_ts_queue_alloc(struct rswitch_private *priv) desc = &gq->ts_ring[gq->ring_size]; desc->desc.die_dt = DT_LINKFIX; rswitch_desc_set_dptr(&desc->desc, gq->ring_dma); - INIT_LIST_HEAD(&priv->gwca.ts_info_list); return 0; } @@ -1003,9 +1002,10 @@ static int rswitch_gwca_request_irqs(struct rswitch_private *priv) static void rswitch_ts(struct rswitch_private *priv) { struct rswitch_gwca_queue *gq = &priv->gwca.ts_queue; - struct rswitch_gwca_ts_info *ts_info, *ts_info2; struct skb_shared_hwtstamps shhwtstamps; struct rswitch_ts_desc *desc; + struct rswitch_device *rdev; + struct sk_buff *ts_skb; struct timespec64 ts; unsigned int num; u32 tag, port; @@ -1015,23 +1015,28 @@ static void rswitch_ts(struct rswitch_private *priv) dma_rmb(); port = TS_DESC_DPN(__le32_to_cpu(desc->desc.dptrl)); + if (unlikely(port >= RSWITCH_NUM_PORTS)) + goto next; + rdev = priv->rdev[port]; + tag = TS_DESC_TSUN(__le32_to_cpu(desc->desc.dptrl)); + if (unlikely(tag >= TS_TAGS_PER_PORT)) + goto next; + ts_skb = xchg(&rdev->ts_skb[tag], NULL); + smp_mb(); /* order rdev->ts_skb[] read before bitmap update */ + clear_bit(tag, rdev->ts_skb_used); - list_for_each_entry_safe(ts_info, ts_info2, &priv->gwca.ts_info_list, list) { - if (!(ts_info->port == port && ts_info->tag == tag)) - continue; + if (unlikely(!ts_skb)) + goto next; - memset(&shhwtstamps, 0, sizeof(shhwtstamps)); - ts.tv_sec = __le32_to_cpu(desc->ts_sec); - ts.tv_nsec = __le32_to_cpu(desc->ts_nsec & cpu_to_le32(0x3fffffff)); - shhwtstamps.hwtstamp = timespec64_to_ktime(ts); - skb_tstamp_tx(ts_info->skb, &shhwtstamps); - dev_consume_skb_irq(ts_info->skb); - list_del(&ts_info->list); - kfree(ts_info); - break; - } + memset(&shhwtstamps, 0, sizeof(shhwtstamps)); + ts.tv_sec = __le32_to_cpu(desc->ts_sec); + ts.tv_nsec = __le32_to_cpu(desc->ts_nsec & cpu_to_le32(0x3fffffff)); + shhwtstamps.hwtstamp = timespec64_to_ktime(ts); + skb_tstamp_tx(ts_skb, &shhwtstamps); + dev_consume_skb_irq(ts_skb); +next: gq->cur = rswitch_next_queue_index(gq, true, 1); desc = &gq->ts_ring[gq->cur]; } @@ -1576,8 +1581,9 @@ static int rswitch_open(struct net_device *ndev) static int rswitch_stop(struct net_device *ndev) { struct rswitch_device *rdev = netdev_priv(ndev); - struct rswitch_gwca_ts_info *ts_info, *ts_info2; + struct sk_buff *ts_skb; unsigned long flags; + unsigned int tag; netif_tx_stop_all_queues(ndev); @@ -1594,12 +1600,13 @@ static int rswitch_stop(struct net_device *ndev) if (bitmap_empty(rdev->priv->opened_ports, RSWITCH_NUM_PORTS)) iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDID); - list_for_each_entry_safe(ts_info, ts_info2, &rdev->priv->gwca.ts_info_list, list) { - if (ts_info->port != rdev->port) - continue; - dev_kfree_skb_irq(ts_info->skb); - list_del(&ts_info->list); - kfree(ts_info); + for (tag = find_first_bit(rdev->ts_skb_used, TS_TAGS_PER_PORT); + tag < TS_TAGS_PER_PORT; + tag = find_next_bit(rdev->ts_skb_used, TS_TAGS_PER_PORT, tag + 1)) { + ts_skb = xchg(&rdev->ts_skb[tag], NULL); + clear_bit(tag, rdev->ts_skb_used); + if (ts_skb) + dev_kfree_skb(ts_skb); } return 0; @@ -1612,20 +1619,17 @@ static bool rswitch_ext_desc_set_info1(struct rswitch_device *rdev, desc->info1 = cpu_to_le64(INFO1_DV(BIT(rdev->etha->index)) | INFO1_IPV(GWCA_IPV_NUM) | INFO1_FMT); if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) { - struct rswitch_gwca_ts_info *ts_info; + unsigned int tag; - ts_info = kzalloc(sizeof(*ts_info), GFP_ATOMIC); - if (!ts_info) + tag = find_first_zero_bit(rdev->ts_skb_used, TS_TAGS_PER_PORT); + if (tag == TS_TAGS_PER_PORT) return false; + smp_mb(); /* order bitmap read before rdev->ts_skb[] write */ + rdev->ts_skb[tag] = skb_get(skb); + set_bit(tag, rdev->ts_skb_used); skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; - rdev->ts_tag++; - desc->info1 |= cpu_to_le64(INFO1_TSUN(rdev->ts_tag) | INFO1_TXC); - - ts_info->skb = skb_get(skb); - ts_info->port = rdev->port; - ts_info->tag = rdev->ts_tag; - list_add_tail(&ts_info->list, &rdev->priv->gwca.ts_info_list); + desc->info1 |= cpu_to_le64(INFO1_TSUN(tag) | INFO1_TXC); skb_tx_timestamp(skb); } diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index e020800dcc57..d8d4ed7d7f8b 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -972,14 +972,6 @@ struct rswitch_gwca_queue { }; }; -struct rswitch_gwca_ts_info { - struct sk_buff *skb; - struct list_head list; - - int port; - u8 tag; -}; - #define RSWITCH_NUM_IRQ_REGS (RSWITCH_MAX_NUM_QUEUES / BITS_PER_TYPE(u32)) struct rswitch_gwca { unsigned int index; @@ -989,7 +981,6 @@ struct rswitch_gwca { struct rswitch_gwca_queue *queues; int num_queues; struct rswitch_gwca_queue ts_queue; - struct list_head ts_info_list; DECLARE_BITMAP(used, RSWITCH_MAX_NUM_QUEUES); u32 tx_irq_bits[RSWITCH_NUM_IRQ_REGS]; u32 rx_irq_bits[RSWITCH_NUM_IRQ_REGS]; @@ -997,6 +988,7 @@ struct rswitch_gwca { }; #define NUM_QUEUES_PER_NDEV 2 +#define TS_TAGS_PER_PORT 256 struct rswitch_device { struct rswitch_private *priv; struct net_device *ndev; @@ -1004,7 +996,8 @@ struct rswitch_device { void __iomem *addr; struct rswitch_gwca_queue *tx_queue; struct rswitch_gwca_queue *rx_queue; - u8 ts_tag; + struct sk_buff *ts_skb[TS_TAGS_PER_PORT]; + DECLARE_BITMAP(ts_skb_used, TS_TAGS_PER_PORT); bool disabled; int port; From 900f83cf376bdaf798b6f5dcb2eae0c822e908b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thi=C3=A9baud=20Weksteen?= Date: Thu, 5 Dec 2024 12:09:19 +1100 Subject: [PATCH 181/807] selinux: ignore unknown extended permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When evaluating extended permissions, ignore unknown permissions instead of calling BUG(). This commit ensures that future permissions can be added without interfering with older kernels. Cc: stable@vger.kernel.org Fixes: fa1aa143ac4a ("selinux: extended permissions for ioctls") Signed-off-by: Thiébaud Weksteen Signed-off-by: Paul Moore --- security/selinux/ss/services.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 971c45d576ba..3d5c563cfc4c 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -979,7 +979,10 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd, return; break; default: - BUG(); + pr_warn_once( + "SELinux: unknown extended permission (%u) will be ignored\n", + node->datum.u.xperms->specified); + return; } if (node->key.specified == AVTAB_XPERMS_ALLOWED) { @@ -998,7 +1001,8 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd, &node->datum.u.xperms->perms, xpermd->dontaudit); } else { - BUG(); + pr_warn_once("SELinux: unknown specified key (%u)\n", + node->key.specified); } } From 83c47d9e0ce79b5d7c0b21b9f35402dbde0fa15c Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Sat, 14 Dec 2024 12:16:45 +0900 Subject: [PATCH 182/807] ksmbd: count all requests in req_running counter This changes the semantics of req_running to count all in-flight requests on a given connection, rather than the number of elements in the conn->request list. The latter is used only in smb2_cancel, and the counter is not used Signed-off-by: Marios Makassikis Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index c14dd72e1b30..be9656d524a4 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -120,8 +120,8 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work) if (conn->ops->get_cmd_val(work) != SMB2_CANCEL_HE) requests_queue = &conn->requests; + atomic_inc(&conn->req_running); if (requests_queue) { - atomic_inc(&conn->req_running); spin_lock(&conn->request_lock); list_add_tail(&work->request_entry, requests_queue); spin_unlock(&conn->request_lock); @@ -132,11 +132,12 @@ void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; + atomic_dec(&conn->req_running); + if (list_empty(&work->request_entry) && list_empty(&work->async_request_entry)) return; - atomic_dec(&conn->req_running); spin_lock(&conn->request_lock); list_del_init(&work->request_entry); spin_unlock(&conn->request_lock); From 43fb7bce8866e793275c4f9f25af6a37745f3416 Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Sat, 14 Dec 2024 12:17:23 +0900 Subject: [PATCH 183/807] ksmbd: fix broken transfers when exceeding max simultaneous operations Since commit 0a77d947f599 ("ksmbd: check outstanding simultaneous SMB operations"), ksmbd enforces a maximum number of simultaneous operations for a connection. The problem is that reaching the limit causes ksmbd to close the socket, and the client has no indication that it should have slowed down. This behaviour can be reproduced by setting "smb2 max credits = 128" (or lower), and transferring a large file (25GB). smbclient fails as below: $ smbclient //192.168.1.254/testshare -U user%pass smb: \> put file.bin cli_push returned NT_STATUS_USER_SESSION_DELETED putting file file.bin as \file.bin smb2cli_req_compound_submit: Insufficient credits. 0 available, 1 needed NT_STATUS_INTERNAL_ERROR closing remote file \file.bin smb: \> smb2cli_req_compound_submit: Insufficient credits. 0 available, 1 needed Windows clients fail with 0x8007003b (with smaller files even). Fix this by delaying reading from the socket until there's room to allocate a request. This effectively applies backpressure on the client, so the transfer completes, albeit at a slower rate. Fixes: 0a77d947f599 ("ksmbd: check outstanding simultaneous SMB operations") Signed-off-by: Marios Makassikis Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 13 +++++++++++-- fs/smb/server/connection.h | 1 - fs/smb/server/server.c | 7 +------ fs/smb/server/server.h | 1 + fs/smb/server/transport_ipc.c | 5 ++++- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index be9656d524a4..f8a40f65db6a 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -70,7 +70,6 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); atomic_set(&conn->refcnt, 1); - atomic_set(&conn->mux_smb_requests, 0); conn->total_credits = 1; conn->outstanding_credits = 0; @@ -133,6 +132,8 @@ void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; atomic_dec(&conn->req_running); + if (waitqueue_active(&conn->req_running_q)) + wake_up(&conn->req_running_q); if (list_empty(&work->request_entry) && list_empty(&work->async_request_entry)) @@ -309,7 +310,7 @@ int ksmbd_conn_handler_loop(void *p) { struct ksmbd_conn *conn = (struct ksmbd_conn *)p; struct ksmbd_transport *t = conn->transport; - unsigned int pdu_size, max_allowed_pdu_size; + unsigned int pdu_size, max_allowed_pdu_size, max_req; char hdr_buf[4] = {0,}; int size; @@ -319,6 +320,7 @@ int ksmbd_conn_handler_loop(void *p) if (t->ops->prepare && t->ops->prepare(t)) goto out; + max_req = server_conf.max_inflight_req; conn->last_active = jiffies; set_freezable(); while (ksmbd_conn_alive(conn)) { @@ -328,6 +330,13 @@ int ksmbd_conn_handler_loop(void *p) kvfree(conn->request_buf); conn->request_buf = NULL; +recheck: + if (atomic_read(&conn->req_running) + 1 > max_req) { + wait_event_interruptible(conn->req_running_q, + atomic_read(&conn->req_running) < max_req); + goto recheck; + } + size = t->ops->read(t, hdr_buf, sizeof(hdr_buf), -1); if (size != sizeof(hdr_buf)) break; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 8ddd5a3c7baf..b379ae4fdcdf 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -107,7 +107,6 @@ struct ksmbd_conn { __le16 signing_algorithm; bool binding; atomic_t refcnt; - atomic_t mux_smb_requests; }; struct ksmbd_conn_ops { diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 3ba95bd8edeb..601e7fcbcf1e 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -270,7 +270,6 @@ static void handle_ksmbd_work(struct work_struct *wk) ksmbd_conn_try_dequeue_request(work); ksmbd_free_work_struct(work); - atomic_dec(&conn->mux_smb_requests); /* * Checking waitqueue to dropping pending requests on * disconnection. waitqueue_active is safe because it @@ -300,11 +299,6 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) if (err) return 0; - if (atomic_inc_return(&conn->mux_smb_requests) >= conn->vals->max_credits) { - atomic_dec_return(&conn->mux_smb_requests); - return -ENOSPC; - } - work = ksmbd_alloc_work_struct(); if (!work) { pr_err("allocation for work failed\n"); @@ -367,6 +361,7 @@ static int server_conf_init(void) server_conf.auth_mechs |= KSMBD_AUTH_KRB5 | KSMBD_AUTH_MSKRB5; #endif + server_conf.max_inflight_req = SMB2_MAX_CREDITS; return 0; } diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h index 4fc529335271..94187628ff08 100644 --- a/fs/smb/server/server.h +++ b/fs/smb/server/server.h @@ -42,6 +42,7 @@ struct ksmbd_server_config { struct smb_sid domain_sid; unsigned int auth_mechs; unsigned int max_connections; + unsigned int max_inflight_req; char *conf[SERVER_CONF_WORK_GROUP + 1]; struct task_struct *dh_task; diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 48cda3350e5a..befaf42b84cc 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -319,8 +319,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_write_size(req->smb2_max_write); if (req->smb2_max_trans) init_smb2_max_trans_size(req->smb2_max_trans); - if (req->smb2_max_credits) + if (req->smb2_max_credits) { init_smb2_max_credits(req->smb2_max_credits); + server_conf.max_inflight_req = + req->smb2_max_credits; + } if (req->smbd_max_io_size) init_smbd_max_io_size(req->smbd_max_io_size); From fe4ed2f09b492e3507615a053814daa8fafdecb1 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 14 Dec 2024 12:19:03 +0900 Subject: [PATCH 184/807] ksmbd: conn lock to serialize smb2 negotiate If client send parallel smb2 negotiate request on same connection, ksmbd_conn can be racy. smb2 negotiate handling that are not performance-related can be serialized with conn lock. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 803b35b89513..23e21845f928 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1097,6 +1097,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) return rc; } + ksmbd_conn_lock(conn); smb2_buf_len = get_rfc1002_len(work->request_buf); smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects); if (smb2_neg_size > smb2_buf_len) { @@ -1247,6 +1248,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_conn_set_need_negotiate(conn); err_out: + ksmbd_conn_unlock(conn); if (rc) rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; From 24740385cb0d6d22ab7fa7adf36546d5b3cdcf73 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 15 Nov 2024 11:54:40 +0200 Subject: [PATCH 185/807] thunderbolt: Improve redrive mode handling When USB-C monitor is connected directly to Intel Barlow Ridge host, it goes into "redrive" mode that basically routes the DisplayPort signals directly from the GPU to the USB-C monitor without any tunneling needed. However, the host router must be powered on for this to work. Aaron reported that there are a couple of cases where this will not work with the current code: - Booting with USB-C monitor plugged in. - Plugging in USB-C monitor when the host router is in sleep state (runtime suspended). - Plugging in USB-C device while the system is in system sleep state. In all these cases once the host router is runtime suspended the picture on the connected USB-C display disappears too. This is certainly not what the user expected. For this reason improve the redrive mode handling to keep the host router from runtime suspending when detect that any of the above cases is happening. Fixes: a75e0684efe5 ("thunderbolt: Keep the domain powered when USB4 port is in redrive mode") Reported-by: Aaron Rainbolt Closes: https://lore.kernel.org/linux-usb/20241009220118.70bfedd0@kf-ir16/ Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/tb.c | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c index 4f777788e917..a7c6919fbf97 100644 --- a/drivers/thunderbolt/tb.c +++ b/drivers/thunderbolt/tb.c @@ -2059,6 +2059,37 @@ static void tb_exit_redrive(struct tb_port *port) } } +static void tb_switch_enter_redrive(struct tb_switch *sw) +{ + struct tb_port *port; + + tb_switch_for_each_port(sw, port) + tb_enter_redrive(port); +} + +/* + * Called during system and runtime suspend to forcefully exit redrive + * mode without querying whether the resource is available. + */ +static void tb_switch_exit_redrive(struct tb_switch *sw) +{ + struct tb_port *port; + + if (!(sw->quirks & QUIRK_KEEP_POWER_IN_DP_REDRIVE)) + return; + + tb_switch_for_each_port(sw, port) { + if (!tb_port_is_dpin(port)) + continue; + + if (port->redrive) { + port->redrive = false; + pm_runtime_put(&sw->dev); + tb_port_dbg(port, "exit redrive mode\n"); + } + } +} + static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port) { struct tb_port *in, *out; @@ -2909,6 +2940,7 @@ static int tb_start(struct tb *tb, bool reset) tb_create_usb3_tunnels(tb->root_switch); /* Add DP IN resources for the root switch */ tb_add_dp_resources(tb->root_switch); + tb_switch_enter_redrive(tb->root_switch); /* Make the discovered switches available to the userspace */ device_for_each_child(&tb->root_switch->dev, NULL, tb_scan_finalize_switch); @@ -2924,6 +2956,7 @@ static int tb_suspend_noirq(struct tb *tb) tb_dbg(tb, "suspending...\n"); tb_disconnect_and_release_dp(tb); + tb_switch_exit_redrive(tb->root_switch); tb_switch_suspend(tb->root_switch, false); tcm->hotplug_active = false; /* signal tb_handle_hotplug to quit */ tb_dbg(tb, "suspend finished\n"); @@ -3016,6 +3049,7 @@ static int tb_resume_noirq(struct tb *tb) tb_dbg(tb, "tunnels restarted, sleeping for 100ms\n"); msleep(100); } + tb_switch_enter_redrive(tb->root_switch); /* Allow tb_handle_hotplug to progress events */ tcm->hotplug_active = true; tb_dbg(tb, "resume finished\n"); @@ -3079,6 +3113,12 @@ static int tb_runtime_suspend(struct tb *tb) struct tb_cm *tcm = tb_priv(tb); mutex_lock(&tb->lock); + /* + * The below call only releases DP resources to allow exiting and + * re-entering redrive mode. + */ + tb_disconnect_and_release_dp(tb); + tb_switch_exit_redrive(tb->root_switch); tb_switch_suspend(tb->root_switch, true); tcm->hotplug_active = false; mutex_unlock(&tb->lock); @@ -3110,6 +3150,7 @@ static int tb_runtime_resume(struct tb *tb) tb_restore_children(tb->root_switch); list_for_each_entry_safe(tunnel, n, &tcm->tunnel_list, list) tb_tunnel_restart(tunnel); + tb_switch_enter_redrive(tb->root_switch); tcm->hotplug_active = true; mutex_unlock(&tb->lock); From a60b990798eb17433d0283788280422b1bd94b18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 14 Dec 2024 12:50:18 +0100 Subject: [PATCH 186/807] PCI/MSI: Handle lack of irqdomain gracefully Alexandre observed a warning emitted from pci_msi_setup_msi_irqs() on a RISCV platform which does not provide PCI/MSI support: WARNING: CPU: 1 PID: 1 at drivers/pci/msi/msi.h:121 pci_msi_setup_msi_irqs+0x2c/0x32 __pci_enable_msix_range+0x30c/0x596 pci_msi_setup_msi_irqs+0x2c/0x32 pci_alloc_irq_vectors_affinity+0xb8/0xe2 RISCV uses hierarchical interrupt domains and correctly does not implement the legacy fallback. The warning triggers from the legacy fallback stub. That warning is bogus as the PCI/MSI layer knows whether a PCI/MSI parent domain is associated with the device or not. There is a check for MSI-X, which has a legacy assumption. But that legacy fallback assumption is only valid when legacy support is enabled, but otherwise the check should simply return -ENOTSUPP. Loongarch tripped over the same problem and blindly enabled legacy support without implementing the legacy fallbacks. There are weak implementations which return an error, so the problem was papered over. Correct pci_msi_domain_supports() to evaluate the legacy mode and add the missing supported check into the MSI enable path to complete it. Fixes: d2a463b29741 ("PCI/MSI: Reject multi-MSI early") Reported-by: Alexandre Ghiti Signed-off-by: Thomas Gleixner Tested-by: Alexandre Ghiti Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/87ed2a8ow5.ffs@tglx --- drivers/pci/msi/irqdomain.c | 7 +++++-- drivers/pci/msi/msi.c | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 569125726b3e..d7ba8795d60f 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -350,8 +350,11 @@ bool pci_msi_domain_supports(struct pci_dev *pdev, unsigned int feature_mask, domain = dev_get_msi_domain(&pdev->dev); - if (!domain || !irq_domain_is_hierarchy(domain)) - return mode == ALLOW_LEGACY; + if (!domain || !irq_domain_is_hierarchy(domain)) { + if (IS_ENABLED(CONFIG_PCI_MSI_ARCH_FALLBACKS)) + return mode == ALLOW_LEGACY; + return false; + } if (!irq_domain_is_msi_parent(domain)) { /* diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 3a45879d85db..2f647cac4cae 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -433,6 +433,10 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, if (WARN_ON_ONCE(dev->msi_enabled)) return -EINVAL; + /* Test for the availability of MSI support */ + if (!pci_msi_domain_supports(dev, 0, ALLOW_LEGACY)) + return -ENOTSUPP; + nvec = pci_msi_vec_count(dev); if (nvec < 0) return nvec; From 88438444fdddd0244c8b2697713adcca3e71599e Mon Sep 17 00:00:00 2001 From: Venkata Prasad Potturu Date: Fri, 13 Dec 2024 11:41:46 +0530 Subject: [PATCH 187/807] ASoC: amd: ps: Fix for enabling DMIC on acp63 platform via _DSD entry Add condition check to register ACP PDM sound card by reading _WOV acpi entry. Fixes: 0386d765f27a ("ASoC: amd: ps: refactor acp device configuration read logic") Signed-off-by: Venkata Prasad Potturu Link: https://patch.msgid.link/20241213061147.1060451-1-venkataprasad.potturu@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/ps/pci-ps.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/sound/soc/amd/ps/pci-ps.c b/sound/soc/amd/ps/pci-ps.c index 823a69bf778b..4575326d0635 100644 --- a/sound/soc/amd/ps/pci-ps.c +++ b/sound/soc/amd/ps/pci-ps.c @@ -375,11 +375,18 @@ static int get_acp63_device_config(struct pci_dev *pci, struct acp63_dev_data *a { struct acpi_device *pdm_dev; const union acpi_object *obj; + acpi_handle handle; + acpi_integer dmic_status; u32 config; bool is_dmic_dev = false; bool is_sdw_dev = false; + bool wov_en, dmic_en; int ret; + /* IF WOV entry not found, enable dmic based on acp-audio-device-type entry*/ + wov_en = true; + dmic_en = false; + config = readl(acp_data->acp63_base + ACP_PIN_CONFIG); switch (config) { case ACP_CONFIG_4: @@ -412,10 +419,18 @@ static int get_acp63_device_config(struct pci_dev *pci, struct acp63_dev_data *a if (!acpi_dev_get_property(pdm_dev, "acp-audio-device-type", ACPI_TYPE_INTEGER, &obj) && obj->integer.value == ACP_DMIC_DEV) - is_dmic_dev = true; + dmic_en = true; } + + handle = ACPI_HANDLE(&pci->dev); + ret = acpi_evaluate_integer(handle, "_WOV", NULL, &dmic_status); + if (!ACPI_FAILURE(ret)) + wov_en = dmic_status; } + if (dmic_en && wov_en) + is_dmic_dev = true; + if (acp_data->is_sdw_config) { ret = acp_scan_sdw_devices(&pci->dev, ACP63_SDW_ADDR); if (!ret && acp_data->info.link_mask) From 7b00af2c5414dc01e0718deef7ead81102867636 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 16 Dec 2024 20:53:08 +0800 Subject: [PATCH 188/807] erofs: use `struct erofs_device_info` for the primary device Instead of just listing each one directly in `struct erofs_sb_info` except that we still use `sb->s_bdev` for the primary block device. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241216125310.930933-2-hsiangkao@linux.alibaba.com --- fs/erofs/data.c | 12 ++++-------- fs/erofs/fscache.c | 6 +++--- fs/erofs/internal.h | 8 ++------ fs/erofs/super.c | 27 +++++++++++++-------------- 4 files changed, 22 insertions(+), 31 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1c49f8962021..622017c65958 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -56,10 +56,10 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) buf->file = NULL; if (erofs_is_fileio_mode(sbi)) { - buf->file = sbi->fdev; /* some fs like FUSE needs it */ + buf->file = sbi->dif0.file; /* some fs like FUSE needs it */ buf->mapping = buf->file->f_mapping; } else if (erofs_is_fscache_mode(sb)) - buf->mapping = sbi->s_fscache->inode->i_mapping; + buf->mapping = sbi->dif0.fscache->inode->i_mapping; else buf->mapping = sb->s_bdev->bd_mapping; } @@ -201,12 +201,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) erofs_off_t startoff, length; int id; - map->m_bdev = sb->s_bdev; - map->m_daxdev = EROFS_SB(sb)->dax_dev; - map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; - map->m_fscache = EROFS_SB(sb)->s_fscache; - map->m_fp = EROFS_SB(sb)->fdev; - + erofs_fill_from_devinfo(map, &EROFS_SB(sb)->dif0); + map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */ if (map->m_deviceid) { down_read(&devs->rwsem); dif = idr_find(&devs->tree, map->m_deviceid - 1); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index fda16eedafb5..ce7e38c82719 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -657,7 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb) if (IS_ERR(fscache)) return PTR_ERR(fscache); - sbi->s_fscache = fscache; + sbi->dif0.fscache = fscache; return 0; } @@ -665,14 +665,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - erofs_fscache_unregister_cookie(sbi->s_fscache); + erofs_fscache_unregister_cookie(sbi->dif0.fscache); if (sbi->domain) erofs_fscache_domain_put(sbi->domain); else fscache_relinquish_volume(sbi->volume, NULL, false); - sbi->s_fscache = NULL; + sbi->dif0.fscache = NULL; sbi->volume = NULL; sbi->domain = NULL; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1c847c30a918..3e8d71d516f4 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -107,6 +107,7 @@ struct erofs_xattr_prefix_item { }; struct erofs_sb_info { + struct erofs_device_info dif0; struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ @@ -124,13 +125,9 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ - struct file *fdev; struct inode *packed_inode; struct erofs_dev_context *devs; - struct dax_device *dax_dev; - u64 dax_part_off; u64 total_blocks; - u32 primarydevice_blocks; u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR @@ -166,7 +163,6 @@ struct erofs_sb_info { /* fscache support */ struct fscache_volume *volume; - struct erofs_fscache *s_fscache; struct erofs_domain *domain; char *fsid; char *domain_id; @@ -187,7 +183,7 @@ struct erofs_sb_info { static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) { - return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev; + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file; } static inline bool erofs_is_fscache_mode(struct super_block *sb) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index de8e3ecc6381..9044907354e1 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -203,7 +203,7 @@ static int erofs_scan_devices(struct super_block *sb, struct erofs_device_info *dif; int id, err = 0; - sbi->total_blocks = sbi->primarydevice_blocks; + sbi->total_blocks = sbi->dif0.blocks; if (!erofs_sb_has_device_table(sbi)) ondisk_extradevs = 0; else @@ -307,7 +307,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); + sbi->dif0.blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -602,9 +602,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -EINVAL; } - sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off, - NULL, NULL); + sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev, + &sbi->dif0.dax_part_off, NULL, NULL); } err = erofs_read_superblock(sb); @@ -627,7 +626,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) } if (test_opt(&sbi->opt, DAX_ALWAYS)) { - if (!sbi->dax_dev) { + if (!sbi->dif0.dax_dev) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); clear_opt(&sbi->opt, DAX_ALWAYS); } else if (sbi->blkszbits != PAGE_SHIFT) { @@ -707,14 +706,13 @@ static int erofs_fc_get_tree(struct fs_context *fc) if (!fc->source) return invalf(fc, "No source specified"); - file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); if (IS_ERR(file)) return PTR_ERR(file); - sbi->fdev = file; + sbi->dif0.file = file; - if (S_ISREG(file_inode(sbi->fdev)->i_mode) && - sbi->fdev->f_mapping->a_ops->read_folio) + if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) && + sbi->dif0.file->f_mapping->a_ops->read_folio) return get_tree_nodev(fc, erofs_fc_fill_super); } #endif @@ -771,8 +769,8 @@ static void erofs_sb_free(struct erofs_sb_info *sbi) erofs_free_dev_context(sbi->devs); kfree(sbi->fsid); kfree(sbi->domain_id); - if (sbi->fdev) - fput(sbi->fdev); + if (sbi->dif0.file) + fput(sbi->dif0.file); kfree(sbi); } @@ -817,11 +815,12 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev) + if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || + sbi->dif0.file) kill_anon_super(sb); else kill_block_super(sb); - fs_put_dax(sbi->dax_dev, NULL); + fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); erofs_sb_free(sbi); sb->s_fs_info = NULL; From f8d920a402aec3482931cb5f1539ed438740fc49 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 13 Dec 2024 07:54:01 +0800 Subject: [PATCH 189/807] erofs: reference `struct erofs_device_info` for erofs_map_dev Record `m_sb` and `m_dif` to replace `m_fscache`, `m_daxdev`, `m_fp` and `m_dax_part_off` in order to simplify the codebase. Note that `m_bdev` is still left since it can be assigned from `sb->s_bdev` directly. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212235401.2857246-1-hsiangkao@linux.alibaba.com --- fs/erofs/data.c | 26 ++++++++++---------------- fs/erofs/fileio.c | 2 +- fs/erofs/fscache.c | 4 ++-- fs/erofs/internal.h | 6 ++---- 4 files changed, 15 insertions(+), 23 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 622017c65958..0cd6b5c4df98 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -179,19 +179,13 @@ out: } static void erofs_fill_from_devinfo(struct erofs_map_dev *map, - struct erofs_device_info *dif) + struct super_block *sb, struct erofs_device_info *dif) { + map->m_sb = sb; + map->m_dif = dif; map->m_bdev = NULL; - map->m_fp = NULL; - if (dif->file) { - if (S_ISBLK(file_inode(dif->file)->i_mode)) - map->m_bdev = file_bdev(dif->file); - else - map->m_fp = dif->file; - } - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode)) + map->m_bdev = file_bdev(dif->file); } int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) @@ -201,7 +195,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) erofs_off_t startoff, length; int id; - erofs_fill_from_devinfo(map, &EROFS_SB(sb)->dif0); + erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */ if (map->m_deviceid) { down_read(&devs->rwsem); @@ -215,7 +209,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - erofs_fill_from_devinfo(map, dif); + erofs_fill_from_devinfo(map, sb, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); @@ -228,7 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - erofs_fill_from_devinfo(map, dif); + erofs_fill_from_devinfo(map, sb, dif); break; } } @@ -298,7 +292,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->offset = map.m_la; if (flags & IOMAP_DAX) - iomap->dax_dev = mdev.m_daxdev; + iomap->dax_dev = mdev.m_dif->dax_dev; else iomap->bdev = mdev.m_bdev; iomap->length = map.m_llen; @@ -327,7 +321,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_MAPPED; iomap->addr = mdev.m_pa; if (flags & IOMAP_DAX) - iomap->addr += mdev.m_dax_part_off; + iomap->addr += mdev.m_dif->dax_part_off; } return 0; } diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 3af96b1e2c2a..a61b8faec651 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -67,7 +67,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) GFP_KERNEL | __GFP_NOFAIL); bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); - rq->iocb.ki_filp = mdev->m_fp; + rq->iocb.ki_filp = mdev->m_dif->file; return rq; } diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index ce7e38c82719..ce3d8737df85 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -198,7 +198,7 @@ struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL); bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ); - io->io.private = mdev->m_fscache->cookie; + io->io.private = mdev->m_dif->fscache->cookie; io->io.end_io = erofs_fscache_bio_endio; refcount_set(&io->io.ref, 1); return &io->bio; @@ -316,7 +316,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) if (!io) return -ENOMEM; iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count); - ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie, + ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie, mdev.m_pa + (pos - map.m_la), io); erofs_fscache_req_io_put(io); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 3e8d71d516f4..7cc8e1be04e8 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -353,11 +353,9 @@ enum { }; struct erofs_map_dev { - struct erofs_fscache *m_fscache; + struct super_block *m_sb; + struct erofs_device_info *m_dif; struct block_device *m_bdev; - struct dax_device *m_daxdev; - struct file *m_fp; - u64 m_dax_part_off; erofs_off_t m_pa; unsigned int m_deviceid; From 6422cde1b0d5a31b206b263417c1c2b3c80fe82c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 12 Dec 2024 21:43:36 +0800 Subject: [PATCH 190/807] erofs: use buffered I/O for file-backed mounts by default For many use cases (e.g. container images are just fetched from remote), performance will be impacted if underlay page cache is up-to-date but direct i/o flushes dirty pages first. Instead, let's use buffered I/O by default to keep in sync with loop devices and add a (re)mount option to explicitly give a try to use direct I/O if supported by the underlying files. The container startup time is improved as below: [workload] docker.io/library/workpress:latest unpack 1st run non-1st runs EROFS snapshotter buffered I/O file 4.586404265s 0.308s 0.198s EROFS snapshotter direct I/O file 4.581742849s 2.238s 0.222s EROFS snapshotter loop 4.596023152s 0.346s 0.201s Overlayfs snapshotter 5.382851037s 0.206s 0.214s Fixes: fb176750266a ("erofs: add file-backed mount support") Cc: Derek McGowan Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212134336.2059899-1-hsiangkao@linux.alibaba.com --- fs/erofs/fileio.c | 7 +++++-- fs/erofs/internal.h | 1 + fs/erofs/super.c | 23 +++++++++++++++-------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index a61b8faec651..33f8539dda4a 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -9,6 +9,7 @@ struct erofs_fileio_rq { struct bio_vec bvecs[BIO_MAX_VECS]; struct bio bio; struct kiocb iocb; + struct super_block *sb; }; struct erofs_fileio { @@ -52,8 +53,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; rq->iocb.ki_ioprio = get_current_ioprio(); rq->iocb.ki_complete = erofs_fileio_ki_complete; - rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ? - IOCB_DIRECT : 0; + if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) && + rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) + rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); @@ -68,6 +70,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); rq->iocb.ki_filp = mdev->m_dif->file; + rq->sb = mdev->m_sb; return rq; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 7cc8e1be04e8..686d835eb533 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -176,6 +176,7 @@ struct erofs_sb_info { #define EROFS_MOUNT_POSIX_ACL 0x00000020 #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 +#define EROFS_MOUNT_DIRECT_IO 0x00000100 #define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 9044907354e1..f5956474bfde 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -364,14 +364,8 @@ static void erofs_default_options(struct erofs_sb_info *sbi) } enum { - Opt_user_xattr, - Opt_acl, - Opt_cache_strategy, - Opt_dax, - Opt_dax_enum, - Opt_device, - Opt_fsid, - Opt_domain_id, + Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_err }; @@ -398,6 +392,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), fsparam_string("domain_id", Opt_domain_id), + fsparam_flag_no("directio", Opt_directio), {} }; @@ -511,6 +506,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); break; #endif + case Opt_directio: +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (result.boolean) + set_opt(&sbi->opt, DIRECT_IO); + else + clear_opt(&sbi->opt, DIRECT_IO); +#else + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); +#endif + break; default: return -ENOPARAM; } @@ -948,6 +953,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); + if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO)) + seq_puts(seq, ",directio"); #ifdef CONFIG_EROFS_FS_ONDEMAND if (sbi->fsid) seq_printf(seq, ",fsid=%s", sbi->fsid); From 38651476e46e088598354510502c383e932e2297 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 11 Dec 2024 14:09:27 +0530 Subject: [PATCH 191/807] RDMA/bnxt_re: Fix the check for 9060 condition The check for 9060 condition should only be made for legacy chips. Fixes: 9152e0b722b2 ("RDMA/bnxt_re: HW workarounds for handling specific conditions") Reviewed-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 72f35070f671..093bfb748cdf 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2669,10 +2669,12 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, bnxt_qplib_add_flush_qp(qp); } else { /* Before we complete, do WA 9060 */ - if (do_wa9060(qp, cq, cq_cons, sq->swq_last, - cqe_sq_cons)) { - *lib_qp = qp; - goto out; + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->cctx)) { + if (do_wa9060(qp, cq, cq_cons, sq->swq_last, + cqe_sq_cons)) { + *lib_qp = qp; + goto out; + } } if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) { cqe->status = CQ_REQ_STATUS_OK; From 798653a0ee30d3cd495099282751c0f248614ae7 Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Wed, 11 Dec 2024 14:09:28 +0530 Subject: [PATCH 192/807] RDMA/bnxt_re: Add check for path mtu in modify_qp When RDMA app configures path MTU, add a check in modify_qp verb to make sure that it doesn't go beyond interface MTU. If this check fails, driver will fail the modify_qp verb. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Reviewed-by: Kalesh AP Signed-off-by: Saravanan Vajravel Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-3-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 26 +++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 215074c0860b..a609e1635a3d 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2162,18 +2162,20 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, } } - if (qp_attr_mask & IB_QP_PATH_MTU) { - qp->qplib_qp.modify_flags |= - CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; - qp->qplib_qp.path_mtu = __from_ib_mtu(qp_attr->path_mtu); - qp->qplib_qp.mtu = ib_mtu_enum_to_int(qp_attr->path_mtu); - } else if (qp_attr->qp_state == IB_QPS_RTR) { - qp->qplib_qp.modify_flags |= - CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; - qp->qplib_qp.path_mtu = - __from_ib_mtu(iboe_get_mtu(rdev->netdev->mtu)); - qp->qplib_qp.mtu = - ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu)); + if (qp_attr->qp_state == IB_QPS_RTR) { + enum ib_mtu qpmtu; + + qpmtu = iboe_get_mtu(rdev->netdev->mtu); + if (qp_attr_mask & IB_QP_PATH_MTU) { + if (ib_mtu_enum_to_int(qp_attr->path_mtu) > + ib_mtu_enum_to_int(qpmtu)) + return -EINVAL; + qpmtu = qp_attr->path_mtu; + } + + qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; + qp->qplib_qp.path_mtu = __from_ib_mtu(qpmtu); + qp->qplib_qp.mtu = ib_mtu_enum_to_int(qpmtu); } if (qp_attr_mask & IB_QP_TIMEOUT) { From da2132e683954e7ddda3cd674e866a847b7389eb Mon Sep 17 00:00:00 2001 From: Damodharam Ammepalli Date: Wed, 11 Dec 2024 14:09:29 +0530 Subject: [PATCH 193/807] RDMA/bnxt_re: Fix setting mandatory attributes for modify_qp Firmware expects "min_rnr_timer" as a mandatory attribute in MODIFY_QP command during the RTR-RTS transition. This needs to be enforced by the driver which is missing while setting bnxt_set_mandatory_attributes that sends these flags as part of modify_qp optimization. Fixes: 82c32d219272 ("RDMA/bnxt_re: Add support for optimized modify QP") Reviewed-by: Rukhsana Ansari Reviewed-by: Kalesh AP Signed-off-by: Damodharam Ammepalli Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 13 +++++++++++-- drivers/infiniband/hw/bnxt_re/qplib_res.h | 5 +++++ drivers/infiniband/hw/bnxt_re/roce_hsi.h | 1 + 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 093bfb748cdf..5169804e6f12 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1285,7 +1285,8 @@ static void __filter_modify_flags(struct bnxt_qplib_qp *qp) } } -static void bnxt_set_mandatory_attributes(struct bnxt_qplib_qp *qp, +static void bnxt_set_mandatory_attributes(struct bnxt_qplib_res *res, + struct bnxt_qplib_qp *qp, struct cmdq_modify_qp *req) { u32 mandatory_flags = 0; @@ -1300,6 +1301,14 @@ static void bnxt_set_mandatory_attributes(struct bnxt_qplib_qp *qp, mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY; } + if (_is_min_rnr_in_rtr_rts_mandatory(res->dattr->dev_cap_flags2) && + (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_RTR && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTS)) { + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC) + mandatory_flags |= + CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER; + } + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_UD || qp->type == CMDQ_MODIFY_QP_QP_TYPE_GSI) mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_QKEY; @@ -1340,7 +1349,7 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) /* Set mandatory attributes for INIT -> RTR and RTR -> RTS transition */ if (_is_optimize_modify_qp_supported(res->dattr->dev_cap_flags2) && is_optimized_state_transition(qp)) - bnxt_set_mandatory_attributes(qp, &req); + bnxt_set_mandatory_attributes(res, qp, &req); } bmask = qp->modify_flags; req.modify_mask = cpu_to_le32(qp->modify_flags); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 21fb148713a6..cbfc49a1a56d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -584,6 +584,11 @@ static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2) return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED; } +static inline bool _is_min_rnr_in_rtr_rts_mandatory(u16 dev_cap_ext_flags2) +{ + return !!(dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED); +} + static inline bool _is_cq_coalescing_supported(u16 dev_cap_ext_flags2) { return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED; diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index a98fc9c2313e..0ee60fdc18b3 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -2215,6 +2215,7 @@ struct creq_query_func_resp_sb { #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE (0x2UL << 4) #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE + #define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL __le16 max_xp_qp_size; __le16 create_qp_batch_size; __le16 destroy_qp_batch_size; From 34db8ec931b84d1426423f263b1927539e73b397 Mon Sep 17 00:00:00 2001 From: Hongguang Gao Date: Wed, 11 Dec 2024 14:09:30 +0530 Subject: [PATCH 194/807] RDMA/bnxt_re: Fix to export port num to ib_query_qp Current driver implementation doesn't populate the port_num field in query_qp. Adding the code to convert internal firmware port id to ibv defined port number and export it. Reviewed-by: Saravanan Vajravel Reviewed-by: Kalesh AP Signed-off-by: Hongguang Gao Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-5-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 1 + drivers/infiniband/hw/bnxt_re/ib_verbs.h | 4 ++++ drivers/infiniband/hw/bnxt_re/qplib_fp.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_fp.h | 1 + 4 files changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index a609e1635a3d..bcb7cfc63d09 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2325,6 +2325,7 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->retry_cnt = qplib_qp->retry_cnt; qp_attr->rnr_retry = qplib_qp->rnr_retry; qp_attr->min_rnr_timer = qplib_qp->min_rnr_timer; + qp_attr->port_num = __to_ib_port_num(qplib_qp->port_id); qp_attr->rq_psn = qplib_qp->rq.psn; qp_attr->max_rd_atomic = qplib_qp->max_rd_atomic; qp_attr->sq_psn = qplib_qp->sq.psn; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index ac59f1d73b15..fbb16a411d6a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -268,6 +268,10 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +static inline u32 __to_ib_port_num(u16 port_id) +{ + return (u32)port_id + 1; +} unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp); void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp, unsigned long flags); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 5169804e6f12..d8a2a929bbe3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1532,6 +1532,7 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp->dest_qpn = le32_to_cpu(sb->dest_qp_id); memcpy(qp->smac, sb->src_mac, 6); qp->vlan_id = le16_to_cpu(sb->vlan_pcp_vlan_dei_vlan_id); + qp->port_id = le16_to_cpu(sb->port_id); bail: dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 19e279871f10..0660101b5310 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -298,6 +298,7 @@ struct bnxt_qplib_qp { u32 dest_qpn; u8 smac[6]; u16 vlan_id; + u16 port_id; u8 nw_type; struct bnxt_qplib_ah ah; From 7179fe0074a3c962e43a9e51169304c4911989ed Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 11 Dec 2024 14:09:31 +0530 Subject: [PATCH 195/807] RDMA/bnxt_re: Fix reporting hw_ver in query_device Driver currently populates subsystem_device id in the "hw_ver" field of ib_attr structure in query_device. Updated to populate PCI revision ID. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Reviewed-by: Preethi G Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-6-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index bcb7cfc63d09..e3d26bd6de05 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -199,7 +199,7 @@ int bnxt_re_query_device(struct ib_device *ibdev, ib_attr->vendor_id = rdev->en_dev->pdev->vendor; ib_attr->vendor_part_id = rdev->en_dev->pdev->device; - ib_attr->hw_ver = rdev->en_dev->pdev->subsystem_device; + ib_attr->hw_ver = rdev->en_dev->pdev->revision; ib_attr->max_qp = dev_attr->max_qp; ib_attr->max_qp_wr = dev_attr->max_qp_wqes; ib_attr->device_cap_flags = From 7c449ef0fdce540bfb235a2d93e7184864c3388b Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 16 Dec 2024 22:08:20 +0800 Subject: [PATCH 196/807] ASoC: Intel: sof_sdw: Fix DMI match for Lenovo 21Q6 and 21Q7 Update the DMI match for a Lenovo laptop to the new DMI identifier. This laptop ships with a different DMI identifier to what was expected, and now has two identifiers. Signed-off-by: Richard Fitzgerald Fixes: 83c062ae81e8 ("ASoC: Intel: sof_sdw: Add quirks for some new Lenovo laptops") Signed-off-by: Bard Liao Link: https://patch.msgid.link/20241216140821.153670-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 810be7c949a5..e20ab6bfa5dd 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -641,9 +641,17 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { .callback = sof_sdw_quirk_cb, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "233B") + DMI_MATCH(DMI_PRODUCT_NAME, "21Q6") }, - .driver_data = (void *)(SOC_SDW_SIDECAR_AMPS), + .driver_data = (void *)(SOC_SDW_SIDECAR_AMPS | SOC_SDW_CODEC_MIC), + }, + { + .callback = sof_sdw_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "21Q7") + }, + .driver_data = (void *)(SOC_SDW_SIDECAR_AMPS | SOC_SDW_CODEC_MIC), }, /* ArrowLake devices */ From ba7d47a54bf23a7201bdd2978e16b04fc1cb1f6e Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 16 Dec 2024 22:08:21 +0800 Subject: [PATCH 197/807] ASoC: Intel: sof_sdw: Fix DMI match for Lenovo 21QA and 21QB Update the DMI match for a Lenovo laptop to the new DMI identifier. This laptop ships with a different DMI identifier to what was expected, and now has two identifiers. Signed-off-by: Richard Fitzgerald Fixes: ea657f6b24e1 ("ASoC: Intel: sof_sdw: Add quirk for cs42l43 system using host DMICs") Signed-off-by: Bard Liao Link: https://patch.msgid.link/20241216140821.153670-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index e20ab6bfa5dd..667103027f7e 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -632,7 +632,16 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { .callback = sof_sdw_quirk_cb, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "233C") + DMI_MATCH(DMI_PRODUCT_NAME, "21QB") + }, + /* Note this quirk excludes the CODEC mic */ + .driver_data = (void *)(SOC_SDW_CODEC_MIC), + }, + { + .callback = sof_sdw_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "21QA") }, /* Note this quirk excludes the CODEC mic */ .driver_data = (void *)(SOC_SDW_CODEC_MIC), From 6f4a0fd03ce856c6d9811429b9969b4f27e2eaee Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 11 Dec 2024 11:54:02 +0800 Subject: [PATCH 198/807] ASoC: dt-bindings: realtek,rt5645: Fix CPVDD voltage comment Both the ALC5645 and ALC5650 datasheets specify a recommended voltage of 1.8V for CPVDD, not 3.5V. Fix the comment. Cc: Matthias Brugger Fixes: 26aa19174f0d ("ASoC: dt-bindings: rt5645: add suppliers") Fixes: 83d43ab0a1cb ("ASoC: dt-bindings: realtek,rt5645: Convert to dtschema") Signed-off-by: Chen-Yu Tsai Acked-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20241211035403.4157760-1-wenst@chromium.org Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/realtek,rt5645.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml b/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml index 13f09f1bc800..0a698798c22b 100644 --- a/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml +++ b/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml @@ -51,7 +51,7 @@ properties: description: Power supply for AVDD, providing 1.8V. cpvdd-supply: - description: Power supply for CPVDD, providing 3.5V. + description: Power supply for CPVDD, providing 1.8V. hp-detect-gpios: description: From 65c8c78cc74d5bcbc43f1f785a004796a2d78360 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 12 Dec 2024 21:13:10 +0100 Subject: [PATCH 199/807] thermal/thresholds: Fix uapi header macros leading to a compilation error The macros giving the direction of the crossing thresholds use the BIT macro which is not exported to the userspace. Consequently when an userspace program includes the header, it fails to compile. Replace the macros by their litteral to allow the compilation of userspace program using this header. Fixes: 445936f9e258 ("thermal: core: Add user thresholds support") Signed-off-by: Daniel Lezcano Link: https://patch.msgid.link/20241212201311.4143196-1-daniel.lezcano@linaro.org [ rjw: Add Fixes: ] Signed-off-by: Rafael J. Wysocki --- include/uapi/linux/thermal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index ba8604bdf206..349718c271eb 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -3,8 +3,8 @@ #define _UAPI_LINUX_THERMAL_H #define THERMAL_NAME_LENGTH 20 -#define THERMAL_THRESHOLD_WAY_UP BIT(0) -#define THERMAL_THRESHOLD_WAY_DOWN BIT(1) +#define THERMAL_THRESHOLD_WAY_UP 0x1 +#define THERMAL_THRESHOLD_WAY_DOWN 0x2 enum thermal_device_mode { THERMAL_DEVICE_DISABLED = 0, From 1fb5cf0d165afc3be76ec754d1b1013515c3896a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 6 Dec 2024 18:24:02 +0100 Subject: [PATCH 200/807] Revert "arm64: dts: qcom: x1e78100-t14s: enable otg on usb-c ports" This reverts commit 1a48dd7b9ac809d1bd0fd2fef509abba83433846. A recent change enabling OTG mode on the Lenovo ThinkPad T14s USB-C ports can break SuperSpeed device hotplugging. The host controller is enumerated, but the device is not: xhci-hcd xhci-hcd.5.auto: xHCI Host Controller xhci-hcd xhci-hcd.5.auto: new USB bus registered, assigned bus number 3 xhci-hcd xhci-hcd.5.auto: hcc params 0x0110ffc5 hci version 0x110 quirks 0x000080a000000810 xhci-hcd xhci-hcd.5.auto: irq 247, io mem 0x0a800000 xhci-hcd xhci-hcd.5.auto: xHCI Host Controller xhci-hcd xhci-hcd.5.auto: new USB bus registered, assigned bus number 4 xhci-hcd xhci-hcd.5.auto: Host supports USB 3.1 Enhanced SuperSpeed hub 3-0:1.0: USB hub found hub 3-0:1.0: 1 port detected hub 4-0:1.0: USB hub found hub 4-0:1.0: 1 port detected Once this happens on either of the two ports, no amount of disconnecting and reconnecting makes the SuperSpeed device be enumerated, while FullSpeed device enumeration still works. With retimer (and orientation detection) support not even merged yet, let's revert at least until we have stable host mode in mainline. Fixes: 1a48dd7b9ac8 ("arm64: dts: qcom: x1e78100-t14s: enable otg on usb-c ports") Cc: Jonathan Marek Signed-off-by: Johan Hovold Reviewed-by: Abel Vesa Link: https://lore.kernel.org/r/20241206172402.20724-1-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- .../arm64/boot/dts/qcom/x1e78100-lenovo-thinkpad-t14s.dts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/x1e78100-lenovo-thinkpad-t14s.dts b/arch/arm64/boot/dts/qcom/x1e78100-lenovo-thinkpad-t14s.dts index 975550139e10..66513fc8e67a 100644 --- a/arch/arm64/boot/dts/qcom/x1e78100-lenovo-thinkpad-t14s.dts +++ b/arch/arm64/boot/dts/qcom/x1e78100-lenovo-thinkpad-t14s.dts @@ -773,6 +773,10 @@ status = "okay"; }; +&usb_1_ss0_dwc3 { + dr_mode = "host"; +}; + &usb_1_ss0_dwc3_hs { remote-endpoint = <&pmic_glink_ss0_hs_in>; }; @@ -801,6 +805,10 @@ status = "okay"; }; +&usb_1_ss1_dwc3 { + dr_mode = "host"; +}; + &usb_1_ss1_dwc3_hs { remote-endpoint = <&pmic_glink_ss1_hs_in>; }; From fb8e7b33c2174e00dfa411361eeed21eeaf3634b Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Wed, 13 Nov 2024 00:05:08 -0800 Subject: [PATCH 201/807] arm64: dts: qcom: x1e80100: Fix up BAR space size for PCIe6a As per memory map table, the region for PCIe6a is 64MByte. Hence, set the size of 32 bit non-prefetchable memory region beginning on address 0x70300000 as 0x3d00000 so that BAR space assigned to BAR registers can be allocated from 0x70300000 to 0x74000000. Fixes: 7af141850012 ("arm64: dts: qcom: x1e80100: Fix up BAR spaces") Cc: stable@vger.kernel.org Signed-off-by: Qiang Yu Reviewed-by: Johan Hovold Link: https://lore.kernel.org/r/20241113080508.3458849-1-quic_qianyu@quicinc.com Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index 88805629ed2b..f1a1e63f8ebc 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -2924,7 +2924,7 @@ #address-cells = <3>; #size-cells = <2>; ranges = <0x01000000 0x0 0x00000000 0x0 0x70200000 0x0 0x100000>, - <0x02000000 0x0 0x70300000 0x0 0x70300000 0x0 0x1d00000>; + <0x02000000 0x0 0x70300000 0x0 0x70300000 0x0 0x3d00000>; bus-range = <0x00 0xff>; dma-coherent; From cc252bb592638e0f7aea40d580186c36d89526b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Dec 2024 13:53:35 -0500 Subject: [PATCH 202/807] fgraph: Still initialize idle shadow stacks when starting A bug was discovered where the idle shadow stacks were not initialized for offline CPUs when starting function graph tracer, and when they came online they were not traced due to the missing shadow stack. To fix this, the idle task shadow stack initialization was moved to using the CPU hotplug callbacks. But it removed the initialization when the function graph was enabled. The problem here is that the hotplug callbacks are called when the CPUs come online, but the idle shadow stack initialization only happens if function graph is currently active. This caused the online CPUs to not get their shadow stack initialized. The idle shadow stack initialization still needs to be done when the function graph is registered, as they will not be allocated if function graph is not registered. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241211135335.094ba282@batman.local.home Fixes: 2c02f7375e65 ("fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks") Reported-by: Linus Walleij Tested-by: Linus Walleij Closes: https://lore.kernel.org/all/CACRpkdaTBrHwRbbrphVy-=SeDz6MSsXhTKypOtLrTQ+DgGAOcQ@mail.gmail.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 0bf78517b5d4..ddedcb50917f 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1215,7 +1215,7 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret; + int ret, cpu; ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, sizeof(*ret_stack_list), GFP_KERNEL); @@ -1223,6 +1223,12 @@ static int start_graph_tracing(void) if (!ret_stack_list) return -ENOMEM; + /* The cpu_boot init_task->ret_stack will never be freed */ + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + } + do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); From 166438a432d76c68d3f0da60667248f3c2303d6c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 16:46:33 -0500 Subject: [PATCH 203/807] ftrace: Do not find "true_parent" if HAVE_DYNAMIC_FTRACE_WITH_ARGS is not set When function tracing and function graph tracing are both enabled (in different instances) the "parent" of some of the function tracing events is "return_to_handler" which is the trampoline used by function graph tracing. To fix this, ftrace_get_true_parent_ip() was introduced that returns the "true" parent ip instead of the trampoline. To do this, the ftrace_regs_get_stack_pointer() is used, which uses kernel_stack_pointer(). The problem is that microblaze does not implement kerenl_stack_pointer() so when function graph tracing is enabled, the build fails. But microblaze also does not enabled HAVE_DYNAMIC_FTRACE_WITH_ARGS. That option has to be enabled by the architecture to reliably get the values from the fregs parameter passed in. When that config is not set, the architecture can also pass in NULL, which is not tested for in that function and could cause the kernel to crash. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Al Viro Cc: Michal Simek Cc: Jeff Xie Link: https://lore.kernel.org/20241216164633.6df18e87@gandalf.local.home Fixes: 60b1f578b578 ("ftrace: Get the true parent ip for function tracer") Reported-by: Al Viro Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 74c353164ca1..d358c9935164 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -176,7 +176,8 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->array_buffer); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* fregs are guaranteed not to be NULL if HAVE_DYNAMIC_FTRACE_WITH_ARGS is set */ +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) static __always_inline unsigned long function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) { From d6fd6f8280f0257ba93f16900a0d3d3912f32c79 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 5 Dec 2024 16:49:51 +0100 Subject: [PATCH 204/807] ceph: fix memory leaks in __ceph_sync_read() In two `break` statements, the call to ceph_release_page_vector() was missing, leaking the allocation from ceph_alloc_page_vector(). Instead of adding the missing ceph_release_page_vector() calls, the Ceph maintainers preferred to transfer page ownership to the `ceph_osd_request` by passing `own_pages=true` to osd_req_op_extent_osd_data_pages(). This requires postponing the ceph_osdc_put_request() call until after the block that accesses the `pages`. Cc: stable@vger.kernel.org Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads") Fixes: f0fe1e54cfcf ("ceph: plumb in decryption during reads") Signed-off-by: Max Kellermann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4b8d59ebda00..ce342a5d4b8b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1127,7 +1127,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, osd_req_op_extent_osd_data_pages(req, 0, pages, read_len, offset_in_page(read_off), - false, false); + false, true); op = &req->r_ops[0]; if (sparse) { @@ -1186,8 +1186,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ret = min_t(ssize_t, fret, len); } - ceph_osdc_put_request(req); - /* Short read but not EOF? Zero out the remainder. */ if (ret >= 0 && ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); @@ -1221,7 +1219,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, break; } } - ceph_release_page_vector(pages, num_pages); + + ceph_osdc_put_request(req); if (ret < 0) { if (ret == -EBLOCKLISTED) From 550f7ca98ee028a606aa75705a7e77b1bd11720f Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 18 Nov 2024 23:28:28 +0100 Subject: [PATCH 205/807] ceph: give up on paths longer than PATH_MAX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the full path to be built by ceph_mdsc_build_path() happens to be longer than PATH_MAX, then this function will enter an endless (retry) loop, effectively blocking the whole task. Most of the machine becomes unusable, making this a very simple and effective DoS vulnerability. I cannot imagine why this retry was ever implemented, but it seems rather useless and harmful to me. Let's remove it and fail with ENAMETOOLONG instead. Cc: stable@vger.kernel.org Reported-by: Dario Weißer Signed-off-by: Max Kellermann Reviewed-by: Alex Markuze Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 219a2cc2bf3c..785fe489ef4b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2800,12 +2800,11 @@ retry: if (pos < 0) { /* - * A rename didn't occur, but somehow we didn't end up where - * we thought we would. Throw a warning and try again. + * The path is longer than PATH_MAX and this function + * cannot ever succeed. Creating paths that long is + * possible with Ceph, but Linux cannot use them. */ - pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n", - pos); - goto retry; + return ERR_PTR(-ENAMETOOLONG); } *pbase = base; From 12eb22a5a609421b380c3c6ca887474fb2089b2c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 20 Nov 2024 16:43:51 +0100 Subject: [PATCH 206/807] ceph: validate snapdirname option length when mounting It becomes a path component, so it shouldn't exceed NAME_MAX characters. This was hardened in commit c152737be22b ("ceph: Use strscpy() instead of strcpy() in __get_snap_name()"), but no actual check was put in place. Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- fs/ceph/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index de03cd6eb86e..4344e1f11806 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -431,6 +431,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, switch (token) { case Opt_snapdirname: + if (strlen(param->string) > NAME_MAX) + return invalfc(fc, "snapdirname too long"); kfree(fsopt->snapdir_name); fsopt->snapdir_name = param->string; param->string = NULL; From 9abee475803fab6ad59d4f4fc59c6a75374a7d9d Mon Sep 17 00:00:00 2001 From: Alex Markuze Date: Wed, 27 Nov 2024 15:34:10 +0200 Subject: [PATCH 207/807] ceph: improve error handling and short/overflow-read logic in __ceph_sync_read() This patch refines the read logic in __ceph_sync_read() to ensure more predictable and efficient behavior in various edge cases. - Return early if the requested read length is zero or if the file size (`i_size`) is zero. - Initialize the index variable (`idx`) where needed and reorder some code to ensure it is always set before use. - Improve error handling by checking for negative return values earlier. - Remove redundant encrypted file checks after failures. Only attempt filesystem-level decryption if the read succeeded. - Simplify leftover calculations to correctly handle cases where the read extends beyond the end of the file or stops short. This can be hit by continuously reading a file while, on another client, we keep truncating and writing new data into it. - This resolves multiple issues caused by integer and consequent buffer overflow (`pages` array being accessed beyond `num_pages`): - https://tracker.ceph.com/issues/67524 - https://tracker.ceph.com/issues/68980 - https://tracker.ceph.com/issues/68981 Cc: stable@vger.kernel.org Fixes: 1065da21e5df ("ceph: stop copying to iter at EOF on sync reads") Reported-by: Luis Henriques (SUSE) Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ce342a5d4b8b..8e0400d461a2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1066,7 +1066,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, if (ceph_inode_is_shutdown(inode)) return -EIO; - if (!len) + if (!len || !i_size) return 0; /* * flush any page cache pages in this range. this @@ -1086,7 +1086,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, int num_pages; size_t page_off; bool more; - int idx; + int idx = 0; size_t left; struct ceph_osd_req_op *op; u64 read_off = off; @@ -1160,7 +1160,14 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, else if (ret == -ENOENT) ret = 0; - if (ret > 0 && IS_ENCRYPTED(inode)) { + if (ret < 0) { + ceph_osdc_put_request(req); + if (ret == -EBLOCKLISTED) + fsc->blocklisted = true; + break; + } + + if (IS_ENCRYPTED(inode)) { int fret; fret = ceph_fscrypt_decrypt_extents(inode, pages, @@ -1187,7 +1194,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, } /* Short read but not EOF? Zero out the remainder. */ - if (ret >= 0 && ret < len && (off + ret < i_size)) { + if (ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); int zoff = page_off + ret; @@ -1197,13 +1204,11 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ret += zlen; } - idx = 0; - if (ret <= 0) - left = 0; - else if (off + ret > i_size) - left = i_size - off; + if (off + ret > i_size) + left = (i_size > off) ? i_size - off : 0; else left = ret; + while (left > 0) { size_t plen, copied; @@ -1222,12 +1227,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ceph_osdc_put_request(req); - if (ret < 0) { - if (ret == -EBLOCKLISTED) - fsc->blocklisted = true; - break; - } - if (off >= i_size || !more) break; } From 66e0c4f91461d17d48071695271c824620bed4ef Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 6 Dec 2024 17:32:59 +0100 Subject: [PATCH 208/807] ceph: fix memory leak in ceph_direct_read_write() The bvecs array which is allocated in iter_get_bvecs_alloc() is leaked and pages remain pinned if ceph_alloc_sparse_ext_map() fails. There is no need to delay the allocation of sparse_ext map until after the bvecs array is set up, so fix this by moving sparse_ext allocation a bit earlier. Also, make a similar adjustment in __ceph_sync_read() for consistency (a leak of the same kind in __ceph_sync_read() has been addressed differently). Cc: stable@vger.kernel.org Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads") Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- fs/ceph/file.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8e0400d461a2..67468d88f139 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1116,6 +1116,16 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, len = read_off + read_len - off; more = len < iov_iter_count(to); + op = &req->r_ops[0]; + if (sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + num_pages = calc_pages_for(read_off, read_len); page_off = offset_in_page(off); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); @@ -1129,16 +1139,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, offset_in_page(read_off), false, true); - op = &req->r_ops[0]; - if (sparse) { - extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); - ret = ceph_alloc_sparse_ext_map(op, extent_cnt); - if (ret) { - ceph_osdc_put_request(req); - break; - } - } - ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); @@ -1551,6 +1551,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } + op = &req->r_ops[0]; + if (sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, size); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); if (len < 0) { ceph_osdc_put_request(req); @@ -1560,6 +1570,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (len != size) osd_req_op_extent_update(req, 0, len); + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); + /* * To simplify error handling, allow AIO when IO within i_size * or IO can be satisfied by single OSD request. @@ -1591,17 +1603,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req->r_mtime = mtime; } - osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); - op = &req->r_ops[0]; - if (sparse) { - extent_cnt = __ceph_sparse_read_ext_count(inode, size); - ret = ceph_alloc_sparse_ext_map(op, extent_cnt); - if (ret) { - ceph_osdc_put_request(req); - break; - } - } - if (aio_req) { aio_req->total_len += len; aio_req->num_reqs++; From 18d44c5d062b97b97bb0162d9742440518958dc1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 7 Dec 2024 17:33:25 +0100 Subject: [PATCH 209/807] ceph: allocate sparse_ext map only for sparse reads If mounted with sparseread option, ceph_direct_read_write() ends up making an unnecessarily allocation for O_DIRECT writes. Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads") Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- fs/ceph/file.c | 2 +- net/ceph/osd_client.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 67468d88f139..851d70200c6b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1552,7 +1552,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } op = &req->r_ops[0]; - if (sparse) { + if (!write && sparse) { extent_cnt = __ceph_sparse_read_ext_count(inode, size); ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9b1168eb77ab..b24afec24138 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1173,6 +1173,8 @@ EXPORT_SYMBOL(ceph_osdc_new_request); int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) { + WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ); + op->extent.sparse_ext_cnt = cnt; op->extent.sparse_ext = kmalloc_array(cnt, sizeof(*op->extent.sparse_ext), From 74d7e038fd072635d21e4734e3223378e09168d3 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 16 Dec 2024 20:36:46 +0300 Subject: [PATCH 210/807] hwmon: (tmp513) Fix interpretation of values of Shunt Voltage and Limit Registers The values returned by the driver after processing the contents of the Shunt Voltage Register and the Shunt Limit Registers do not correspond to the TMP512/TMP513 specifications. A raw register value is converted to a signed integer value by a sign extension in accordance with the algorithm provided in the specification, but due to the off-by-one error in the sign bit index, the result is incorrect. Moreover, the PGA shift calculated with the tmp51x_get_pga_shift function is relevant only to the Shunt Voltage Register, but is also applied to the Shunt Limit Registers. According to the TMP512 and TMP513 datasheets, the Shunt Voltage Register (04h) is 13 to 16 bit two's complement integer value, depending on the PGA setting. The Shunt Positive (0Ch) and Negative (0Dh) Limit Registers are 16-bit two's complement integer values. Below are some examples: * Shunt Voltage Register If PGA = 8, and regval = 1000 0011 0000 0000, then the decimal value must be -32000, but the value calculated by the driver will be 33536. * Shunt Limit Register If regval = 1000 0011 0000 0000, then the decimal value must be -32000, but the value calculated by the driver will be 768, if PGA = 1. Fix sign bit index, and also correct misleading comment describing the tmp51x_get_pga_shift function. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 59dfa75e5d82 ("hwmon: Add driver for Texas Instruments TMP512/513 sensor chips.") Signed-off-by: Murad Masimov Link: https://lore.kernel.org/r/20241216173648.526-2-m.masimov@maxima.ru [groeck: Fixed description and multi-line alignments] Signed-off-by: Guenter Roeck --- drivers/hwmon/tmp513.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c index 926d28cd3fab..d87fcea3ef24 100644 --- a/drivers/hwmon/tmp513.c +++ b/drivers/hwmon/tmp513.c @@ -182,7 +182,7 @@ struct tmp51x_data { struct regmap *regmap; }; -// Set the shift based on the gain 8=4, 4=3, 2=2, 1=1 +// Set the shift based on the gain: 8 -> 1, 4 -> 2, 2 -> 3, 1 -> 4 static inline u8 tmp51x_get_pga_shift(struct tmp51x_data *data) { return 5 - ffs(data->pga_gain); @@ -204,7 +204,9 @@ static int tmp51x_get_value(struct tmp51x_data *data, u8 reg, u8 pos, * 2's complement number shifted by one to four depending * on the pga gain setting. 1lsb = 10uV */ - *val = sign_extend32(regval, 17 - tmp51x_get_pga_shift(data)); + *val = sign_extend32(regval, + reg == TMP51X_SHUNT_CURRENT_RESULT ? + 16 - tmp51x_get_pga_shift(data) : 15); *val = DIV_ROUND_CLOSEST(*val * 10 * MILLI, data->shunt_uohms); break; case TMP51X_BUS_VOLTAGE_RESULT: From da1d0e6ba211baf6747db74c07700caddfd8a179 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 16 Dec 2024 20:36:47 +0300 Subject: [PATCH 211/807] hwmon: (tmp513) Fix Current Register value interpretation The value returned by the driver after processing the contents of the Current Register does not correspond to the TMP512/TMP513 specifications. A raw register value is converted to a signed integer value by a sign extension in accordance with the algorithm provided in the specification, but due to the off-by-one error in the sign bit index, the result is incorrect. Moreover, negative values will be reported as large positive due to missing sign extension from u32 to long. According to the TMP512 and TMP513 datasheets, the Current Register (07h) is a 16-bit two's complement integer value. E.g., if regval = 1000 0011 0000 0000, then the value must be (-32000 * lsb), but the driver will return (33536 * lsb). Fix off-by-one bug, and also cast data->curr_lsb_ua (which is of type u32) to long to prevent incorrect cast for negative values. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 59dfa75e5d82 ("hwmon: Add driver for Texas Instruments TMP512/513 sensor chips.") Signed-off-by: Murad Masimov Link: https://lore.kernel.org/r/20241216173648.526-3-m.masimov@maxima.ru [groeck: Fixed description line length] Signed-off-by: Guenter Roeck --- drivers/hwmon/tmp513.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c index d87fcea3ef24..2846b1cc515d 100644 --- a/drivers/hwmon/tmp513.c +++ b/drivers/hwmon/tmp513.c @@ -222,7 +222,7 @@ static int tmp51x_get_value(struct tmp51x_data *data, u8 reg, u8 pos, break; case TMP51X_BUS_CURRENT_RESULT: // Current = (ShuntVoltage * CalibrationRegister) / 4096 - *val = sign_extend32(regval, 16) * data->curr_lsb_ua; + *val = sign_extend32(regval, 15) * (long)data->curr_lsb_ua; *val = DIV_ROUND_CLOSEST(*val, MILLI); break; case TMP51X_LOCAL_TEMP_RESULT: From dd471e25770e7e632f736b90db1e2080b2171668 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 16 Dec 2024 20:36:48 +0300 Subject: [PATCH 212/807] hwmon: (tmp513) Fix interpretation of values of Temperature Result and Limit Registers The values returned by the driver after processing the contents of the Temperature Result and the Temperature Limit Registers do not correspond to the TMP512/TMP513 specifications. A raw register value is converted to a signed integer value by a sign extension in accordance with the algorithm provided in the specification, but due to the off-by-one error in the sign bit index, the result is incorrect. According to the TMP512 and TMP513 datasheets, the Temperature Result (08h to 0Bh) and Limit (11h to 14h) Registers are 13-bit two's complement integer values, shifted left by 3 bits. The value is scaled by 0.0625 degrees Celsius per bit. E.g., if regval = 1 1110 0111 0000 000, the output should be -25 degrees, but the driver will return +487 degrees. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 59dfa75e5d82 ("hwmon: Add driver for Texas Instruments TMP512/513 sensor chips.") Signed-off-by: Murad Masimov Link: https://lore.kernel.org/r/20241216173648.526-4-m.masimov@maxima.ru [groeck: fixed description line length] Signed-off-by: Guenter Roeck --- drivers/hwmon/tmp513.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c index 2846b1cc515d..1c2cb12071b8 100644 --- a/drivers/hwmon/tmp513.c +++ b/drivers/hwmon/tmp513.c @@ -234,7 +234,7 @@ static int tmp51x_get_value(struct tmp51x_data *data, u8 reg, u8 pos, case TMP51X_REMOTE_TEMP_LIMIT_2: case TMP513_REMOTE_TEMP_LIMIT_3: // 1lsb = 0.0625 degrees centigrade - *val = sign_extend32(regval, 16) >> TMP51X_TEMP_SHIFT; + *val = sign_extend32(regval, 15) >> TMP51X_TEMP_SHIFT; *val = DIV_ROUND_CLOSEST(*val * 625, 10); break; case TMP51X_N_FACTOR_AND_HYST_1: From 239d87327dcd361b0098038995f8908f3296864f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Dec 2024 17:28:06 -0800 Subject: [PATCH 213/807] fortify: Hide run-time copy size from value range tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC performs value range tracking for variables as a way to provide better diagnostics. One place this is regularly seen is with warnings associated with bounds-checking, e.g. -Wstringop-overflow, -Wstringop-overread, -Warray-bounds, etc. In order to keep the signal-to-noise ratio high, warnings aren't emitted when a value range spans the entire value range representable by a given variable. For example: unsigned int len; char dst[8]; ... memcpy(dst, src, len); If len's value is unknown, it has the full "unsigned int" range of [0, UINT_MAX], and GCC's compile-time bounds checks against memcpy() will be ignored. However, when a code path has been able to narrow the range: if (len > 16) return; memcpy(dst, src, len); Then the range will be updated for the execution path. Above, len is now [0, 16] when reading memcpy(), so depending on other optimizations, we might see a -Wstringop-overflow warning like: error: '__builtin_memcpy' writing between 9 and 16 bytes into region of size 8 [-Werror=stringop-overflow] When building with CONFIG_FORTIFY_SOURCE, the fortified run-time bounds checking can appear to narrow value ranges of lengths for memcpy(), depending on how the compiler constructs the execution paths during optimization passes, due to the checks against the field sizes. For example: if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) As intentionally designed, these checks only affect the kernel warnings emitted at run-time and do not block the potentially overflowing memcpy(), so GCC thinks it needs to produce a warning about the resulting value range that might be reaching the memcpy(). We have seen this manifest a few times now, with the most recent being with cpumasks: In function ‘bitmap_copy’, inlined from ‘cpumask_copy’ at ./include/linux/cpumask.h:839:2, inlined from ‘__padata_set_cpumasks’ at kernel/padata.c:730:2: ./include/linux/fortify-string.h:114:33: error: ‘__builtin_memcpy’ reading between 257 and 536870904 bytes from a region of size 256 [-Werror=stringop-overread] 114 | #define __underlying_memcpy __builtin_memcpy | ^ ./include/linux/fortify-string.h:633:9: note: in expansion of macro ‘__underlying_memcpy’ 633 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ ./include/linux/fortify-string.h:678:26: note: in expansion of macro ‘__fortify_memcpy_chk’ 678 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ ./include/linux/bitmap.h:259:17: note: in expansion of macro ‘memcpy’ 259 | memcpy(dst, src, len); | ^~~~~~ kernel/padata.c: In function ‘__padata_set_cpumasks’: kernel/padata.c:713:48: note: source object ‘pcpumask’ of size [0, 256] 713 | cpumask_var_t pcpumask, | ~~~~~~~~~~~~~~^~~~~~~~ This warning is _not_ emitted when CONFIG_FORTIFY_SOURCE is disabled, and with the recent -fdiagnostics-details we can confirm the origin of the warning is due to FORTIFY's bounds checking: ../include/linux/bitmap.h:259:17: note: in expansion of macro 'memcpy' 259 | memcpy(dst, src, len); | ^~~~~~ '__padata_set_cpumasks': events 1-2 ../include/linux/fortify-string.h:613:36: 612 | if (p_size_field != SIZE_MAX && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 613 | p_size != p_size_field && p_size_field < size) | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~ | | | (1) when the condition is evaluated to false | (2) when the condition is evaluated to true '__padata_set_cpumasks': event 3 114 | #define __underlying_memcpy __builtin_memcpy | ^ | | | (3) out of array bounds here Note that the cpumask warning started appearing since bitmap functions were recently marked __always_inline in commit ed8cd2b3bd9f ("bitmap: Switch from inline to __always_inline"), which allowed GCC to gain visibility into the variables as they passed through the FORTIFY implementation. In order to silence these false positives but keep otherwise deterministic compile-time warnings intact, hide the length variable from GCC with OPTIMIZE_HIDE_VAR() before calling the builtin memcpy. Additionally add a comment about why all the macro args have copies with const storage. Reported-by: "Thomas Weißschuh" Closes: https://lore.kernel.org/all/db7190c8-d17f-4a0d-bc2f-5903c79f36c2@t-8ch.de/ Reported-by: Nilay Shroff Closes: https://lore.kernel.org/all/20241112124127.1666300-1-nilay@linux.ibm.com/ Tested-by: Nilay Shroff Acked-by: Yury Norov Acked-by: Greg Kroah-Hartman Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 0d99bf11d260..e4ce1cae03bf 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -616,6 +616,12 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, return false; } +/* + * To work around what seems to be an optimizer bug, the macro arguments + * need to have const copies or the values end up changed by the time they + * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture + * __bos() results in const temp vars") for more details. + */ #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ @@ -623,6 +629,8 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ + /* Keep a mutable version of the size for the final copy. */ \ + size_t __copy_size = __fortify_size; \ fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, FORTIFY_FUNC_ ##op), \ @@ -630,7 +638,11 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ - __underlying_##op(p, q, __fortify_size); \ + /* Hide only the run-time size from value range tracking to */ \ + /* silence compile-time false positive bounds warnings. */ \ + if (!__builtin_constant_p(__copy_size)) \ + OPTIMIZER_HIDE_VAR(__copy_size); \ + __underlying_##op(p, q, __copy_size); \ }) /* From 0efbca0fec7d1665e19fa008ba95daab71f76f4d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 9 Dec 2024 12:06:39 +0100 Subject: [PATCH 214/807] nios2: Use str_yes_no() helper in show_cpuinfo() Remove hard-coded strings by using the str_yes_no() helper function. Signed-off-by: Thorsten Blum Signed-off-by: Dinh Nguyen --- arch/nios2/kernel/cpuinfo.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/nios2/kernel/cpuinfo.c b/arch/nios2/kernel/cpuinfo.c index 338849c430a5..7b1e8f9128e9 100644 --- a/arch/nios2/kernel/cpuinfo.c +++ b/arch/nios2/kernel/cpuinfo.c @@ -143,11 +143,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) " DIV:\t\t%s\n" " BMX:\t\t%s\n" " CDX:\t\t%s\n", - cpuinfo.has_mul ? "yes" : "no", - cpuinfo.has_mulx ? "yes" : "no", - cpuinfo.has_div ? "yes" : "no", - cpuinfo.has_bmx ? "yes" : "no", - cpuinfo.has_cdx ? "yes" : "no"); + str_yes_no(cpuinfo.has_mul), + str_yes_no(cpuinfo.has_mulx), + str_yes_no(cpuinfo.has_div), + str_yes_no(cpuinfo.has_bmx), + str_yes_no(cpuinfo.has_cdx)); seq_printf(m, "Icache:\t\t%ukB, line length: %u\n", From b1f3a2f5a742c1e939a73031bd31b9e557a2d77d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:40 -0800 Subject: [PATCH 215/807] netdev: fix repeated netlink messages in queue dump The context is supposed to record the next queue to dump, not last dumped. If the dump doesn't fit we will restart from the already-dumped queue, duplicating the message. Before this fix and with the selftest improvements later in this series we see: # ./run_kselftest.sh -t drivers/net:queues.py timeout set to 45 selftests: drivers/net: queues.py KTAP version 1 1..2 # Check| At /root/ksft-net-drv/drivers/net/./queues.py, line 32, in get_queues: # Check| ksft_eq(queues, expected) # Check failed 102 != 100 # Check| At /root/ksft-net-drv/drivers/net/./queues.py, line 32, in get_queues: # Check| ksft_eq(queues, expected) # Check failed 101 != 100 not ok 1 queues.get_queues ok 2 queues.addremove_queues # Totals: pass:1 fail:1 xfail:0 xpass:0 skip:0 error:0 not ok 1 selftests: drivers/net: queues.py # exit=1 With the fix: # ./ksft-net-drv/run_kselftest.sh -t drivers/net:queues.py timeout set to 45 selftests: drivers/net: queues.py KTAP version 1 1..2 ok 1 queues.get_queues ok 2 queues.addremove_queues # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 6b6171db7fc8 ("netdev-genl: Add netlink framework functions for queue") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213152244.3080955-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9527dd46e4dc..9f086b190619 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -488,24 +488,21 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, struct netdev_nl_dump_ctx *ctx) { int err = 0; - int i; if (!(netdev->flags & IFF_UP)) return err; - for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; - ctx->rxq_idx = i++; } - for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; - ctx->txq_idx = i++; } return err; From ecc391a541573da46b7ccc188105efedd40aef1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:41 -0800 Subject: [PATCH 216/807] netdev: fix repeated netlink messages in queue stats The context is supposed to record the next queue to dump, not last dumped. If the dump doesn't fit we will restart from the already-dumped queue, duplicating the message. Before this fix and with the selftest improvements later in this series we see: # ./run_kselftest.sh -t drivers/net:stats.py timeout set to 45 selftests: drivers/net: stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 45 != 44 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 45 != 44 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 45 != 44 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 45 != 44 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 103 != 100 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 103 != 100 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 102 != 100 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 102 != 100 missing queue keys not ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:4 fail:1 xfail:0 xpass:0 skip:0 error:0 With the fix: # ./ksft-net-drv/run_kselftest.sh -t drivers/net:stats.py timeout set to 45 selftests: drivers/net: stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:5 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: ab63a2387cb9 ("netdev: add per-queue statistics") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213152244.3080955-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9f086b190619..1be8c7c21d19 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -668,7 +668,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->rxq_idx = i++; + ctx->rxq_idx = ++i; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { @@ -676,7 +676,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->txq_idx = i++; + ctx->txq_idx = ++i; } ctx->rxq_idx = 0; From 0518863407b8dcc7070fdbc1c015046d66777e78 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:42 -0800 Subject: [PATCH 217/807] selftests: net: support setting recv_size in YNL recv_size parameter allows constraining the buffer size for dumps. It's useful in testing kernel handling of dump continuation, IOW testing dumps which span multiple skbs. Let the tests set this parameter when initializing the YNL family. Keep the normal default, we don't want tests to unintentionally behave very differently than normal code. Reviewed-by: Joe Damato Reviewed-by: Petr Machata Link: https://patch.msgid.link/20241213152244.3080955-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ynl.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py index a0d689d58c57..076a7e8dc3eb 100644 --- a/tools/testing/selftests/net/lib/py/ynl.py +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -32,23 +32,23 @@ except ModuleNotFoundError as e: # Set schema='' to avoid jsonschema validation, it's slow # class EthtoolFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('ethtool.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) class RtnlFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) class NetdevFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) class NetshaperFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('net_shaper.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) From 1234810b1649e9d781aeafd4b23fb1fcfbf95d8f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:43 -0800 Subject: [PATCH 218/807] selftests: net-drv: queues: sanity check netlink dumps This test already catches a netlink bug fixed by this series, but only when running on HW with many queues. Make sure the netdevsim instance created has a lot of queues, and constrain the size of the recv_buffer used by netlink. While at it test both rx and tx queues. Reviewed-by: Joe Damato Reviewed-by: Petr Machata Link: https://patch.msgid.link/20241213152244.3080955-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/queues.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 30f29096e27c..9c5473abbd78 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -8,25 +8,28 @@ from lib.py import cmd import glob -def sys_get_queues(ifname) -> int: - folders = glob.glob(f'/sys/class/net/{ifname}/queues/rx-*') +def sys_get_queues(ifname, qtype='rx') -> int: + folders = glob.glob(f'/sys/class/net/{ifname}/queues/{qtype}-*') return len(folders) -def nl_get_queues(cfg, nl): +def nl_get_queues(cfg, nl, qtype='rx'): queues = nl.queue_get({'ifindex': cfg.ifindex}, dump=True) if queues: - return len([q for q in queues if q['type'] == 'rx']) + return len([q for q in queues if q['type'] == qtype]) return None def get_queues(cfg, nl) -> None: - queues = nl_get_queues(cfg, nl) - if not queues: - raise KsftSkipEx('queue-get not supported by device') + snl = NetdevFamily(recv_size=4096) - expected = sys_get_queues(cfg.dev['ifname']) - ksft_eq(queues, expected) + for qtype in ['rx', 'tx']: + queues = nl_get_queues(cfg, snl, qtype) + if not queues: + raise KsftSkipEx('queue-get not supported by device') + + expected = sys_get_queues(cfg.dev['ifname'], qtype) + ksft_eq(queues, expected) def addremove_queues(cfg, nl) -> None: @@ -57,7 +60,7 @@ def addremove_queues(cfg, nl) -> None: def main() -> None: - with NetDrvEnv(__file__, queue_count=3) as cfg: + with NetDrvEnv(__file__, queue_count=100) as cfg: ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily())) ksft_exit() From 5712e323d4c3ad03bba4d28f83e80593171ac3f1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:44 -0800 Subject: [PATCH 219/807] selftests: net-drv: stats: sanity check netlink dumps Sanity check netlink dumps, to make sure dumps don't have repeated entries or gaps in IDs. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20241213152244.3080955-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/stats.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 63e3c045a3b2..031ac9def6c0 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -110,6 +110,23 @@ def qstat_by_ifindex(cfg) -> None: ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key) ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key) + # Sanity check the dumps + queues = NetdevFamily(recv_size=4096).qstats_get({"scope": "queue"}, dump=True) + # Reformat the output into {ifindex: {rx: [id, id, ...], tx: [id, id, ...]}} + parsed = {} + for entry in queues: + ifindex = entry["ifindex"] + if ifindex not in parsed: + parsed[ifindex] = {"rx":[], "tx": []} + parsed[ifindex][entry["queue-type"]].append(entry['queue-id']) + # Now, validate + for ifindex, queues in parsed.items(): + for qtype in ['rx', 'tx']: + ksft_eq(len(queues[qtype]), len(set(queues[qtype])), + comment="repeated queue keys") + ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, + comment="missing queue keys") + # Test invalid dumps # 0 is invalid with ksft_raises(NlError) as cm: @@ -158,7 +175,7 @@ def check_down(cfg) -> None: def main() -> None: - with NetDrvEnv(__file__) as cfg: + with NetDrvEnv(__file__, queue_count=100) as cfg: ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex, check_down], args=(cfg, )) From fbbd84af6ba70334335bdeba3ae536cf751c14c6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Dec 2024 12:47:27 +0300 Subject: [PATCH 220/807] chelsio/chtls: prevent potential integer overflow on 32bit The "gl->tot_len" variable is controlled by the user. It comes from process_responses(). On 32bit systems, the "gl->tot_len + sizeof(struct cpl_pass_accept_req) + sizeof(struct rss_header)" addition could have an integer wrapping bug. Use size_add() to prevent this. Fixes: a08943947873 ("crypto: chtls - Register chtls with net tls") Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Link: https://patch.msgid.link/c6bfb23c-2db2-4e1b-b8ab-ba3925c82ef5@stanley.mountain Signed-off-by: Jakub Kicinski --- .../net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c index 96fd31d75dfd..daa1ebaef511 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c @@ -346,8 +346,9 @@ static struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl, * driver. Once driver synthesizes cpl_pass_accept_req the skb will go * through the regular cpl_pass_accept_req processing in TOM. */ - skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) - - pktshift, GFP_ATOMIC); + skb = alloc_skb(size_add(gl->tot_len, + sizeof(struct cpl_pass_accept_req)) - + pktshift, GFP_ATOMIC); if (unlikely(!skb)) return NULL; __skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) From e78c20f327bd94dabac68b98218dff069a8780f0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 Dec 2024 13:36:57 +0100 Subject: [PATCH 221/807] team: Fix feature exposure when no ports are present Small follow-up to align this to an equivalent behavior as the bond driver. The change in 3625920b62c3 ("teaming: fix vlan_features computing") removed the netdevice vlan_features when there is no team port attached, yet it leaves the full set of enc_features intact. Instead, leave the default features as pre 3625920b62c3, and recompute once we do have ports attached. Also, similarly as in bonding case, call the netdev_base_features() helper on the enc_features. Fixes: 3625920b62c3 ("teaming: fix vlan_features computing") Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241213123657.401868-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- drivers/net/team/team_core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index 69ea2c3c76bf..c7690adec8db 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -998,9 +998,13 @@ static void __team_compute_features(struct team *team) unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; - vlan_features = netdev_base_features(vlan_features); - rcu_read_lock(); + if (list_empty(&team->port_list)) + goto done; + + vlan_features = netdev_base_features(vlan_features); + enc_features = netdev_base_features(enc_features); + list_for_each_entry_rcu(port, &team->port_list, list) { vlan_features = netdev_increment_features(vlan_features, port->dev->vlan_features, @@ -1010,11 +1014,11 @@ static void __team_compute_features(struct team *team) port->dev->hw_enc_features, TEAM_ENC_FEATURES); - dst_release_flag &= port->dev->priv_flags; if (port->dev->hard_header_len > max_hard_header_len) max_hard_header_len = port->dev->hard_header_len; } +done: rcu_read_unlock(); team->dev->vlan_features = vlan_features; From 7203d10e93b6e6e1d19481ef7907de6a9133a467 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Dec 2024 17:28:11 +0300 Subject: [PATCH 222/807] net: hinic: Fix cleanup in create_rxqs/txqs() There is a check for NULL at the start of create_txqs() and create_rxqs() which tess if "nic_dev->txqs" is non-NULL. The intention is that if the device is already open and the queues are already created then we don't create them a second time. However, the bug is that if we have an error in the create_txqs() then the pointer doesn't get set back to NULL. The NULL check at the start of the function will say that it's already open when it's not and the device can't be used. Set ->txqs back to NULL on cleanup on error. Fixes: c3e79baf1b03 ("net-next/hinic: Add logical Txq and Rxq") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Link: https://patch.msgid.link/0cc98faf-a0ed-4565-a55b-0fa2734bc205@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/huawei/hinic/hinic_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index 890f213da8d1..ae1f523d6841 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -172,6 +172,7 @@ err_init_txq: hinic_sq_dbgfs_uninit(nic_dev); devm_kfree(&netdev->dev, nic_dev->txqs); + nic_dev->txqs = NULL; return err; } @@ -268,6 +269,7 @@ err_init_rxq: hinic_rq_dbgfs_uninit(nic_dev); devm_kfree(&netdev->dev, nic_dev->rxqs); + nic_dev->rxqs = NULL; return err; } From b4845bb6383821a9516ce30af3a27dc873e37fd4 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 11:00:52 +0200 Subject: [PATCH 223/807] x86/xen: add central hypercall functions Add generic hypercall functions usable for all normal (i.e. not iret) hypercalls. Depending on the guest type and the processor vendor different functions need to be used due to the to be used instruction for entering the hypervisor: - PV guests need to use syscall - HVM/PVH guests on Intel need to use vmcall - HVM/PVH guests on AMD and Hygon need to use vmmcall As PVH guests need to issue hypercalls very early during boot, there is a 4th hypercall function needed for HVM/PVH which can be used on Intel and AMD processors. It will check the vendor type and then set the Intel or AMD specific function to use via static_call(). This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra --- arch/x86/include/asm/xen/hypercall.h | 3 + arch/x86/xen/enlighten.c | 65 ++++++++++++++++++++++ arch/x86/xen/enlighten_hvm.c | 4 ++ arch/x86/xen/enlighten_pv.c | 4 +- arch/x86/xen/xen-asm.S | 23 ++++++++ arch/x86/xen/xen-head.S | 83 ++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 9 +++ 7 files changed, 190 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a2dd24947eb8..6b4dd4de08a6 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -88,6 +88,9 @@ struct xen_dm_op_buf; extern struct { char _entry[32]; } hypercall_page[]; +void xen_hypercall_func(void); +DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); + #define __HYPERCALL "call hypercall_page+%c[offset]" #define __HYPERCALL_ENTRY(x) \ [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0])) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 84e5adbd0925..1887435af2fb 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -23,6 +24,9 @@ EXPORT_SYMBOL_GPL(hypercall_page); +DEFINE_STATIC_CALL(xen_hypercall, xen_hypercall_hvm); +EXPORT_STATIC_CALL_TRAMP(xen_hypercall); + /* * Pointer to the xen_vcpu_info structure or * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info @@ -68,6 +72,67 @@ EXPORT_SYMBOL(xen_start_flags); */ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; +static __ref void xen_get_vendor(void) +{ + init_cpu_devs(); + cpu_detect(&boot_cpu_data); + get_cpu_vendor(&boot_cpu_data); +} + +void xen_hypercall_setfunc(void) +{ + if (static_call_query(xen_hypercall) != xen_hypercall_hvm) + return; + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) + static_call_update(xen_hypercall, xen_hypercall_amd); + else + static_call_update(xen_hypercall, xen_hypercall_intel); +} + +/* + * Evaluate processor vendor in order to select the correct hypercall + * function for HVM/PVH guests. + * Might be called very early in boot before vendor has been set by + * early_cpu_init(). + */ +noinstr void *__xen_hypercall_setfunc(void) +{ + void (*func)(void); + + /* + * Xen is supported only on CPUs with CPUID, so testing for + * X86_FEATURE_CPUID is a test for early_cpu_init() having been + * run. + * + * Note that __xen_hypercall_setfunc() is noinstr only due to a nasty + * dependency chain: it is being called via the xen_hypercall static + * call when running as a PVH or HVM guest. Hypercalls need to be + * noinstr due to PV guests using hypercalls in noinstr code. So we + * can safely tag the function body as "instrumentation ok", since + * the PV guest requirement is not of interest here (xen_get_vendor() + * calls noinstr functions, and static_call_update_early() might do + * so, too). + */ + instrumentation_begin(); + + if (!boot_cpu_has(X86_FEATURE_CPUID)) + xen_get_vendor(); + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) + func = xen_hypercall_amd; + else + func = xen_hypercall_intel; + + static_call_update_early(xen_hypercall, func); + + instrumentation_end(); + + return func; +} + static int xen_cpu_up_online(unsigned int cpu) { xen_init_lock_cpu(cpu); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 24d2957a4726..973a74fc966a 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -300,6 +300,10 @@ static uint32_t __init xen_platform_hvm(void) if (xen_pv_domain()) return 0; + /* Set correct hypercall function. */ + if (xen_domain) + xen_hypercall_setfunc(); + if (xen_pvh_domain() && nopv) { /* Guest booting via the Xen-PVH boot entry goes here */ pr_info("\"nopv\" parameter is ignored in PVH guest\n"); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index d6818c6cafda..a8eb7e0c473c 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1341,6 +1341,9 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_domain_type = XEN_PV_DOMAIN; xen_start_flags = xen_start_info->flags; + /* Interrupts are guaranteed to be off initially. */ + early_boot_irqs_disabled = true; + static_call_update_early(xen_hypercall, xen_hypercall_pv); xen_setup_features(); @@ -1431,7 +1434,6 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); local_irq_disable(); - early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index ca6edfe4c14b..b518f36d1ca2 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -20,9 +20,32 @@ #include #include +#include #include <../entry/calling.h> .pushsection .noinstr.text, "ax" +/* + * PV hypercall interface to the hypervisor. + * + * Called via inline asm(), so better preserve %rcx and %r11. + * + * Input: + * %eax: hypercall number + * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall + * Output: %rax + */ +SYM_FUNC_START(xen_hypercall_pv) + ANNOTATE_NOENDBR + push %rcx + push %r11 + UNWIND_HINT_SAVE + syscall + UNWIND_HINT_RESTORE + pop %r11 + pop %rcx + RET +SYM_FUNC_END(xen_hypercall_pv) + /* * Disabling events is simply a matter of making the event mask * non-zero. diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7f6c69dbb816..c173ba6740e9 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -6,9 +6,11 @@ #include #include +#include #include #include +#include #include #include #include @@ -87,6 +89,87 @@ SYM_CODE_END(xen_cpu_bringup_again) #endif #endif + .pushsection .noinstr.text, "ax" +/* + * Xen hypercall interface to the hypervisor. + * + * Input: + * %eax: hypercall number + * 32-bit: + * %ebx, %ecx, %edx, %esi, %edi: args 1..5 for the hypercall + * 64-bit: + * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall + * Output: %[er]ax + */ +SYM_FUNC_START(xen_hypercall_hvm) + ENDBR + FRAME_BEGIN + /* Save all relevant registers (caller save and arguments). */ +#ifdef CONFIG_X86_32 + push %eax + push %ebx + push %ecx + push %edx + push %esi + push %edi +#else + push %rax + push %rcx + push %rdx + push %rdi + push %rsi + push %r11 + push %r10 + push %r9 + push %r8 +#ifdef CONFIG_FRAME_POINTER + pushq $0 /* Dummy push for stack alignment. */ +#endif +#endif + /* Set the vendor specific function. */ + call __xen_hypercall_setfunc + /* Set ZF = 1 if AMD, Restore saved registers. */ +#ifdef CONFIG_X86_32 + lea xen_hypercall_amd, %ebx + cmp %eax, %ebx + pop %edi + pop %esi + pop %edx + pop %ecx + pop %ebx + pop %eax +#else + lea xen_hypercall_amd(%rip), %rbx + cmp %rax, %rbx +#ifdef CONFIG_FRAME_POINTER + pop %rax /* Dummy pop. */ +#endif + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %rsi + pop %rdi + pop %rdx + pop %rcx + pop %rax +#endif + /* Use correct hypercall function. */ + jz xen_hypercall_amd + jmp xen_hypercall_intel +SYM_FUNC_END(xen_hypercall_hvm) + +SYM_FUNC_START(xen_hypercall_amd) + vmmcall + RET +SYM_FUNC_END(xen_hypercall_amd) + +SYM_FUNC_START(xen_hypercall_intel) + vmcall + RET +SYM_FUNC_END(xen_hypercall_intel) + .popsection + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index e1b782e823e6..63c13a2ccf55 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -326,4 +326,13 @@ static inline void xen_smp_intr_free_pv(unsigned int cpu) {} static inline void xen_smp_count_cpus(void) { } #endif /* CONFIG_SMP */ +#ifdef CONFIG_XEN_PV +void xen_hypercall_pv(void); +#endif +void xen_hypercall_hvm(void); +void xen_hypercall_amd(void); +void xen_hypercall_intel(void); +void xen_hypercall_setfunc(void); +void *__xen_hypercall_setfunc(void); + #endif /* XEN_OPS_H */ From b1c2cb86f4a7861480ad54bb9a58df3cbebf8e92 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 14:47:13 +0200 Subject: [PATCH 224/807] x86/xen: use new hypercall functions instead of hypercall page Call the Xen hypervisor via the new xen_hypercall_func static-call instead of the hypercall page. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra Co-developed-by: Josh Poimboeuf --- arch/x86/include/asm/xen/hypercall.h | 33 +++++++++++++++++----------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 6b4dd4de08a6..7d5f8ad66774 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -39,9 +39,11 @@ #include #include #include +#include #include +#include #include #include #include @@ -91,9 +93,17 @@ extern struct { char _entry[32]; } hypercall_page[]; void xen_hypercall_func(void); DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); -#define __HYPERCALL "call hypercall_page+%c[offset]" -#define __HYPERCALL_ENTRY(x) \ - [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0])) +#ifdef MODULE +#define __ADDRESSABLE_xen_hypercall +#else +#define __ADDRESSABLE_xen_hypercall __ADDRESSABLE_ASM_STR(__SCK__xen_hypercall) +#endif + +#define __HYPERCALL \ + __ADDRESSABLE_xen_hypercall \ + "call __SCT__xen_hypercall" + +#define __HYPERCALL_ENTRY(x) "a" (x) #ifdef CONFIG_X86_32 #define __HYPERCALL_RETREG "eax" @@ -151,7 +161,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_0ARG(); \ asm volatile (__HYPERCALL \ : __HYPERCALL_0PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER0); \ (type)__res; \ }) @@ -162,7 +172,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_1ARG(a1); \ asm volatile (__HYPERCALL \ : __HYPERCALL_1PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER1); \ (type)__res; \ }) @@ -173,7 +183,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_2ARG(a1, a2); \ asm volatile (__HYPERCALL \ : __HYPERCALL_2PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER2); \ (type)__res; \ }) @@ -184,7 +194,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_3ARG(a1, a2, a3); \ asm volatile (__HYPERCALL \ : __HYPERCALL_3PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER3); \ (type)__res; \ }) @@ -195,7 +205,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_4ARG(a1, a2, a3, a4); \ asm volatile (__HYPERCALL \ : __HYPERCALL_4PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER4); \ (type)__res; \ }) @@ -209,12 +219,9 @@ xen_single_call(unsigned int call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); - if (call >= PAGE_SIZE / sizeof(hypercall_page[0])) - return -EINVAL; - - asm volatile(CALL_NOSPEC + asm volatile(__HYPERCALL : __HYPERCALL_5PARAM - : [thunk_target] "a" (&hypercall_page[call]) + : __HYPERCALL_ENTRY(call) : __HYPERCALL_CLOBBER5); return (long)__res; From 7fa0da5373685e7ed249af3fa317ab1e1ba8b0a6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 15:27:31 +0200 Subject: [PATCH 225/807] x86/xen: remove hypercall page The hypercall page is no longer needed. It can be removed, as from the Xen perspective it is optional. But, from Linux's perspective, it removes naked RET instructions that escape the speculative protections that Call Depth Tracking and/or Untrain Ret are trying to achieve. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Reviewed-by: Andrew Cooper Reviewed-by: Jan Beulich --- arch/x86/include/asm/xen/hypercall.h | 2 -- arch/x86/kernel/callthunks.c | 5 ----- arch/x86/kernel/vmlinux.lds.S | 4 ---- arch/x86/xen/enlighten.c | 2 -- arch/x86/xen/enlighten_hvm.c | 9 +-------- arch/x86/xen/enlighten_pvh.c | 7 ------- arch/x86/xen/xen-head.S | 24 ------------------------ 7 files changed, 1 insertion(+), 52 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 7d5f8ad66774..97771b9d33af 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -88,8 +88,6 @@ struct xen_dm_op_buf; * there aren't more than 5 arguments...) */ -extern struct { char _entry[32]; } hypercall_page[]; - void xen_hypercall_func(void); DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 465647456753..f17d16607882 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -142,11 +142,6 @@ static bool skip_addr(void *dest) if (dest >= (void *)relocate_kernel && dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) return true; -#endif -#ifdef CONFIG_XEN - if (dest >= (void *)hypercall_page && - dest < (void*)hypercall_page + PAGE_SIZE) - return true; #endif return false; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index fab3ac9a4574..6a17396c8174 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -519,14 +519,10 @@ INIT_PER_CPU(irq_stack_backing_store); * linker will never mark as relocatable. (Using just ABSOLUTE() is not * sufficient for that). */ -#ifdef CONFIG_XEN #ifdef CONFIG_XEN_PV xen_elfnote_entry_value = ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen); #endif -xen_elfnote_hypercall_page_value = - ABSOLUTE(xen_elfnote_hypercall_page) + ABSOLUTE(hypercall_page); -#endif #ifdef CONFIG_PVH xen_elfnote_phys32_entry_value = ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1887435af2fb..43dcd8c7badc 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -22,8 +22,6 @@ #include "xen-ops.h" -EXPORT_SYMBOL_GPL(hypercall_page); - DEFINE_STATIC_CALL(xen_hypercall, xen_hypercall_hvm); EXPORT_STATIC_CALL_TRAMP(xen_hypercall); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 973a74fc966a..fe57ff85d004 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -106,15 +106,8 @@ static void __init init_hvm_pv_info(void) /* PVH set up hypercall page in xen_prepare_pvh(). */ if (xen_pvh_domain()) pv_info.name = "Xen PVH"; - else { - u64 pfn; - uint32_t msr; - + else pv_info.name = "Xen HVM"; - msr = cpuid_ebx(base + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - } xen_setup_features(); diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index bf68c329fc01..0e3d930bcb89 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -129,17 +129,10 @@ static void __init pvh_arch_setup(void) void __init xen_pvh_init(struct boot_params *boot_params) { - u32 msr; - u64 pfn; - xen_pvh = 1; xen_domain_type = XEN_HVM_DOMAIN; xen_start_flags = pvh_start_info.flags; - msr = cpuid_ebx(xen_cpuid_base() + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - x86_init.oem.arch_setup = pvh_arch_setup; x86_init.oem.banner = xen_banner; diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index c173ba6740e9..9252652afe59 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -22,28 +22,6 @@ #include #include -.pushsection .noinstr.text, "ax" - .balign PAGE_SIZE -SYM_CODE_START(hypercall_page) - .rept (PAGE_SIZE / 32) - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - ANNOTATE_UNRET_SAFE - ret - /* - * Xen will write the hypercall page, and sort out ENDBR. - */ - .skip 31, 0xcc - .endr - -#define HYPERCALL(n) \ - .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ - .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 -#include -#undef HYPERCALL -SYM_CODE_END(hypercall_page) -.popsection - #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) @@ -199,8 +177,6 @@ SYM_FUNC_END(xen_hypercall_intel) #else # define FEATURES_DOM0 0 #endif - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .globl xen_elfnote_hypercall_page; - xen_elfnote_hypercall_page: _ASM_PTR xen_elfnote_hypercall_page_value - .) ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0) ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") From 94901b7a74d82bfd30420f1d9d00898278fdc8bf Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 12 Dec 2024 22:00:15 +0900 Subject: [PATCH 226/807] rust: net::phy fix module autoloading The alias symbol name was renamed. Adjust module_phy_driver macro to create the proper symbol name to fix module autoloading. Fixes: 054a9cd395a7 ("modpost: rename alias symbol for MODULE_DEVICE_TABLE()") Signed-off-by: FUJITA Tomonori Link: https://patch.msgid.link/20241212130015.238863-1-fujita.tomonori@gmail.com Signed-off-by: Paolo Abeni --- rust/kernel/net/phy.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index b89c681d97c0..2fbfb6a94c11 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -860,7 +860,7 @@ impl DeviceMask { /// ]; /// #[cfg(MODULE)] /// #[no_mangle] -/// static __mod_mdio__phydev_device_table: [::kernel::bindings::mdio_device_id; 2] = _DEVICE_TABLE; +/// static __mod_device_table__mdio__phydev: [::kernel::bindings::mdio_device_id; 2] = _DEVICE_TABLE; /// ``` #[macro_export] macro_rules! module_phy_driver { @@ -883,7 +883,7 @@ macro_rules! module_phy_driver { #[cfg(MODULE)] #[no_mangle] - static __mod_mdio__phydev_device_table: [$crate::bindings::mdio_device_id; + static __mod_device_table__mdio__phydev: [$crate::bindings::mdio_device_id; $crate::module_phy_driver!(@count_devices $($dev),+) + 1] = _DEVICE_TABLE; }; From 212fbabe1dfecdda35bf5aaa900f745a3bab5ac4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 16 Dec 2024 19:28:24 +0000 Subject: [PATCH 227/807] KVM: arm64: Fix set_id_regs selftest for ASIDBITS becoming unwritable In commit 03c7527e97f7 ("KVM: arm64: Do not allow ID_AA64MMFR0_EL1.ASIDbits to be overridden") we made that bitfield in the ID registers unwritable however the change neglected to make the corresponding update to set_id_regs resulting in it failing: ok 56 ID_AA64MMFR0_EL1_BIGEND ==== Test Assertion Failure ==== aarch64/set_id_regs.c:434: masks[idx] & ftr_bits[j].mask == ftr_bits[j].mask pid=5566 tid=5566 errno=22 - Invalid argument 1 0x00000000004034a7: test_vm_ftr_id_regs at set_id_regs.c:434 2 0x0000000000401b53: main at set_id_regs.c:684 3 0x0000ffff8e6b7543: ?? ??:0 4 0x0000ffff8e6b7617: ?? ??:0 5 0x0000000000401e6f: _start at ??:? not ok 8 selftests: kvm: set_id_regs # exit=254 Remove ID_AA64MMFR1_EL1.ASIDBITS from the set of bitfields we test for writeability. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241216-kvm-arm64-fix-set-id-asidbits-v1-1-8b105b888fc3@kernel.org Acked-by: Marc Zyngier Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/aarch64/set_id_regs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index a79b7f18452d..3a97c160b5fe 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -152,7 +152,6 @@ static const struct reg_ftr_bits ftr_id_aa64mmfr0_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, BIGENDEL0, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, SNSMEM, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, BIGEND, 0), - REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ASIDBITS, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, PARANGE, 0), REG_FTR_END, }; From abcc2ddae5f82aa6cfca162e3db643dd33f0a2e8 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Wed, 27 Nov 2024 09:40:04 -0800 Subject: [PATCH 228/807] i915/guc: Reset engine utilization buffer before registration On GT reset, we store total busyness counts for all engines and re-register the utilization buffer with GuC. At that time we should reset the buffer, so that we don't get spurious busyness counts on subsequent queries. To repro this issue, run igt@perf_pmu@busy-hang followed by igt@perf_pmu@most-busy-idle-check-all for a couple iterations. Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20241127174006.190128-2-umesh.nerlige.ramappa@intel.com (cherry picked from commit abd318237fa6556c1e5225529af145ef15d5ff0d) Signed-off-by: Tvrtko Ursulin --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 9ede6f240d79..b1d0c66e166f 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1243,6 +1243,21 @@ static void __get_engine_usage_record(struct intel_engine_cs *engine, } while (++i < 6); } +static void __set_engine_usage_record(struct intel_engine_cs *engine, + u32 last_in, u32 id, u32 total) +{ + struct iosys_map rec_map = intel_guc_engine_usage_record_map(engine); + +#define record_write(map_, field_, val_) \ + iosys_map_wr_field(map_, 0, struct guc_engine_usage_record, field_, val_) + + record_write(&rec_map, last_switch_in_stamp, last_in); + record_write(&rec_map, current_context_index, id); + record_write(&rec_map, total_runtime, total); + +#undef record_write +} + static void guc_update_engine_gt_clks(struct intel_engine_cs *engine) { struct intel_engine_guc_stats *stats = &engine->stats.guc; @@ -1543,6 +1558,9 @@ err_trylock: static int guc_action_enable_usage_stats(struct intel_guc *guc) { + struct intel_gt *gt = guc_to_gt(guc); + struct intel_engine_cs *engine; + enum intel_engine_id id; u32 offset = intel_guc_engine_usage_offset(guc); u32 action[] = { INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF, @@ -1550,6 +1568,9 @@ static int guc_action_enable_usage_stats(struct intel_guc *guc) 0, }; + for_each_engine(engine, gt, id) + __set_engine_usage_record(engine, 0, 0xffffffff, 0); + return intel_guc_send(guc, action, ARRAY_SIZE(action)); } From 59a0b46788d58fdcee8d2f6b4e619d264a1799bf Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Wed, 27 Nov 2024 09:40:05 -0800 Subject: [PATCH 229/807] i915/guc: Ensure busyness counter increases motonically Active busyness of an engine is calculated using gt timestamp and the context switch in time. While capturing the gt timestamp, it's possible that the context switches out. This race could result in an active busyness value that is greater than the actual context runtime value by a small amount. This leads to a negative delta and throws off busyness calculations for the user. If a subsequent count is smaller than the previous one, just return the previous one, since we expect the busyness to catch up. Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20241127174006.190128-3-umesh.nerlige.ramappa@intel.com (cherry picked from commit cf907f6d294217985e9dafd9985dce874e04ca37) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_engine_types.h | 5 +++++ drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index ba55c059063d..fe1f85e5dda3 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -343,6 +343,11 @@ struct intel_engine_guc_stats { * @start_gt_clk: GT clock time of last idle to active transition. */ u64 start_gt_clk; + + /** + * @total: The last value of total returned + */ + u64 total; }; union intel_engine_tlb_inv_reg { diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index b1d0c66e166f..9dcf76d440d8 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1378,9 +1378,12 @@ static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) total += intel_gt_clock_interval_to_ns(gt, clk); } + if (total > stats->total) + stats->total = total; + spin_unlock_irqrestore(&guc->timestamp.lock, flags); - return ns_to_ktime(total); + return ns_to_ktime(stats->total); } static void guc_enable_busyness_worker(struct intel_guc *guc) From 1622ed27d26ab4c234476be746aa55bcd39159dd Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Wed, 27 Nov 2024 09:40:06 -0800 Subject: [PATCH 230/807] i915/guc: Accumulate active runtime on gt reset On gt reset, if a context is running, then accumulate it's active time into the busyness counter since there will be no chance for the context to switch out and update it's run time. v2: Move comment right above the if (John) Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20241127174006.190128-4-umesh.nerlige.ramappa@intel.com (cherry picked from commit 7ed047da59cfa1acb558b95169d347acc8d85da1) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 9dcf76d440d8..c0bd730383f2 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1449,8 +1449,21 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) guc_update_pm_timestamp(guc, &unused); for_each_engine(engine, gt, id) { + struct intel_engine_guc_stats *stats = &engine->stats.guc; + guc_update_engine_gt_clks(engine); - engine->stats.guc.prev_total = 0; + + /* + * If resetting a running context, accumulate the active + * time as well since there will be no context switch. + */ + if (stats->running) { + u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk; + + stats->total_gt_clks += clk; + } + stats->prev_total = 0; + stats->running = 0; } spin_unlock_irqrestore(&guc->timestamp.lock, flags); From e21ebe51af688eb98fd6269240212a3c7300deea Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 17 Dec 2024 12:21:21 +0200 Subject: [PATCH 231/807] xhci: Turn NEC specific quirk for handling Stop Endpoint errors generic xHC hosts from several vendors have the same issue where endpoints start so slowly that a later queued 'Stop Endpoint' command may complete before endpoint is up and running. The 'Stop Endpoint' command fails with context state error as the endpoint still appears as stopped. See commit 42b758137601 ("usb: xhci: Limit Stop Endpoint retries") for details CC: stable@vger.kernel.org Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20241217102122.2316814-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 4cf5363875c7..09b05a62375e 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -1199,8 +1199,6 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id, * Keep retrying until the EP starts and stops again, on * chips where this is known to help. Wait for 100ms. */ - if (!(xhci->quirks & XHCI_NEC_HOST)) - break; if (time_is_before_jiffies(ep->stop_time + msecs_to_jiffies(100))) break; fallthrough; From b9252f80b807801056e67e3a672fb1be0ecb81d8 Mon Sep 17 00:00:00 2001 From: Niklas Neronin Date: Tue, 17 Dec 2024 12:21:22 +0200 Subject: [PATCH 232/807] usb: xhci: fix ring expansion regression in 6.13-rc1 The source and destination rings were incorrectly assigned during the ring linking process. The "source" ring, which contains the new segments, was not spliced into the "destination" ring, leading to incorrect ring expansion. Fixes: fe688e500613 ("usb: xhci: refactor xhci_link_rings() to use source and destination rings") Reported-by: Jeff Chua Closes: https://lore.kernel.org/lkml/CAAJw_ZtppNqC9XA=-WVQDr+vaAS=di7jo15CzSqONeX48H75MA@mail.gmail.com/ Signed-off-by: Niklas Neronin Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20241217102122.2316814-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 15db90c54a45..92703efda1f7 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -436,7 +436,7 @@ int xhci_ring_expansion(struct xhci_hcd *xhci, struct xhci_ring *ring, goto free_segments; } - xhci_link_rings(xhci, ring, &new_ring); + xhci_link_rings(xhci, &new_ring, ring); trace_xhci_ring_expansion(ring); xhci_dbg_trace(xhci, trace_xhci_dbg_ring_expansion, "ring expansion succeed, now has %d segments", From 70465acbb0ce1bb69447acf32f136c8153cda0de Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Mon, 2 Dec 2024 09:53:17 +0800 Subject: [PATCH 233/807] exfat: fix exfat_find_empty_entry() not returning error on failure On failure, "dentry" is the error code. If the error code indicates that there is no space, a new cluster may need to be allocated; for other errors, it should be returned directly. Only on success, "dentry" is the index of the directory entry, and it needs to be converted into the directory entry index within the cluster where it is located. Fixes: 8a3f5711ad74 ("exfat: reduce FAT chain traversal") Reported-by: syzbot+6f6c9397e0078ef60bce@syzkaller.appspotmail.com Tested-by: syzbot+6f6c9397e0078ef60bce@syzkaller.appspotmail.com Signed-off-by: Yuezhang Mo Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 97d2774760fe..099f80645072 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -330,8 +330,8 @@ static int exfat_find_empty_entry(struct inode *inode, while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir, num_entries, es)) < 0) { - if (dentry == -EIO) - break; + if (dentry != -ENOSPC) + return dentry; if (exfat_check_max_dentries(inode)) return -ENOSPC; From 00a973e093e93690d433f1e1873ee52a6a6eca1f Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Wed, 4 Dec 2024 00:33:34 +0200 Subject: [PATCH 234/807] interconnect: qcom: icc-rpm: Set the count member before accessing the flex array The following UBSAN error is reported during boot on the db410c board on a clang-19 build: Internal error: UBSAN: array index out of bounds: 00000000f2005512 [#1] PREEMPT SMP ... pc : qnoc_probe+0x5f8/0x5fc ... The cause of the error is that the counter member was not set before accessing the annotated flexible array member, but after that. Fix this by initializing it earlier. Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/r/CA+G9fYs+2mBz1y2dAzxkj9-oiBJ2Acm1Sf1h2YQ3VmBqj_VX2g@mail.gmail.com Fixes: dd4904f3b924 ("interconnect: qcom: Annotate struct icc_onecell_data with __counted_by") Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20241203223334.233404-1-djakov@kernel.org Signed-off-by: Georgi Djakov --- drivers/interconnect/qcom/icc-rpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/interconnect/qcom/icc-rpm.c b/drivers/interconnect/qcom/icc-rpm.c index a8ed435f696c..ea1042d38128 100644 --- a/drivers/interconnect/qcom/icc-rpm.c +++ b/drivers/interconnect/qcom/icc-rpm.c @@ -503,6 +503,7 @@ int qnoc_probe(struct platform_device *pdev) GFP_KERNEL); if (!data) return -ENOMEM; + data->num_nodes = num_nodes; qp->num_intf_clks = cd_num; for (i = 0; i < cd_num; i++) @@ -597,7 +598,6 @@ regmap_done: data->nodes[i] = node; } - data->num_nodes = num_nodes; clk_bulk_disable_unprepare(qp->num_intf_clks, qp->intf_clks); From 44c5aa73ccd1e8a738fd011354ee8fb9fcda201a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 2 Dec 2024 17:57:23 +0100 Subject: [PATCH 235/807] interconnect: icc-clk: check return values of devm_kasprintf() devm_kasprintf() can fail and return NULL, add missing return value checks. Fixes: 0ac2a08f42ce ("interconnect: add clk-based icc provider support") Signed-off-by: Bartosz Golaszewski Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20241202165723.17292-1-brgl@bgdev.pl Signed-off-by: Georgi Djakov --- drivers/interconnect/icc-clk.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/interconnect/icc-clk.c b/drivers/interconnect/icc-clk.c index b956e4050f38..88f311c11020 100644 --- a/drivers/interconnect/icc-clk.c +++ b/drivers/interconnect/icc-clk.c @@ -116,6 +116,11 @@ struct icc_provider *icc_clk_register(struct device *dev, } node->name = devm_kasprintf(dev, GFP_KERNEL, "%s_master", data[i].name); + if (!node->name) { + ret = -ENOMEM; + goto err; + } + node->data = &qp->clocks[i]; icc_node_add(node, provider); /* link to the next node, slave */ @@ -129,6 +134,11 @@ struct icc_provider *icc_clk_register(struct device *dev, } node->name = devm_kasprintf(dev, GFP_KERNEL, "%s_slave", data[i].name); + if (!node->name) { + ret = -ENOMEM; + goto err; + } + /* no data for slave node */ icc_node_add(node, provider); onecell->nodes[j++] = node; From 7d2f320e12744e5906a4fab40381060a81d22c12 Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Fri, 13 Dec 2024 18:01:58 +0530 Subject: [PATCH 236/807] net: ethernet: oa_tc6: fix infinite loop error when tx credits becomes 0 SPI thread wakes up to perform SPI transfer whenever there is an TX skb from n/w stack or interrupt from MAC-PHY. Ethernet frame from TX skb is transferred based on the availability tx credits in the MAC-PHY which is reported from the previous SPI transfer. Sometimes there is a possibility that TX skb is available to transmit but there is no tx credits from MAC-PHY. In this case, there will not be any SPI transfer but the thread will be running in an endless loop until tx credits available again. So checking the availability of tx credits along with TX skb will prevent the above infinite loop. When the tx credits available again that will be notified through interrupt which will trigger the SPI transfer to get the available tx credits. Fixes: 53fbde8ab21e ("net: ethernet: oa_tc6: implement transmit path to transfer tx ethernet frames") Reviewed-by: Jacob Keller Signed-off-by: Parthiban Veerasooran Signed-off-by: Paolo Abeni --- drivers/net/ethernet/oa_tc6.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c index f9c0dcd965c2..4c8b0ca922b7 100644 --- a/drivers/net/ethernet/oa_tc6.c +++ b/drivers/net/ethernet/oa_tc6.c @@ -1111,8 +1111,9 @@ static int oa_tc6_spi_thread_handler(void *data) /* This kthread will be waken up if there is a tx skb or mac-phy * interrupt to perform spi transfer with tx chunks. */ - wait_event_interruptible(tc6->spi_wq, tc6->waiting_tx_skb || - tc6->int_flag || + wait_event_interruptible(tc6->spi_wq, tc6->int_flag || + (tc6->waiting_tx_skb && + tc6->tx_credits) || kthread_should_stop()); if (kthread_should_stop()) From e592b5110b3e9393881b0a019d86832bbf71a47f Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Fri, 13 Dec 2024 18:01:59 +0530 Subject: [PATCH 237/807] net: ethernet: oa_tc6: fix tx skb race condition between reference pointers There are two skb pointers to manage tx skb's enqueued from n/w stack. waiting_tx_skb pointer points to the tx skb which needs to be processed and ongoing_tx_skb pointer points to the tx skb which is being processed. SPI thread prepares the tx data chunks from the tx skb pointed by the ongoing_tx_skb pointer. When the tx skb pointed by the ongoing_tx_skb is processed, the tx skb pointed by the waiting_tx_skb is assigned to ongoing_tx_skb and the waiting_tx_skb pointer is assigned with NULL. Whenever there is a new tx skb from n/w stack, it will be assigned to waiting_tx_skb pointer if it is NULL. Enqueuing and processing of a tx skb handled in two different threads. Consider a scenario where the SPI thread processed an ongoing_tx_skb and it moves next tx skb from waiting_tx_skb pointer to ongoing_tx_skb pointer without doing any NULL check. At this time, if the waiting_tx_skb pointer is NULL then ongoing_tx_skb pointer is also assigned with NULL. After that, if a new tx skb is assigned to waiting_tx_skb pointer by the n/w stack and there is a chance to overwrite the tx skb pointer with NULL in the SPI thread. Finally one of the tx skb will be left as unhandled, resulting packet missing and memory leak. - Consider the below scenario where the TXC reported from the previous transfer is 10 and ongoing_tx_skb holds an tx ethernet frame which can be transported in 20 TXCs and waiting_tx_skb is still NULL. tx_credits = 10; /* 21 are filled in the previous transfer */ ongoing_tx_skb = 20; waiting_tx_skb = NULL; /* Still NULL */ - So, (tc6->ongoing_tx_skb || tc6->waiting_tx_skb) becomes true. - After oa_tc6_prepare_spi_tx_buf_for_tx_skbs() ongoing_tx_skb = 10; waiting_tx_skb = NULL; /* Still NULL */ - Perform SPI transfer. - Process SPI rx buffer to get the TXC from footers. - Now let's assume previously filled 21 TXCs are freed so we are good to transport the next remaining 10 tx chunks from ongoing_tx_skb. tx_credits = 21; ongoing_tx_skb = 10; waiting_tx_skb = NULL; - So, (tc6->ongoing_tx_skb || tc6->waiting_tx_skb) becomes true again. - In the oa_tc6_prepare_spi_tx_buf_for_tx_skbs() ongoing_tx_skb = NULL; waiting_tx_skb = NULL; - Now the below bad case might happen, Thread1 (oa_tc6_start_xmit) Thread2 (oa_tc6_spi_thread_handler) --------------------------- ----------------------------------- - if waiting_tx_skb is NULL - if ongoing_tx_skb is NULL - ongoing_tx_skb = waiting_tx_skb - waiting_tx_skb = skb - waiting_tx_skb = NULL ... - ongoing_tx_skb = NULL - if waiting_tx_skb is NULL - waiting_tx_skb = skb To overcome the above issue, protect the moving of tx skb reference from waiting_tx_skb pointer to ongoing_tx_skb pointer and assigning new tx skb to waiting_tx_skb pointer, so that the other thread can't access the waiting_tx_skb pointer until the current thread completes moving the tx skb reference safely. Fixes: 53fbde8ab21e ("net: ethernet: oa_tc6: implement transmit path to transfer tx ethernet frames") Signed-off-by: Parthiban Veerasooran Reviewed-by: Larysa Zaremba Signed-off-by: Paolo Abeni --- drivers/net/ethernet/oa_tc6.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c index 4c8b0ca922b7..db200e4ec284 100644 --- a/drivers/net/ethernet/oa_tc6.c +++ b/drivers/net/ethernet/oa_tc6.c @@ -113,6 +113,7 @@ struct oa_tc6 { struct mii_bus *mdiobus; struct spi_device *spi; struct mutex spi_ctrl_lock; /* Protects spi control transfer */ + spinlock_t tx_skb_lock; /* Protects tx skb handling */ void *spi_ctrl_tx_buf; void *spi_ctrl_rx_buf; void *spi_data_tx_buf; @@ -1004,8 +1005,10 @@ static u16 oa_tc6_prepare_spi_tx_buf_for_tx_skbs(struct oa_tc6 *tc6) for (used_tx_credits = 0; used_tx_credits < tc6->tx_credits; used_tx_credits++) { if (!tc6->ongoing_tx_skb) { + spin_lock_bh(&tc6->tx_skb_lock); tc6->ongoing_tx_skb = tc6->waiting_tx_skb; tc6->waiting_tx_skb = NULL; + spin_unlock_bh(&tc6->tx_skb_lock); } if (!tc6->ongoing_tx_skb) break; @@ -1210,7 +1213,9 @@ netdev_tx_t oa_tc6_start_xmit(struct oa_tc6 *tc6, struct sk_buff *skb) return NETDEV_TX_OK; } + spin_lock_bh(&tc6->tx_skb_lock); tc6->waiting_tx_skb = skb; + spin_unlock_bh(&tc6->tx_skb_lock); /* Wake spi kthread to perform spi transfer */ wake_up_interruptible(&tc6->spi_wq); @@ -1240,6 +1245,7 @@ struct oa_tc6 *oa_tc6_init(struct spi_device *spi, struct net_device *netdev) tc6->netdev = netdev; SET_NETDEV_DEV(netdev, &spi->dev); mutex_init(&tc6->spi_ctrl_lock); + spin_lock_init(&tc6->tx_skb_lock); /* Set the SPI controller to pump at realtime priority */ tc6->spi->rt = true; From 0cb2c504d79e7caa3abade3f466750c82ad26f01 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 14 Dec 2024 10:49:12 +0900 Subject: [PATCH 238/807] net: ethernet: bgmac-platform: fix an OF node reference leak The OF node obtained by of_parse_phandle() is not freed. Call of_node_put() to balance the refcount. This bug was found by an experimental static analysis tool that I am developing. Fixes: 1676aba5ef7e ("net: ethernet: bgmac: device tree phy enablement") Signed-off-by: Joe Hattori Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241214014912.2810315-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bgmac-platform.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c index ecce23cecbea..4e266ce41180 100644 --- a/drivers/net/ethernet/broadcom/bgmac-platform.c +++ b/drivers/net/ethernet/broadcom/bgmac-platform.c @@ -171,6 +171,7 @@ static int platform_phy_connect(struct bgmac *bgmac) static int bgmac_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; + struct device_node *phy_node; struct bgmac *bgmac; struct resource *regs; int ret; @@ -236,7 +237,9 @@ static int bgmac_probe(struct platform_device *pdev) bgmac->cco_ctl_maskset = platform_bgmac_cco_ctl_maskset; bgmac->get_bus_clock = platform_bgmac_get_bus_clock; bgmac->cmn_maskset32 = platform_bgmac_cmn_maskset32; - if (of_parse_phandle(np, "phy-handle", 0)) { + phy_node = of_parse_phandle(np, "phy-handle", 0); + if (phy_node) { + of_node_put(phy_node); bgmac->phy_connect = platform_phy_connect; } else { bgmac->phy_connect = bgmac_phy_connect_direct; From 2b2fc0be98a828cf33a88a28e9745e8599fb05cf Mon Sep 17 00:00:00 2001 From: Zhang Kunbo Date: Tue, 17 Dec 2024 07:18:36 +0000 Subject: [PATCH 239/807] fs: fix missing declaration of init_files fs/file.c should include include/linux/init_task.h for declaration of init_files. This fixes the sparse warning: fs/file.c:501:21: warning: symbol 'init_files' was not declared. Should it be static? Signed-off-by: Zhang Kunbo Link: https://lore.kernel.org/r/20241217071836.2634868-1-zhangkunbo@huawei.com Signed-off-by: Christian Brauner --- fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/file.c b/fs/file.c index fb1011cf6b4a..25c6e53b03f8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "internal.h" From e8d0ba147d901022bcb69da8d8fd817f84e9f3ca Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 17 Dec 2024 11:10:19 +0200 Subject: [PATCH 240/807] ASoC: SOF: Intel: hda-dai: Do not release the link DMA on STOP The linkDMA should not be released on stop trigger since a stream re-start might happen without closing of the stream. This leaves a short time for other streams to 'steal' the linkDMA since it has been released. This issue is not easy to reproduce under normal conditions as usually after stop the stream is closed, or the same stream is restarted, but if another stream got in between the stop and start, like this: aplay -Dhw:0,3 -c2 -r48000 -fS32_LE /dev/zero -d 120 CTRL+z aplay -Dhw:0,0 -c2 -r48000 -fS32_LE /dev/zero -d 120 then the link DMA channels will be mixed up, resulting firmware error or crash. Fixes: ab5593793e90 ("ASoC: SOF: Intel: hda: Always clean up link DMA during stop") Cc: stable@vger.kernel.org Closes: https://github.com/thesofproject/sof/issues/9695 Signed-off-by: Peter Ujfalusi Reviewed-by: Ranjani Sridharan Reviewed-by: Liam Girdwood Reviewed-by: Bard Liao Link: https://patch.msgid.link/20241217091019.31798-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dai.c | 25 +++++++++++++++++++------ sound/soc/sof/intel/hda.h | 2 -- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index c13f89b7065e..0db2a3e554fb 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -103,8 +103,10 @@ hda_dai_get_ops(struct snd_pcm_substream *substream, struct snd_soc_dai *cpu_dai return sdai->platform_private; } -int hda_link_dma_cleanup(struct snd_pcm_substream *substream, struct hdac_ext_stream *hext_stream, - struct snd_soc_dai *cpu_dai) +static int +hda_link_dma_cleanup(struct snd_pcm_substream *substream, + struct hdac_ext_stream *hext_stream, + struct snd_soc_dai *cpu_dai, bool release) { const struct hda_dai_widget_dma_ops *ops = hda_dai_get_ops(substream, cpu_dai); struct sof_intel_hda_stream *hda_stream; @@ -128,6 +130,17 @@ int hda_link_dma_cleanup(struct snd_pcm_substream *substream, struct hdac_ext_st snd_hdac_ext_bus_link_clear_stream_id(hlink, stream_tag); } + if (!release) { + /* + * Force stream reconfiguration without releasing the channel on + * subsequent stream restart (without free), including LinkDMA + * reset. + * The stream is released via hda_dai_hw_free() + */ + hext_stream->link_prepared = 0; + return 0; + } + if (ops->release_hext_stream) ops->release_hext_stream(sdev, cpu_dai, substream); @@ -211,7 +224,7 @@ static int __maybe_unused hda_dai_hw_free(struct snd_pcm_substream *substream, if (!hext_stream) return 0; - return hda_link_dma_cleanup(substream, hext_stream, cpu_dai); + return hda_link_dma_cleanup(substream, hext_stream, cpu_dai, true); } static int __maybe_unused hda_dai_hw_params_data(struct snd_pcm_substream *substream, @@ -304,7 +317,8 @@ static int __maybe_unused hda_dai_trigger(struct snd_pcm_substream *substream, i switch (cmd) { case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_SUSPEND: - ret = hda_link_dma_cleanup(substream, hext_stream, dai); + ret = hda_link_dma_cleanup(substream, hext_stream, dai, + cmd == SNDRV_PCM_TRIGGER_STOP ? false : true); if (ret < 0) { dev_err(sdev->dev, "%s: failed to clean up link DMA\n", __func__); return ret; @@ -660,8 +674,7 @@ static int hda_dai_suspend(struct hdac_bus *bus) } ret = hda_link_dma_cleanup(hext_stream->link_substream, - hext_stream, - cpu_dai); + hext_stream, cpu_dai, true); if (ret < 0) return ret; } diff --git a/sound/soc/sof/intel/hda.h b/sound/soc/sof/intel/hda.h index 22bd9c3c8216..ee4ccc1a5490 100644 --- a/sound/soc/sof/intel/hda.h +++ b/sound/soc/sof/intel/hda.h @@ -1038,8 +1038,6 @@ const struct hda_dai_widget_dma_ops * hda_select_dai_widget_ops(struct snd_sof_dev *sdev, struct snd_sof_widget *swidget); int hda_dai_config(struct snd_soc_dapm_widget *w, unsigned int flags, struct snd_sof_dai_config_data *data); -int hda_link_dma_cleanup(struct snd_pcm_substream *substream, struct hdac_ext_stream *hext_stream, - struct snd_soc_dai *cpu_dai); static inline struct snd_sof_dev *widget_to_sdev(struct snd_soc_dapm_widget *w) { From 7ed2d91588779f0a2b27fd502ce2aaf1fab9b3ca Mon Sep 17 00:00:00 2001 From: Gianfranco Trad Date: Sun, 15 Dec 2024 02:17:34 +0100 Subject: [PATCH 241/807] qed: fix possible uninit pointer read in qed_mcp_nvm_info_populate() Coverity reports an uninit pointer read in qed_mcp_nvm_info_populate(). If EOPNOTSUPP is returned from qed_mcp_bist_nvm_get_num_images() ensure nvm_info.num_images is set to 0 to avoid possible uninit assignment to p_hwfn->nvm_info.image_att later on in out label. Closes: https://scan5.scan.coverity.com/#/project-view/63204/10063?selectedIssue=1636666 Suggested-by: Simon Horman Signed-off-by: Gianfranco Trad Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241215011733.351325-2-gianf.trad@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index b45efc272fdb..c7f497c36f66 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -3358,6 +3358,7 @@ int qed_mcp_nvm_info_populate(struct qed_hwfn *p_hwfn) p_ptt, &nvm_info.num_images); if (rc == -EOPNOTSUPP) { DP_INFO(p_hwfn, "DRV_MSG_CODE_BIST_TEST is not supported\n"); + nvm_info.num_images = 0; goto out; } else if (rc || !nvm_info.num_images) { DP_ERR(p_hwfn, "Failed getting number of images\n"); From a37eecb705f33726f1fb7cd2a67e514a15dfe693 Mon Sep 17 00:00:00 2001 From: Evgenii Shatokhin Date: Mon, 9 Dec 2024 10:46:59 +0300 Subject: [PATCH 242/807] pinctrl: mcp23s08: Fix sleeping in atomic context due to regmap locking If a device uses MCP23xxx IO expander to receive IRQs, the following bug can happen: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:283 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, ... preempt_count: 1, expected: 0 ... Call Trace: ... __might_resched+0x104/0x10e __might_sleep+0x3e/0x62 mutex_lock+0x20/0x4c regmap_lock_mutex+0x10/0x18 regmap_update_bits_base+0x2c/0x66 mcp23s08_irq_set_type+0x1ae/0x1d6 __irq_set_trigger+0x56/0x172 __setup_irq+0x1e6/0x646 request_threaded_irq+0xb6/0x160 ... We observed the problem while experimenting with a touchscreen driver which used MCP23017 IO expander (I2C). The regmap in the pinctrl-mcp23s08 driver uses a mutex for protection from concurrent accesses, which is the default for regmaps without .fast_io, .disable_locking, etc. mcp23s08_irq_set_type() calls regmap_update_bits_base(), and the latter locks the mutex. However, __setup_irq() locks desc->lock spinlock before calling these functions. As a result, the system tries to lock the mutex whole holding the spinlock. It seems, the internal regmap locks are not needed in this driver at all. mcp->lock seems to protect the regmap from concurrent accesses already, except, probably, in mcp_pinconf_get/set. mcp23s08_irq_set_type() and mcp23s08_irq_mask/unmask() are called under chip_bus_lock(), which calls mcp23s08_irq_bus_lock(). The latter takes mcp->lock and enables regmap caching, so that the potentially slow I2C accesses are deferred until chip_bus_unlock(). The accesses to the regmap from mcp23s08_probe_one() do not need additional locking. In all remaining places where the regmap is accessed, except mcp_pinconf_get/set(), the driver already takes mcp->lock. This patch adds locking in mcp_pinconf_get/set() and disables internal locking in the regmap config. Among other things, it fixes the sleeping in atomic context described above. Fixes: 8f38910ba4f6 ("pinctrl: mcp23s08: switch to regmap caching") Cc: stable@vger.kernel.org Signed-off-by: Evgenii Shatokhin Link: https://lore.kernel.org/20241209074659.1442898-1-e.shatokhin@yadro.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-mcp23s08.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pinctrl/pinctrl-mcp23s08.c b/drivers/pinctrl/pinctrl-mcp23s08.c index d66c3a3e8429..b96e6368a956 100644 --- a/drivers/pinctrl/pinctrl-mcp23s08.c +++ b/drivers/pinctrl/pinctrl-mcp23s08.c @@ -86,6 +86,7 @@ const struct regmap_config mcp23x08_regmap = { .num_reg_defaults = ARRAY_SIZE(mcp23x08_defaults), .cache_type = REGCACHE_FLAT, .max_register = MCP_OLAT, + .disable_locking = true, /* mcp->lock protects the regmap */ }; EXPORT_SYMBOL_GPL(mcp23x08_regmap); @@ -132,6 +133,7 @@ const struct regmap_config mcp23x17_regmap = { .num_reg_defaults = ARRAY_SIZE(mcp23x17_defaults), .cache_type = REGCACHE_FLAT, .val_format_endian = REGMAP_ENDIAN_LITTLE, + .disable_locking = true, /* mcp->lock protects the regmap */ }; EXPORT_SYMBOL_GPL(mcp23x17_regmap); @@ -228,7 +230,9 @@ static int mcp_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: + mutex_lock(&mcp->lock); ret = mcp_read(mcp, MCP_GPPU, &data); + mutex_unlock(&mcp->lock); if (ret < 0) return ret; status = (data & BIT(pin)) ? 1 : 0; @@ -257,7 +261,9 @@ static int mcp_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: + mutex_lock(&mcp->lock); ret = mcp_set_bit(mcp, MCP_GPPU, pin, arg); + mutex_unlock(&mcp->lock); break; default: dev_dbg(mcp->dev, "Invalid config param %04x\n", param); From 69d803c40edeaf94089fbc8751c9b746cdc35044 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Mon, 16 Dec 2024 22:21:52 +0800 Subject: [PATCH 243/807] nfsd: Revert "nfsd: release svc_expkey/svc_export with rcu_work" This reverts commit f8c989a0c89a75d30f899a7cabdc14d72522bb8d. Before this commit, svc_export_put or expkey_put will call path_put with sync mode. After this commit, path_put will be called with async mode. And this can lead the unexpected results show as follow. mkfs.xfs -f /dev/sda echo "/ *(rw,no_root_squash,fsid=0)" > /etc/exports echo "/mnt *(rw,no_root_squash,fsid=1)" >> /etc/exports exportfs -ra service nfs-server start mount -t nfs -o vers=4.0 127.0.0.1:/mnt /mnt1 mount /dev/sda /mnt/sda touch /mnt1/sda/file exportfs -r umount /mnt/sda # failed unexcepted The touch will finally call nfsd_cross_mnt, add refcount to mount, and then add cache_head. Before this commit, exportfs -r will call cache_flush to cleanup all cache_head, and path_put in svc_export_put/expkey_put will be finished with sync mode. So, the latter umount will always success. However, after this commit, path_put will be called with async mode, the latter umount may failed, and if we add some delay, umount will success too. Personally I think this bug and should be fixed. We first revert before bugfix patch, and then fix the original bug with a different way. Fixes: f8c989a0c89a ("nfsd: release svc_expkey/svc_export with rcu_work") Signed-off-by: Yang Erkun Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 31 ++++++------------------------- fs/nfsd/export.h | 4 ++-- 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index eacafe46e3b6..aa4712362b3b 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -40,24 +40,15 @@ #define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS) #define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) -static void expkey_put_work(struct work_struct *work) +static void expkey_put(struct kref *ref) { - struct svc_expkey *key = - container_of(to_rcu_work(work), struct svc_expkey, ek_rcu_work); + struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); if (test_bit(CACHE_VALID, &key->h.flags) && !test_bit(CACHE_NEGATIVE, &key->h.flags)) path_put(&key->ek_path); auth_domain_put(key->ek_client); - kfree(key); -} - -static void expkey_put(struct kref *ref) -{ - struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); - - INIT_RCU_WORK(&key->ek_rcu_work, expkey_put_work); - queue_rcu_work(system_wq, &key->ek_rcu_work); + kfree_rcu(key, ek_rcu); } static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) @@ -364,26 +355,16 @@ static void export_stats_destroy(struct export_stats *stats) EXP_STATS_COUNTERS_NUM); } -static void svc_export_put_work(struct work_struct *work) +static void svc_export_put(struct kref *ref) { - struct svc_export *exp = - container_of(to_rcu_work(work), struct svc_export, ex_rcu_work); - + struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); export_stats_destroy(exp->ex_stats); kfree(exp->ex_stats); kfree(exp->ex_uuid); - kfree(exp); -} - -static void svc_export_put(struct kref *ref) -{ - struct svc_export *exp = container_of(ref, struct svc_export, h.ref); - - INIT_RCU_WORK(&exp->ex_rcu_work, svc_export_put_work); - queue_rcu_work(system_wq, &exp->ex_rcu_work); + kfree_rcu(exp, ex_rcu); } static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 6f2fbaae01fa..4d92b99c1ffd 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -75,7 +75,7 @@ struct svc_export { u32 ex_layout_types; struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; - struct rcu_work ex_rcu_work; + struct rcu_head ex_rcu; unsigned long ex_xprtsec_modes; struct export_stats *ex_stats; }; @@ -92,7 +92,7 @@ struct svc_expkey { u32 ek_fsid[6]; struct path ek_path; - struct rcu_work ek_rcu_work; + struct rcu_head ek_rcu; }; #define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) From 020b40f3562495f3c703a283ece145ffec19e82d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 08:21:46 -0700 Subject: [PATCH 244/807] io_uring: make ctx->timeout_lock a raw spinlock Chase reports that their tester complaints about a locking context mismatch: ============================= [ BUG: Invalid wait context ] 6.13.0-rc1-gf137f14b7ccb-dirty #9 Not tainted ----------------------------- syz.1.25198/182604 is trying to lock: ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: spin_lock_irq include/linux/spinlock.h:376 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe io_uring/io_uring.c:218 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 other info that might help us debug this: context-{5:5} 1 lock held by syz.1.25198/182604: #0: ffff88802b7d48c0 (&acct->lock){+.+.}-{2:2}, at: io_acct_cancel_pending_work+0x2d/0x6b0 io_uring/io-wq.c:1049 stack backtrace: CPU: 0 UID: 0 PID: 182604 Comm: syz.1.25198 Not tainted 6.13.0-rc1-gf137f14b7ccb-dirty #9 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x82/0xd0 lib/dump_stack.c:120 print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline] check_wait_context kernel/locking/lockdep.c:4898 [inline] __lock_acquire+0x883/0x3c80 kernel/locking/lockdep.c:5176 lock_acquire.part.0+0x11b/0x370 kernel/locking/lockdep.c:5849 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:119 [inline] _raw_spin_lock_irq+0x36/0x50 kernel/locking/spinlock.c:170 spin_lock_irq include/linux/spinlock.h:376 [inline] io_match_task_safe io_uring/io_uring.c:218 [inline] io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 io_acct_cancel_pending_work+0xb8/0x6b0 io_uring/io-wq.c:1052 io_wq_cancel_pending_work io_uring/io-wq.c:1074 [inline] io_wq_cancel_cb+0xb0/0x390 io_uring/io-wq.c:1112 io_uring_try_cancel_requests+0x15e/0xd70 io_uring/io_uring.c:3062 io_uring_cancel_generic+0x6ec/0x8c0 io_uring/io_uring.c:3140 io_uring_files_cancel include/linux/io_uring.h:20 [inline] do_exit+0x494/0x27a0 kernel/exit.c:894 do_group_exit+0xb3/0x250 kernel/exit.c:1087 get_signal+0x1d77/0x1ef0 kernel/signal.c:3017 arch_do_signal_or_restart+0x79/0x5b0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xd8/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f which is because io_uring has ctx->timeout_lock nesting inside the io-wq acct lock, the latter of which is used from inside the scheduler and hence is a raw spinlock, while the former is a "normal" spinlock and can hence be sleeping on PREEMPT_RT. Change ctx->timeout_lock to be a raw spinlock to solve this nesting dependency on PREEMPT_RT=y. Reported-by: chase xd Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 10 ++++----- io_uring/timeout.c | 40 +++++++++++++++++----------------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 011860ade268..fd4cdb0860a2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -345,7 +345,7 @@ struct io_ring_ctx { /* timeouts */ struct { - spinlock_t timeout_lock; + raw_spinlock_t timeout_lock; struct list_head timeout_list; struct list_head ltimeout_list; unsigned cq_last_tm_flush; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 06ff41484e29..605625e932eb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -215,9 +215,9 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, struct io_ring_ctx *ctx = head->ctx; /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { matched = io_match_linked(head); } @@ -333,7 +333,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); + raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); @@ -498,10 +498,10 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index f3d502717aeb..bbe58638eca7 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -74,10 +74,10 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) if (!io_timeout_finish(timeout, data)) { if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return; } } @@ -109,7 +109,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) u32 seq; struct io_timeout *timeout, *tmp; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { @@ -134,7 +134,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) @@ -200,9 +200,9 @@ void io_disarm_next(struct io_kiocb *req) } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); } @@ -238,11 +238,11 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); @@ -285,9 +285,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { struct io_kiocb *req; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); @@ -330,7 +330,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); prev = timeout->head; timeout->head = NULL; @@ -345,7 +345,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) } list_del(&timeout->list); timeout->prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; io_req_task_work_add(req); @@ -472,12 +472,12 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); else ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } if (ret < 0) @@ -572,7 +572,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct list_head *entry; u32 tail, off = timeout->off; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this @@ -611,7 +611,7 @@ add: list_add(&timeout->list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } @@ -620,7 +620,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -633,7 +633,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } @@ -668,7 +668,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx * timeout_lockfirst to keep locking ordering. */ spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); @@ -676,7 +676,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx io_kill_timeout(req, -ECANCELED)) canceled++; } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); return canceled != 0; } From 62e2a47ceab8f3f7d2e3f0e03fdd1c5e0059fd8b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 16 Dec 2024 19:28:06 -0500 Subject: [PATCH 245/807] NFS/pnfs: Fix a live lock between recalled layouts and layoutget When the server is recalling a layout, we should ignore the count of outstanding layoutget calls, since the server is expected to return either NFS4ERR_RECALLCONFLICT or NFS4ERR_RETURNCONFLICT for as long as the recall is outstanding. Currently, we may end up livelocking, causing the layout to eventually be forcibly revoked. Fixes: bf0291dd2267 ("pNFS: Ensure LAYOUTGET and LAYOUTRETURN are properly serialised") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0d16b383a452..5f582713bf05 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1308,7 +1308,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, enum pnfs_iomode *iomode) { /* Serialise LAYOUTGET/LAYOUTRETURN */ - if (atomic_read(&lo->plh_outstanding) != 0) + if (atomic_read(&lo->plh_outstanding) != 0 && lo->plh_return_seq == 0) return false; if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) return false; From bedb4e6088a886f587d2ea44e0c198c8ce2182c9 Mon Sep 17 00:00:00 2001 From: Zhang Kunbo Date: Tue, 17 Dec 2024 07:19:21 +0000 Subject: [PATCH 246/807] fs/nfs: fix missing declaration of nfs_idmap_cache_timeout fs/nfs/super.c should include fs/nfs/nfs4idmap.h for declaration of nfs_idmap_cache_timeout. This fixes the sparse warning: fs/nfs/super.c:1397:14: warning: symbol 'nfs_idmap_cache_timeout' was not declared. Should it be static? Signed-off-by: Zhang Kunbo Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ae5c5e39afa0..aeb715b4a690 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -73,6 +73,7 @@ #include "nfs.h" #include "netns.h" #include "sysfs.h" +#include "nfs4idmap.h" #define NFSDBG_FACILITY NFSDBG_VFS From 185e1b1d91e419445d3fd99c1c0376a970438acf Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Mon, 16 Dec 2024 11:25:38 +0900 Subject: [PATCH 247/807] platform/x86: mlx-platform: call pci_dev_put() to balance the refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mlxplat_pci_fpga_device_init() calls pci_get_device() but does not release the refcount on error path. Call pci_dev_put() on the error path and in mlxplat_pci_fpga_device_exit() to fix this. This bug was found by an experimental static analysis tool that I am developing. Fixes: 02daa222fbdd ("platform: mellanox: Add initial support for PCIe based programming logic device") Signed-off-by: Joe Hattori Reviewed-by: Vadim Pasternak Link: https://lore.kernel.org/r/20241216022538.381209-1-joe@pf.is.s.u-tokyo.ac.jp Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/mlx-platform.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c index 671021cd1f59..9c7f30a47f1f 100644 --- a/drivers/platform/x86/mlx-platform.c +++ b/drivers/platform/x86/mlx-platform.c @@ -6237,6 +6237,7 @@ fail_pci_set_dma_mask: fail_pci_request_regions: pci_disable_device(pci_dev); fail_pci_enable_device: + pci_dev_put(pci_dev); return err; } @@ -6247,6 +6248,7 @@ mlxplat_pci_fpga_device_exit(struct pci_dev *pci_bridge, iounmap(pci_bridge_addr); pci_release_regions(pci_bridge); pci_disable_device(pci_bridge); + pci_dev_put(pci_bridge); } static int From a6629626c584200daf495cc9a740048b455addcd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:19 -0500 Subject: [PATCH 248/807] tracing: Fix test_event_printk() to process entire print argument The test_event_printk() analyzes print formats of trace events looking for cases where it may dereference a pointer that is not in the ring buffer which can possibly be a bug when the trace event is read from the ring buffer and the content of that pointer no longer exists. The function needs to accurately go from one print format argument to the next. It handles quotes and parenthesis that may be included in an argument. When it finds the start of the next argument, it uses a simple "c = strstr(fmt + i, ',')" to find the end of that argument! In order to include "%s" dereferencing, it needs to process the entire content of the print format argument and not just the content of the first ',' it finds. As there may be content like: ({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char *access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" }; union kvm_mmu_page_role role; role.word = REC->role; trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe %sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level, role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "", access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? "" : "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ? "unsync" : "sync", 0); saved_ptr; }) Which is an example of a full argument of an existing event. As the code already handles finding the next print format argument, process the argument at the end of it and not the start of it. This way it has both the start of the argument as well as the end of it. Add a helper function "process_pointer()" that will do the processing during the loop as well as at the end. It also makes the code cleaner and easier to read. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 84 ++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 30 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 77e68efbd43e..14e160a5b905 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -265,8 +265,7 @@ static bool test_field(const char *fmt, struct trace_event_call *call) len = p - fmt; for (; field->type; field++) { - if (strncmp(field->name, fmt, len) || - field->name[len]) + if (strncmp(field->name, fmt, len) || field->name[len]) continue; array_descriptor = strchr(field->type, '['); /* This is an array and is OK to dereference. */ @@ -275,6 +274,32 @@ static bool test_field(const char *fmt, struct trace_event_call *call) return false; } +/* Return true if the argument pointer is safe */ +static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) +{ + const char *r, *e, *a; + + e = fmt + len; + + /* Find the REC-> in the argument */ + r = strstr(fmt, "REC->"); + if (r && r < e) { + /* + * Addresses of events on the buffer, or an array on the buffer is + * OK to dereference. There's ways to fool this, but + * this is to catch common mistakes, not malicious code. + */ + a = strchr(fmt, '&'); + if ((a && (a < r)) || test_field(r, call)) + return true; + } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) { + return true; + } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) { + return true; + } + return false; +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -285,12 +310,12 @@ static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; bool first = true; - const char *fmt, *c, *r, *a; + const char *fmt; int parens = 0; char in_quote = 0; int start_arg = 0; int arg = 0; - int i; + int i, e; fmt = call->print_fmt; @@ -403,42 +428,41 @@ static void test_event_printk(struct trace_event_call *call) case ',': if (in_quote || parens) continue; + e = i; i++; while (isspace(fmt[i])) i++; - start_arg = i; - if (!(dereference_flags & (1ULL << arg))) - goto next_arg; - /* Find the REC-> in the argument */ - c = strchr(fmt + i, ','); - r = strstr(fmt + i, "REC->"); - if (r && (!c || r < c)) { - /* - * Addresses of events on the buffer, - * or an array on the buffer is - * OK to dereference. - * There's ways to fool this, but - * this is to catch common mistakes, - * not malicious code. - */ - a = strchr(fmt + i, '&'); - if ((a && (a < r)) || test_field(r, call)) - dereference_flags &= ~(1ULL << arg); - } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && - (!c || r < c)) { - dereference_flags &= ~(1ULL << arg); - } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && - (!c || r < c)) { - dereference_flags &= ~(1ULL << arg); + /* + * If start_arg is zero, then this is the start of the + * first argument. The processing of the argument happens + * when the end of the argument is found, as it needs to + * handle paranthesis and such. + */ + if (!start_arg) { + start_arg = i; + /* Balance out the i++ in the for loop */ + i--; + continue; } - next_arg: - i--; + if (dereference_flags & (1ULL << arg)) { + if (process_pointer(fmt + start_arg, e - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } + + start_arg = i; arg++; + /* Balance out the i++ in the for loop */ + i--; } } + if (dereference_flags & (1ULL << arg)) { + if (process_pointer(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } + /* * If you triggered the below warning, the trace event reported * uses an unsafe dereference pointer %p*. As the data stored From 917110481f6bc1c96b1e54b62bb114137fbc6d17 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:20 -0500 Subject: [PATCH 249/807] tracing: Add missing helper functions in event pointer dereference check The process_pointer() helper function looks to see if various trace event macros are used. These macros are for storing data in the event. This makes it safe to dereference as the dereference will then point into the event on the ring buffer where the content of the data stays with the event itself. A few helper functions were missing. Those were: __get_rel_dynamic_array() __get_dynamic_array_len() __get_rel_dynamic_array_len() __get_rel_sockaddr() Also add a helper function find_print_string() to not need to use a middle man variable to test if the string exists. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.521836792@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 14e160a5b905..df75c06bb23f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -274,6 +274,15 @@ static bool test_field(const char *fmt, struct trace_event_call *call) return false; } +/* Look for a string within an argument */ +static bool find_print_string(const char *arg, const char *str, const char *end) +{ + const char *r; + + r = strstr(arg, str); + return r && r < end; +} + /* Return true if the argument pointer is safe */ static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) { @@ -292,9 +301,17 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c a = strchr(fmt, '&'); if ((a && (a < r)) || test_field(r, call)) return true; - } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) { + } else if (find_print_string(fmt, "__get_dynamic_array(", e)) { return true; - } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) { + } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) { + return true; + } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) { + return true; + } else if (find_print_string(fmt, "__get_sockaddr(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) { return true; } return false; From 65a25d9f7ac02e0cf361356e834d1c71d36acca9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:21 -0500 Subject: [PATCH 250/807] tracing: Add "%s" check in test_event_printk() The test_event_printk() code makes sure that when a trace event is registered, any dereferenced pointers in from the event's TP_printk() are pointing to content in the ring buffer. But currently it does not handle "%s", as there's cases where the string pointer saved in the ring buffer points to a static string in the kernel that will never be freed. As that is a valid case, the pointer needs to be checked at runtime. Currently the runtime check is done via trace_check_vprintf(), but to not have to replicate everything in vsnprintf() it does some logic with the va_list that may not be reliable across architectures. In order to get rid of that logic, more work in the test_event_printk() needs to be done. Some of the strings can be validated at this time when it is obvious the string is valid because the string will be saved in the ring buffer content. Do all the validation of strings in the ring buffer at boot in test_event_printk(), and make sure that the field of the strings that point into the kernel are accessible. This will allow adding checks at runtime that will validate the fields themselves and not rely on paring the TP_printk() format at runtime. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.685917008@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 104 ++++++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index df75c06bb23f..521ad2fd1fe7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -244,19 +244,16 @@ int trace_event_get_offsets(struct trace_event_call *call) return tail->offset + tail->size; } -/* - * Check if the referenced field is an array and return true, - * as arrays are OK to dereference. - */ -static bool test_field(const char *fmt, struct trace_event_call *call) + +static struct trace_event_fields *find_event_field(const char *fmt, + struct trace_event_call *call) { struct trace_event_fields *field = call->class->fields_array; - const char *array_descriptor; const char *p = fmt; int len; if (!(len = str_has_prefix(fmt, "REC->"))) - return false; + return NULL; fmt += len; for (p = fmt; *p; p++) { if (!isalnum(*p) && *p != '_') @@ -267,11 +264,26 @@ static bool test_field(const char *fmt, struct trace_event_call *call) for (; field->type; field++) { if (strncmp(field->name, fmt, len) || field->name[len]) continue; - array_descriptor = strchr(field->type, '['); - /* This is an array and is OK to dereference. */ - return array_descriptor != NULL; + + return field; } - return false; + return NULL; +} + +/* + * Check if the referenced field is an array and return true, + * as arrays are OK to dereference. + */ +static bool test_field(const char *fmt, struct trace_event_call *call) +{ + struct trace_event_fields *field; + + field = find_event_field(fmt, call); + if (!field) + return false; + + /* This is an array and is OK to dereference. */ + return strchr(field->type, '[') != NULL; } /* Look for a string within an argument */ @@ -317,6 +329,53 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c return false; } +/* Return true if the string is safe */ +static bool process_string(const char *fmt, int len, struct trace_event_call *call) +{ + const char *r, *e, *s; + + e = fmt + len; + + /* + * There are several helper functions that return strings. + * If the argument contains a function, then assume its field is valid. + * It is considered that the argument has a function if it has: + * alphanumeric or '_' before a parenthesis. + */ + s = fmt; + do { + r = strstr(s, "("); + if (!r || r >= e) + break; + for (int i = 1; r - i >= s; i++) { + char ch = *(r - i); + if (isspace(ch)) + continue; + if (isalnum(ch) || ch == '_') + return true; + /* Anything else, this isn't a function */ + break; + } + /* A function could be wrapped in parethesis, try the next one */ + s = r + 1; + } while (s < e); + + /* + * If there's any strings in the argument consider this arg OK as it + * could be: REC->field ? "foo" : "bar" and we don't want to get into + * verifying that logic here. + */ + if (find_print_string(fmt, "\"", e)) + return true; + + /* Dereferenced strings are also valid like any other pointer */ + if (process_pointer(fmt, len, call)) + return true; + + /* Make sure the field is found, and consider it OK for now if it is */ + return find_event_field(fmt, call) != NULL; +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -326,6 +385,7 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; + u64 string_flags = 0; bool first = true; const char *fmt; int parens = 0; @@ -416,8 +476,16 @@ static void test_event_printk(struct trace_event_call *call) star = true; continue; } - if ((fmt[i + j] == 's') && star) - arg++; + if ((fmt[i + j] == 's')) { + if (star) + arg++; + if (WARN_ONCE(arg == 63, + "Too many args for event: %s", + trace_event_name(call))) + return; + dereference_flags |= 1ULL << arg; + string_flags |= 1ULL << arg; + } break; } break; @@ -464,7 +532,10 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (process_pointer(fmt + start_arg, e - start_arg, call)) + if (string_flags & (1ULL << arg)) { + if (process_string(fmt + start_arg, e - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(fmt + start_arg, e - start_arg, call)) dereference_flags &= ~(1ULL << arg); } @@ -476,7 +547,10 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (process_pointer(fmt + start_arg, i - start_arg, call)) + if (string_flags & (1ULL << arg)) { + if (process_string(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(fmt + start_arg, i - start_arg, call)) dereference_flags &= ~(1ULL << arg); } From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:22 -0500 Subject: [PATCH 251/807] tracing: Check "%s" dereference via the field and not the TP_printk format The TP_printk() portion of a trace event is executed at the time a event is read from the trace. This can happen seconds, minutes, hours, days, months, years possibly later since the event was recorded. If the print format contains a dereference to a string via "%s", and that string was allocated, there's a chance that string could be freed before it is read by the trace file. To protect against such bugs, there are two functions that verify the event. The first one is test_event_printk(), which is called when the event is created. It reads the TP_printk() format as well as its arguments to make sure nothing may be dereferencing a pointer that was not copied into the ring buffer along with the event. If it is, it will trigger a WARN_ON(). For strings that use "%s", it is not so easy. The string may not reside in the ring buffer but may still be valid. Strings that are static and part of the kernel proper which will not be freed for the life of the running system, are safe to dereference. But to know if it is a pointer to a static string or to something on the heap can not be determined until the event is triggered. This brings us to the second function that tests for the bad dereferencing of strings, trace_check_vprintf(). It would walk through the printf format looking for "%s", and when it finds it, it would validate that the pointer is safe to read. If not, it would produces a WARN_ON() as well and write into the ring buffer "[UNSAFE-MEMORY]". The problem with this is how it used va_list to have vsnprintf() handle all the cases that it didn't need to check. Instead of re-implementing vsnprintf(), it would make a copy of the format up to the %s part, and call vsnprintf() with the current va_list ap variable, where the ap would then be ready to point at the string in question. For architectures that passed va_list by reference this was possible. For architectures that passed it by copy it was not. A test_can_verify() function was used to differentiate between the two, and if it wasn't possible, it would disable it. Even for architectures where this was feasible, it was a stretch to rely on such a method that is undocumented, and could cause issues later on with new optimizations of the compiler. Instead, the first function test_event_printk() was updated to look at "%s" as well. If the "%s" argument is a pointer outside the event in the ring buffer, it would find the field type of the event that is the problem and mark the structure with a new flag called "needs_test". The event itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that this event has a field that needs to be verified before the event can be printed using the printf format. When the event fields are created from the field type structure, the fields would copy the field type's "needs_test" value. Finally, before being printed, a new function ignore_event() is called which will check if the event has the TEST_STR flag set (if not, it returns false). If the flag is set, it then iterates through the events fields looking for the ones that have the "needs_test" flag set. Then it uses the offset field from the field structure to find the pointer in the ring buffer event. It runs the tests to make sure that pointer is safe to print and if not, it triggers the WARN_ON() and also adds to the trace output that the event in question has an unsafe memory access. The ignore_event() makes the trace_check_vprintf() obsolete so it is removed. Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 6 +- kernel/trace/trace.c | 255 ++++++++--------------------------- kernel/trace/trace.h | 6 +- kernel/trace/trace_events.c | 32 +++-- kernel/trace/trace_output.c | 6 +- 5 files changed, 88 insertions(+), 217 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2a5df5b62cfc..91b8ffbdfa8c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -273,7 +273,8 @@ struct trace_event_fields { const char *name; const int size; const int align; - const int is_signed; + const unsigned int is_signed:1; + unsigned int needs_test:1; const int filter_type; const int len; }; @@ -324,6 +325,7 @@ enum { TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, + TRACE_EVENT_FL_TEST_STR_BIT, }; /* @@ -340,6 +342,7 @@ enum { * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. + * TEST_STR - The event has a "%s" that points to a string outside the event */ enum { TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), @@ -352,6 +355,7 @@ enum { TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), + TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be62f0ea1814..7cc18b9bce27 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3611,17 +3611,12 @@ char *trace_iter_expand_format(struct trace_iterator *iter) } /* Returns true if the string is safe to dereference from an event */ -static bool trace_safe_str(struct trace_iterator *iter, const char *str, - bool star, int len) +static bool trace_safe_str(struct trace_iterator *iter, const char *str) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; - /* Ignore strings with no length */ - if (star && !len) - return true; - /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) @@ -3661,181 +3656,69 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, return false; } -static DEFINE_STATIC_KEY_FALSE(trace_no_verify); - -static int test_can_verify_check(const char *fmt, ...) -{ - char buf[16]; - va_list ap; - int ret; - - /* - * The verifier is dependent on vsnprintf() modifies the va_list - * passed to it, where it is sent as a reference. Some architectures - * (like x86_32) passes it by value, which means that vsnprintf() - * does not modify the va_list passed to it, and the verifier - * would then need to be able to understand all the values that - * vsnprintf can use. If it is passed by value, then the verifier - * is disabled. - */ - va_start(ap, fmt); - vsnprintf(buf, 16, "%d", ap); - ret = va_arg(ap, int); - va_end(ap); - - return ret; -} - -static void test_can_verify(void) -{ - if (!test_can_verify_check("%d %d", 0, 1)) { - pr_info("trace event string verifier disabled\n"); - static_branch_inc(&trace_no_verify); - } -} - /** - * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * ignore_event - Check dereferenced fields while writing to the seq buffer * @iter: The iterator that holds the seq buffer and the event being printed - * @fmt: The format used to print the event - * @ap: The va_list holding the data to print from @fmt. * - * This writes the data into the @iter->seq buffer using the data from - * @fmt and @ap. If the format has a %s, then the source of the string - * is examined to make sure it is safe to print, otherwise it will - * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string - * pointer. + * At boot up, test_event_printk() will flag any event that dereferences + * a string with "%s" that does exist in the ring buffer. It may still + * be valid, as the string may point to a static string in the kernel + * rodata that never gets freed. But if the string pointer is pointing + * to something that was allocated, there's a chance that it can be freed + * by the time the user reads the trace. This would cause a bad memory + * access by the kernel and possibly crash the system. + * + * This function will check if the event has any fields flagged as needing + * to be checked at runtime and perform those checks. + * + * If it is found that a field is unsafe, it will write into the @iter->seq + * a message stating what was found to be unsafe. + * + * @return: true if the event is unsafe and should be ignored, + * false otherwise. */ -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) +bool ignore_event(struct trace_iterator *iter) { - long text_delta = 0; - long data_delta = 0; - const char *p = fmt; - const char *str; - bool good; - int i, j; + struct ftrace_event_field *field; + struct trace_event *trace_event; + struct trace_event_call *event; + struct list_head *head; + struct trace_seq *seq; + const void *ptr; - if (WARN_ON_ONCE(!fmt)) - return; + trace_event = ftrace_find_event(iter->ent->type); - if (static_branch_unlikely(&trace_no_verify)) - goto print; + seq = &iter->seq; - /* - * When the kernel is booted with the tp_printk command line - * parameter, trace events go directly through to printk(). - * It also is checked by this function, but it does not - * have an associated trace_array (tr) for it. - */ - if (iter->tr) { - text_delta = iter->tr->text_delta; - data_delta = iter->tr->data_delta; + if (!trace_event) { + trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type); + return true; } - /* Don't bother checking when doing a ftrace_dump() */ - if (iter->fmt == static_fmt_buf) - goto print; + event = container_of(trace_event, struct trace_event_call, event); + if (!(event->flags & TRACE_EVENT_FL_TEST_STR)) + return false; - while (*p) { - bool star = false; - int len = 0; + head = trace_get_fields(event); + if (!head) { + trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n", + trace_event_name(event)); + return true; + } - j = 0; + /* Offsets are from the iter->ent that points to the raw event */ + ptr = iter->ent; - /* - * We only care about %s and variants - * as well as %p[sS] if delta is non-zero - */ - for (i = 0; p[i]; i++) { - if (i + 1 >= iter->fmt_size) { - /* - * If we can't expand the copy buffer, - * just print it. - */ - if (!trace_iter_expand_format(iter)) - goto print; - } + list_for_each_entry(field, head, link) { + const char *str; + bool good; - if (p[i] == '\\' && p[i+1]) { - i++; - continue; - } - if (p[i] == '%') { - /* Need to test cases like %08.*s */ - for (j = 1; p[i+j]; j++) { - if (isdigit(p[i+j]) || - p[i+j] == '.') - continue; - if (p[i+j] == '*') { - star = true; - continue; - } - break; - } - if (p[i+j] == 's') - break; - - if (text_delta && p[i+1] == 'p' && - ((p[i+2] == 's' || p[i+2] == 'S'))) - break; - - star = false; - } - j = 0; - } - /* If no %s found then just print normally */ - if (!p[i]) - break; - - /* Copy up to the %s, and print that */ - strncpy(iter->fmt, p, i); - iter->fmt[i] = '\0'; - trace_seq_vprintf(&iter->seq, iter->fmt, ap); - - /* Add delta to %pS pointers */ - if (p[i+1] == 'p') { - unsigned long addr; - char fmt[4]; - - fmt[0] = '%'; - fmt[1] = 'p'; - fmt[2] = p[i+2]; /* Either %ps or %pS */ - fmt[3] = '\0'; - - addr = va_arg(ap, unsigned long); - addr += text_delta; - trace_seq_printf(&iter->seq, fmt, (void *)addr); - - p += i + 3; + if (!field->needs_test) continue; - } - /* - * If iter->seq is full, the above call no longer guarantees - * that ap is in sync with fmt processing, and further calls - * to va_arg() can return wrong positional arguments. - * - * Ensure that ap is no longer used in this case. - */ - if (iter->seq.full) { - p = ""; - break; - } + str = *(const char **)(ptr + field->offset); - if (star) - len = va_arg(ap, int); - - /* The ap now points to the string data of the %s */ - str = va_arg(ap, const char *); - - good = trace_safe_str(iter, str, star, len); - - /* Could be from the last boot */ - if (data_delta && !good) { - str += data_delta; - good = trace_safe_str(iter, str, star, len); - } + good = trace_safe_str(iter, str); /* * If you hit this warning, it is likely that the @@ -3846,44 +3729,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, * instead. See samples/trace_events/trace-events-sample.h * for reference. */ - if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'", - fmt, seq_buf_str(&iter->seq.seq))) { - int ret; - - /* Try to safely read the string */ - if (star) { - if (len + 1 > iter->fmt_size) - len = iter->fmt_size - 1; - if (len < 0) - len = 0; - ret = copy_from_kernel_nofault(iter->fmt, str, len); - iter->fmt[len] = 0; - star = false; - } else { - ret = strncpy_from_kernel_nofault(iter->fmt, str, - iter->fmt_size); - } - if (ret < 0) - trace_seq_printf(&iter->seq, "(0x%px)", str); - else - trace_seq_printf(&iter->seq, "(0x%px:%s)", - str, iter->fmt); - str = "[UNSAFE-MEMORY]"; - strcpy(iter->fmt, "%s"); - } else { - strncpy(iter->fmt, p + i, j + 1); - iter->fmt[j+1] = '\0'; + if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'", + trace_event_name(event), field->name)) { + trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n", + trace_event_name(event), field->name); + return true; } - if (star) - trace_seq_printf(&iter->seq, iter->fmt, len, str); - else - trace_seq_printf(&iter->seq, iter->fmt, str); - - p += i + j + 1; } - print: - if (*p) - trace_seq_vprintf(&iter->seq, p, ap); + return false; } const char *trace_event_format(struct trace_iterator *iter, const char *fmt) @@ -10777,8 +10630,6 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); - test_can_verify(); - return 0; out_free_pipe_cpumask: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 266740b4e121..9691b47b5f3d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -667,9 +667,8 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) __printf(2, 0); char *trace_iter_expand_format(struct trace_iterator *iter); +bool ignore_event(struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -1413,7 +1412,8 @@ struct ftrace_event_field { int filter_type; int offset; int size; - int is_signed; + unsigned int is_signed:1; + unsigned int needs_test:1; int len; }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 521ad2fd1fe7..1545cc8b49d0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -82,7 +82,7 @@ static int system_refcount_dec(struct event_subsystem *system) } static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) +__find_event_field(struct list_head *head, const char *name) { struct ftrace_event_field *field; @@ -114,7 +114,8 @@ trace_find_event_field(struct trace_event_call *call, char *name) static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, - int is_signed, int filter_type, int len) + int is_signed, int filter_type, int len, + int need_test) { struct ftrace_event_field *field; @@ -133,6 +134,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field->offset = offset; field->size = size; field->is_signed = is_signed; + field->needs_test = need_test; field->len = len; list_add(&field->link, head); @@ -151,13 +153,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, 0); + is_signed, filter_type, 0, 0); } EXPORT_SYMBOL_GPL(trace_define_field); static int trace_define_field_ext(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, - int filter_type, int len) + int filter_type, int len, int need_test) { struct list_head *head; @@ -166,13 +168,13 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, len); + is_signed, filter_type, len, need_test); } #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ - filter_type, 0); \ + filter_type, 0, 0); \ if (ret) \ return ret; @@ -181,7 +183,8 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER, 0); \ + is_signed_type(type), FILTER_OTHER, \ + 0, 0); \ if (ret) \ return ret; @@ -332,6 +335,7 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c /* Return true if the string is safe */ static bool process_string(const char *fmt, int len, struct trace_event_call *call) { + struct trace_event_fields *field; const char *r, *e, *s; e = fmt + len; @@ -372,8 +376,16 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca if (process_pointer(fmt, len, call)) return true; - /* Make sure the field is found, and consider it OK for now if it is */ - return find_event_field(fmt, call) != NULL; + /* Make sure the field is found */ + field = find_event_field(fmt, call); + if (!field) + return false; + + /* Test this field's string before printing the event */ + call->flags |= TRACE_EVENT_FL_TEST_STR; + field->needs_test = 1; + + return true; } /* @@ -2586,7 +2598,7 @@ event_define_fields(struct trace_event_call *call) ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, field->is_signed, field->filter_type, - field->len); + field->len, field->needs_test); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index da748b7cbc4d..03d56f711ad1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep); void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) { + struct trace_seq *s = &iter->seq; va_list ap; + if (ignore_event(iter)) + return; + va_start(ap, fmt); - trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); + trace_seq_vprintf(s, trace_event_format(iter, fmt), ap); va_end(ap); } EXPORT_SYMBOL(trace_event_printf); From b6ccddd6fe1fd49c7a82b6fbed01cccad21a29c7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 11 Dec 2024 08:11:46 -0800 Subject: [PATCH 252/807] perf/x86/intel/uncore: Add Clearwater Forest support From the perspective of the uncore PMU, the Clearwater Forest is the same as the previous Sierra Forest. The only difference is the event list, which will be supported in the perf tool later. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241211161146.235253-1-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d98fac567684..e7aba7349231 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1910,6 +1910,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_uncore_init), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_uncore_init), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_uncore_init), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &gnr_uncore_init), {}, }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); From b8c3a2502a205321fe66c356f4b70cabd8e1a5fc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 16 Dec 2024 12:45:02 -0800 Subject: [PATCH 253/807] perf/x86/intel/ds: Add PEBS format 6 The only difference between 5 and 6 is the new counters snapshotting group, without the following counters snapshotting enabling patches, it's impossible to utilize the feature in a PEBS record. It's safe to share the same code path with format 5. Add format 6, so the end user can at least utilize the legacy PEBS features. Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241216204505.748363-1-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 1a4b326ca2ce..6ba6549f26fa 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2517,6 +2517,7 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; + case 6: case 5: x86_pmu.pebs_ept = 1; fallthrough; From 4a077914578183ec397ad09f7156a357e00e5d72 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 12 Dec 2024 14:21:33 -0800 Subject: [PATCH 254/807] locking/rtmutex: Make sure we wake anything on the wake_q when we release the lock->wait_lock Bert reported seeing occasional boot hangs when running with PREEPT_RT and bisected it down to commit 894d1b3db41c ("locking/mutex: Remove wakeups from under mutex::wait_lock"). It looks like I missed a few spots where we drop the wait_lock and potentially call into schedule without waking up the tasks on the wake_q structure. Since the tasks being woken are ww_mutex tasks they need to be able to run to release the mutex and unblock the task that currently is planning to wake them. Thus we can deadlock. So make sure we wake the wake_q tasks when we unlock the wait_lock. Closes: https://lore.kernel.org/lkml/20241211182502.2915-1-spasswolf@web.de Fixes: 894d1b3db41c ("locking/mutex: Remove wakeups from under mutex::wait_lock") Reported-by: Bert Karwatzki Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241212222138.2400498-1-jstultz@google.com --- kernel/locking/rtmutex.c | 18 ++++++++++++++++-- kernel/locking/rtmutex_api.c | 2 +- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index e858de203eb6..697a56d3d949 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1292,7 +1292,13 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, */ get_task_struct(owner); + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */ + wake_up_q(wake_q); + wake_q_init(wake_q); + preempt_enable(); + res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); @@ -1596,6 +1602,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter + * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock * * Must be called with lock->wait_lock held and interrupts disabled */ @@ -1603,7 +1610,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + struct wake_q_head *wake_q) __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); @@ -1634,7 +1642,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = rt_mutex_owner(lock); else owner = NULL; + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } + preempt_enable(); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) rt_mutex_schedule(); @@ -1708,7 +1722,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); if (likely(!ret)) - ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); + ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q); if (likely(!ret)) { /* acquired the lock */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 33ea31d6a7b3..191e4720e546 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -383,7 +383,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, raw_spin_lock_irq(&lock->wait_lock); /* sleep on the mutex */ set_current_state(TASK_INTERRUPTIBLE); - ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter); + ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL); /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. From 8fc38062be3f692ff8816da84fde71972530bcc4 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 16 Dec 2024 08:42:47 +0100 Subject: [PATCH 255/807] fbdev: Fix recursive dependencies wrt BACKLIGHT_CLASS_DEVICE Do not select BACKLIGHT_CLASS_DEVICE from FB_BACKLIGHT. The latter only controls backlight support within fbdev core code and data structures. Make fbdev drivers depend on BACKLIGHT_CLASS_DEVICE and let users select it explicitly. Fixes warnings about recursive dependencies, such as error: recursive dependency detected! symbol BACKLIGHT_CLASS_DEVICE is selected by FB_BACKLIGHT symbol FB_BACKLIGHT is selected by FB_SH_MOBILE_LCDC symbol FB_SH_MOBILE_LCDC depends on FB_DEVICE symbol FB_DEVICE depends on FB_CORE symbol FB_CORE is selected by DRM_GEM_DMA_HELPER symbol DRM_GEM_DMA_HELPER is selected by DRM_PANEL_ILITEK_ILI9341 symbol DRM_PANEL_ILITEK_ILI9341 depends on BACKLIGHT_CLASS_DEVICE BACKLIGHT_CLASS_DEVICE is user-selectable, so making drivers adapt to it is the correct approach in any case. For most drivers, backlight support is also configurable separately. v3: - Select BACKLIGHT_CLASS_DEVICE in PowerMac defconfigs (Christophe) - Fix PMAC_BACKLIGHT module dependency corner cases (Christophe) v2: - s/BACKLIGHT_DEVICE_CLASS/BACKLIGHT_CLASS_DEVICE (Helge) - Fix fbdev driver-dependency corner case (Arnd) Signed-off-by: Thomas Zimmermann Reviewed-by: Arnd Bergmann Link: https://patchwork.freedesktop.org/patch/msgid/20241216074450.8590-2-tzimmermann@suse.de --- arch/powerpc/configs/pmac32_defconfig | 1 + arch/powerpc/configs/ppc6xx_defconfig | 1 + drivers/auxdisplay/Kconfig | 2 +- drivers/macintosh/Kconfig | 1 + drivers/staging/fbtft/Kconfig | 1 + drivers/video/fbdev/Kconfig | 18 +++++++++++++----- drivers/video/fbdev/core/Kconfig | 3 +-- 7 files changed, 19 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 57ded82c2840..e8b3f67bf3f5 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -208,6 +208,7 @@ CONFIG_FB_ATY=y CONFIG_FB_ATY_CT=y CONFIG_FB_ATY_GX=y CONFIG_FB_3DFX=y +CONFIG_BACKLIGHT_CLASS_DEVICE=y # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 4d77e17541e9..ca0c90e95837 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -716,6 +716,7 @@ CONFIG_FB_TRIDENT=m CONFIG_FB_SM501=m CONFIG_FB_IBM_GXT4500=y CONFIG_LCD_PLATFORM=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y CONFIG_LOGO=y diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 21545ffba065..8934e6ad5772 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -489,7 +489,7 @@ config IMG_ASCII_LCD config HT16K33 tristate "Holtek Ht16K33 LED controller with keyscan" - depends on FB && I2C && INPUT + depends on FB && I2C && INPUT && BACKLIGHT_CLASS_DEVICE select FB_SYSMEM_HELPERS select INPUT_MATRIXKMAP select FB_BACKLIGHT diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig index fb38f684444f..d00e713c1092 100644 --- a/drivers/macintosh/Kconfig +++ b/drivers/macintosh/Kconfig @@ -120,6 +120,7 @@ config PMAC_MEDIABAY config PMAC_BACKLIGHT bool "Backlight control for LCD screens" depends on PPC_PMAC && ADB_PMU && FB = y && (BROKEN || !PPC64) + depends on BACKLIGHT_CLASS_DEVICE=y select FB_BACKLIGHT help Say Y here to enable Macintosh specific extensions of the generic diff --git a/drivers/staging/fbtft/Kconfig b/drivers/staging/fbtft/Kconfig index 77ab44362f16..dcf6a70455cc 100644 --- a/drivers/staging/fbtft/Kconfig +++ b/drivers/staging/fbtft/Kconfig @@ -3,6 +3,7 @@ menuconfig FB_TFT tristate "Support for small TFT LCD display modules" depends on FB && SPI depends on FB_DEVICE + depends on BACKLIGHT_CLASS_DEVICE depends on GPIOLIB || COMPILE_TEST select FB_BACKLIGHT select FB_SYSMEM_HELPERS_DEFERRED diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index de035071fedb..55c6686f091e 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -649,6 +649,7 @@ config FB_S1D13XXX config FB_ATMEL tristate "AT91 LCD Controller support" depends on FB && OF && HAVE_CLK && HAS_IOMEM + depends on BACKLIGHT_CLASS_DEVICE depends on HAVE_FB_ATMEL || COMPILE_TEST select FB_BACKLIGHT select FB_IOMEM_HELPERS @@ -660,7 +661,6 @@ config FB_ATMEL config FB_NVIDIA tristate "nVidia Framebuffer Support" depends on FB && PCI - select FB_BACKLIGHT if FB_NVIDIA_BACKLIGHT select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -700,6 +700,8 @@ config FB_NVIDIA_DEBUG config FB_NVIDIA_BACKLIGHT bool "Support for backlight control" depends on FB_NVIDIA + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_NVIDIA + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -707,7 +709,6 @@ config FB_NVIDIA_BACKLIGHT config FB_RIVA tristate "nVidia Riva support" depends on FB && PCI - select FB_BACKLIGHT if FB_RIVA_BACKLIGHT select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -747,6 +748,8 @@ config FB_RIVA_DEBUG config FB_RIVA_BACKLIGHT bool "Support for backlight control" depends on FB_RIVA + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_RIVA + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -934,7 +937,6 @@ config FB_MATROX_MAVEN config FB_RADEON tristate "ATI Radeon display support" depends on FB && PCI - select FB_BACKLIGHT if FB_RADEON_BACKLIGHT select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -960,6 +962,8 @@ config FB_RADEON_I2C config FB_RADEON_BACKLIGHT bool "Support for backlight control" depends on FB_RADEON + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_RADEON + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -975,7 +979,6 @@ config FB_RADEON_DEBUG config FB_ATY128 tristate "ATI Rage128 display support" depends on FB && PCI - select FB_BACKLIGHT if FB_ATY128_BACKLIGHT select FB_IOMEM_HELPERS select FB_MACMODES if PPC_PMAC help @@ -989,6 +992,8 @@ config FB_ATY128 config FB_ATY128_BACKLIGHT bool "Support for backlight control" depends on FB_ATY128 + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_ATY128 + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -999,7 +1004,6 @@ config FB_ATY select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT - select FB_BACKLIGHT if FB_ATY_BACKLIGHT select FB_IOMEM_FOPS select FB_MACMODES if PPC select FB_ATY_CT if SPARC64 && PCI @@ -1040,6 +1044,8 @@ config FB_ATY_GX config FB_ATY_BACKLIGHT bool "Support for backlight control" depends on FB_ATY + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_ATY + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -1528,6 +1534,7 @@ config FB_SH_MOBILE_LCDC depends on FB && HAVE_CLK && HAS_IOMEM depends on SUPERH || COMPILE_TEST depends on FB_DEVICE + depends on BACKLIGHT_CLASS_DEVICE select FB_BACKLIGHT select FB_DEFERRED_IO select FB_DMAMEM_HELPERS @@ -1793,6 +1800,7 @@ config FB_SSD1307 tristate "Solomon SSD1307 framebuffer support" depends on FB && I2C depends on GPIOLIB || COMPILE_TEST + depends on BACKLIGHT_CLASS_DEVICE select FB_BACKLIGHT select FB_SYSMEM_HELPERS_DEFERRED help diff --git a/drivers/video/fbdev/core/Kconfig b/drivers/video/fbdev/core/Kconfig index 0ab8848ba2f1..d554d8c543d4 100644 --- a/drivers/video/fbdev/core/Kconfig +++ b/drivers/video/fbdev/core/Kconfig @@ -183,9 +183,8 @@ config FB_SYSMEM_HELPERS_DEFERRED select FB_SYSMEM_HELPERS config FB_BACKLIGHT - tristate + bool depends on FB - select BACKLIGHT_CLASS_DEVICE config FB_MODE_HELPERS bool "Enable Video Mode Handling Helpers" From 8ce35bf0ef5a659f3a15237152770a7c1d13c996 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 16 Dec 2024 08:42:48 +0100 Subject: [PATCH 256/807] drm/fbdev: Select FB_CORE dependency for fbdev on DMA and TTM Select FB_CORE if GEM's DMA and TTM implementations support fbdev emulation. Fixes linker errors about missing symbols from the fbdev subsystem. Also see [1] for a related SHMEM fix. Fixes: dadd28d4142f ("drm/client: Add client-lib module") Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/series/141411/ # 1 Reviewed-by: Arnd Bergmann Link: https://patchwork.freedesktop.org/patch/msgid/20241216074450.8590-3-tzimmermann@suse.de --- drivers/gpu/drm/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index a0690049b292..ccee570eab7d 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -358,6 +358,7 @@ config DRM_TTM_HELPER tristate depends on DRM select DRM_TTM + select FB_CORE if DRM_FBDEV_EMULATION select FB_SYSMEM_HELPERS_DEFERRED if DRM_FBDEV_EMULATION help Helpers for ttm-based gem objects @@ -365,6 +366,7 @@ config DRM_TTM_HELPER config DRM_GEM_DMA_HELPER tristate depends on DRM + select FB_CORE if DRM_FBDEV_EMULATION select FB_DMAMEM_HELPERS_DEFERRED if DRM_FBDEV_EMULATION help Choose this if you need the GEM DMA helper functions From 2182e0f200d097805f2f6bc0042de8695c60f386 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Dec 2024 08:42:49 +0100 Subject: [PATCH 257/807] drm: rework FB_CORE dependency The 'select FB_CORE' statement moved from CONFIG_DRM to DRM_CLIENT_LIB, but there are now configurations that have code calling into fb_core as built-in even though the client_lib itself is a loadable module: x86_64-linux-ld: drivers/gpu/drm/drm_fb_helper.o: in function `drm_fb_helper_set_suspend': drm_fb_helper.c:(.text+0x2c6): undefined reference to `fb_set_suspend' x86_64-linux-ld: drivers/gpu/drm/drm_fb_helper.o: in function `drm_fb_helper_resume_worker': drm_fb_helper.c:(.text+0x2e1): undefined reference to `fb_set_suspend' In addition to DRM_CLIENT_LIB, the 'select' needs to be at least in DRM_KMS_HELPER and DRM_GEM_SHMEM_HELPER, so add it here. This patch is the KMS_HELPER part of [1]. Fixes: dadd28d4142f ("drm/client: Add client-lib module") Signed-off-by: Arnd Bergmann Reviewed-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/series/141411/ # 1 Link: https://patchwork.freedesktop.org/patch/msgid/20241216074450.8590-4-tzimmermann@suse.de Signed-off-by: Thomas Zimmermann --- drivers/gpu/drm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index ccee570eab7d..772fc7625639 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -99,6 +99,7 @@ config DRM_KUNIT_TEST config DRM_KMS_HELPER tristate depends on DRM + select FB_CORE if DRM_FBDEV_EMULATION help CRTC helpers for KMS drivers. From d75d72a858f0c00ca8ae161b48cdb403807be4de Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Nov 2024 11:11:55 -0500 Subject: [PATCH 258/807] btrfs: fix improper generation check in snapshot delete We have been using the following check if (generation <= root->root_key.offset) to make decisions about whether or not to visit a node during snapshot delete. This is because for normal subvolumes this is set to 0, and for snapshots it's set to the creation generation. The idea being that if the generation of the node is less than or equal to our creation generation then we don't need to visit that node, because it doesn't belong to us, we can simply drop our reference and move on. However reloc roots don't have their generation stored in root->root_key.offset, instead that is the objectid of their corresponding fs root. This means we can incorrectly not walk into nodes that need to be dropped when deleting a reloc root. There are a variety of consequences to making the wrong choice in two distinct areas. visit_node_for_delete() 1. False positive. We think we are newer than the block when we really aren't. We don't visit the node and drop our reference to the node and carry on. This would result in leaked space. 2. False negative. We do decide to walk down into a block that we should have just dropped our reference to. However this means that the child node will have refs > 1, so we will switch to UPDATE_BACKREF, and then the subsequent walk_down_proc() will notice that btrfs_header_owner(node) != root->root_key.objectid and it'll break out of the loop, and then walk_up_proc() will drop our reference, so this appears to be ok. do_walk_down() 1. False positive. We are in UPDATE_BACKREF and incorrectly decide that we are done and don't need to update the backref for our lower nodes. This is another case that simply won't happen with relocation, as we only have to do UPDATE_BACKREF if the node below us was shared and didn't have FULL_BACKREF set, and since we don't own that node because we're a reloc root we actually won't end up in this case. 2. False negative. Again this is tricky because as described above, we simply wouldn't be here from relocation, because we don't own any of the nodes because we never set btrfs_header_owner() to the reloc root objectid, and we always use FULL_BACKREF, we never actually need to set FULL_BACKREF on any children. Having spent a lot of time stressing relocation/snapshot delete recently I've not seen this pop in practice. But this is objectively incorrect, so fix this to get the correct starting generation based on the root we're dropping to keep me from thinking there's a problem here. CC: stable@vger.kernel.org Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 19 +++++++++++++++++++ fs/btrfs/extent-tree.c | 6 +++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 307dedf95c70..2c341956a01c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -370,6 +370,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi WRITE_ONCE(root->last_trans, transid); } +/* + * Return the generation this root started with. + * + * Every normal root that is created with root->root_key.offset set to it's + * originating generation. If it is a snapshot it is the generation when the + * snapshot was created. + * + * However for TREE_RELOC roots root_key.offset is the objectid of the owning + * tree root. Thankfully we copy the root item of the owning tree root, which + * has it's last_snapshot set to what we would have root_key.offset set to, so + * return that if this is a TREE_RELOC root. + */ +static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) +{ + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return btrfs_root_last_snapshot(&root->root_item); + return root->root_key.offset; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 412e318e4a22..43a771f7bd7a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5285,7 +5285,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control * * reference to it. */ generation = btrfs_node_ptr_generation(eb, slot); - if (!wc->update_ref || generation <= root->root_key.offset) + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) return false; /* @@ -5340,7 +5340,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ @@ -5683,7 +5683,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { + generation <= btrfs_root_origin_generation(root)) { wc->lookup_info = 1; return 1; } From 6c3864e055486fadb5b97793b57688082e14b43b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:32 +0100 Subject: [PATCH 259/807] btrfs: use bio_is_zone_append() in the completion handler Otherwise it won't catch bios turned into regular writes by the block level zone write plugging. The additional test it adds is for emulated zone append. Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation") CC: stable@vger.kernel.org # 6.12 Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 1f216d07eff6..011cc97be3b5 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -355,7 +355,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_bio_end_io(bbio, bbio->bio.bi_status); } @@ -398,7 +398,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; btrfs_bio_end_io(bbio, bbio->bio.bi_status); @@ -412,7 +412,7 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); - } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + } else if (bio_is_zone_append(bio)) { stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } From be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:33 +0100 Subject: [PATCH 260/807] btrfs: split bios to the fs sector size boundary Btrfs like other file systems can't really deal with I/O not aligned to it's internal block size (which strangely is called sector size in btrfs, for historical reasons), but the block layer split helper doesn't even know about that. Round down the split boundary so that all I/Os are aligned. Fixes: d5e4377d5051 ("btrfs: split zone append bios in btrfs_submit_bio") CC: stable@vger.kernel.org # 6.12 Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 011cc97be3b5..78f5606baacb 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -649,8 +649,14 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) map_length = min(map_length, bbio->fs_info->max_zone_append_size); sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, &nr_segs, map_length); - if (sector_offset) - return sector_offset << SECTOR_SHIFT; + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } return map_length; } From dfb92681a19e1d5172420baa242806414b3eff6f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 4 Dec 2024 13:30:46 +1030 Subject: [PATCH 261/807] btrfs: tree-checker: reject inline extent items with 0 ref count [BUG] There is a bug report in the mailing list where btrfs_run_delayed_refs() failed to drop the ref count for logical 25870311358464 num_bytes 2113536. The involved leaf dump looks like this: item 166 key (25870311358464 168 2113536) itemoff 10091 itemsize 50 extent refs 1 gen 84178 flags 1 ref#0: shared data backref parent 32399126528000 count 0 <<< ref#1: shared data backref parent 31808973717504 count 1 Notice the count number is 0. [CAUSE] There is no concrete evidence yet, but considering 0 -> 1 is also a single bit flipped, it's possible that hardware memory bitflip is involved, causing the on-disk extent tree to be corrupted. [FIX] To prevent us reading such corrupted extent item, or writing such damaged extent item back to disk, enhance the handling of BTRFS_EXTENT_DATA_REF_KEY and BTRFS_SHARED_DATA_REF_KEY keys for both inlined and key items, to detect such 0 ref count and reject them. CC: stable@vger.kernel.org # 5.4+ Link: https://lore.kernel.org/linux-btrfs/7c69dd49-c346-4806-86e7-e6f863a66f48@app.fastmail.com/ Reported-by: Frankie Fisher Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 148d8cefa40e..dfeee033f31f 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1527,6 +1527,11 @@ static int check_extent_item(struct extent_buffer *leaf, dref_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_extent_data_ref_count(leaf, dref); break; /* Contains parent bytenr and ref count */ @@ -1539,6 +1544,11 @@ static int check_extent_item(struct extent_buffer *leaf, inline_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; case BTRFS_EXTENT_OWNER_REF_KEY: @@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, { u32 expect_item_size = 0; - if (key->type == BTRFS_SHARED_DATA_REF_KEY) + if (key->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + + sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data backref count, should have non-zero value"); + return -EUCLEAN; + } + expect_item_size = sizeof(struct btrfs_shared_data_ref); + } if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, @@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf, offset, leaf->fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid extent data backref count, should have non-zero value"); + return -EUCLEAN; + } } return 0; } From 058387d9c6b70e225da82492e1e193635c3fac3f Mon Sep 17 00:00:00 2001 From: Willow Cunningham Date: Mon, 7 Oct 2024 17:29:54 -0400 Subject: [PATCH 262/807] arm64: dts: broadcom: Fix L2 linesize for Raspberry Pi 5 Set the cache-line-size parameter of the L2 cache for each core to the correct value of 64 bytes. Previously, the L2 cache line size was incorrectly set to 128 bytes for the Broadcom BCM2712. This causes validation tests for the Performance Application Programming Interface (PAPI) tool to fail as they depend on sysfs accurately reporting cache line sizes. The correct value of 64 bytes is stated in the official documentation of the ARM Cortex A-72, which is linked in the comments of arm64/boot/dts/broadcom/bcm2712.dtsi as the source for cache-line-size. Fixes: faa3381267d0 ("arm64: dts: broadcom: Add minimal support for Raspberry Pi 5") Signed-off-by: Willow Cunningham Link: https://lore.kernel.org/r/20241007212954.214724-1-willow.e.cunningham@maine.edu Signed-off-by: Florian Fainelli --- arch/arm64/boot/dts/broadcom/bcm2712.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/broadcom/bcm2712.dtsi b/arch/arm64/boot/dts/broadcom/bcm2712.dtsi index 6e5a984c1d4e..26a29e5e5078 100644 --- a/arch/arm64/boot/dts/broadcom/bcm2712.dtsi +++ b/arch/arm64/boot/dts/broadcom/bcm2712.dtsi @@ -67,7 +67,7 @@ l2_cache_l0: l2-cache-l0 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; @@ -91,7 +91,7 @@ l2_cache_l1: l2-cache-l1 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; @@ -115,7 +115,7 @@ l2_cache_l2: l2-cache-l2 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; @@ -139,7 +139,7 @@ l2_cache_l3: l2-cache-l3 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; From f4bf0b909a6bf64a2220a42a7c8b8c2ee1b77b89 Mon Sep 17 00:00:00 2001 From: Maksim Kiselev Date: Tue, 10 Dec 2024 11:30:27 +0300 Subject: [PATCH 263/807] clk: thead: Fix TH1520 emmc and shdci clock rate In accordance with LicheePi 4A BSP the clock that comes to emmc/sdhci is 198Mhz which is got through frequency division of source clock VIDEO PLL by 4 [1]. But now the AP_SUBSYS driver sets the CLK EMMC SDIO to the same frequency as the VIDEO PLL, equal to 792 MHz. This causes emmc/sdhci to work 4 times slower. Let's fix this issue by adding fixed factor clock that divides VIDEO PLL by 4 for emmc/sdhci. Link: https://github.com/revyos/thead-kernel/blob/7563179071a314f41cdcdbfd8cf6e101e73707f3/drivers/clk/thead/clk-light-fm.c#L454 Fixes: ae81b69fd2b1 ("clk: thead: Add support for T-Head TH1520 AP_SUBSYS clocks") Signed-off-by: Maksim Kiselev Link: https://lore.kernel.org/r/20241210083029.92620-1-bigunclemax@gmail.com Tested-by: Xi Ruoyao Reviewed-by: Drew Fustini Signed-off-by: Stephen Boyd --- drivers/clk/thead/clk-th1520-ap.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/clk/thead/clk-th1520-ap.c b/drivers/clk/thead/clk-th1520-ap.c index 17e32ae08720..1015fab95251 100644 --- a/drivers/clk/thead/clk-th1520-ap.c +++ b/drivers/clk/thead/clk-th1520-ap.c @@ -779,6 +779,13 @@ static struct ccu_div dpu1_clk = { }, }; +static CLK_FIXED_FACTOR_HW(emmc_sdio_ref_clk, "emmc-sdio-ref", + &video_pll_clk.common.hw, 4, 1, 0); + +static const struct clk_parent_data emmc_sdio_ref_clk_pd[] = { + { .hw = &emmc_sdio_ref_clk.hw }, +}; + static CCU_GATE(CLK_BROM, brom_clk, "brom", ahb2_cpusys_hclk_pd, 0x100, BIT(4), 0); static CCU_GATE(CLK_BMU, bmu_clk, "bmu", axi4_cpusys2_aclk_pd, 0x100, BIT(5), 0); static CCU_GATE(CLK_AON2CPU_A2X, aon2cpu_a2x_clk, "aon2cpu-a2x", axi4_cpusys2_aclk_pd, @@ -798,7 +805,7 @@ static CCU_GATE(CLK_PERISYS_APB4_HCLK, perisys_apb4_hclk, "perisys-apb4-hclk", p 0x150, BIT(12), 0); static CCU_GATE(CLK_NPU_AXI, npu_axi_clk, "npu-axi", axi_aclk_pd, 0x1c8, BIT(5), 0); static CCU_GATE(CLK_CPU2VP, cpu2vp_clk, "cpu2vp", axi_aclk_pd, 0x1e0, BIT(13), 0); -static CCU_GATE(CLK_EMMC_SDIO, emmc_sdio_clk, "emmc-sdio", video_pll_clk_pd, 0x204, BIT(30), 0); +static CCU_GATE(CLK_EMMC_SDIO, emmc_sdio_clk, "emmc-sdio", emmc_sdio_ref_clk_pd, 0x204, BIT(30), 0); static CCU_GATE(CLK_GMAC1, gmac1_clk, "gmac1", gmac_pll_clk_pd, 0x204, BIT(26), 0); static CCU_GATE(CLK_PADCTRL1, padctrl1_clk, "padctrl1", perisys_apb_pclk_pd, 0x204, BIT(24), 0); static CCU_GATE(CLK_DSMART, dsmart_clk, "dsmart", perisys_apb_pclk_pd, 0x204, BIT(23), 0); @@ -1059,6 +1066,10 @@ static int th1520_clk_probe(struct platform_device *pdev) return ret; priv->hws[CLK_PLL_GMAC_100M] = &gmac_pll_clk_100m.hw; + ret = devm_clk_hw_register(dev, &emmc_sdio_ref_clk.hw); + if (ret) + return ret; + ret = devm_of_clk_add_hw_provider(dev, of_clk_hw_onecell_get, priv); if (ret) return ret; From 93433c1d919775f8ac0f7893692f42e6731a5373 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Mon, 25 Nov 2024 15:58:54 -0800 Subject: [PATCH 264/807] idpf: add support for SW triggered interrupts SW triggered interrupts are guaranteed to fire after their timer expires, unlike Tx and Rx interrupts which will only fire after the timer expires _and_ a descriptor write back is available to be processed by the driver. Add the necessary fields, defines, and initializations to enable a SW triggered interrupt in the vector's dyn_ctl register. Reviewed-by: Madhu Chittim Signed-off-by: Joshua Hay Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_dev.c | 3 +++ drivers/net/ethernet/intel/idpf/idpf_txrx.h | 8 +++++++- drivers/net/ethernet/intel/idpf/idpf_vf_dev.c | 3 +++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_dev.c b/drivers/net/ethernet/intel/idpf/idpf_dev.c index 6c913a703df6..41e4bd49402a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_dev.c +++ b/drivers/net/ethernet/intel/idpf/idpf_dev.c @@ -101,6 +101,9 @@ static int idpf_intr_reg_init(struct idpf_vport *vport) intr->dyn_ctl_itridx_s = PF_GLINT_DYN_CTL_ITR_INDX_S; intr->dyn_ctl_intrvl_s = PF_GLINT_DYN_CTL_INTERVAL_S; intr->dyn_ctl_wb_on_itr_m = PF_GLINT_DYN_CTL_WB_ON_ITR_M; + intr->dyn_ctl_swint_trig_m = PF_GLINT_DYN_CTL_SWINT_TRIG_M; + intr->dyn_ctl_sw_itridx_ena_m = + PF_GLINT_DYN_CTL_SW_ITR_INDX_ENA_M; spacing = IDPF_ITR_IDX_SPACING(reg_vals[vec_id].itrn_index_spacing, IDPF_PF_ITR_IDX_SPACING); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 9c1fe84108ed..0f71a6f5557b 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -354,6 +354,8 @@ struct idpf_vec_regs { * @dyn_ctl_itridx_m: Mask for ITR index * @dyn_ctl_intrvl_s: Register bit offset for ITR interval * @dyn_ctl_wb_on_itr_m: Mask for WB on ITR feature + * @dyn_ctl_sw_itridx_ena_m: Mask for SW ITR index + * @dyn_ctl_swint_trig_m: Mask for dyn_ctl SW triggered interrupt enable * @rx_itr: RX ITR register * @tx_itr: TX ITR register * @icr_ena: Interrupt cause register offset @@ -367,6 +369,8 @@ struct idpf_intr_reg { u32 dyn_ctl_itridx_m; u32 dyn_ctl_intrvl_s; u32 dyn_ctl_wb_on_itr_m; + u32 dyn_ctl_sw_itridx_ena_m; + u32 dyn_ctl_swint_trig_m; void __iomem *rx_itr; void __iomem *tx_itr; void __iomem *icr_ena; @@ -437,7 +441,7 @@ struct idpf_q_vector { cpumask_var_t affinity_mask; __cacheline_group_end_aligned(cold); }; -libeth_cacheline_set_assert(struct idpf_q_vector, 112, +libeth_cacheline_set_assert(struct idpf_q_vector, 120, 24 + sizeof(struct napi_struct) + 2 * sizeof(struct dim), 8 + sizeof(cpumask_var_t)); @@ -471,6 +475,8 @@ struct idpf_tx_queue_stats { #define IDPF_ITR_IS_DYNAMIC(itr_mode) (itr_mode) #define IDPF_ITR_TX_DEF IDPF_ITR_20K #define IDPF_ITR_RX_DEF IDPF_ITR_20K +/* Index used for 'SW ITR' update in DYN_CTL register */ +#define IDPF_SW_ITR_UPDATE_IDX 2 /* Index used for 'No ITR' update in DYN_CTL register */ #define IDPF_NO_ITR_UPDATE_IDX 3 #define IDPF_ITR_IDX_SPACING(spacing, dflt) (spacing ? spacing : dflt) diff --git a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c index aad62e270ae4..aba828abcb17 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c +++ b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c @@ -101,6 +101,9 @@ static int idpf_vf_intr_reg_init(struct idpf_vport *vport) intr->dyn_ctl_itridx_s = VF_INT_DYN_CTLN_ITR_INDX_S; intr->dyn_ctl_intrvl_s = VF_INT_DYN_CTLN_INTERVAL_S; intr->dyn_ctl_wb_on_itr_m = VF_INT_DYN_CTLN_WB_ON_ITR_M; + intr->dyn_ctl_swint_trig_m = VF_INT_DYN_CTLN_SWINT_TRIG_M; + intr->dyn_ctl_sw_itridx_ena_m = + VF_INT_DYN_CTLN_SW_ITR_INDX_ENA_M; spacing = IDPF_ITR_IDX_SPACING(reg_vals[vec_id].itrn_index_spacing, IDPF_VF_ITR_IDX_SPACING); From 9048cf05a17a7bc26f0b8e2e53750b1237303970 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 17 Dec 2024 16:18:12 -0500 Subject: [PATCH 265/807] NFSD: fix management of pending async copies Currently the pending_async_copies count is decremented just before a struct nfsd4_copy is destroyed. After commit aa0ebd21df9c ("NFSD: Add nfsd4_copy time-to-live") nfsd4_copy structures sticks around for 10 lease periods after the COPY itself has completed, the pending_async_copies count stays high for a long time. This causes NFSD to avoid the use of background copy even though the actual background copy workload might no longer be running. In this patch, decrement pending_async_copies once async copy thread is done processing the copy work. Fixes: aa0ebd21df9c ("NFSD: Add nfsd4_copy time-to-live") Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f8a10f90bc7a..ad44ad49274f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1347,7 +1347,6 @@ static void nfs4_put_copy(struct nfsd4_copy *copy) { if (!refcount_dec_and_test(©->refcount)) return; - atomic_dec(©->cp_nn->pending_async_copies); kfree(copy->cp_src); kfree(copy); } @@ -1870,6 +1869,7 @@ do_callback: set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); trace_nfsd_copy_async_done(copy); nfsd4_send_cb_offload(copy); + atomic_dec(©->cp_nn->pending_async_copies); return 0; } @@ -1927,19 +1927,19 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Arbitrary cap on number of pending async copy operations */ if (atomic_inc_return(&nn->pending_async_copies) > (int)rqstp->rq_pool->sp_nrthreads) - goto out_err; + goto out_dec_async_copy_err; async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); if (!async_copy->cp_src) - goto out_err; + goto out_dec_async_copy_err; if (!nfs4_init_copy_state(nn, copy)) - goto out_err; + goto out_dec_async_copy_err; memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); if (IS_ERR(async_copy->copy_task)) - goto out_err; + goto out_dec_async_copy_err; spin_lock(&async_copy->cp_clp->async_lock); list_add(&async_copy->copies, &async_copy->cp_clp->async_copies); @@ -1954,6 +1954,9 @@ out: trace_nfsd_copy_done(copy, status); release_copy_files(copy); return status; +out_dec_async_copy_err: + if (async_copy) + atomic_dec(&nn->pending_async_copies); out_err: if (nfsd4_ssc_is_inter(copy)) { /* From 0c1683c681681c14f4389e3bfa8de10baf242ba8 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Mon, 25 Nov 2024 15:58:55 -0800 Subject: [PATCH 266/807] idpf: trigger SW interrupt when exiting wb_on_itr mode There is a race condition between exiting wb_on_itr and completion write backs. For example, we are in wb_on_itr mode and a Tx completion is generated by HW, ready to be written back, as we are re-enabling interrupts: HW SW | | | | idpf_tx_splitq_clean_all | | napi_complete_done | | | tx_completion_wb | idpf_vport_intr_update_itr_ena_irq That tx_completion_wb happens before the vector is fully re-enabled. Continuing with this example, it is a UDP stream and the tx_completion_wb is the last one in the flow (there are no rx packets). Because the HW generated the completion before the interrupt is fully enabled, the HW will not fire the interrupt once the timer expires and the write back will not happen. NAPI poll won't be called. We have indicated we're back in interrupt mode but nothing else will trigger the interrupt. Therefore, the completion goes unprocessed, triggering a Tx timeout. To mitigate this, fire a SW triggered interrupt upon exiting wb_on_itr. This interrupt will catch the rogue completion and avoid the timeout. Add logic to set the appropriate bits in the vector's dyn_ctl register. Fixes: 9c4a27da0ecc ("idpf: enable WB_ON_ITR") Reviewed-by: Madhu Chittim Signed-off-by: Joshua Hay Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 29 ++++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 34f4118c7bc0..2fa9c36e33c9 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3604,21 +3604,31 @@ static void idpf_vport_intr_dis_irq_all(struct idpf_vport *vport) /** * idpf_vport_intr_buildreg_itr - Enable default interrupt generation settings * @q_vector: pointer to q_vector - * @type: itr index - * @itr: itr value */ -static u32 idpf_vport_intr_buildreg_itr(struct idpf_q_vector *q_vector, - const int type, u16 itr) +static u32 idpf_vport_intr_buildreg_itr(struct idpf_q_vector *q_vector) { - u32 itr_val; + u32 itr_val = q_vector->intr_reg.dyn_ctl_intena_m; + int type = IDPF_NO_ITR_UPDATE_IDX; + u16 itr = 0; + + if (q_vector->wb_on_itr) { + /* + * Trigger a software interrupt when exiting wb_on_itr, to make + * sure we catch any pending write backs that might have been + * missed due to interrupt state transition. + */ + itr_val |= q_vector->intr_reg.dyn_ctl_swint_trig_m | + q_vector->intr_reg.dyn_ctl_sw_itridx_ena_m; + type = IDPF_SW_ITR_UPDATE_IDX; + itr = IDPF_ITR_20K; + } itr &= IDPF_ITR_MASK; /* Don't clear PBA because that can cause lost interrupts that * came in while we were cleaning/polling */ - itr_val = q_vector->intr_reg.dyn_ctl_intena_m | - (type << q_vector->intr_reg.dyn_ctl_itridx_s) | - (itr << (q_vector->intr_reg.dyn_ctl_intrvl_s - 1)); + itr_val |= (type << q_vector->intr_reg.dyn_ctl_itridx_s) | + (itr << (q_vector->intr_reg.dyn_ctl_intrvl_s - 1)); return itr_val; } @@ -3716,9 +3726,8 @@ void idpf_vport_intr_update_itr_ena_irq(struct idpf_q_vector *q_vector) /* net_dim() updates ITR out-of-band using a work item */ idpf_net_dim(q_vector); + intval = idpf_vport_intr_buildreg_itr(q_vector); q_vector->wb_on_itr = false; - intval = idpf_vport_intr_buildreg_itr(q_vector, - IDPF_NO_ITR_UPDATE_IDX, 0); writel(intval, q_vector->intr_reg.dyn_ctl); } From aef25be35d23ec768eed08bfcf7ca3cf9685bc28 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 21 Nov 2024 11:22:18 -0700 Subject: [PATCH 267/807] hexagon: Disable constant extender optimization for LLVM prior to 19.1.0 The Hexagon-specific constant extender optimization in LLVM may crash on Linux kernel code [1], such as fs/bcache/btree_io.c after commit 32ed4a620c54 ("bcachefs: Btree path tracepoints") in 6.12: clang: llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp:745: bool (anonymous namespace)::HexagonConstExtenders::ExtRoot::operator<(const HCE::ExtRoot &) const: Assertion `ThisB->getParent() == OtherB->getParent()' failed. Stack dump: 0. Program arguments: clang --target=hexagon-linux-musl ... fs/bcachefs/btree_io.c 1. parser at end of file 2. Code generation 3. Running pass 'Function Pass Manager' on module 'fs/bcachefs/btree_io.c'. 4. Running pass 'Hexagon constant-extender optimization' on function '@__btree_node_lock_nopath' Without assertions enabled, there is just a hang during compilation. This has been resolved in LLVM main (20.0.0) [2] and backported to LLVM 19.1.0 but the kernel supports LLVM 13.0.1 and newer, so disable the constant expander optimization using the '-mllvm' option when using a toolchain that is not fixed. Cc: stable@vger.kernel.org Link: https://github.com/llvm/llvm-project/issues/99714 [1] Link: https://github.com/llvm/llvm-project/commit/68df06a0b2998765cb0a41353fcf0919bbf57ddb [2] Link: https://github.com/llvm/llvm-project/commit/2ab8d93061581edad3501561722ebd5632d73892 [3] Reviewed-by: Brian Cain Signed-off-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- arch/hexagon/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/hexagon/Makefile b/arch/hexagon/Makefile index 92d005958dfb..ff172cbe5881 100644 --- a/arch/hexagon/Makefile +++ b/arch/hexagon/Makefile @@ -32,3 +32,9 @@ KBUILD_LDFLAGS += $(ldflags-y) TIR_NAME := r19 KBUILD_CFLAGS += -ffixed-$(TIR_NAME) -DTHREADINFO_REG=$(TIR_NAME) -D__linux__ KBUILD_AFLAGS += -DTHREADINFO_REG=$(TIR_NAME) + +# Disable HexagonConstExtenders pass for LLVM versions prior to 19.1.0 +# https://github.com/llvm/llvm-project/issues/99714 +ifneq ($(call clang-min-version, 190100),y) +KBUILD_CFLAGS += -mllvm -hexagon-cext=false +endif From 498d5b14db8c9118be139f668720c67bea2dc344 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Wed, 11 Dec 2024 23:01:43 -0800 Subject: [PATCH 268/807] riscv: selftests: Fix warnings pointer masking test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling the pointer masking tests with -Wall this warning is present: pointer_masking.c: In function ‘test_tagged_addr_abi_sysctl’: pointer_masking.c:203:9: warning: ignoring return value of ‘pwrite’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 203 | pwrite(fd, &value, 1, 0); | ^~~~~~~~~~~~~~~~~~~~~~~~ pointer_masking.c:208:9: warning: ignoring return value of ‘pwrite’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 208 | pwrite(fd, &value, 1, 0); I came across this on riscv64-linux-gnu-gcc (Ubuntu 11.4.0-1ubuntu1~22.04). Fix this by checking that the number of bytes written equal the expected number of bytes written. Fixes: 7470b5afd150 ("riscv: selftests: Add a pointer masking test") Signed-off-by: Charlie Jenkins Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20241211-fix_warnings_pointer_masking_tests-v6-1-c7ae708fbd2f@rivosinc.com Signed-off-by: Palmer Dabbelt --- .../selftests/riscv/abi/pointer_masking.c | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/riscv/abi/pointer_masking.c b/tools/testing/selftests/riscv/abi/pointer_masking.c index dee41b7ee3e3..059d2e87eb1f 100644 --- a/tools/testing/selftests/riscv/abi/pointer_masking.c +++ b/tools/testing/selftests/riscv/abi/pointer_masking.c @@ -185,8 +185,20 @@ static void test_fork_exec(void) } } +static bool pwrite_wrapper(int fd, void *buf, size_t count, const char *msg) +{ + int ret = pwrite(fd, buf, count, 0); + + if (ret != count) { + ksft_perror(msg); + return false; + } + return true; +} + static void test_tagged_addr_abi_sysctl(void) { + char *err_pwrite_msg = "failed to write to /proc/sys/abi/tagged_addr_disabled\n"; char value; int fd; @@ -200,14 +212,18 @@ static void test_tagged_addr_abi_sysctl(void) } value = '1'; - pwrite(fd, &value, 1, 0); - ksft_test_result(set_tagged_addr_ctrl(min_pmlen, true) == -EINVAL, - "sysctl disabled\n"); + if (!pwrite_wrapper(fd, &value, 1, "write '1'")) + ksft_test_result_fail(err_pwrite_msg); + else + ksft_test_result(set_tagged_addr_ctrl(min_pmlen, true) == -EINVAL, + "sysctl disabled\n"); value = '0'; - pwrite(fd, &value, 1, 0); - ksft_test_result(set_tagged_addr_ctrl(min_pmlen, true) == 0, - "sysctl enabled\n"); + if (!pwrite_wrapper(fd, &value, 1, "write '0'")) + ksft_test_result_fail(err_pwrite_msg); + else + ksft_test_result(set_tagged_addr_ctrl(min_pmlen, true) == 0, + "sysctl enabled\n"); set_tagged_addr_ctrl(0, false); From 23579010cf0a12476e96a5f1acdf78a9c5843657 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 17 Dec 2024 20:58:13 +0100 Subject: [PATCH 269/807] bpf: Fix bpf_get_smp_processor_id() on !CONFIG_SMP On x86-64 calling bpf_get_smp_processor_id() in a kernel with CONFIG_SMP disabled can trigger the following bug, as pcpu_hot is unavailable: [ 8.471774] BUG: unable to handle page fault for address: 00000000936a290c [ 8.471849] #PF: supervisor read access in kernel mode [ 8.471881] #PF: error_code(0x0000) - not-present page Fix by inlining a return 0 in the !CONFIG_SMP case. Fixes: 1ae6921009e5 ("bpf: inline bpf_get_smp_processor_id() helper") Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241217195813.622568-1-arighi@nvidia.com --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f7f892a52a37..77f56674aaa9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21281,11 +21281,15 @@ patch_map_ops_generic: * changed in some incompatible and hard to support * way, it's fine to back out this inlining logic */ +#ifdef CONFIG_SMP insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); cnt = 3; - +#else + insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + cnt = 1; +#endif new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; From b9b8301d369b4c876de5255dbf067b19ba88ac71 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Dec 2024 08:37:03 +0000 Subject: [PATCH 270/807] net: netdevsim: fix nsim_pp_hold_write() nsim_pp_hold_write() has two problems: 1) It may return with rtnl held, as found by syzbot. 2) Its return value does not propagate an error if any. Fixes: 1580cbcbfe77 ("net: netdevsim: add some fake page pool use") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241216083703.1859921-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index 0be47fed4efc..e068a9761c09 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -635,10 +635,10 @@ nsim_pp_hold_write(struct file *file, const char __user *data, page_pool_put_full_page(ns->page->pp, ns->page, false); ns->page = NULL; } - rtnl_unlock(); exit: - return count; + rtnl_unlock(); + return ret; } static const struct file_operations nsim_pp_hold_fops = { From 954a2b40719a21e763a1bba2f0da92347e058fce Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 16 Dec 2024 20:04:32 +0900 Subject: [PATCH 271/807] rtnetlink: Try the outer netns attribute in rtnl_get_peer_net(). Xiao Liang reported that the cited commit changed netns handling in newlink() of netkit, veth, and vxcan. Before the patch, if we don't find a netns attribute in the peer device attributes, we tried to find another netns attribute in the outer netlink attributes by passing it to rtnl_link_get_net(). Let's restore the original behaviour. Fixes: 48327566769a ("rtnetlink: fix double call of rtnl_link_get_net_ifla()") Reported-by: Xiao Liang Closes: https://lore.kernel.org/netdev/CABAhCORBVVU8P6AHcEkENMj+gD2d3ce9t=A_o48E0yOQp8_wUQ@mail.gmail.com/#t Signed-off-by: Kuniyuki Iwashima Tested-by: Xiao Liang Link: https://patch.msgid.link/20241216110432.51488-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ebcfc2debf1a..d9f959c619d9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3819,6 +3819,7 @@ out_unregister: } static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, + struct nlattr *tbp[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -3826,7 +3827,7 @@ static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, int err; if (!data || !data[ops->peer_type]) - return NULL; + return rtnl_link_get_net_ifla(tbp); err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) @@ -3971,7 +3972,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (ops->peer_type) { - peer_net = rtnl_get_peer_net(ops, data, extack); + peer_net = rtnl_get_peer_net(ops, tb, data, extack); if (IS_ERR(peer_net)) { ret = PTR_ERR(peer_net); goto put_ops; From fca2977629f49dee437e217c3fc423b6e0cad98c Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Mon, 7 Oct 2024 10:23:58 +0200 Subject: [PATCH 272/807] can: m_can: set init flag earlier in probe While an m_can controller usually already has the init flag from a hardware reset, no such reset happens on the integrated m_can_pci of the Intel Elkhart Lake. If the CAN controller is found in an active state, m_can_dev_setup() would fail because m_can_niso_supported() calls m_can_cccr_update_bits(), which refuses to modify any other configuration bits when CCCR_INIT is not set. To avoid this issue, set CCCR_INIT before attempting to modify any other configuration flags. Fixes: cd5a46ce6fa6 ("can: m_can: don't enable transceiver when probing") Signed-off-by: Matthias Schiffer Reviewed-by: Markus Schneider-Pargmann Link: https://patch.msgid.link/e247f331cb72829fcbdfda74f31a59cbad1a6006.1728288535.git.matthias.schiffer@ew.tq-group.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 533bcb77c9f9..67c404fbe166 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1695,6 +1695,14 @@ static int m_can_dev_setup(struct m_can_classdev *cdev) return -EINVAL; } + /* Write the INIT bit, in case no hardware reset has happened before + * the probe (for example, it was observed that the Intel Elkhart Lake + * SoCs do not properly reset the CAN controllers on reboot) + */ + err = m_can_cccr_update_bits(cdev, CCCR_INIT, CCCR_INIT); + if (err) + return err; + if (!cdev->is_peripheral) netif_napi_add(dev, &cdev->napi, m_can_poll); @@ -1746,11 +1754,7 @@ static int m_can_dev_setup(struct m_can_classdev *cdev) return -EINVAL; } - /* Forcing standby mode should be redundant, as the chip should be in - * standby after a reset. Write the INIT bit anyways, should the chip - * be configured by previous stage. - */ - return m_can_cccr_update_bits(cdev, CCCR_INIT, CCCR_INIT); + return 0; } static void m_can_stop(struct net_device *dev) From 743375f8deee360b0e902074bab99b0c9368d42f Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Mon, 7 Oct 2024 10:23:59 +0200 Subject: [PATCH 273/807] can: m_can: fix missed interrupts with m_can_pci The interrupt line of PCI devices is interpreted as edge-triggered, however the interrupt signal of the m_can controller integrated in Intel Elkhart Lake CPUs appears to be generated level-triggered. Consider the following sequence of events: - IR register is read, interrupt X is set - A new interrupt Y is triggered in the m_can controller - IR register is written to acknowledge interrupt X. Y remains set in IR As at no point in this sequence no interrupt flag is set in IR, the m_can interrupt line will never become deasserted, and no edge will ever be observed to trigger another run of the ISR. This was observed to result in the TX queue of the EHL m_can to get stuck under high load, because frames were queued to the hardware in m_can_start_xmit(), but m_can_finish_tx() was never run to account for their successful transmission. On an Elkhart Lake based board with the two CAN interfaces connected to each other, the following script can reproduce the issue: ip link set can0 up type can bitrate 1000000 ip link set can1 up type can bitrate 1000000 cangen can0 -g 2 -I 000 -L 8 & cangen can0 -g 2 -I 001 -L 8 & cangen can0 -g 2 -I 002 -L 8 & cangen can0 -g 2 -I 003 -L 8 & cangen can0 -g 2 -I 004 -L 8 & cangen can0 -g 2 -I 005 -L 8 & cangen can0 -g 2 -I 006 -L 8 & cangen can0 -g 2 -I 007 -L 8 & cangen can1 -g 2 -I 100 -L 8 & cangen can1 -g 2 -I 101 -L 8 & cangen can1 -g 2 -I 102 -L 8 & cangen can1 -g 2 -I 103 -L 8 & cangen can1 -g 2 -I 104 -L 8 & cangen can1 -g 2 -I 105 -L 8 & cangen can1 -g 2 -I 106 -L 8 & cangen can1 -g 2 -I 107 -L 8 & stress-ng --matrix 0 & To fix the issue, repeatedly read and acknowledge interrupts at the start of the ISR until no interrupt flags are set, so the next incoming interrupt will also result in an edge on the interrupt line. While we have received a report that even with this patch, the TX queue can become stuck under certain (currently unknown) circumstances on the Elkhart Lake, this patch completely fixes the issue with the above reproducer, and it is unclear whether the remaining issue has a similar cause at all. Fixes: cab7ffc0324f ("can: m_can: add PCI glue driver for Intel Elkhart Lake") Signed-off-by: Matthias Schiffer Reviewed-by: Markus Schneider-Pargmann Link: https://patch.msgid.link/fdf0439c51bcb3a46c21e9fb21c7f1d06363be84.1728288535.git.matthias.schiffer@ew.tq-group.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 22 +++++++++++++++++----- drivers/net/can/m_can/m_can.h | 1 + drivers/net/can/m_can/m_can_pci.c | 1 + 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 67c404fbe166..97cd8bbf2e32 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1220,20 +1220,32 @@ static void m_can_coalescing_update(struct m_can_classdev *cdev, u32 ir) static int m_can_interrupt_handler(struct m_can_classdev *cdev) { struct net_device *dev = cdev->net; - u32 ir; + u32 ir = 0, ir_read; int ret; if (pm_runtime_suspended(cdev->dev)) return IRQ_NONE; - ir = m_can_read(cdev, M_CAN_IR); + /* The m_can controller signals its interrupt status as a level, but + * depending in the integration the CPU may interpret the signal as + * edge-triggered (for example with m_can_pci). For these + * edge-triggered integrations, we must observe that IR is 0 at least + * once to be sure that the next interrupt will generate an edge. + */ + while ((ir_read = m_can_read(cdev, M_CAN_IR)) != 0) { + ir |= ir_read; + + /* ACK all irqs */ + m_can_write(cdev, M_CAN_IR, ir); + + if (!cdev->irq_edge_triggered) + break; + } + m_can_coalescing_update(cdev, ir); if (!ir) return IRQ_NONE; - /* ACK all irqs */ - m_can_write(cdev, M_CAN_IR, ir); - if (cdev->ops->clear_interrupts) cdev->ops->clear_interrupts(cdev); diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h index 92b2bd8628e6..ef39e8e527ab 100644 --- a/drivers/net/can/m_can/m_can.h +++ b/drivers/net/can/m_can/m_can.h @@ -99,6 +99,7 @@ struct m_can_classdev { int pm_clock_support; int pm_wake_source; int is_peripheral; + bool irq_edge_triggered; // Cached M_CAN_IE register content u32 active_interrupts; diff --git a/drivers/net/can/m_can/m_can_pci.c b/drivers/net/can/m_can/m_can_pci.c index d72fe771dfc7..9ad7419f88f8 100644 --- a/drivers/net/can/m_can/m_can_pci.c +++ b/drivers/net/can/m_can/m_can_pci.c @@ -127,6 +127,7 @@ static int m_can_pci_probe(struct pci_dev *pci, const struct pci_device_id *id) mcan_class->pm_clock_support = 1; mcan_class->pm_wake_source = 0; mcan_class->can.clock.freq = id->driver_data; + mcan_class->irq_edge_triggered = true; mcan_class->ops = &m_can_pci_ops; pci_set_drvdata(pci, mcan_class); From 05aa156e156ef3168e7ab8a68721945196495c17 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Fri, 13 Dec 2024 21:17:58 -0800 Subject: [PATCH 274/807] powerpc/pseries/vas: Add close() callback in vas_vm_ops struct The mapping VMA address is saved in VAS window struct when the paste address is mapped. This VMA address is used during migration to unmap the paste address if the window is active. The paste address mapping will be removed when the window is closed or with the munmap(). But the VMA address in the VAS window is not updated with munmap() which is causing invalid access during migration. The KASAN report shows: [16386.254991] BUG: KASAN: slab-use-after-free in reconfig_close_windows+0x1a0/0x4e8 [16386.255043] Read of size 8 at addr c00000014a819670 by task drmgr/696928 [16386.255096] CPU: 29 UID: 0 PID: 696928 Comm: drmgr Kdump: loaded Tainted: G B 6.11.0-rc5-nxgzip #2 [16386.255128] Tainted: [B]=BAD_PAGE [16386.255148] Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.00 (NH1110_016) hv:phyp pSeries [16386.255181] Call Trace: [16386.255202] [c00000016b297660] [c0000000018ad0ac] dump_stack_lvl+0x84/0xe8 (unreliable) [16386.255246] [c00000016b297690] [c0000000006e8a90] print_report+0x19c/0x764 [16386.255285] [c00000016b297760] [c0000000006e9490] kasan_report+0x128/0x1f8 [16386.255309] [c00000016b297880] [c0000000006eb5c8] __asan_load8+0xac/0xe0 [16386.255326] [c00000016b2978a0] [c00000000013f898] reconfig_close_windows+0x1a0/0x4e8 [16386.255343] [c00000016b297990] [c000000000140e58] vas_migration_handler+0x3a4/0x3fc [16386.255368] [c00000016b297a90] [c000000000128848] pseries_migrate_partition+0x4c/0x4c4 ... [16386.256136] Allocated by task 696554 on cpu 31 at 16377.277618s: [16386.256149] kasan_save_stack+0x34/0x68 [16386.256163] kasan_save_track+0x34/0x80 [16386.256175] kasan_save_alloc_info+0x58/0x74 [16386.256196] __kasan_slab_alloc+0xb8/0xdc [16386.256209] kmem_cache_alloc_noprof+0x200/0x3d0 [16386.256225] vm_area_alloc+0x44/0x150 [16386.256245] mmap_region+0x214/0x10c4 [16386.256265] do_mmap+0x5fc/0x750 [16386.256277] vm_mmap_pgoff+0x14c/0x24c [16386.256292] ksys_mmap_pgoff+0x20c/0x348 [16386.256303] sys_mmap+0xd0/0x160 ... [16386.256350] Freed by task 0 on cpu 31 at 16386.204848s: [16386.256363] kasan_save_stack+0x34/0x68 [16386.256374] kasan_save_track+0x34/0x80 [16386.256384] kasan_save_free_info+0x64/0x10c [16386.256396] __kasan_slab_free+0x120/0x204 [16386.256415] kmem_cache_free+0x128/0x450 [16386.256428] vm_area_free_rcu_cb+0xa8/0xd8 [16386.256441] rcu_do_batch+0x2c8/0xcf0 [16386.256458] rcu_core+0x378/0x3c4 [16386.256473] handle_softirqs+0x20c/0x60c [16386.256495] do_softirq_own_stack+0x6c/0x88 [16386.256509] do_softirq_own_stack+0x58/0x88 [16386.256521] __irq_exit_rcu+0x1a4/0x20c [16386.256533] irq_exit+0x20/0x38 [16386.256544] interrupt_async_exit_prepare.constprop.0+0x18/0x2c ... [16386.256717] Last potentially related work creation: [16386.256729] kasan_save_stack+0x34/0x68 [16386.256741] __kasan_record_aux_stack+0xcc/0x12c [16386.256753] __call_rcu_common.constprop.0+0x94/0xd04 [16386.256766] vm_area_free+0x28/0x3c [16386.256778] remove_vma+0xf4/0x114 [16386.256797] do_vmi_align_munmap.constprop.0+0x684/0x870 [16386.256811] __vm_munmap+0xe0/0x1f8 [16386.256821] sys_munmap+0x54/0x6c [16386.256830] system_call_exception+0x1a0/0x4a0 [16386.256841] system_call_vectored_common+0x15c/0x2ec [16386.256868] The buggy address belongs to the object at c00000014a819670 which belongs to the cache vm_area_struct of size 168 [16386.256887] The buggy address is located 0 bytes inside of freed 168-byte region [c00000014a819670, c00000014a819718) [16386.256915] The buggy address belongs to the physical page: [16386.256928] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x14a81 [16386.256950] memcg:c0000000ba430001 [16386.256961] anon flags: 0x43ffff800000000(node=4|zone=0|lastcpupid=0x7ffff) [16386.256975] page_type: 0xfdffffff(slab) [16386.256990] raw: 043ffff800000000 c00000000501c080 0000000000000000 5deadbee00000001 [16386.257003] raw: 0000000000000000 00000000011a011a 00000001fdffffff c0000000ba430001 [16386.257018] page dumped because: kasan: bad access detected This patch adds close() callback in vas_vm_ops vm_operations_struct which will be executed during munmap() before freeing VMA. The VMA address in the VAS window is set to NULL after holding the window mmap_mutex. Fixes: 37e6764895ef ("powerpc/pseries/vas: Add VAS migration handler") Signed-off-by: Haren Myneni Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20241214051758.997759-1-haren@linux.ibm.com --- arch/powerpc/platforms/book3s/vas-api.c | 36 +++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index f381b177ea06..0b6365d85d11 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -464,7 +464,43 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +/* + * During mmap() paste address, mapping VMA is saved in VAS window + * struct which is used to unmap during migration if the window is + * still open. But the user space can remove this mapping with + * munmap() before closing the window and the VMA address will + * be invalid. Set VAS window VMA to NULL in this function which + * is called before VMA free. + */ +static void vas_mmap_close(struct vm_area_struct *vma) +{ + struct file *fp = vma->vm_file; + struct coproc_instance *cp_inst = fp->private_data; + struct vas_window *txwin; + + /* Should not happen */ + if (!cp_inst || !cp_inst->txwin) { + pr_err("No attached VAS window for the paste address mmap\n"); + return; + } + + txwin = cp_inst->txwin; + /* + * task_ref.vma is set in coproc_mmap() during mmap paste + * address. So it has to be the same VMA that is getting freed. + */ + if (WARN_ON(txwin->task_ref.vma != vma)) { + pr_err("Invalid paste address mmaping\n"); + return; + } + + mutex_lock(&txwin->task_ref.mmap_mutex); + txwin->task_ref.vma = NULL; + mutex_unlock(&txwin->task_ref.mmap_mutex); +} + static const struct vm_operations_struct vas_vm_ops = { + .close = vas_mmap_close, .fault = vas_mmap_fault, }; From edc19bd0e571c732cd01c8da62f904e6d2a29a48 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Tue, 17 Dec 2024 16:00:21 +0100 Subject: [PATCH 275/807] pwm: stm32: Fix complementary output in round_waveform_tohw() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the timer supports complementary output, the CCxNE bit must be set additionally to the CCxE bit. So to not overwrite the latter use |= instead of = to set the former. Fixes: deaba9cff809 ("pwm: stm32: Implementation of the waveform callbacks") Signed-off-by: Fabrice Gasnier Link: https://lore.kernel.org/r/20241217150021.2030213-1-fabrice.gasnier@foss.st.com [ukleinek: Slightly improve commit log] Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-stm32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index b889e64522c3..17e591f61efb 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -84,7 +84,7 @@ static int stm32_pwm_round_waveform_tohw(struct pwm_chip *chip, wfhw->ccer = TIM_CCER_CCxE(ch + 1); if (priv->have_complementary_output) - wfhw->ccer = TIM_CCER_CCxNE(ch + 1); + wfhw->ccer |= TIM_CCER_CCxNE(ch + 1); rate = clk_get_rate(priv->clk); From 4feaedf7d243f1a9af36dfb2711a5641fe3559dc Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 16 Dec 2024 22:26:44 +0100 Subject: [PATCH 276/807] thermal/thresholds: Fix boundaries and detection routine The current implementation does not work if the thermal zone is interrupt driven only. The boundaries are not correctly checked and computed as it happens only when the temperature is increasing or decreasing. The problem arises because the routine to detect when we cross a threshold is correlated with the computation of the boundaries. We assume we have to recompute the boundaries when a threshold is crossed but actually we should do that even if the it is not the case. Mixing the boundaries computation and the threshold detection for the sake of optimizing the routine is much more complex as it appears intuitively and prone to errors. This fix separates the boundaries computation and the threshold crossing detection into different routines. The result is a code much more simple to understand, thus easier to maintain. The drawback is we browse the thresholds list several time but we can consider that as neglictible because that happens when the temperature is updated. There are certainly some aeras to improve in the temperature update routine but it would be not adequate as this change aims to fix the thresholds for v6.13. Fixes: 445936f9e258 ("thermal: core: Add user thresholds support") Tested-by: Daniel Lezcano # rock5b, Lenovo x13s Signed-off-by: Daniel Lezcano Link: https://patch.msgid.link/20241216212644.1145122-1-daniel.lezcano@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_thresholds.c | 76 +++++++++++++++------------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/drivers/thermal/thermal_thresholds.c b/drivers/thermal/thermal_thresholds.c index d9b2a0bb44fc..38f5fd0e8930 100644 --- a/drivers/thermal/thermal_thresholds.c +++ b/drivers/thermal/thermal_thresholds.c @@ -69,40 +69,18 @@ static struct user_threshold *__thermal_thresholds_find(const struct list_head * return NULL; } -static bool __thermal_threshold_is_crossed(struct user_threshold *threshold, int temperature, - int last_temperature, int direction, - int *low, int *high) -{ - - if (temperature >= threshold->temperature) { - if (threshold->temperature > *low && - THERMAL_THRESHOLD_WAY_DOWN & threshold->direction) - *low = threshold->temperature; - - if (last_temperature < threshold->temperature && - threshold->direction & direction) - return true; - } else { - if (threshold->temperature < *high && THERMAL_THRESHOLD_WAY_UP - & threshold->direction) - *high = threshold->temperature; - - if (last_temperature >= threshold->temperature && - threshold->direction & direction) - return true; - } - - return false; -} - static bool thermal_thresholds_handle_raising(struct list_head *thresholds, int temperature, - int last_temperature, int *low, int *high) + int last_temperature) { struct user_threshold *t; list_for_each_entry(t, thresholds, list_node) { - if (__thermal_threshold_is_crossed(t, temperature, last_temperature, - THERMAL_THRESHOLD_WAY_UP, low, high)) + + if (!(t->direction & THERMAL_THRESHOLD_WAY_UP)) + continue; + + if (temperature >= t->temperature && + last_temperature < t->temperature) return true; } @@ -110,19 +88,43 @@ static bool thermal_thresholds_handle_raising(struct list_head *thresholds, int } static bool thermal_thresholds_handle_dropping(struct list_head *thresholds, int temperature, - int last_temperature, int *low, int *high) + int last_temperature) { struct user_threshold *t; list_for_each_entry_reverse(t, thresholds, list_node) { - if (__thermal_threshold_is_crossed(t, temperature, last_temperature, - THERMAL_THRESHOLD_WAY_DOWN, low, high)) + + if (!(t->direction & THERMAL_THRESHOLD_WAY_DOWN)) + continue; + + if (temperature <= t->temperature && + last_temperature > t->temperature) return true; } return false; } +static void thermal_threshold_find_boundaries(struct list_head *thresholds, int temperature, + int *low, int *high) +{ + struct user_threshold *t; + + list_for_each_entry(t, thresholds, list_node) { + if (temperature < t->temperature && + (t->direction & THERMAL_THRESHOLD_WAY_UP) && + *high > t->temperature) + *high = t->temperature; + } + + list_for_each_entry_reverse(t, thresholds, list_node) { + if (temperature > t->temperature && + (t->direction & THERMAL_THRESHOLD_WAY_DOWN) && + *low < t->temperature) + *low = t->temperature; + } +} + void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *high) { struct list_head *thresholds = &tz->user_thresholds; @@ -132,6 +134,8 @@ void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *hi lockdep_assert_held(&tz->lock); + thermal_threshold_find_boundaries(thresholds, temperature, low, high); + /* * We need a second update in order to detect a threshold being crossed */ @@ -151,12 +155,12 @@ void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *hi * - decreased : thresholds are crossed the way down */ if (temperature > last_temperature) { - if (thermal_thresholds_handle_raising(thresholds, temperature, - last_temperature, low, high)) + if (thermal_thresholds_handle_raising(thresholds, + temperature, last_temperature)) thermal_notify_threshold_up(tz); } else { - if (thermal_thresholds_handle_dropping(thresholds, temperature, - last_temperature, low, high)) + if (thermal_thresholds_handle_dropping(thresholds, + temperature, last_temperature)) thermal_notify_threshold_down(tz); } } From c9e3ebdc52ebe028f238c9df5162ae92483bedd5 Mon Sep 17 00:00:00 2001 From: Shuming Fan Date: Wed, 18 Dec 2024 17:13:07 +0800 Subject: [PATCH 277/807] ASoC: rt722: add delay time to wait for the calibration procedure The calibration procedure needs some time to finish. This patch adds the delay time to ensure the calibration procedure is completed correctly. Signed-off-by: Shuming Fan Link: https://patch.msgid.link/20241218091307.96656-1-shumingf@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/rt722-sdca.c b/sound/soc/codecs/rt722-sdca.c index 908846e994df..e17a142d03b9 100644 --- a/sound/soc/codecs/rt722-sdca.c +++ b/sound/soc/codecs/rt722-sdca.c @@ -1468,13 +1468,18 @@ static void rt722_sdca_jack_preset(struct rt722_sdca_priv *rt722) 0x008d); /* check HP calibration FSM status */ for (loop_check = 0; loop_check < chk_cnt; loop_check++) { + usleep_range(10000, 11000); ret = rt722_sdca_index_read(rt722, RT722_VENDOR_CALI, RT722_DAC_DC_CALI_CTL3, &calib_status); - if (ret < 0 || loop_check == chk_cnt) + if (ret < 0) dev_dbg(&rt722->slave->dev, "calibration failed!, ret=%d\n", ret); if ((calib_status & 0x0040) == 0x0) break; } + + if (loop_check == chk_cnt) + dev_dbg(&rt722->slave->dev, "%s, calibration time-out!\n", __func__); + /* Set ADC09 power entity floating control */ rt722_sdca_index_write(rt722, RT722_VENDOR_HDA_CTL, RT722_ADC0A_08_PDE_FLOAT_CTL, 0x2a12); From 26fff8a4432ffd03409346b7dae1e1a2c5318b7c Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Tue, 17 Dec 2024 18:02:11 -0800 Subject: [PATCH 278/807] block/bdev: use helper for max block size check We already have a helper for checking the limits on the block size both low and high, just use that. No functional changes. Reviewed-by: John Garry Signed-off-by: Luis Chamberlain Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241218020212.3657139-2-mcgrof@kernel.org Signed-off-by: Jens Axboe --- block/bdev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 738e3c8457e7..9d73a8fbf7f9 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -155,8 +155,7 @@ int set_blocksize(struct file *file, int size) struct inode *inode = file->f_mapping->host; struct block_device *bdev = I_BDEV(inode); - /* Size must be a power of two, and between 512 and PAGE_SIZE */ - if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) + if (blk_validate_block_size(size)) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ From 51588b1b77b65cd0fb3440f78f37bef7178a2715 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Tue, 17 Dec 2024 18:02:12 -0800 Subject: [PATCH 279/807] nvme: use blk_validate_block_size() for max LBA check The block layer already has support to validates proper block sizes with blk_validate_block_size(), we can leverage that as well. No functional changes. Signed-off-by: Luis Chamberlain Reviewed-by: John Garry Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241218020212.3657139-3-mcgrof@kernel.org Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d169a30eb935..a970168a3014 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2034,7 +2034,7 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, * or smaller than a sector size yet, so catch this early and don't * allow block I/O. */ - if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) { + if (blk_validate_block_size(bs)) { bs = (1 << 9); valid = false; } From 224749be6c23efe7fb8a030854f4fc5d1dd813b3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 18 Dec 2024 18:16:14 +0800 Subject: [PATCH 280/807] block: Revert "block: Fix potential deadlock while freezing queue and acquiring sysfs_lock" This reverts commit be26ba96421ab0a8fa2055ccf7db7832a13c44d2. Commit be26ba96421a ("block: Fix potential deadlock while freezing queue and acquiring sysfs_loc") actually reverts commit 22465bbac53c ("blk-mq: move cpuhp callback registering out of q->sysfs_lock"), and causes the original resctrl lockdep warning. So revert it and we need to fix the issue in another way. Cc: Nilay Shroff Fixes: be26ba96421a ("block: Fix potential deadlock while freezing queue and acquiring sysfs_loc") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241218101617.3275704-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 16 ++++++++++------ block/blk-mq.c | 29 +++++++++++------------------ block/blk-sysfs.c | 4 ++-- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index cd5ea6eaa76b..156e9bb07abf 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -275,13 +275,15 @@ void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - lockdep_assert_held(&q->sysfs_dir_lock); - + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) - return; + goto unlock; queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); + +unlock: + mutex_unlock(&q->sysfs_dir_lock); } int blk_mq_sysfs_register_hctxs(struct request_queue *q) @@ -290,10 +292,9 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) unsigned long i; int ret = 0; - lockdep_assert_held(&q->sysfs_dir_lock); - + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) - return ret; + goto unlock; queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); @@ -301,5 +302,8 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) break; } +unlock: + mutex_unlock(&q->sysfs_dir_lock); + return ret; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 6b6111513986..92e8ddf34575 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4453,8 +4453,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, unsigned long i, j; /* protect against switching io scheduler */ - lockdep_assert_held(&q->sysfs_lock); - + mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4487,6 +4486,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); + mutex_unlock(&q->sysfs_lock); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4518,14 +4518,10 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, xa_init(&q->hctx_table); - mutex_lock(&q->sysfs_lock); - blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; - mutex_unlock(&q->sysfs_lock); - INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); @@ -4544,7 +4540,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return 0; err_hctxs: - mutex_unlock(&q->sysfs_lock); blk_mq_release(q); err_exit: q->mq_ops = NULL; @@ -4925,12 +4920,12 @@ static bool blk_mq_elv_switch_none(struct list_head *head, return false; /* q->elevator needs protection from ->sysfs_lock */ - lockdep_assert_held(&q->sysfs_lock); + mutex_lock(&q->sysfs_lock); /* the check has to be done with holding sysfs_lock */ if (!q->elevator) { kfree(qe); - goto out; + goto unlock; } INIT_LIST_HEAD(&qe->node); @@ -4940,7 +4935,9 @@ static bool blk_mq_elv_switch_none(struct list_head *head, __elevator_get(qe->type); list_add(&qe->node, head); elevator_disable(q); -out: +unlock: + mutex_unlock(&q->sysfs_lock); + return true; } @@ -4969,9 +4966,11 @@ static void blk_mq_elv_switch_back(struct list_head *head, list_del(&qe->node); kfree(qe); + mutex_lock(&q->sysfs_lock); elevator_switch(q, t); /* drop the reference acquired in blk_mq_elv_switch_none */ elevator_put(t); + mutex_unlock(&q->sysfs_lock); } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, @@ -4991,11 +4990,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; - list_for_each_entry(q, &set->tag_list, tag_set_list) { - mutex_lock(&q->sysfs_dir_lock); - mutex_lock(&q->sysfs_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); - } /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done @@ -5051,11 +5047,8 @@ switch_back: list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_elv_switch_back(&head, q); - list_for_each_entry(q, &set->tag_list, tag_set_list) { + list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); - mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); - } /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 64f70c713d2f..767598e719ab 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -706,11 +706,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, if (entry->load_module) entry->load_module(disk, page, length); - mutex_lock(&q->sysfs_lock); blk_mq_freeze_queue(q); + mutex_lock(&q->sysfs_lock); res = entry->store(disk, page, length); - blk_mq_unfreeze_queue(q); mutex_unlock(&q->sysfs_lock); + blk_mq_unfreeze_queue(q); return res; } From 85672ca9ceeaa1dcf2777a7048af5f4aee3fd02b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 18 Dec 2024 18:16:15 +0800 Subject: [PATCH 281/807] block: avoid to reuse `hctx` not removed from cpuhp callback list If the 'hctx' isn't removed from cpuhp callback list, we can't reuse it, otherwise use-after-free may be triggered. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412172217.b906db7c-lkp@intel.com Tested-by: kernel test robot Fixes: 22465bbac53c ("blk-mq: move cpuhp callback registering out of q->sysfs_lock") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241218101617.3275704-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 92e8ddf34575..8ac19d4ae3c0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4412,6 +4412,15 @@ struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); +/* + * Only hctx removed from cpuhp list can be reused + */ +static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) +{ + return hlist_unhashed(&hctx->cpuhp_online) && + hlist_unhashed(&hctx->cpuhp_dead); +} + static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) @@ -4421,7 +4430,7 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( /* reuse dead hctx first */ spin_lock(&q->unused_hctx_lock); list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { - if (tmp->numa_node == node) { + if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { hctx = tmp; break; } From 7f9a1eed1ad8b274ed9163a02cef891a90427237 Mon Sep 17 00:00:00 2001 From: Jon Lin Date: Wed, 18 Dec 2024 23:47:41 +0800 Subject: [PATCH 282/807] spi: rockchip-sfc: Fix error in remove progress Fix error in remove progress: [ 43.026148] Call trace: [ 43.026370] klist_next+0x1c/0x1d4 [ 43.026671] device_for_each_child+0x48/0xac [ 43.027049] spi_unregister_controller+0x30/0x130 [ 43.027469] rockchip_sfc_remove+0x48/0x80 [spi_rockchip_sfc] Signed-off-by: Jon Lin Link: https://patch.msgid.link/20241218154741.901591-1-jon.lin@rock-chips.com Signed-off-by: Mark Brown --- drivers/spi/spi-rockchip-sfc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-rockchip-sfc.c b/drivers/spi/spi-rockchip-sfc.c index 69d0f2175568..70bbb459caa4 100644 --- a/drivers/spi/spi-rockchip-sfc.c +++ b/drivers/spi/spi-rockchip-sfc.c @@ -182,6 +182,7 @@ struct rockchip_sfc { bool use_dma; u32 max_iosize; u16 version; + struct spi_controller *host; }; static int rockchip_sfc_reset(struct rockchip_sfc *sfc) @@ -574,6 +575,7 @@ static int rockchip_sfc_probe(struct platform_device *pdev) sfc = spi_controller_get_devdata(host); sfc->dev = dev; + sfc->host = host; sfc->regbase = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(sfc->regbase)) @@ -651,8 +653,8 @@ err_hclk: static void rockchip_sfc_remove(struct platform_device *pdev) { - struct spi_controller *host = platform_get_drvdata(pdev); struct rockchip_sfc *sfc = platform_get_drvdata(pdev); + struct spi_controller *host = sfc->host; spi_unregister_controller(host); From cc0c53f4fac562efb3aca2bc493515e77642ae33 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Jun 2024 14:12:45 -0700 Subject: [PATCH 283/807] wifi: iwlwifi: mvm: Fix __counted_by usage in cfg80211_wowlan_nd_* Both struct cfg80211_wowlan_nd_match and struct cfg80211_wowlan_nd_info pre-allocate space for channels and matches, but then may end up using fewer that the full allocation. Shrink the associated counter (n_channels and n_matches) after counting the results. This avoids compile-time (and run-time) warnings from __counted_by. (The counter member needs to be updated _before_ accessing the array index.) Seen with coming GCC 15: drivers/net/wireless/intel/iwlwifi/mvm/d3.c: In function 'iwl_mvm_query_set_freqs': drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2877:66: warning: operation on 'match->n_channels' may be undefined [-Wsequence-point] 2877 | match->channels[match->n_channels++] = | ~~~~~~~~~~~~~~~~~^~ drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2885:66: warning: operation on 'match->n_channels' may be undefined [-Wsequence-point] 2885 | match->channels[match->n_channels++] = | ~~~~~~~~~~~~~~~~~^~ drivers/net/wireless/intel/iwlwifi/mvm/d3.c: In function 'iwl_mvm_query_netdetect_reasons': drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2982:58: warning: operation on 'net_detect->n_matches' may be undefined [-Wsequence-point] 2982 | net_detect->matches[net_detect->n_matches++] = match; | ~~~~~~~~~~~~~~~~~~~~~^~ Cc: stable@vger.kernel.org Fixes: aa4ec06c455d ("wifi: cfg80211: use __counted_by where appropriate") Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20240619211233.work.355-kees@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index f85c01e04ebf..7d973546c9fb 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -2954,6 +2954,7 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, int idx) { int i; + int n_channels = 0; if (fw_has_api(&mvm->fw->ucode_capa, IWL_UCODE_TLV_API_SCAN_OFFLOAD_CHANS)) { @@ -2962,7 +2963,7 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, for (i = 0; i < SCAN_OFFLOAD_MATCHING_CHANNELS_LEN * 8; i++) if (matches[idx].matching_channels[i / 8] & (BIT(i % 8))) - match->channels[match->n_channels++] = + match->channels[n_channels++] = mvm->nd_channels[i]->center_freq; } else { struct iwl_scan_offload_profile_match_v1 *matches = @@ -2970,9 +2971,11 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, for (i = 0; i < SCAN_OFFLOAD_MATCHING_CHANNELS_LEN_V1 * 8; i++) if (matches[idx].matching_channels[i / 8] & (BIT(i % 8))) - match->channels[match->n_channels++] = + match->channels[n_channels++] = mvm->nd_channels[i]->center_freq; } + /* We may have ended up with fewer channels than we allocated. */ + match->n_channels = n_channels; } /** @@ -3053,6 +3056,8 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, GFP_KERNEL); if (!net_detect || !n_matches) goto out_report_nd; + net_detect->n_matches = n_matches; + n_matches = 0; for_each_set_bit(i, &matched_profiles, mvm->n_nd_match_sets) { struct cfg80211_wowlan_nd_match *match; @@ -3066,8 +3071,9 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, GFP_KERNEL); if (!match) goto out_report_nd; + match->n_channels = n_channels; - net_detect->matches[net_detect->n_matches++] = match; + net_detect->matches[n_matches++] = match; /* We inverted the order of the SSIDs in the scan * request, so invert the index here. @@ -3082,6 +3088,8 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, iwl_mvm_query_set_freqs(mvm, d3_data->nd_results, match, i); } + /* We may have fewer matches than we allocated. */ + net_detect->n_matches = n_matches; out_report_nd: wakeup.net_detect = net_detect; From 349f0086ba8b2a169877d21ff15a4d9da3a60054 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 18 Dec 2024 09:02:28 +0100 Subject: [PATCH 284/807] x86/static-call: fix 32-bit build In 32-bit x86 builds CONFIG_STATIC_CALL_INLINE isn't set, leading to static_call_initialized not being available. Define it as "0" in that case. Reported-by: Stephen Rothwell Fixes: 0ef8047b737d ("x86/static-call: provide a way to do very early static-call updates") Signed-off-by: Juergen Gross Acked-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- include/linux/static_call.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 785980af8972..78a77a4ae0ea 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,7 +138,6 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include -extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ @@ -161,6 +160,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool #ifdef CONFIG_HAVE_STATIC_CALL_INLINE +extern int static_call_initialized; + extern int __init static_call_init(void); extern void static_call_force_reinit(void); @@ -226,6 +227,8 @@ extern long __static_call_return0(void); #elif defined(CONFIG_HAVE_STATIC_CALL) +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } #define DEFINE_STATIC_CALL(name, _func) \ @@ -282,6 +285,8 @@ extern long __static_call_return0(void); #else /* Generic implementation */ +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } static inline long __static_call_return0(void) From 536ae08d7b6ae16872f0b3c2679e656a7fc9d5e2 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 11 Dec 2024 09:56:01 -0600 Subject: [PATCH 285/807] drm/amd: Require CONFIG_HOTPLUG_PCI_PCIE for BOCO If the kernel hasn't been compiled with PCIe hotplug support this can lead to problems with dGPUs that use BOCO because they effectively drop off the bus. To prevent issues, disable BOCO support when compiled without PCIe hotplug. Reported-by: Gabriel Marcano Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/1707#note_2696862 Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20241211155601.3585256-1-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 1ad5bdc28bafa66db0f041cc6cdd278a80426aae) --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d272d95dd5b2..cd4fac120834 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -417,6 +417,9 @@ bool amdgpu_device_supports_boco(struct drm_device *dev) { struct amdgpu_device *adev = drm_to_adev(dev); + if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) + return false; + if (adev->has_pr3 || ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) return true; From a93b1020eb9386d7da11608477121b10079c076a Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Fri, 6 Dec 2024 13:17:45 +0100 Subject: [PATCH 286/807] drm/amdgpu: don't access invalid sched MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 2320c9e6a768 ("drm/sched: memset() 'job' in drm_sched_job_init()") accessing job->base.sched can produce unexpected results as the initialisation of (*job)->base.sched done in amdgpu_job_alloc is overwritten by the memset. This commit fixes an issue when a CS would fail validation and would be rejected after job->num_ibs is incremented. In this case, amdgpu_ib_free(ring->adev, ...) will be called, which would crash the machine because the ring value is bogus. To fix this, pass a NULL pointer to amdgpu_ib_free(): we can do this because the device is actually not used in this function. The next commit will remove the ring argument completely. Fixes: 2320c9e6a768 ("drm/sched: memset() 'job' in drm_sched_job_init()") Signed-off-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 2ae520cb12831d264ceb97c61f72c59d33c0dbd7) --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index b9d08bc96581..a21c510c408e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -255,7 +255,6 @@ void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds, void amdgpu_job_free_resources(struct amdgpu_job *job) { - struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); struct dma_fence *f; unsigned i; @@ -268,7 +267,7 @@ void amdgpu_job_free_resources(struct amdgpu_job *job) f = NULL; for (i = 0; i < job->num_ibs; ++i) - amdgpu_ib_free(ring->adev, &job->ibs[i], f); + amdgpu_ib_free(NULL, &job->ibs[i], f); } static void amdgpu_job_free_cb(struct drm_sched_job *s_job) From 146b6057e1fd28fb1a38d300bf76a38dfba7f9fb Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 17 Dec 2024 13:55:48 +0100 Subject: [PATCH 287/807] wifi: cw1200: Fix potential NULL dereference A recent refactoring was identified by smatch to cause another potential NULL dereference: drivers/net/wireless/st/cw1200/cw1200_spi.c:440 cw1200_spi_disconnect() error: we previously assumed 'self' could be null (see line 433) Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202411271742.Xa7CNVh1-lkp@intel.com/ Fixes: 2719a9e7156c ("wifi: cw1200: Convert to GPIO descriptors") Signed-off-by: Linus Walleij Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241217-cw1200-fix-v1-1-911e6b5823ec@linaro.org --- drivers/net/wireless/st/cw1200/cw1200_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/st/cw1200/cw1200_spi.c b/drivers/net/wireless/st/cw1200/cw1200_spi.c index 862964a8cc87..52386dfb5f4a 100644 --- a/drivers/net/wireless/st/cw1200/cw1200_spi.c +++ b/drivers/net/wireless/st/cw1200/cw1200_spi.c @@ -442,8 +442,8 @@ static void cw1200_spi_disconnect(struct spi_device *func) cw1200_core_release(self->core); self->core = NULL; } + cw1200_spi_off(self, dev_get_platdata(&func->dev)); } - cw1200_spi_off(self, dev_get_platdata(&func->dev)); } static int __maybe_unused cw1200_spi_suspend(struct device *dev) From 458600da793da12e0f3724ecbea34a80703f4d5b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 16:47:48 -0500 Subject: [PATCH 288/807] drm/amdgpu/nbio7.7: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 22b9555bc90df22b585bdd1f161b61584b13af51) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c index 1ac730328516..3fb6d2aa7e3b 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c @@ -247,7 +247,7 @@ static void nbio_v7_7_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF0_PCIE_MST_CTRL_3, data); - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 7, 0): data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4) & ~BIT(23); WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4, data); From 8c1ecc7197a88c6ae62de56e1c0887f220712a32 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:00:07 -0500 Subject: [PATCH 289/807] drm/amdgpu/nbio7.11: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 2c8eeaaa0fe5841ccf07a0eb51b1426f34ef39f7) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c index 814ab59fdd4a..41421da63a08 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c @@ -275,7 +275,7 @@ static void nbio_v7_11_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_PCIE_MST_CTRL_3, data); - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 11, 0): case IP_VERSION(7, 11, 1): case IP_VERSION(7, 11, 2): From 6ebc5b92190e01dd48313b68cbf752c9adcfefa8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:03:20 -0500 Subject: [PATCH 290/807] drm/amdgpu/mmhub4.1: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 63bfd24088b42c6f55c2096bfc41b50213d419b2) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c index 0fbc3be81f14..f2ab5001b492 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c @@ -108,7 +108,7 @@ mmhub_v4_1_0_print_l2_protection_fault_status(struct amdgpu_device *adev, dev_err(adev->dev, "MMVM_L2_PROTECTION_FAULT_STATUS_LO32:0x%08X\n", status); - switch (adev->ip_versions[MMHUB_HWIP][0]) { + switch (amdgpu_ip_version(adev, MMHUB_HWIP, 0)) { case IP_VERSION(4, 1, 0): mmhub_cid = mmhub_client_ids_v4_1_0[cid][rw]; break; From 41be00f839e9ee7753892a73a36ce4c14c6f5cbf Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:04:58 -0500 Subject: [PATCH 291/807] drm/amdgpu/gfx12: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit f1fd1d0f40272948aa6ab82a3a82ecbbc76dff53) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index fe7c48f2fb2a..da327ab48a57 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -4123,7 +4123,7 @@ static int gfx_v12_0_set_clockgating_state(void *handle, if (amdgpu_sriov_vf(adev)) return 0; - switch (adev->ip_versions[GC_HWIP][0]) { + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): gfx_v12_0_update_gfx_clock_gating(adev, From 9e752ee26c1031312a01d2afc281f5f6fdfca176 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:06:26 -0500 Subject: [PATCH 292/807] drm/amdgpu/smu14.0.2: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 8f2cd1067afe68372a1723e05e19b68ed187676a) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 6a565ce74d5b..5cad09c5f2ff 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -2096,7 +2096,7 @@ static int smu_v14_0_2_enable_gfx_features(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; - if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(14, 0, 2)) + if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 2)) return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_EnableAllSmuFeatures, FEATURE_PWR_GFX, NULL); else From 8d1a13816e59254bd3b18f5ae0895230922bd120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 12 Dec 2024 16:29:18 +0100 Subject: [PATCH 293/807] drm/amdgpu: fix amdgpu_coredump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VM pointer might already be outdated when that function is called. Use the PASID instead to gather the information instead. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 57f812d171af4ba233d3ed7c94dfa5b8e92dcc04) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index 946c48829f19..824f9da5b6ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -343,11 +343,10 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, coredump->skip_vram_check = skip_vram_check; coredump->reset_vram_lost = vram_lost; - if (job && job->vm) { - struct amdgpu_vm *vm = job->vm; + if (job && job->pasid) { struct amdgpu_task_info *ti; - ti = amdgpu_vm_get_task_info_vm(vm); + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); if (ti) { coredump->reset_task_info = *ti; amdgpu_vm_put_task_info(ti); From 85230ee36d88e7a09fb062d43203035659dd10a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Tue, 17 Dec 2024 18:22:56 +0100 Subject: [PATCH 294/807] drm/amdgpu: Handle NULL bo->tbo.resource (again) in amdgpu_vm_bo_update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third time's the charm, I hope? Fixes: d3116756a710 ("drm/ttm: rename bo->mem and make it a pointer") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3837 Reviewed-by: Christian König Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher (cherry picked from commit 695c2c745e5dff201b75da8a1d237ce403600d04) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ddd7f05e4db9..c9c48b782ec1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1266,10 +1266,9 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, * next command submission. */ if (amdgpu_vm_is_bo_always_valid(vm, bo)) { - uint32_t mem_type = bo->tbo.resource->mem_type; - - if (!(bo->preferred_domains & - amdgpu_mem_type_to_domain(mem_type))) + if (bo->tbo.resource && + !(bo->preferred_domains & + amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type))) amdgpu_vm_bo_evicted(&bo_va->base); else amdgpu_vm_bo_idle(&bo_va->base); From c58a812c8e49ad688f94f4b050ad5c5b388fc5d2 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Wed, 18 Dec 2024 21:36:55 +0800 Subject: [PATCH 295/807] ring-buffer: Fix overflow in __rb_map_vma An overflow occurred when performing the following calculation: nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; Add a check before the calculation to avoid this problem. syzbot reported this as a slab-out-of-bounds in __rb_map_vma: BUG: KASAN: slab-out-of-bounds in __rb_map_vma+0x9ab/0xae0 kernel/trace/ring_buffer.c:7058 Read of size 8 at addr ffff8880767dd2b8 by task syz-executor187/5836 CPU: 0 UID: 0 PID: 5836 Comm: syz-executor187 Not tainted 6.13.0-rc2-syzkaller-00159-gf932fb9b4074 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xc3/0x620 mm/kasan/report.c:489 kasan_report+0xd9/0x110 mm/kasan/report.c:602 __rb_map_vma+0x9ab/0xae0 kernel/trace/ring_buffer.c:7058 ring_buffer_map+0x56e/0x9b0 kernel/trace/ring_buffer.c:7138 tracing_buffers_mmap+0xa6/0x120 kernel/trace/trace.c:8482 call_mmap include/linux/fs.h:2183 [inline] mmap_file mm/internal.h:124 [inline] __mmap_new_file_vma mm/vma.c:2291 [inline] __mmap_new_vma mm/vma.c:2355 [inline] __mmap_region+0x1786/0x2670 mm/vma.c:2456 mmap_region+0x127/0x320 mm/mmap.c:1348 do_mmap+0xc00/0xfc0 mm/mmap.c:496 vm_mmap_pgoff+0x1ba/0x360 mm/util.c:580 ksys_mmap_pgoff+0x32c/0x5c0 mm/mmap.c:542 __do_sys_mmap arch/x86/kernel/sys_x86_64.c:89 [inline] __se_sys_mmap arch/x86/kernel/sys_x86_64.c:82 [inline] __x64_sys_mmap+0x125/0x190 arch/x86/kernel/sys_x86_64.c:82 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f The reproducer for this bug is: ------------------------8<------------------------- #include #include #include #include #include int main(int argc, char **argv) { int page_size = getpagesize(); int fd; void *meta; system("echo 1 > /sys/kernel/tracing/buffer_size_kb"); fd = open("/sys/kernel/tracing/per_cpu/cpu0/trace_pipe_raw", O_RDONLY); meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, page_size * 5); } ------------------------>8------------------------- Cc: stable@vger.kernel.org Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Link: https://lore.kernel.org/tencent_06924B6674ED771167C23CC336C097223609@qq.com Reported-by: syzbot+345e4443a21200874b18@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=345e4443a21200874b18 Signed-off-by: Edward Adam Davis Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7e257e855dd1..60210fb5b211 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7019,7 +7019,11 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, lockdep_assert_held(&cpu_buffer->mapping_lock); nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ - nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */ + nr_pages = ((nr_subbufs + 1) << subbuf_order); /* + meta-page */ + if (nr_pages <= pgoff) + return -EINVAL; + + nr_pages -= pgoff; nr_vma_pages = vma_pages(vma); if (!nr_vma_pages || nr_vma_pages > nr_pages) From 8cd63406d08110c8098e1efda8aef7ddab4db348 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Dec 2024 14:15:07 -0500 Subject: [PATCH 296/807] trace/ring-buffer: Do not use TP_printk() formatting for boot mapped buffers The TP_printk() of a TRACE_EVENT() is a generic printf format that any developer can create for their event. It may include pointers to strings and such. A boot mapped buffer may contain data from a previous kernel where the strings addresses are different. One solution is to copy the event content and update the pointers by the recorded delta, but a simpler solution (for now) is to just use the print_fields() function to print these events. The print_fields() function just iterates the fields and prints them according to what type they are, and ignores the TP_printk() format from the event itself. To understand the difference, when printing via TP_printk() the output looks like this: 4582.696626: kmem_cache_alloc: call_site=getname_flags+0x47/0x1f0 ptr=00000000e70e10e0 bytes_req=4096 bytes_alloc=4096 gfp_flags=GFP_KERNEL node=-1 accounted=false 4582.696629: kmem_cache_alloc: call_site=alloc_empty_file+0x6b/0x110 ptr=0000000095808002 bytes_req=360 bytes_alloc=384 gfp_flags=GFP_KERNEL node=-1 accounted=false 4582.696630: kmem_cache_alloc: call_site=security_file_alloc+0x24/0x100 ptr=00000000576339c3 bytes_req=16 bytes_alloc=16 gfp_flags=GFP_KERNEL|__GFP_ZERO node=-1 accounted=false 4582.696653: kmem_cache_free: call_site=do_sys_openat2+0xa7/0xd0 ptr=00000000e70e10e0 name=names_cache But when printing via print_fields() (echo 1 > /sys/kernel/tracing/options/fields) the same event output looks like this: 4582.696626: kmem_cache_alloc: call_site=0xffffffff92d10d97 (-1831793257) ptr=0xffff9e0e8571e000 (-107689771147264) bytes_req=0x1000 (4096) bytes_alloc=0x1000 (4096) gfp_flags=0xcc0 (3264) node=0xffffffff (-1) accounted=(0) 4582.696629: kmem_cache_alloc: call_site=0xffffffff92d0250b (-1831852789) ptr=0xffff9e0e8577f800 (-107689770747904) bytes_req=0x168 (360) bytes_alloc=0x180 (384) gfp_flags=0xcc0 (3264) node=0xffffffff (-1) accounted=(0) 4582.696630: kmem_cache_alloc: call_site=0xffffffff92efca74 (-1829778828) ptr=0xffff9e0e8d35d3b0 (-107689640864848) bytes_req=0x10 (16) bytes_alloc=0x10 (16) gfp_flags=0xdc0 (3520) node=0xffffffff (-1) accounted=(0) 4582.696653: kmem_cache_free: call_site=0xffffffff92cfbea7 (-1831879001) ptr=0xffff9e0e8571e000 (-107689771147264) name=names_cache Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Linus Torvalds Link: https://lore.kernel.org/20241218141507.28389a1d@gandalf.local.home Fixes: 07714b4bb3f98 ("tracing: Handle old buffer mappings for event strings and functions") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be62f0ea1814..6581cb2bc67f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4353,6 +4353,15 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) if (event) { if (tr->trace_flags & TRACE_ITER_FIELDS) return print_event_fields(iter, event); + /* + * For TRACE_EVENT() events, the print_fmt is not + * safe to use if the array has delta offsets + * Force printing via the fields. + */ + if ((tr->text_delta || tr->data_delta) && + event->type > __TRACE_LAST_TYPE) + return print_event_fields(iter, event); + return event->funcs->trace(iter, sym_flags, event); } From 0674188f2f4d38d74aa863f17373d76256f2ed09 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 17 Dec 2024 15:37:04 +0800 Subject: [PATCH 297/807] ACPI: EC: Enable EC support on LoongArch by default Commit a6021aa24f6417416d933 ("ACPI: EC: make EC support compile-time conditional") only enable ACPI_EC on X86 by default, but the embedded controller is also widely used on LoongArch laptops so we also enable ACPI_EC for LoongArch. The laptop driver cannot work without EC, so also update the dependency of LOONGSON_LAPTOP to let it depend on APCI_EC. Fixes: a6021aa24f6417416d933 ("ACPI: EC: make EC support compile-time conditional") Reported-by: Xiaotian Wu Tested-by: Binbin Zhou Signed-off-by: Huacai Chen Link: https://patch.msgid.link/20241217073704.3339587-1-chenhuacai@loongson.cn [ rjw: Added Fixes: ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 4 ++-- drivers/platform/loongarch/Kconfig | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index d65cd08ba8e1..d81b55f5068c 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -135,10 +135,10 @@ config ACPI_REV_OVERRIDE_POSSIBLE config ACPI_EC bool "Embedded Controller" depends on HAS_IOPORT - default X86 + default X86 || LOONGARCH help This driver handles communication with the microcontroller - on many x86 laptops and other machines. + on many x86/LoongArch laptops and other machines. config ACPI_EC_DEBUGFS tristate "EC read/write access through /sys/kernel/debug/ec" diff --git a/drivers/platform/loongarch/Kconfig b/drivers/platform/loongarch/Kconfig index 5633e4d73991..447528797d07 100644 --- a/drivers/platform/loongarch/Kconfig +++ b/drivers/platform/loongarch/Kconfig @@ -18,7 +18,7 @@ if LOONGARCH_PLATFORM_DEVICES config LOONGSON_LAPTOP tristate "Generic Loongson-3 Laptop Driver" - depends on ACPI + depends on ACPI_EC depends on BACKLIGHT_CLASS_DEVICE depends on INPUT depends on MACH_LOONGSON64 From a7f9d98eb1202132014ba760c26ad8608ffc9caf Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 10 Dec 2024 20:44:14 -0600 Subject: [PATCH 298/807] drm/amd: Update strapping for NBIO 2.5.0 This helps to avoid a spurious PME event on hotplug to Azalia. Cc: Vijendar Mukunda Reported-and-tested-by: ionut_n2001@yahoo.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=215884 Tested-by: Gabriel Marcano Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20241211024414.7840-1-mario.limonciello@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 3f6f237b9dd189e1fb85b8a3f7c97a8f27c1e49a) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index b1b57dcc5a73..49e953f86ced 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -271,8 +271,19 @@ const struct nbio_hdp_flush_reg nbio_v7_0_hdp_flush_reg = { .ref_and_mask_sdma1 = GPU_HDP_FLUSH_DONE__SDMA1_MASK, }; +#define regRCC_DEV0_EPF6_STRAP4 0xd304 +#define regRCC_DEV0_EPF6_STRAP4_BASE_IDX 5 + static void nbio_v7_0_init_registers(struct amdgpu_device *adev) { + uint32_t data; + + switch (adev->ip_versions[NBIO_HWIP][0]) { + case IP_VERSION(2, 5, 0): + data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23); + WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data); + break; + } } #define MMIO_REG_HOLE_OFFSET (0x80000 - PAGE_SIZE) From 3abb660f9e18925468685591a3702bda05faba4f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 16:49:20 -0500 Subject: [PATCH 299/807] drm/amdgpu/nbio7.0: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 0ec43fbece784215d3c4469973e4556d70bce915) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index 49e953f86ced..d1032e9992b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -278,7 +278,7 @@ static void nbio_v7_0_init_registers(struct amdgpu_device *adev) { uint32_t data; - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(2, 5, 0): data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23); WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data); From 985bb51f17abbe83c697a5ac0aa40fad5f4e00f4 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 28 Nov 2024 15:44:06 +0000 Subject: [PATCH 300/807] KVM: arm64: Always check the state from hyp_ack_unshare() There are multiple pKVM memory transitions where the state of a page is not cross-checked from the completer's PoV for performance reasons. For example, if a page is PKVM_PAGE_OWNED from the initiator's PoV, we should be guaranteed by construction that it is PKVM_NOPAGE for everybody else, hence allowing us to save a page-table lookup. When it was introduced, hyp_ack_unshare() followed that logic and bailed out without checking the PKVM_PAGE_SHARED_BORROWED state in the hypervisor's stage-1. This was correct as we could safely assume that all host-initiated shares were directed at the hypervisor at the time. But with the introduction of other types of shares (e.g. for FF-A or non-protected guests), it is now very much required to cross check this state to prevent the host from running __pkvm_host_unshare_hyp() on a page shared with TZ or a non-protected guest. Thankfully, if an attacker were to try this, the hyp_unmap() call from hyp_complete_unshare() would fail, hence causing to WARN() from __do_unshare() with the host lock held, which is fatal. But this is fragile at best, and can hardly be considered a security measure. Let's just do the right thing and always check the state from hyp_ack_unshare(). Signed-off-by: Quentin Perret Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241128154406.602875-1-qperret@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index caba3e4bd09e..e75374d682f4 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -783,9 +783,6 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr)) return -EBUSY; - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; - return __hyp_check_page_state_range(addr, size, PKVM_PAGE_SHARED_BORROWED); } From e22c369520d0a2a191820cc308f81a860b1b8d47 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 17 Dec 2024 09:55:13 -0800 Subject: [PATCH 301/807] KVM: arm64: Add unified helper for reprogramming counters by mask Having separate helpers for enabling/disabling counters provides the wrong abstraction, as the state of each counter needs to be evaluated independently and, in some cases, use a different global enable bit. Collapse the enable/disable accessors into a single, common helper that reconfigures every counter set in @mask, leaving the complexity of determining if an event is actually enabled in kvm_pmu_counter_is_enabled(). Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241217175513.3658056-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 68 ++++++++++++++------------------------- arch/arm64/kvm/sys_regs.c | 10 +++--- include/kvm/arm_pmu.h | 6 ++-- 3 files changed, 30 insertions(+), 54 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 456102bc0b55..6b3ec956a6e2 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -24,6 +24,7 @@ static DEFINE_MUTEX(arm_pmus_lock); static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); +static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc); static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) { @@ -327,48 +328,25 @@ u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); } -/** - * kvm_pmu_enable_counter_mask - enable selected PMU counters - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENSET register - * - * Call perf_event_enable to start counting the perf event - */ -void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc) { - int i; - if (!kvm_vcpu_has_pmu(vcpu)) + if (!pmc->perf_event) { + kvm_pmu_create_perf_event(pmc); return; - - if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) || !val) - return; - - for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { - struct kvm_pmc *pmc; - - if (!(val & BIT(i))) - continue; - - pmc = kvm_vcpu_idx_to_pmc(vcpu, i); - - if (!pmc->perf_event) { - kvm_pmu_create_perf_event(pmc); - } else { - perf_event_enable(pmc->perf_event); - if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) - kvm_debug("fail to enable perf event\n"); - } } + + perf_event_enable(pmc->perf_event); + if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) + kvm_debug("fail to enable perf event\n"); } -/** - * kvm_pmu_disable_counter_mask - disable selected PMU counters - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENCLR register - * - * Call perf_event_disable to stop counting the perf event - */ -void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +static void kvm_pmc_disable_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) + perf_event_disable(pmc->perf_event); +} + +void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; @@ -376,16 +354,18 @@ void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) return; for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { - struct kvm_pmc *pmc; + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); if (!(val & BIT(i))) continue; - pmc = kvm_vcpu_idx_to_pmc(vcpu, i); - - if (pmc->perf_event) - perf_event_disable(pmc->perf_event); + if (kvm_pmu_counter_is_enabled(pmc)) + kvm_pmc_enable_perf_event(pmc); + else + kvm_pmc_disable_perf_event(pmc); } + + kvm_vcpu_pmu_restore_guest(vcpu); } /* @@ -630,10 +610,10 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); if (val & ARMV8_PMU_PMCR_E) { - kvm_pmu_enable_counter_mask(vcpu, + kvm_pmu_reprogram_counter_mask(vcpu, __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } else { - kvm_pmu_disable_counter_mask(vcpu, + kvm_pmu_reprogram_counter_mask(vcpu, __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e2a5c2918d9e..6ef8641d9833 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1208,16 +1208,14 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, mask = kvm_pmu_accessible_counter_mask(vcpu); if (p->is_write) { val = p->regval & mask; - if (r->Op2 & 0x1) { + if (r->Op2 & 0x1) /* accessing PMCNTENSET_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; - kvm_pmu_enable_counter_mask(vcpu, val); - kvm_vcpu_pmu_restore_guest(vcpu); - } else { + else /* accessing PMCNTENCLR_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; - kvm_pmu_disable_counter_mask(vcpu, val); - } + + kvm_pmu_reprogram_counter_mask(vcpu, val); } else { p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); } diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index e61dd7dd2286..147bd3ee4f7b 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -53,8 +53,7 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); -void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val); -void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val); +void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu); @@ -127,8 +126,7 @@ static inline u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} -static inline void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} -static inline void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} +static inline void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {} static inline bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) From adf8623b3f51e9c7eb18a7bb0381093f31053e38 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 17 Dec 2024 09:55:32 -0800 Subject: [PATCH 302/807] KVM: arm64: Use KVM_REQ_RELOAD_PMU to handle PMCR_EL0.E change Nested virt introduces yet another set of 'global' knobs for controlling event counters that are reserved for EL2 (i.e. >= HPMN). Get ready to share some plumbing with the NV controls by offloading counter reprogramming to KVM_REQ_RELOAD_PMU. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241217175532.3658134-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 6b3ec956a6e2..c6423782a8aa 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -606,17 +606,13 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) val &= ~ARMV8_PMU_PMCR_LP; + /* Request a reload of the PMU to enable/disable affected counters */ + if ((__vcpu_sys_reg(vcpu, PMCR_EL0) ^ val) & ARMV8_PMU_PMCR_E) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + /* The reset bits don't indicate any state, and shouldn't be saved. */ __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); - if (val & ARMV8_PMU_PMCR_E) { - kvm_pmu_reprogram_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); - } else { - kvm_pmu_reprogram_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); - } - if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); @@ -626,7 +622,6 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } - kvm_vcpu_pmu_restore_guest(vcpu); } static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) @@ -890,11 +885,11 @@ void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { u64 mask = kvm_pmu_implemented_counter_mask(vcpu); - kvm_pmu_handle_pmcr(vcpu, kvm_vcpu_read_pmcr(vcpu)); - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= mask; __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= mask; __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= mask; + + kvm_pmu_reprogram_counter_mask(vcpu, mask); } int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) From d3ba35b69eaed060bbc92a99bf027627bad170eb Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 17 Dec 2024 09:55:50 -0800 Subject: [PATCH 303/807] KVM: arm64: nv: Reload PMU events upon MDCR_EL2.HPME change MDCR_EL2.HPME is the 'global' enable bit for event counters reserved for EL2. Give the PMU a kick when it's changed to ensure events are reprogrammed before returning to the guest. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241217175550.3658212-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 6ef8641d9833..634ff18a59a1 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2448,6 +2448,26 @@ static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu, return __el2_visibility(vcpu, rd, s1pie_visibility); } +static bool access_mdcr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 old = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!access_rw(vcpu, p, r)) + return false; + + /* + * Request a reload of the PMU to enable/disable the counters affected + * by HPME. + */ + if ((old ^ __vcpu_sys_reg(vcpu, MDCR_EL2)) & MDCR_EL2_HPME) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + + return true; +} + + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2981,7 +3001,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), EL2_REG_VNCR(HCR_EL2, reset_hcr, 0), - EL2_REG(MDCR_EL2, access_rw, reset_val, 0), + EL2_REG(MDCR_EL2, access_mdcr, reset_val, 0), EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), EL2_REG_VNCR(HSTR_EL2, reset_val, 0), EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0), From e96d8b80afd3f63ffad58c0fdd5e0c380c4c404e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 17 Dec 2024 09:56:11 -0800 Subject: [PATCH 304/807] KVM: arm64: Only apply PMCR_EL0.P to the guest range of counters An important distinction from other registers affected by HPMN is that PMCR_EL0 only affects the guest range of counters, regardless of the EL from which it is accessed. Ensure that PMCR_EL0.P is always applied to 'guest' counters by manually computing the mask rather than deriving it from the current context. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241217175611.3658290-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index c6423782a8aa..6c5950b9ceac 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -617,8 +617,14 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { - unsigned long mask = kvm_pmu_accessible_counter_mask(vcpu); - mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); + /* + * Unlike other PMU sysregs, the controls in PMCR_EL0 always apply + * to the 'guest' range of counters and never the 'hyp' range. + */ + unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) & + ~kvm_pmu_hyp_counter_mask(vcpu) & + ~BIT(ARMV8_PMU_CYCLE_IDX); + for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } From cf2c97423a4f89c8b798294d3f34ecfe7e7035c3 Mon Sep 17 00:00:00 2001 From: David Laight Date: Sat, 14 Dec 2024 17:30:53 +0000 Subject: [PATCH 305/807] ipvs: Fix clamp() of ip_vs_conn_tab on small memory systems The 'max_avail' value is calculated from the system memory size using order_base_2(). order_base_2(x) is defined as '(x) ? fn(x) : 0'. The compiler generates two copies of the code that follows and then expands clamp(max, min, PAGE_SHIFT - 12) (11 on 32bit). This triggers a compile-time assert since min is 5. In reality a system would have to have less than 512MB memory for the bounds passed to clamp to be reversed. Swap the order of the arguments to clamp() to avoid the warning. Replace the clamp_val() on the line below with clamp(). clamp_val() is just 'an accident waiting to happen' and not needed here. Detected by compile time checks added to clamp(), specifically: minmax.h: use BUILD_BUG_ON_MSG() for the lo < hi test in clamp() Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/all/CA+G9fYsT34UkGFKxus63H6UVpYi5GRZkezT9MRLfAbM3f6ke0g@mail.gmail.com/ Fixes: 4f325e26277b ("ipvs: dynamically limit the connection hash table") Tested-by: Bartosz Golaszewski Reviewed-by: Bartosz Golaszewski Signed-off-by: David Laight Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 98d7dbe3d787..c0289f83f96d 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1495,8 +1495,8 @@ int __init ip_vs_conn_init(void) max_avail -= 2; /* ~4 in hash row */ max_avail -= 1; /* IPVS up to 1/2 of mem */ max_avail -= order_base_2(sizeof(struct ip_vs_conn)); - max = clamp(max, min, max_avail); - ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max); + max = clamp(max_avail, min, max); + ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; From 70b6f46a4ed8bd56c85ffff22df91e20e8c85e33 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 17 Dec 2024 20:56:55 +0100 Subject: [PATCH 306/807] netfilter: ipset: Fix for recursive locking warning With CONFIG_PROVE_LOCKING, when creating a set of type bitmap:ip, adding it to a set of type list:set and populating it from iptables SET target triggers a kernel warning: | WARNING: possible recursive locking detected | 6.12.0-rc7-01692-g5e9a28f41134-dirty #594 Not tainted | -------------------------------------------- | ping/4018 is trying to acquire lock: | ffff8881094a6848 (&set->lock){+.-.}-{2:2}, at: ip_set_add+0x28c/0x360 [ip_set] | | but task is already holding lock: | ffff88811034c048 (&set->lock){+.-.}-{2:2}, at: ip_set_add+0x28c/0x360 [ip_set] This is a false alarm: ipset does not allow nested list:set type, so the loop in list_set_kadd() can never encounter the outer set itself. No other set type supports embedded sets, so this is the only case to consider. To avoid the false report, create a distinct lock class for list:set type ipset locks. Fixes: f830837f0eed ("netfilter: ipset: list:set set type support") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_list_set.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index bfae7066936b..db794fe1300e 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -611,6 +611,8 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) return true; } +static struct lock_class_key list_set_lockdep_key; + static int list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) @@ -627,6 +629,7 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; + lockdep_set_class(&set->lock, &list_set_lockdep_key); set->variant = &set_variant; set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), __alignof__(struct set_elem)); From b3ded6072c5600704cfa3ce3a8dc8718d34bda66 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 16 Nov 2024 21:36:47 +0100 Subject: [PATCH 307/807] power: supply: bq24190: Fix BQ24296 Vbus regulator support There are 2 issues with bq24296_set_otg_vbus(): 1. When writing the OTG_CONFIG bit it uses POC_CHG_CONFIG_SHIFT which should be POC_OTG_CONFIG_SHIFT. 2. When turning the regulator off it never turns charging back on. Note this must be done through bq24190_charger_set_charge_type(), to ensure that the charge_type property value of none/trickle/fast is honored. Resolve both issues to fix BQ24296 Vbus regulator support not working. Fixes: b150a703b56f ("power: supply: bq24190_charger: Add support for BQ24296") Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20241116203648.169100-2-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq24190_charger.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c index 2b393eb5c282..c47f32f152e6 100644 --- a/drivers/power/supply/bq24190_charger.c +++ b/drivers/power/supply/bq24190_charger.c @@ -567,6 +567,7 @@ static int bq24190_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) static int bq24296_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) { + union power_supply_propval val = { .intval = bdi->charge_type }; int ret; ret = pm_runtime_resume_and_get(bdi->dev); @@ -587,13 +588,18 @@ static int bq24296_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) ret = bq24190_write_mask(bdi, BQ24190_REG_POC, BQ24296_REG_POC_OTG_CONFIG_MASK, - BQ24296_REG_POC_CHG_CONFIG_SHIFT, + BQ24296_REG_POC_OTG_CONFIG_SHIFT, BQ24296_REG_POC_OTG_CONFIG_OTG); - } else + } else { ret = bq24190_write_mask(bdi, BQ24190_REG_POC, BQ24296_REG_POC_OTG_CONFIG_MASK, - BQ24296_REG_POC_CHG_CONFIG_SHIFT, + BQ24296_REG_POC_OTG_CONFIG_SHIFT, BQ24296_REG_POC_OTG_CONFIG_DISABLE); + if (ret < 0) + goto out; + + ret = bq24190_charger_set_charge_type(bdi, &val); + } out: pm_runtime_mark_last_busy(bdi->dev); From cff865c700711ecc3824b2dfe181637f3ed23c80 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Dec 2024 09:10:34 +0100 Subject: [PATCH 308/807] net: phy: avoid undefined behavior in *_led_polarity_set() gcc runs into undefined behavior at the end of the three led_polarity_set() callback functions if it were called with a zero 'modes' argument and it just ends the function there without returning from it. This gets flagged by 'objtool' as a function that continues on to the next one: drivers/net/phy/aquantia/aquantia_leds.o: warning: objtool: aqr_phy_led_polarity_set+0xf: can't find jump dest instruction at .text+0x5d9 drivers/net/phy/intel-xway.o: warning: objtool: xway_gphy_led_polarity_set() falls through to next function xway_gphy_config_init() drivers/net/phy/mxl-gpy.o: warning: objtool: gpy_led_polarity_set() falls through to next function gpy_led_hw_control_get() There is no point to micro-optimize the behavior here to save a single-digit number of bytes in the kernel, so just change this to a "return -EINVAL" as we do when any unexpected bits are set. Fixes: 1758af47b98c ("net: phy: intel-xway: add support for PHY LEDs") Fixes: 9d55e68b19f2 ("net: phy: aquantia: correctly describe LED polarity override") Fixes: eb89c79c1b8f ("net: phy: mxl-gpy: correctly describe LED polarity") Signed-off-by: Arnd Bergmann Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241217081056.238792-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/phy/aquantia/aquantia_leds.c | 2 +- drivers/net/phy/intel-xway.c | 2 +- drivers/net/phy/mxl-gpy.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/aquantia/aquantia_leds.c b/drivers/net/phy/aquantia/aquantia_leds.c index 00ad2313fed3..951f46104eff 100644 --- a/drivers/net/phy/aquantia/aquantia_leds.c +++ b/drivers/net/phy/aquantia/aquantia_leds.c @@ -156,5 +156,5 @@ int aqr_phy_led_polarity_set(struct phy_device *phydev, int index, unsigned long if (force_active_high || force_active_low) return aqr_phy_led_active_low_set(phydev, index, force_active_low); - unreachable(); + return -EINVAL; } diff --git a/drivers/net/phy/intel-xway.c b/drivers/net/phy/intel-xway.c index b672c55a7a4e..e6ed2413e514 100644 --- a/drivers/net/phy/intel-xway.c +++ b/drivers/net/phy/intel-xway.c @@ -529,7 +529,7 @@ static int xway_gphy_led_polarity_set(struct phy_device *phydev, int index, if (force_active_high) return phy_clear_bits(phydev, XWAY_MDIO_LED, XWAY_GPHY_LED_INV(index)); - unreachable(); + return -EINVAL; } static struct phy_driver xway_gphy[] = { diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c index db3c1f72b407..a8ccf257c109 100644 --- a/drivers/net/phy/mxl-gpy.c +++ b/drivers/net/phy/mxl-gpy.c @@ -1014,7 +1014,7 @@ static int gpy_led_polarity_set(struct phy_device *phydev, int index, if (force_active_high) return phy_clear_bits(phydev, PHY_LED, PHY_LED_POLARITY(index)); - unreachable(); + return -EINVAL; } static struct phy_driver gpy_drivers[] = { From 12d908116f7efd34f255a482b9afc729d7a5fb78 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 18 Dec 2024 17:56:25 +0100 Subject: [PATCH 309/807] io_uring: Fix registered ring file refcount leak Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is only called on exit, but __io_uring_free (which frees the tctx in which the registered ring pointers are stored) is also called on execve (via begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel -> io_uring_cancel_generic -> __io_uring_free). This means: A process going through execve while having registered rings will leak references to the rings' `struct file`. Fix it by zapping registered rings on execve(). This is implemented by moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its callee __io_uring_cancel(), which is called from io_uring_task_cancel() on execve. This could probably be exploited *on 32-bit kernels* by leaking 2^32 references to the same ring, because the file refcount is stored in a pointer-sized field and get_file() doesn't have protection against refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no impact beyond a memory leak. Cc: stable@vger.kernel.org Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors") Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20241218-uring-reg-ring-cleanup-v1-1-8f63e999045b@google.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 4 +--- io_uring/io_uring.c | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e123d5e17b52..85fe4e6b275c 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { - if (current->io_uring) { - io_uring_unreg_ringfd(); + if (current->io_uring) __io_uring_cancel(false); - } } static inline void io_uring_task_cancel(void) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 605625e932eb..432b95ca9c85 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3214,6 +3214,7 @@ end_wait: void __io_uring_cancel(bool cancel_all) { + io_uring_unreg_ringfd(); io_uring_cancel_generic(cancel_all, NULL); } From 5c964c8a97c12145104f5d2782aa1ffccf3a93dd Mon Sep 17 00:00:00 2001 From: Martin Hou Date: Mon, 16 Dec 2024 11:06:18 +0800 Subject: [PATCH 310/807] net: usb: qmi_wwan: add Quectel RG255C Add support for Quectel RG255C which is based on Qualcomm SDX35 chip. The composition is DM / NMEA / AT / QMI. T: Bus=01 Lev=01 Prnt=01 Port=04 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=0316 Rev= 5.15 S: Manufacturer=Quectel S: Product=RG255C-CN S: SerialNumber=c68192c1 C:* #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=86(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Martin Hou Link: https://patch.msgid.link/tencent_17DDD787B48E8A5AB8379ED69E23A0CD9309@qq.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 9fe7f704a2f7..e9208a8d2bfa 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1429,6 +1429,7 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x2c7c, 0x0195, 4)}, /* Quectel EG95 */ {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x030e, 4)}, /* Quectel EM05GV2 */ + {QMI_QUIRK_SET_DTR(0x2c7c, 0x0316, 3)}, /* Quectel RG255C */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0104, 4)}, /* Fibocom NL678 series */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0112, 0)}, /* Fibocom FG132 */ {QMI_FIXED_INTF(0x0489, 0xe0b4, 0)}, /* Foxconn T77W968 LTE */ From dbf8be8218e7ff2ee2bbeebc91bf0e0c58a8c60b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 8 Nov 2024 13:57:06 +0000 Subject: [PATCH 311/807] docs/mm: add VMA locks documentation Locking around VMAs is complicated and confusing. While we have a number of disparate comments scattered around the place, we seem to be reaching a level of complexity that justifies a serious effort at clearly documenting how locks are expected to be used when it comes to interacting with mm_struct and vm_area_struct objects. This is especially pertinent as regards the efforts to find sensible abstractions for these fundamental objects in kernel rust code whose compiler strictly requires some means of expressing these rules (and through this expression, self-document these requirements as well as enforce them). The document limits scope to mmap and VMA locks and those that are immediately adjacent and relevant to them - so additionally covers page table locking as this is so very closely tied to VMA operations (and relies upon us handling these correctly). The document tries to cover some of the nastier and more confusing edge cases and concerns especially around lock ordering and page table teardown. The document is split between generally useful information for users of mm interfaces, and separately a section intended for mm kernel developers providing a discussion around internal implementation details. [lorenzo.stoakes@oracle.com: v3] Link: https://lkml.kernel.org/r/20241114205402.859737-1-lorenzo.stoakes@oracle.com [lorenzo.stoakes@oracle.com: docs/mm: minor corrections] Link: https://lkml.kernel.org/r/d3de735a-25ae-4eb2-866c-a9624fe6f795@lucifer.local [jannh@google.com: docs/mm: add more warnings around page table access] Link: https://lkml.kernel.org/r/20241118-vma-docs-addition1-onv3-v2-1-c9d5395b72ee@google.com Link: https://lkml.kernel.org/r/20241108135708.48567-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Qi Zheng Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Bagas Sanjaya Reviewed-by: Jann Horn Cc: Alice Ryhl Cc: Boqun Feng Cc: Hillf Danton Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/process_addrs.rst | 850 +++++++++++++++++++++++++++++ 1 file changed, 850 insertions(+) diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index e8618fbc62c9..1d416658d7f5 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -3,3 +3,853 @@ ================= Process Addresses ================= + +.. toctree:: + :maxdepth: 3 + + +Userland memory ranges are tracked by the kernel via Virtual Memory Areas or +'VMA's of type :c:struct:`!struct vm_area_struct`. + +Each VMA describes a virtually contiguous memory range with identical +attributes, each described by a :c:struct:`!struct vm_area_struct` +object. Userland access outside of VMAs is invalid except in the case where an +adjacent stack VMA could be extended to contain the accessed address. + +All VMAs are contained within one and only one virtual address space, described +by a :c:struct:`!struct mm_struct` object which is referenced by all tasks (that is, +threads) which share the virtual address space. We refer to this as the +:c:struct:`!mm`. + +Each mm object contains a maple tree data structure which describes all VMAs +within the virtual address space. + +.. note:: An exception to this is the 'gate' VMA which is provided by + architectures which use :c:struct:`!vsyscall` and is a global static + object which does not belong to any specific mm. + +------- +Locking +------- + +The kernel is designed to be highly scalable against concurrent read operations +on VMA **metadata** so a complicated set of locks are required to ensure memory +corruption does not occur. + +.. note:: Locking VMAs for their metadata does not have any impact on the memory + they describe nor the page tables that map them. + +Terminology +----------- + +* **mmap locks** - Each MM has a read/write semaphore :c:member:`!mmap_lock` + which locks at a process address space granularity which can be acquired via + :c:func:`!mmap_read_lock`, :c:func:`!mmap_write_lock` and variants. +* **VMA locks** - The VMA lock is at VMA granularity (of course) which behaves + as a read/write semaphore in practice. A VMA read lock is obtained via + :c:func:`!lock_vma_under_rcu` (and unlocked via :c:func:`!vma_end_read`) and a + write lock via :c:func:`!vma_start_write` (all VMA write locks are unlocked + automatically when the mmap write lock is released). To take a VMA write lock + you **must** have already acquired an :c:func:`!mmap_write_lock`. +* **rmap locks** - When trying to access VMAs through the reverse mapping via a + :c:struct:`!struct address_space` or :c:struct:`!struct anon_vma` object + (reachable from a folio via :c:member:`!folio->mapping`). VMAs must be stabilised via + :c:func:`!anon_vma_[try]lock_read` or :c:func:`!anon_vma_[try]lock_write` for + anonymous memory and :c:func:`!i_mmap_[try]lock_read` or + :c:func:`!i_mmap_[try]lock_write` for file-backed memory. We refer to these + locks as the reverse mapping locks, or 'rmap locks' for brevity. + +We discuss page table locks separately in the dedicated section below. + +The first thing **any** of these locks achieve is to **stabilise** the VMA +within the MM tree. That is, guaranteeing that the VMA object will not be +deleted from under you nor modified (except for some specific fields +described below). + +Stabilising a VMA also keeps the address space described by it around. + +Lock usage +---------- + +If you want to **read** VMA metadata fields or just keep the VMA stable, you +must do one of the following: + +* Obtain an mmap read lock at the MM granularity via :c:func:`!mmap_read_lock` (or a + suitable variant), unlocking it with a matching :c:func:`!mmap_read_unlock` when + you're done with the VMA, *or* +* Try to obtain a VMA read lock via :c:func:`!lock_vma_under_rcu`. This tries to + acquire the lock atomically so might fail, in which case fall-back logic is + required to instead obtain an mmap read lock if this returns :c:macro:`!NULL`, + *or* +* Acquire an rmap lock before traversing the locked interval tree (whether + anonymous or file-backed) to obtain the required VMA. + +If you want to **write** VMA metadata fields, then things vary depending on the +field (we explore each VMA field in detail below). For the majority you must: + +* Obtain an mmap write lock at the MM granularity via :c:func:`!mmap_write_lock` (or a + suitable variant), unlocking it with a matching :c:func:`!mmap_write_unlock` when + you're done with the VMA, *and* +* Obtain a VMA write lock via :c:func:`!vma_start_write` for each VMA you wish to + modify, which will be released automatically when :c:func:`!mmap_write_unlock` is + called. +* If you want to be able to write to **any** field, you must also hide the VMA + from the reverse mapping by obtaining an **rmap write lock**. + +VMA locks are special in that you must obtain an mmap **write** lock **first** +in order to obtain a VMA **write** lock. A VMA **read** lock however can be +obtained without any other lock (:c:func:`!lock_vma_under_rcu` will acquire then +release an RCU lock to lookup the VMA for you). + +This constrains the impact of writers on readers, as a writer can interact with +one VMA while a reader interacts with another simultaneously. + +.. note:: The primary users of VMA read locks are page fault handlers, which + means that without a VMA write lock, page faults will run concurrent with + whatever you are doing. + +Examining all valid lock states: + +.. table:: + + ========= ======== ========= ======= ===== =========== ========== + mmap lock VMA lock rmap lock Stable? Read? Write most? Write all? + ========= ======== ========= ======= ===== =========== ========== + \- \- \- N N N N + \- R \- Y Y N N + \- \- R/W Y Y N N + R/W \-/R \-/R/W Y Y N N + W W \-/R Y Y Y N + W W W Y Y Y Y + ========= ======== ========= ======= ===== =========== ========== + +.. warning:: While it's possible to obtain a VMA lock while holding an mmap read lock, + attempting to do the reverse is invalid as it can result in deadlock - if + another task already holds an mmap write lock and attempts to acquire a VMA + write lock that will deadlock on the VMA read lock. + +All of these locks behave as read/write semaphores in practice, so you can +obtain either a read or a write lock for each of these. + +.. note:: Generally speaking, a read/write semaphore is a class of lock which + permits concurrent readers. However a write lock can only be obtained + once all readers have left the critical region (and pending readers + made to wait). + + This renders read locks on a read/write semaphore concurrent with other + readers and write locks exclusive against all others holding the semaphore. + +VMA fields +^^^^^^^^^^ + +We can subdivide :c:struct:`!struct vm_area_struct` fields by their purpose, which makes it +easier to explore their locking characteristics: + +.. note:: We exclude VMA lock-specific fields here to avoid confusion, as these + are in effect an internal implementation detail. + +.. table:: Virtual layout fields + + ===================== ======================================== =========== + Field Description Write lock + ===================== ======================================== =========== + :c:member:`!vm_start` Inclusive start virtual address of range mmap write, + VMA describes. VMA write, + rmap write. + :c:member:`!vm_end` Exclusive end virtual address of range mmap write, + VMA describes. VMA write, + rmap write. + :c:member:`!vm_pgoff` Describes the page offset into the file, mmap write, + the original page offset within the VMA write, + virtual address space (prior to any rmap write. + :c:func:`!mremap`), or PFN if a PFN map + and the architecture does not support + :c:macro:`!CONFIG_ARCH_HAS_PTE_SPECIAL`. + ===================== ======================================== =========== + +These fields describes the size, start and end of the VMA, and as such cannot be +modified without first being hidden from the reverse mapping since these fields +are used to locate VMAs within the reverse mapping interval trees. + +.. table:: Core fields + + ============================ ======================================== ========================= + Field Description Write lock + ============================ ======================================== ========================= + :c:member:`!vm_mm` Containing mm_struct. None - written once on + initial map. + :c:member:`!vm_page_prot` Architecture-specific page table mmap write, VMA write. + protection bits determined from VMA + flags. + :c:member:`!vm_flags` Read-only access to VMA flags describing N/A + attributes of the VMA, in union with + private writable + :c:member:`!__vm_flags`. + :c:member:`!__vm_flags` Private, writable access to VMA flags mmap write, VMA write. + field, updated by + :c:func:`!vm_flags_*` functions. + :c:member:`!vm_file` If the VMA is file-backed, points to a None - written once on + struct file object describing the initial map. + underlying file, if anonymous then + :c:macro:`!NULL`. + :c:member:`!vm_ops` If the VMA is file-backed, then either None - Written once on + the driver or file-system provides a initial map by + :c:struct:`!struct vm_operations_struct` :c:func:`!f_ops->mmap()`. + object describing callbacks to be + invoked on VMA lifetime events. + :c:member:`!vm_private_data` A :c:member:`!void *` field for Handled by driver. + driver-specific metadata. + ============================ ======================================== ========================= + +These are the core fields which describe the MM the VMA belongs to and its attributes. + +.. table:: Config-specific fields + + ================================= ===================== ======================================== =============== + Field Configuration option Description Write lock + ================================= ===================== ======================================== =============== + :c:member:`!anon_name` CONFIG_ANON_VMA_NAME A field for storing a mmap write, + :c:struct:`!struct anon_vma_name` VMA write. + object providing a name for anonymous + mappings, or :c:macro:`!NULL` if none + is set or the VMA is file-backed. The + underlying object is reference counted + and can be shared across multiple VMAs + for scalability. + :c:member:`!swap_readahead_info` CONFIG_SWAP Metadata used by the swap mechanism mmap read, + to perform readahead. This field is swap-specific + accessed atomically. lock. + :c:member:`!vm_policy` CONFIG_NUMA :c:type:`!mempolicy` object which mmap write, + describes the NUMA behaviour of the VMA write. + VMA. The underlying object is reference + counted. + :c:member:`!numab_state` CONFIG_NUMA_BALANCING :c:type:`!vma_numab_state` object which mmap read, + describes the current state of numab-specific + NUMA balancing in relation to this VMA. lock. + Updated under mmap read lock by + :c:func:`!task_numa_work`. + :c:member:`!vm_userfaultfd_ctx` CONFIG_USERFAULTFD Userfaultfd context wrapper object of mmap write, + type :c:type:`!vm_userfaultfd_ctx`, VMA write. + either of zero size if userfaultfd is + disabled, or containing a pointer + to an underlying + :c:type:`!userfaultfd_ctx` object which + describes userfaultfd metadata. + ================================= ===================== ======================================== =============== + +These fields are present or not depending on whether the relevant kernel +configuration option is set. + +.. table:: Reverse mapping fields + + =================================== ========================================= ============================ + Field Description Write lock + =================================== ========================================= ============================ + :c:member:`!shared.rb` A red/black tree node used, if the mmap write, VMA write, + mapping is file-backed, to place the VMA i_mmap write. + in the + :c:member:`!struct address_space->i_mmap` + red/black interval tree. + :c:member:`!shared.rb_subtree_last` Metadata used for management of the mmap write, VMA write, + interval tree if the VMA is file-backed. i_mmap write. + :c:member:`!anon_vma_chain` List of pointers to both forked/CoW’d mmap read, anon_vma write. + :c:type:`!anon_vma` objects and + :c:member:`!vma->anon_vma` if it is + non-:c:macro:`!NULL`. + :c:member:`!anon_vma` :c:type:`!anon_vma` object used by When :c:macro:`NULL` and + anonymous folios mapped exclusively to setting non-:c:macro:`NULL`: + this VMA. Initially set by mmap read, page_table_lock. + :c:func:`!anon_vma_prepare` serialised + by the :c:macro:`!page_table_lock`. This When non-:c:macro:`NULL` and + is set as soon as any page is faulted in. setting :c:macro:`NULL`: + mmap write, VMA write, + anon_vma write. + =================================== ========================================= ============================ + +These fields are used to both place the VMA within the reverse mapping, and for +anonymous mappings, to be able to access both related :c:struct:`!struct anon_vma` objects +and the :c:struct:`!struct anon_vma` in which folios mapped exclusively to this VMA should +reside. + +.. note:: If a file-backed mapping is mapped with :c:macro:`!MAP_PRIVATE` set + then it can be in both the :c:type:`!anon_vma` and :c:type:`!i_mmap` + trees at the same time, so all of these fields might be utilised at + once. + +Page tables +----------- + +We won't speak exhaustively on the subject but broadly speaking, page tables map +virtual addresses to physical ones through a series of page tables, each of +which contain entries with physical addresses for the next page table level +(along with flags), and at the leaf level the physical addresses of the +underlying physical data pages or a special entry such as a swap entry, +migration entry or other special marker. Offsets into these pages are provided +by the virtual address itself. + +In Linux these are divided into five levels - PGD, P4D, PUD, PMD and PTE. Huge +pages might eliminate one or two of these levels, but when this is the case we +typically refer to the leaf level as the PTE level regardless. + +.. note:: In instances where the architecture supports fewer page tables than + five the kernel cleverly 'folds' page table levels, that is stubbing + out functions related to the skipped levels. This allows us to + conceptually act as if there were always five levels, even if the + compiler might, in practice, eliminate any code relating to missing + ones. + +There are four key operations typically performed on page tables: + +1. **Traversing** page tables - Simply reading page tables in order to traverse + them. This only requires that the VMA is kept stable, so a lock which + establishes this suffices for traversal (there are also lockless variants + which eliminate even this requirement, such as :c:func:`!gup_fast`). +2. **Installing** page table mappings - Whether creating a new mapping or + modifying an existing one in such a way as to change its identity. This + requires that the VMA is kept stable via an mmap or VMA lock (explicitly not + rmap locks). +3. **Zapping/unmapping** page table entries - This is what the kernel calls + clearing page table mappings at the leaf level only, whilst leaving all page + tables in place. This is a very common operation in the kernel performed on + file truncation, the :c:macro:`!MADV_DONTNEED` operation via + :c:func:`!madvise`, and others. This is performed by a number of functions + including :c:func:`!unmap_mapping_range` and :c:func:`!unmap_mapping_pages`. + The VMA need only be kept stable for this operation. +4. **Freeing** page tables - When finally the kernel removes page tables from a + userland process (typically via :c:func:`!free_pgtables`) extreme care must + be taken to ensure this is done safely, as this logic finally frees all page + tables in the specified range, ignoring existing leaf entries (it assumes the + caller has both zapped the range and prevented any further faults or + modifications within it). + +.. note:: Modifying mappings for reclaim or migration is performed under rmap + lock as it, like zapping, does not fundamentally modify the identity + of what is being mapped. + +**Traversing** and **zapping** ranges can be performed holding any one of the +locks described in the terminology section above - that is the mmap lock, the +VMA lock or either of the reverse mapping locks. + +That is - as long as you keep the relevant VMA **stable** - you are good to go +ahead and perform these operations on page tables (though internally, kernel +operations that perform writes also acquire internal page table locks to +serialise - see the page table implementation detail section for more details). + +When **installing** page table entries, the mmap or VMA lock must be held to +keep the VMA stable. We explore why this is in the page table locking details +section below. + +.. warning:: Page tables are normally only traversed in regions covered by VMAs. + If you want to traverse page tables in areas that might not be + covered by VMAs, heavier locking is required. + See :c:func:`!walk_page_range_novma` for details. + +**Freeing** page tables is an entirely internal memory management operation and +has special requirements (see the page freeing section below for more details). + +.. warning:: When **freeing** page tables, it must not be possible for VMAs + containing the ranges those page tables map to be accessible via + the reverse mapping. + + The :c:func:`!free_pgtables` function removes the relevant VMAs + from the reverse mappings, but no other VMAs can be permitted to be + accessible and span the specified range. + +Lock ordering +------------- + +As we have multiple locks across the kernel which may or may not be taken at the +same time as explicit mm or VMA locks, we have to be wary of lock inversion, and +the **order** in which locks are acquired and released becomes very important. + +.. note:: Lock inversion occurs when two threads need to acquire multiple locks, + but in doing so inadvertently cause a mutual deadlock. + + For example, consider thread 1 which holds lock A and tries to acquire lock B, + while thread 2 holds lock B and tries to acquire lock A. + + Both threads are now deadlocked on each other. However, had they attempted to + acquire locks in the same order, one would have waited for the other to + complete its work and no deadlock would have occurred. + +The opening comment in :c:macro:`!mm/rmap.c` describes in detail the required +ordering of locks within memory management code: + +.. code-block:: + + inode->i_rwsem (while writing or truncating, not reading or faulting) + mm->mmap_lock + mapping->invalidate_lock (in filemap_fault) + folio_lock + hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) + vma_start_write + mapping->i_mmap_rwsem + anon_vma->rwsem + mm->page_table_lock or pte_lock + swap_lock (in swap_duplicate, swap_info_get) + mmlist_lock (in mmput, drain_mmlist and others) + mapping->private_lock (in block_dirty_folio) + i_pages lock (widely used) + lruvec->lru_lock (in folio_lruvec_lock_irq) + inode->i_lock (in set_page_dirty's __mark_inode_dirty) + bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) + sb_lock (within inode_lock in fs/fs-writeback.c) + i_pages lock (widely used, in set_page_dirty, + in arch-dependent flush_dcache_mmap_lock, + within bdi.wb->list_lock in __sync_single_inode) + +There is also a file-system specific lock ordering comment located at the top of +:c:macro:`!mm/filemap.c`: + +.. code-block:: + + ->i_mmap_rwsem (truncate_pagecache) + ->private_lock (__free_pte->block_dirty_folio) + ->swap_lock (exclusive_swap_page, others) + ->i_pages lock + + ->i_rwsem + ->invalidate_lock (acquired by fs in truncate path) + ->i_mmap_rwsem (truncate->unmap_mapping_range) + + ->mmap_lock + ->i_mmap_rwsem + ->page_table_lock or pte_lock (various, mainly in memory.c) + ->i_pages lock (arch-dependent flush_dcache_mmap_lock) + + ->mmap_lock + ->invalidate_lock (filemap_fault) + ->lock_page (filemap_fault, access_process_vm) + + ->i_rwsem (generic_perform_write) + ->mmap_lock (fault_in_readable->do_page_fault) + + bdi->wb.list_lock + sb_lock (fs/fs-writeback.c) + ->i_pages lock (__sync_single_inode) + + ->i_mmap_rwsem + ->anon_vma.lock (vma_merge) + + ->anon_vma.lock + ->page_table_lock or pte_lock (anon_vma_prepare and various) + + ->page_table_lock or pte_lock + ->swap_lock (try_to_unmap_one) + ->private_lock (try_to_unmap_one) + ->i_pages lock (try_to_unmap_one) + ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) + ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) + ->private_lock (folio_remove_rmap_pte->set_page_dirty) + ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) + bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) + ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) + bdi.wb->list_lock (zap_pte_range->set_page_dirty) + ->inode->i_lock (zap_pte_range->set_page_dirty) + ->private_lock (zap_pte_range->block_dirty_folio) + +Please check the current state of these comments which may have changed since +the time of writing of this document. + +------------------------------ +Locking Implementation Details +------------------------------ + +.. warning:: Locking rules for PTE-level page tables are very different from + locking rules for page tables at other levels. + +Page table locking details +-------------------------- + +In addition to the locks described in the terminology section above, we have +additional locks dedicated to page tables: + +* **Higher level page table locks** - Higher level page tables, that is PGD, P4D + and PUD each make use of the process address space granularity + :c:member:`!mm->page_table_lock` lock when modified. + +* **Fine-grained page table locks** - PMDs and PTEs each have fine-grained locks + either kept within the folios describing the page tables or allocated + separated and pointed at by the folios if :c:macro:`!ALLOC_SPLIT_PTLOCKS` is + set. The PMD spin lock is obtained via :c:func:`!pmd_lock`, however PTEs are + mapped into higher memory (if a 32-bit system) and carefully locked via + :c:func:`!pte_offset_map_lock`. + +These locks represent the minimum required to interact with each page table +level, but there are further requirements. + +Importantly, note that on a **traversal** of page tables, sometimes no such +locks are taken. However, at the PTE level, at least concurrent page table +deletion must be prevented (using RCU) and the page table must be mapped into +high memory, see below. + +Whether care is taken on reading the page table entries depends on the +architecture, see the section on atomicity below. + +Locking rules +^^^^^^^^^^^^^ + +We establish basic locking rules when interacting with page tables: + +* When changing a page table entry the page table lock for that page table + **must** be held, except if you can safely assume nobody can access the page + tables concurrently (such as on invocation of :c:func:`!free_pgtables`). +* Reads from and writes to page table entries must be *appropriately* + atomic. See the section on atomicity below for details. +* Populating previously empty entries requires that the mmap or VMA locks are + held (read or write), doing so with only rmap locks would be dangerous (see + the warning below). +* As mentioned previously, zapping can be performed while simply keeping the VMA + stable, that is holding any one of the mmap, VMA or rmap locks. + +.. warning:: Populating previously empty entries is dangerous as, when unmapping + VMAs, :c:func:`!vms_clear_ptes` has a window of time between + zapping (via :c:func:`!unmap_vmas`) and freeing page tables (via + :c:func:`!free_pgtables`), where the VMA is still visible in the + rmap tree. :c:func:`!free_pgtables` assumes that the zap has + already been performed and removes PTEs unconditionally (along with + all other page tables in the freed range), so installing new PTE + entries could leak memory and also cause other unexpected and + dangerous behaviour. + +There are additional rules applicable when moving page tables, which we discuss +in the section on this topic below. + +PTE-level page tables are different from page tables at other levels, and there +are extra requirements for accessing them: + +* On 32-bit architectures, they may be in high memory (meaning they need to be + mapped into kernel memory to be accessible). +* When empty, they can be unlinked and RCU-freed while holding an mmap lock or + rmap lock for reading in combination with the PTE and PMD page table locks. + In particular, this happens in :c:func:`!retract_page_tables` when handling + :c:macro:`!MADV_COLLAPSE`. + So accessing PTE-level page tables requires at least holding an RCU read lock; + but that only suffices for readers that can tolerate racing with concurrent + page table updates such that an empty PTE is observed (in a page table that + has actually already been detached and marked for RCU freeing) while another + new page table has been installed in the same location and filled with + entries. Writers normally need to take the PTE lock and revalidate that the + PMD entry still refers to the same PTE-level page table. + +To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or +:c:func:`!pte_offset_map` can be used depending on stability requirements. +These map the page table into kernel memory if required, take the RCU lock, and +depending on variant, may also look up or acquire the PTE lock. +See the comment on :c:func:`!__pte_offset_map_lock`. + +Atomicity +^^^^^^^^^ + +Regardless of page table locks, the MMU hardware concurrently updates accessed +and dirty bits (perhaps more, depending on architecture). Additionally, page +table traversal operations in parallel (though holding the VMA stable) and +functionality like GUP-fast locklessly traverses (that is reads) page tables, +without even keeping the VMA stable at all. + +When performing a page table traversal and keeping the VMA stable, whether a +read must be performed once and only once or not depends on the architecture +(for instance x86-64 does not require any special precautions). + +If a write is being performed, or if a read informs whether a write takes place +(on an installation of a page table entry say, for instance in +:c:func:`!__pud_install`), special care must always be taken. In these cases we +can never assume that page table locks give us entirely exclusive access, and +must retrieve page table entries once and only once. + +If we are reading page table entries, then we need only ensure that the compiler +does not rearrange our loads. This is achieved via :c:func:`!pXXp_get` +functions - :c:func:`!pgdp_get`, :c:func:`!p4dp_get`, :c:func:`!pudp_get`, +:c:func:`!pmdp_get`, and :c:func:`!ptep_get`. + +Each of these uses :c:func:`!READ_ONCE` to guarantee that the compiler reads +the page table entry only once. + +However, if we wish to manipulate an existing page table entry and care about +the previously stored data, we must go further and use an hardware atomic +operation as, for example, in :c:func:`!ptep_get_and_clear`. + +Equally, operations that do not rely on the VMA being held stable, such as +GUP-fast (see :c:func:`!gup_fast` and its various page table level handlers like +:c:func:`!gup_fast_pte_range`), must very carefully interact with page table +entries, using functions such as :c:func:`!ptep_get_lockless` and equivalent for +higher level page table levels. + +Writes to page table entries must also be appropriately atomic, as established +by :c:func:`!set_pXX` functions - :c:func:`!set_pgd`, :c:func:`!set_p4d`, +:c:func:`!set_pud`, :c:func:`!set_pmd`, and :c:func:`!set_pte`. + +Equally functions which clear page table entries must be appropriately atomic, +as in :c:func:`!pXX_clear` functions - :c:func:`!pgd_clear`, +:c:func:`!p4d_clear`, :c:func:`!pud_clear`, :c:func:`!pmd_clear`, and +:c:func:`!pte_clear`. + +Page table installation +^^^^^^^^^^^^^^^^^^^^^^^ + +Page table installation is performed with the VMA held stable explicitly by an +mmap or VMA lock in read or write mode (see the warning in the locking rules +section for details as to why). + +When allocating a P4D, PUD or PMD and setting the relevant entry in the above +PGD, P4D or PUD, the :c:member:`!mm->page_table_lock` must be held. This is +acquired in :c:func:`!__p4d_alloc`, :c:func:`!__pud_alloc` and +:c:func:`!__pmd_alloc` respectively. + +.. note:: :c:func:`!__pmd_alloc` actually invokes :c:func:`!pud_lock` and + :c:func:`!pud_lockptr` in turn, however at the time of writing it ultimately + references the :c:member:`!mm->page_table_lock`. + +Allocating a PTE will either use the :c:member:`!mm->page_table_lock` or, if +:c:macro:`!USE_SPLIT_PMD_PTLOCKS` is defined, a lock embedded in the PMD +physical page metadata in the form of a :c:struct:`!struct ptdesc`, acquired by +:c:func:`!pmd_ptdesc` called from :c:func:`!pmd_lock` and ultimately +:c:func:`!__pte_alloc`. + +Finally, modifying the contents of the PTE requires special treatment, as the +PTE page table lock must be acquired whenever we want stable and exclusive +access to entries contained within a PTE, especially when we wish to modify +them. + +This is performed via :c:func:`!pte_offset_map_lock` which carefully checks to +ensure that the PTE hasn't changed from under us, ultimately invoking +:c:func:`!pte_lockptr` to obtain a spin lock at PTE granularity contained within +the :c:struct:`!struct ptdesc` associated with the physical PTE page. The lock +must be released via :c:func:`!pte_unmap_unlock`. + +.. note:: There are some variants on this, such as + :c:func:`!pte_offset_map_rw_nolock` when we know we hold the PTE stable but + for brevity we do not explore this. See the comment for + :c:func:`!__pte_offset_map_lock` for more details. + +When modifying data in ranges we typically only wish to allocate higher page +tables as necessary, using these locks to avoid races or overwriting anything, +and set/clear data at the PTE level as required (for instance when page faulting +or zapping). + +A typical pattern taken when traversing page table entries to install a new +mapping is to optimistically determine whether the page table entry in the table +above is empty, if so, only then acquiring the page table lock and checking +again to see if it was allocated underneath us. + +This allows for a traversal with page table locks only being taken when +required. An example of this is :c:func:`!__pud_alloc`. + +At the leaf page table, that is the PTE, we can't entirely rely on this pattern +as we have separate PMD and PTE locks and a THP collapse for instance might have +eliminated the PMD entry as well as the PTE from under us. + +This is why :c:func:`!__pte_offset_map_lock` locklessly retrieves the PMD entry +for the PTE, carefully checking it is as expected, before acquiring the +PTE-specific lock, and then *again* checking that the PMD entry is as expected. + +If a THP collapse (or similar) were to occur then the lock on both pages would +be acquired, so we can ensure this is prevented while the PTE lock is held. + +Installing entries this way ensures mutual exclusion on write. + +Page table freeing +^^^^^^^^^^^^^^^^^^ + +Tearing down page tables themselves is something that requires significant +care. There must be no way that page tables designated for removal can be +traversed or referenced by concurrent tasks. + +It is insufficient to simply hold an mmap write lock and VMA lock (which will +prevent racing faults, and rmap operations), as a file-backed mapping can be +truncated under the :c:struct:`!struct address_space->i_mmap_rwsem` alone. + +As a result, no VMA which can be accessed via the reverse mapping (either +through the :c:struct:`!struct anon_vma->rb_root` or the :c:member:`!struct +address_space->i_mmap` interval trees) can have its page tables torn down. + +The operation is typically performed via :c:func:`!free_pgtables`, which assumes +either the mmap write lock has been taken (as specified by its +:c:member:`!mm_wr_locked` parameter), or that the VMA is already unreachable. + +It carefully removes the VMA from all reverse mappings, however it's important +that no new ones overlap these or any route remain to permit access to addresses +within the range whose page tables are being torn down. + +Additionally, it assumes that a zap has already been performed and steps have +been taken to ensure that no further page table entries can be installed between +the zap and the invocation of :c:func:`!free_pgtables`. + +Since it is assumed that all such steps have been taken, page table entries are +cleared without page table locks (in the :c:func:`!pgd_clear`, :c:func:`!p4d_clear`, +:c:func:`!pud_clear`, and :c:func:`!pmd_clear` functions. + +.. note:: It is possible for leaf page tables to be torn down independent of + the page tables above it as is done by + :c:func:`!retract_page_tables`, which is performed under the i_mmap + read lock, PMD, and PTE page table locks, without this level of care. + +Page table moving +^^^^^^^^^^^^^^^^^ + +Some functions manipulate page table levels above PMD (that is PUD, P4D and PGD +page tables). Most notable of these is :c:func:`!mremap`, which is capable of +moving higher level page tables. + +In these instances, it is required that **all** locks are taken, that is +the mmap lock, the VMA lock and the relevant rmap locks. + +You can observe this in the :c:func:`!mremap` implementation in the functions +:c:func:`!take_rmap_locks` and :c:func:`!drop_rmap_locks` which perform the rmap +side of lock acquisition, invoked ultimately by :c:func:`!move_page_tables`. + +VMA lock internals +------------------ + +Overview +^^^^^^^^ + +VMA read locking is entirely optimistic - if the lock is contended or a competing +write has started, then we do not obtain a read lock. + +A VMA **read** lock is obtained by :c:func:`!lock_vma_under_rcu`, which first +calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU +critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`, +before releasing the RCU lock via :c:func:`!rcu_read_unlock`. + +VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for +their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it +via :c:func:`!vma_end_read`. + +VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a +VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always +acquired. An mmap write lock **must** be held for the duration of the VMA write +lock, releasing or downgrading the mmap write lock also releases the VMA write +lock so there is no :c:func:`!vma_end_write` function. + +Note that a semaphore write lock is not held across a VMA lock. Rather, a +sequence number is used for serialisation, and the write semaphore is only +acquired at the point of write lock to update this. + +This ensures the semantics we require - VMA write locks provide exclusive write +access to the VMA. + +Implementation details +^^^^^^^^^^^^^^^^^^^^^^ + +The VMA lock mechanism is designed to be a lightweight means of avoiding the use +of the heavily contended mmap lock. It is implemented using a combination of a +read/write semaphore and sequence numbers belonging to the containing +:c:struct:`!struct mm_struct` and the VMA. + +Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic +operation, i.e. it tries to acquire a read lock but returns false if it is +unable to do so. At the end of the read operation, :c:func:`!vma_end_read` is +called to release the VMA read lock. + +Invoking :c:func:`!vma_start_read` requires that :c:func:`!rcu_read_lock` has +been called first, establishing that we are in an RCU critical section upon VMA +read lock acquisition. Once acquired, the RCU lock can be released as it is only +required for lookup. This is abstracted by :c:func:`!lock_vma_under_rcu` which +is the interface a user should use. + +Writing requires the mmap to be write-locked and the VMA lock to be acquired via +:c:func:`!vma_start_write`, however the write lock is released by the termination or +downgrade of the mmap write lock so no :c:func:`!vma_end_write` is required. + +All this is achieved by the use of per-mm and per-VMA sequence counts, which are +used in order to reduce complexity, especially for operations which write-lock +multiple VMAs at once. + +If the mm sequence count, :c:member:`!mm->mm_lock_seq` is equal to the VMA +sequence count :c:member:`!vma->vm_lock_seq` then the VMA is write-locked. If +they differ, then it is not. + +Each time the mmap write lock is released in :c:func:`!mmap_write_unlock` or +:c:func:`!mmap_write_downgrade`, :c:func:`!vma_end_write_all` is invoked which +also increments :c:member:`!mm->mm_lock_seq` via +:c:func:`!mm_lock_seqcount_end`. + +This way, we ensure that, regardless of the VMA's sequence number, a write lock +is never incorrectly indicated and that when we release an mmap write lock we +efficiently release **all** VMA write locks contained within the mmap at the +same time. + +Since the mmap write lock is exclusive against others who hold it, the automatic +release of any VMA locks on its release makes sense, as you would never want to +keep VMAs locked across entirely separate write operations. It also maintains +correct lock ordering. + +Each time a VMA read lock is acquired, we acquire a read lock on the +:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that +the sequence count of the VMA does not match that of the mm. + +If it does, the read lock fails. If it does not, we hold the lock, excluding +writers, but permitting other readers, who will also obtain this lock under RCU. + +Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu` +are also RCU safe, so the whole read lock operation is guaranteed to function +correctly. + +On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock` +read/write semaphore, before setting the VMA's sequence number under this lock, +also simultaneously holding the mmap write lock. + +This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep +until these are finished and mutual exclusion is achieved. + +After setting the VMA's sequence number, the lock is released, avoiding +complexity with a long-term held write lock. + +This clever combination of a read/write semaphore and sequence count allows for +fast RCU-based per-VMA lock acquisition (especially on page fault, though +utilised elsewhere) with minimal complexity around lock ordering. + +mmap write lock downgrading +--------------------------- + +When an mmap write lock is held one has exclusive access to resources within the +mmap (with the usual caveats about requiring VMA write locks to avoid races with +tasks holding VMA read locks). + +It is then possible to **downgrade** from a write lock to a read lock via +:c:func:`!mmap_write_downgrade` which, similar to :c:func:`!mmap_write_unlock`, +implicitly terminates all VMA write locks via :c:func:`!vma_end_write_all`, but +importantly does not relinquish the mmap lock while downgrading, therefore +keeping the locked virtual address space stable. + +An interesting consequence of this is that downgraded locks are exclusive +against any other task possessing a downgraded lock (since a racing task would +have to acquire a write lock first to downgrade it, and the downgraded lock +prevents a new write lock from being obtained until the original lock is +released). + +For clarity, we map read (R)/downgraded write (D)/write (W) locks against one +another showing which locks exclude the others: + +.. list-table:: Lock exclusivity + :widths: 5 5 5 5 + :header-rows: 1 + :stub-columns: 1 + + * - + - R + - D + - W + * - R + - N + - N + - Y + * - D + - N + - Y + - Y + * - W + - Y + - Y + - Y + +Here a Y indicates the locks in the matching row/column are mutually exclusive, +and N indicates that they are not. + +Stack expansion +--------------- + +Stack expansion throws up additional complexities in that we cannot permit there +to be racing page faults, as a result we invoke :c:func:`!vma_start_write` to +prevent this in :c:func:`!expand_downwards` or :c:func:`!expand_upwards`. From 6a75f19af16ff482cfd6085c77123aa0f464f8dd Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Thu, 5 Dec 2024 11:29:41 -0800 Subject: [PATCH 312/807] selftests/memfd: run sysctl tests when PID namespace support is enabled The sysctl tests for vm.memfd_noexec rely on the kernel to support PID namespaces (i.e. the kernel is built with CONFIG_PID_NS=y). If the kernel the test runs on does not support PID namespaces, the first sysctl test will fail when attempting to spawn a new thread in a new PID namespace, abort the test, preventing the remaining tests from being run. This is not desirable, as not all kernels need PID namespaces, but can still use the other features provided by memfd. Therefore, only run the sysctl tests if the kernel supports PID namespaces. Otherwise, skip those tests and emit an informative message to let the user know why the sysctl tests are not being run. Link: https://lkml.kernel.org/r/20241205192943.3228757-1-isaacmanjarres@google.com Fixes: 11f75a01448f ("selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC") Signed-off-by: Isaac J. Manjarres Reviewed-by: Jeff Xu Cc: Suren Baghdasaryan Cc: Kalesh Singh Cc: [6.6+] Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 95af2d78fd31..0a0b55516028 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1557,6 +1558,11 @@ static void test_share_fork(char *banner, char *b_suffix) close(fd); } +static bool pid_ns_supported(void) +{ + return access("/proc/self/ns/pid", F_OK) == 0; +} + int main(int argc, char **argv) { pid_t pid; @@ -1591,8 +1597,12 @@ int main(int argc, char **argv) test_seal_grow(); test_seal_resize(); - test_sysctl_simple(); - test_sysctl_nested(); + if (pid_ns_supported()) { + test_sysctl_simple(); + test_sysctl_nested(); + } else { + printf("PID namespaces are not supported; skipping sysctl tests\n"); + } test_share_dup("SHARE-DUP", ""); test_share_mmap("SHARE-MMAP", ""); From da5bd7fa789ae212ac18ebc3ac52b7f2ce1781da Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 5 Dec 2024 20:42:01 +0800 Subject: [PATCH 313/807] mailmap: add entry for Ying Huang Map my old company email to my personal email. Link: https://lkml.kernel.org/r/20241205124201.529308-1-huang.ying.caritas@gmail.com Signed-off-by: "Huang, Ying" Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 5ff0e5d681e7..7efe43237ca8 100644 --- a/.mailmap +++ b/.mailmap @@ -735,6 +735,7 @@ Wolfram Sang Wolfram Sang Yakir Yang Yanteng Si +Ying Huang Yusuke Goda Zack Rusin Zhu Yanjun From 1a72d2ebeec51f10e5b0f0609c6754e92b11ee9d Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 5 Dec 2024 18:48:32 +0800 Subject: [PATCH 314/807] ocfs2: revert "ocfs2: fix the la space leak when unmounting an ocfs2 volume" Patch series "Revert ocfs2 commit dfe6c5692fb5 and provide a new fix". SUSE QA team detected a mistake in my commit dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume"). I am very sorry for my error. (If my eyes are correct) From the mailling list mails, this patch shouldn't be applied to 4.19 5.4 5.10 5.15 6.1 6.6, and these branches should perform a revert operation. Reason for revert: In commit dfe6c5692fb5, I mistakenly wrote: "This bug has existed since the initial OCFS2 code.". The statement is wrong. The correct introduction commit is 30dd3478c3cd. IOW, if the branch doesn't include 30dd3478c3cd, dfe6c5692fb5 should also not be included. This reverts commit dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume"). In commit dfe6c5692fb5, the commit log "This bug has existed since the initial OCFS2 code." is wrong. The correct introduction commit is 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()"). The influence of commit dfe6c5692fb5 is that it provides a correct fix for the latest kernel. however, it shouldn't be pushed to stable branches. Let's use this commit to revert all branches that include dfe6c5692fb5 and use a new fix method to fix commit 30dd3478c3cd. Link: https://lkml.kernel.org/r/20241205104835.18223-1-heming.zhao@suse.com Link: https://lkml.kernel.org/r/20241205104835.18223-2-heming.zhao@suse.com Fixes: dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume") Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/localalloc.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 8ac42ea81a17..5df34561c551 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -1002,25 +1002,6 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, start = bit_off + 1; } - /* clear the contiguous bits until the end boundary */ - if (count) { - blkno = la_start_blk + - ocfs2_clusters_to_blocks(osb->sb, - start - count); - - trace_ocfs2_sync_local_to_main_free( - count, start - count, - (unsigned long long)la_start_blk, - (unsigned long long)blkno); - - status = ocfs2_release_clusters(handle, - main_bm_inode, - main_bm_bh, blkno, - count); - if (status < 0) - mlog_errno(status); - } - bail: if (status) mlog_errno(status); From 7782e3b3b004e8cb94a88621a22cc3c2f33e5b90 Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 5 Dec 2024 18:48:33 +0800 Subject: [PATCH 315/807] ocfs2: fix the space leak in LA when releasing LA Commit 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()") introduced an issue, the ocfs2_sync_local_to_main() ignores the last contiguous free bits, which causes an OCFS2 volume to lose the last free clusters of LA window during the release routine. Please note, because commit dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume") was reverted, this commit is a replacement fix for commit dfe6c5692fb5. Link: https://lkml.kernel.org/r/20241205104835.18223-3-heming.zhao@suse.com Fixes: 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()") Signed-off-by: Heming Zhao Suggested-by: Joseph Qi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/localalloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 5df34561c551..d1aa04a5af1b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -971,9 +971,9 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, start = count = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) < - left) { - if (bit_off == start) { + while (1) { + bit_off = ocfs2_find_next_zero_bit(bitmap, left, start); + if ((bit_off < left) && (bit_off == start)) { count++; start++; continue; @@ -998,6 +998,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, } } + if (bit_off >= left) + break; count = 1; start = bit_off + 1; } From dad2dc9c92e0f93f33cebcb0595b8daa3d57473f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 4 Dec 2024 22:50:06 -0800 Subject: [PATCH 316/807] mm: shmem: fix ShmemHugePages at swapout /proc/meminfo ShmemHugePages has been showing overlarge amounts (more than Shmem) after swapping out THPs: we forgot to update NR_SHMEM_THPS. Add shmem_update_stats(), to avoid repetition, and risk of making that mistake again: the call from shmem_delete_from_page_cache() is the bugfix; the call from shmem_replace_folio() is reassuring, but not really a bugfix (replace corrects misplaced swapin readahead, but huge swapin readahead would be a mistake). Link: https://lkml.kernel.org/r/5ba477c8-a569-70b5-923e-09ab221af45b@google.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Hugh Dickins Reviewed-by: Shakeel Butt Reviewed-by: Yosry Ahmed Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ccb9629a0f70..f6fb053ac50d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -787,6 +787,14 @@ static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static void shmem_update_stats(struct folio *folio, int nr_pages) +{ + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); +} + /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ @@ -821,10 +829,7 @@ static int shmem_add_to_page_cache(struct folio *folio, xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); + shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); @@ -852,8 +857,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put_refs(folio, nr); BUG_ON(error); @@ -1969,10 +1973,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } if (!error) { mem_cgroup_replace_folio(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages); - __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages); - __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); From 8aca2bc96c833ba695ede7a45ad7784c836a262e Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 28 Oct 2024 22:56:55 +0800 Subject: [PATCH 317/807] mm: use aligned address in clear_gigantic_page() In current kernel, hugetlb_no_page() calls folio_zero_user() with the fault address. Where the fault address may be not aligned with the huge page size. Then, folio_zero_user() may call clear_gigantic_page() with the address, while clear_gigantic_page() requires the address to be huge page size aligned. So, this may cause memory corruption or information leak, addtional, use more obvious naming 'addr_hint' instead of 'addr' for clear_gigantic_page(). Link: https://lkml.kernel.org/r/20241028145656.932941-1-wangkefeng.wang@huawei.com Fixes: 78fefd04c123 ("mm: memory: convert clear_huge_page() to folio_zero_user()") Signed-off-by: Kefeng Wang Reviewed-by: "Huang, Ying" Reviewed-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- mm/memory.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 90f883d6b8fd..fc1ae5132127 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -825,7 +825,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, error = PTR_ERR(folio); goto out; } - folio_zero_user(folio, ALIGN_DOWN(addr, hpage_size)); + folio_zero_user(folio, addr); __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { diff --git a/mm/memory.c b/mm/memory.c index 75c2dfd04f72..84864387f965 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6815,9 +6815,10 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct folio *folio, unsigned long addr, +static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, unsigned int nr_pages) { + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); int i; might_sleep(); From f5d09de9f1bf9674c6418ff10d0a40cfe29268e1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 28 Oct 2024 22:56:56 +0800 Subject: [PATCH 318/807] mm: use aligned address in copy_user_gigantic_page() In current kernel, hugetlb_wp() calls copy_user_large_folio() with the fault address. Where the fault address may be not aligned with the huge page size. Then, copy_user_large_folio() may call copy_user_gigantic_page() with the address, while copy_user_gigantic_page() requires the address to be huge page size aligned. So, this may cause memory corruption or information leak, addtional, use more obvious naming 'addr_hint' instead of 'addr' for copy_user_gigantic_page(). Link: https://lkml.kernel.org/r/20241028145656.932941-2-wangkefeng.wang@huawei.com Fixes: 530dd9926dc1 ("mm: memory: improve copy_user_large_folio()") Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Cc: Huang Ying Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 5 ++--- mm/memory.c | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea2ed8e301ef..cec4b121193f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5340,7 +5340,7 @@ again: break; } ret = copy_user_large_folio(new_folio, pte_folio, - ALIGN_DOWN(addr, sz), dst_vma); + addr, dst_vma); folio_put(pte_folio); if (ret) { folio_put(new_folio); @@ -6643,8 +6643,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, *foliop = NULL; goto out; } - ret = copy_user_large_folio(folio, *foliop, - ALIGN_DOWN(dst_addr, size), dst_vma); + ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); folio_put(*foliop); *foliop = NULL; if (ret) { diff --git a/mm/memory.c b/mm/memory.c index 84864387f965..209885a4134f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6852,13 +6852,14 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint) } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, - unsigned long addr, + unsigned long addr_hint, struct vm_area_struct *vma, unsigned int nr_pages) { - int i; + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst)); struct page *dst_page; struct page *src_page; + int i; for (i = 0; i < nr_pages; i++) { dst_page = folio_page(dst, i); From 42c4e4b20d9c4651903c4afc53a4ff18b7451b3e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 6 Dec 2024 21:52:29 +0000 Subject: [PATCH 319/807] mm: correctly reference merged VMA On second merge attempt on mmap() we incorrectly discard the possibly merged VMA, resulting in a possible use-after-free (and most certainly a reference to the wrong VMA) in this instance in the subsequent __mmap_complete() invocation. Correct this mistake by reassigning vma correctly if a merge succeeds in this case. Link: https://lkml.kernel.org/r/20241206215229.244413-1-lorenzo.stoakes@oracle.com Fixes: 5ac87a885aec ("mm: defer second attempt at merge on mmap()") Signed-off-by: Lorenzo Stoakes Suggested-by: Jann Horn Reported-by: syzbot+91cf8da9401355f946c3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67536a25.050a0220.a30f1.0149.GAE@google.com/ Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vma.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/vma.c b/mm/vma.c index 8e31b7e25aeb..bb2119e5a0d0 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2460,10 +2460,13 @@ unsigned long __mmap_region(struct file *file, unsigned long addr, /* If flags changed, we might be able to merge, so try again. */ if (map.retry_merge) { + struct vm_area_struct *merged; VMG_MMAP_STATE(vmg, &map, vma); vma_iter_config(map.vmi, map.addr, map.end); - vma_merge_existing_range(&vmg); + merged = vma_merge_existing_range(&vmg); + if (merged) + vma = merged; } __mmap_complete(&map, vma); From 31c5629920b82ddf66059f20f79be2bc00c4197b Mon Sep 17 00:00:00 2001 From: Petr Malat Date: Tue, 10 Dec 2024 01:06:04 +0100 Subject: [PATCH 320/807] mm: add RCU annotation to pte_offset_map(_lock) RCU lock is taken by ___pte_offset_map() unless it returns NULL. Add this information to its inline callers to avoid sparse warning about context imbalance in pte_unmap(). Link: https://lkml.kernel.org/r/20241210000604.700710-1-oss@malat.biz Signed-off-by: Petr Malat Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- mm/pgtable-generic.c | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..3a6ee6a05aa0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3010,7 +3010,15 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, + pmd_t *pmdvalp) +{ + pte_t *pte; + + __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); + return pte; +} static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); @@ -3023,7 +3031,8 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; - __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)); + __cond_lock(RCU, __cond_lock(*ptlp, + pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5297dcc38c37..5a882f2b10f9 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -279,7 +279,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; From 5c0541e11c16bd2f162e23a22d07c09d58017e5a Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:25 -0500 Subject: [PATCH 321/807] mm: introduce cpu_icache_is_aliasing() across all architectures In commit eacd0e950dc2 ("ARC: [mm] Lazy D-cache flush (non aliasing VIPT)"), arc adds the need to flush dcache to make icache see the code page change. This also requires special handling for clear_user_(high)page(). Introduce cpu_icache_is_aliasing() to make MM code query special clear_user_(high)page() easier. This will be used by the following commit. Link: https://lkml.kernel.org/r/20241209182326.2955963-1-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Suggested-by: Mathieu Desnoyers Reviewed-by: Mathieu Desnoyers Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: Geert Uytterhoeven Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- arch/arc/Kconfig | 1 + arch/arc/include/asm/cachetype.h | 8 ++++++++ include/linux/cacheinfo.h | 6 ++++++ 3 files changed, 15 insertions(+) create mode 100644 arch/arc/include/asm/cachetype.h diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ea5a1dcb133b..4f2eeda907ec 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,6 +6,7 @@ config ARC def_bool y select ARC_TIMERS + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT diff --git a/arch/arc/include/asm/cachetype.h b/arch/arc/include/asm/cachetype.h new file mode 100644 index 000000000000..acd3b6cb4bf5 --- /dev/null +++ b/arch/arc/include/asm/cachetype.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_ARC_CACHETYPE_H +#define __ASM_ARC_CACHETYPE_H + +#define cpu_dcache_is_aliasing() false +#define cpu_icache_is_aliasing() true + +#endif diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 108060612bb8..7ad736538649 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -155,8 +155,14 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) #ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING #define cpu_dcache_is_aliasing() false +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() #else #include + +#ifndef cpu_icache_is_aliasing +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() +#endif + #endif #endif /* _LINUX_CACHEINFO_H */ From c51a4f11e6d8246590b5e64908c1ed84b33e8ba2 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:26 -0500 Subject: [PATCH 322/807] mm: use clear_user_(high)page() for arch with special user folio handling Some architectures have special handling after clearing user folios: architectures, which set cpu_dcache_is_aliasing() to true, require flushing dcache; arc, which sets cpu_icache_is_aliasing() to true, changes folio->flags to make icache coherent to dcache. So __GFP_ZERO using only clear_page() is not enough to zero user folios and clear_user_(high)page() must be used. Otherwise, user data will be corrupted. Fix it by always clearing user folios with clear_user_(high)page() when cpu_dcache_is_aliasing() is true or cpu_icache_is_aliasing() is true. Rename alloc_zeroed() to user_alloc_needs_zeroing() and invert the logic to clarify its intend. Link: https://lkml.kernel.org/r/20241209182326.2955963-2-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/linux-mm/CAMuHMdV1hRp_NtR5YnJo=HsfgKQeH91J537Gh4gKk3PFZhSkbA@mail.gmail.com/ Tested-by: Geert Uytterhoeven Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- include/linux/highmem.h | 8 +++++++- include/linux/mm.h | 18 ++++++++++++++++++ mm/huge_memory.c | 9 +++++---- mm/internal.h | 6 ------ mm/memory.c | 10 +++++----- 5 files changed, 35 insertions(+), 16 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6e452bd8e7e3..5c6bea81a90e 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -224,7 +224,13 @@ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr); + struct folio *folio; + + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); + if (folio && user_alloc_needs_zeroing()) + clear_user_highpage(&folio->page, vaddr); + + return folio; } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a6ee6a05aa0..338a76ce9083 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -31,6 +31,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -4184,6 +4185,23 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif +/* + * user_alloc_needs_zeroing checks if a user folio from page allocator needs to + * be zeroed or not. + */ +static inline bool user_alloc_needs_zeroing(void) +{ + /* + * for user folios, arch with cache aliasing requires cache flush and + * arc changes folio->flags to make icache coherent with dcache, so + * always return false to make caller use + * clear_user_page()/clear_user_highpage(). + */ + return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || + !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); +} + int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ee335d96fc39..9bb351caa619 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1176,11 +1176,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, folio_throttle_swaprate(folio, gfp); /* - * When a folio is not zeroed during allocation (__GFP_ZERO not used), - * folio_zero_user() is used to make sure that the page corresponding - * to the faulting address will be hot in the cache after zeroing. + * When a folio is not zeroed during allocation (__GFP_ZERO not used) + * or user folios require special handling, folio_zero_user() is used to + * make sure that the page corresponding to the faulting address will be + * hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that diff --git a/mm/internal.h b/mm/internal.h index cb8d8e8e3ffa..3bd08bafad04 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1285,12 +1285,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr, void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); -static inline bool alloc_zeroed(void) -{ - return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, - &init_on_alloc); -} - /* * Parses a string with mem suffixes into its order. Useful to parse kernel * parameters. diff --git a/mm/memory.c b/mm/memory.c index 209885a4134f..398c031be9ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4733,12 +4733,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation - * (__GFP_ZERO not used), folio_zero_user() is used - * to make sure that the page corresponding to the - * faulting address will be hot in the cache after - * zeroing. + * (__GFP_ZERO not used) or user folios require special + * handling, folio_zero_user() is used to make sure + * that the page corresponding to the faulting address + * will be hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, vmf->address); return folio; } From be48c412f6ebf38849213c19547bc6d5b692b5e5 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 10 Dec 2024 00:57:15 +0800 Subject: [PATCH 323/807] zram: refuse to use zero sized block device as backing device Patch series "zram: fix backing device setup issue", v2. This series fixes two bugs of backing device setting: - ZRAM should reject using a zero sized (or the uninitialized ZRAM device itself) as the backing device. - Fix backing device leaking when removing a uninitialized ZRAM device. This patch (of 2): Setting a zero sized block device as backing device is pointless, and one can easily create a recursive loop by setting the uninitialized ZRAM device itself as its own backing device by (zram0 is uninitialized): echo /dev/zram0 > /sys/block/zram0/backing_dev It's definitely a wrong config, and the module will pin itself, kernel should refuse doing so in the first place. By refusing to use zero sized device we avoided misuse cases including this one above. Link: https://lkml.kernel.org/r/20241209165717.94215-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20241209165717.94215-2-ryncsn@gmail.com Fixes: 013bf95a83ec ("zram: add interface to specif backing device") Signed-off-by: Kairui Song Reported-by: Desheng Wu Reviewed-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3dee026988dc..e86cc3d2f4d2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -614,6 +614,12 @@ static ssize_t backing_dev_store(struct device *dev, } nr_pages = i_size_read(inode) >> PAGE_SHIFT; + /* Refuse to use zero sized device (also prevents self reference) */ + if (!nr_pages) { + err = -EINVAL; + goto out; + } + bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long); bitmap = kvzalloc(bitmap_sz, GFP_KERNEL); if (!bitmap) { From 74363ec674cb172d8856de25776c8f3103f05e2f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 10 Dec 2024 00:57:16 +0800 Subject: [PATCH 324/807] zram: fix uninitialized ZRAM not releasing backing device Setting backing device is done before ZRAM initialization. If we set the backing device, then remove the ZRAM module without initializing the device, the backing device reference will be leaked and the device will be hold forever. Fix this by always reset the ZRAM fully on rmmod or reset store. Link: https://lkml.kernel.org/r/20241209165717.94215-3-ryncsn@gmail.com Fixes: 013bf95a83ec ("zram: add interface to specif backing device") Signed-off-by: Kairui Song Reported-by: Desheng Wu Suggested-by: Sergey Senozhatsky Reviewed-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e86cc3d2f4d2..45df5eeabc5e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1444,12 +1444,16 @@ static void zram_meta_free(struct zram *zram, u64 disksize) size_t num_pages = disksize >> PAGE_SHIFT; size_t index; + if (!zram->table) + return; + /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) zram_free_page(zram, index); zs_destroy_pool(zram->mem_pool); vfree(zram->table); + zram->table = NULL; } static bool zram_meta_alloc(struct zram *zram, u64 disksize) @@ -2326,11 +2330,6 @@ static void zram_reset_device(struct zram *zram) zram->limit_pages = 0; - if (!init_done(zram)) { - up_write(&zram->init_lock); - return; - } - set_capacity_and_notify(zram->disk, 0); part_stat_set_all(zram->disk->part0, 0); From 901ce9705fbb9f330ff1f19600e5daf9770b0175 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Mon, 9 Dec 2024 15:56:52 +0900 Subject: [PATCH 325/807] nilfs2: prevent use of deleted inode syzbot reported a WARNING in nilfs_rmdir. [1] Because the inode bitmap is corrupted, an inode with an inode number that should exist as a ".nilfs" file was reassigned by nilfs_mkdir for "file0", causing an inode duplication during execution. And this causes an underflow of i_nlink in rmdir operations. The inode is used twice by the same task to unmount and remove directories ".nilfs" and "file0", it trigger warning in nilfs_rmdir. Avoid to this issue, check i_nlink in nilfs_iget(), if it is 0, it means that this inode has been deleted, and iput is executed to reclaim it. [1] WARNING: CPU: 1 PID: 5824 at fs/inode.c:407 drop_nlink+0xc4/0x110 fs/inode.c:407 ... Call Trace: nilfs_rmdir+0x1b0/0x250 fs/nilfs2/namei.c:342 vfs_rmdir+0x3a3/0x510 fs/namei.c:4394 do_rmdir+0x3b5/0x580 fs/namei.c:4453 __do_sys_rmdir fs/namei.c:4472 [inline] __se_sys_rmdir fs/namei.c:4470 [inline] __x64_sys_rmdir+0x47/0x50 fs/namei.c:4470 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Link: https://lkml.kernel.org/r/20241209065759.6781-1-konishi.ryusuke@gmail.com Fixes: d25006523d0b ("nilfs2: pathname operations") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+9260555647a5132edd48@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9260555647a5132edd48 Tested-by: syzbot+9260555647a5132edd48@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/inode.c | 8 +++++++- fs/nilfs2/namei.c | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index cf9ba481ae37..b7d4105f37bf 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -544,8 +544,14 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, inode = nilfs_iget_locked(sb, root, ino); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + + if (!(inode->i_state & I_NEW)) { + if (!inode->i_nlink) { + iput(inode); + return ERR_PTR(-ESTALE); + } return inode; + } err = __nilfs_read_inode(sb, root, ino, inode); if (unlikely(err)) { diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 9b108052d9f7..1d836a5540f3 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -67,6 +67,11 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) inode = NULL; } else { inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino); + if (inode == ERR_PTR(-ESTALE)) { + nilfs_error(dir->i_sb, + "deleted inode referenced: %lu", ino); + return ERR_PTR(-EIO); + } } return d_splice_alias(inode, dentry); From 8ac662f5da19f5873fdd94c48a5cdb45b2e1b58f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 10 Dec 2024 17:24:12 +0000 Subject: [PATCH 326/807] fork: avoid inappropriate uprobe access to invalid mm If dup_mmap() encounters an issue, currently uprobe is able to access the relevant mm via the reverse mapping (in build_map_info()), and if we are very unlucky with a race window, observe invalid XA_ZERO_ENTRY state which we establish as part of the fork error path. This occurs because uprobe_write_opcode() invokes anon_vma_prepare() which in turn invokes find_mergeable_anon_vma() that uses a VMA iterator, invoking vma_iter_load() which uses the advanced maple tree API and thus is able to observe XA_ZERO_ENTRY entries added to dup_mmap() in commit d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()"). This change was made on the assumption that only process tear-down code would actually observe (and make use of) these values. However this very unlikely but still possible edge case with uprobes exists and unfortunately does make these observable. The uprobe operation prevents races against the dup_mmap() operation via the dup_mmap_sem semaphore, which is acquired via uprobe_start_dup_mmap() and dropped via uprobe_end_dup_mmap(), and held across register_for_each_vma() prior to invoking build_map_info() which does the reverse mapping lookup. Currently these are acquired and dropped within dup_mmap(), which exposes the race window prior to error handling in the invoking dup_mm() which tears down the mm. We can avoid all this by just moving the invocation of uprobe_start_dup_mmap() and uprobe_end_dup_mmap() up a level to dup_mm() and only release this lock once the dup_mmap() operation succeeds or clean up is done. This means that the uprobe code can never observe an incompletely constructed mm and resolves the issue in this case. Link: https://lkml.kernel.org/r/20241210172412.52995-1-lorenzo.stoakes@oracle.com Fixes: d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") Signed-off-by: Lorenzo Stoakes Reported-by: syzbot+2d788f4f7cb660dac4b7@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6756d273.050a0220.2477f.003d.GAE@google.com/ Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peng Zhang Cc: Peter Zijlstra Cc: Vlastimil Babka Cc: David Hildenbrand Signed-off-by: Andrew Morton --- kernel/fork.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 1450b461d196..9b301180fd41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -639,11 +639,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, 0); - uprobe_start_dup_mmap(); - if (mmap_write_lock_killable(oldmm)) { - retval = -EINTR; - goto fail_uprobe_end; - } + if (mmap_write_lock_killable(oldmm)) + return -EINTR; flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* @@ -782,8 +779,6 @@ out: dup_userfaultfd_complete(&uf); else dup_userfaultfd_fail(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: @@ -1692,9 +1687,11 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; + uprobe_start_dup_mmap(); err = dup_mmap(mm, oldmm); if (err) goto free_pt; + uprobe_end_dup_mmap(); mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; @@ -1709,6 +1706,8 @@ free_pt: mm->binfmt = NULL; mm_init_owner(mm, NULL); mmput(mm); + if (err) + uprobe_end_dup_mmap(); fail_nomem: return NULL; From faeec8e23c10bd30e8aa759a2eb3018dae00f924 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 10 Dec 2024 10:34:37 +0100 Subject: [PATCH 327/807] mm/page_alloc: don't call pfn_to_page() on possibly non-existent PFN in split_large_buddy() In split_large_buddy(), we might call pfn_to_page() on a PFN that might not exist. In corner cases, such as when freeing the highest pageblock in the last memory section, this could result with CONFIG_SPARSEMEM && !CONFIG_SPARSEMEM_EXTREME in __pfn_to_section() returning NULL and and __section_mem_map_addr() dereferencing that NULL pointer. Let's fix it, and avoid doing a pfn_to_page() call for the first iteration, where we already have the page. So far this was found by code inspection, but let's just CC stable as the fix is easy. Link: https://lkml.kernel.org/r/20241210093437.174413-1-david@redhat.com Fixes: fd919a85cd55 ("mm: page_isolation: prepare for hygienic freelists") Signed-off-by: David Hildenbrand Reported-by: Vlastimil Babka Closes: https://lkml.kernel.org/r/e1a898ba-a717-4d20-9144-29df1a6c8813@suse.cz Reviewed-by: Vlastimil Babka Reviewed-by: Zi Yan Acked-by: Johannes Weiner Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1cb4b8c8886d..cae7b93864c2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1238,13 +1238,15 @@ static void split_large_buddy(struct zone *zone, struct page *page, if (order > pageblock_order) order = pageblock_order; - while (pfn != end) { + do { int mt = get_pfnblock_migratetype(page, pfn); __free_one_page(page, pfn, zone, order, mt, fpi); pfn += 1 << order; + if (pfn == end) + break; page = pfn_to_page(pfn); - } + } while (1); } static void free_one_page(struct zone *zone, struct page *page, From a2e740e216f5bf49ccb83b6d490c72a340558a43 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Dec 2024 20:25:37 +0000 Subject: [PATCH 328/807] vmalloc: fix accounting with i915 If the caller of vmap() specifies VM_MAP_PUT_PAGES (currently only the i915 driver), we will decrement nr_vmalloc_pages and MEMCG_VMALLOC in vfree(). These counters are incremented by vmalloc() but not by vmap() so this will cause an underflow. Check the VM_MAP_PUT_PAGES flag before decrementing either counter. Link: https://lkml.kernel.org/r/20241211202538.168311-1-willy@infradead.org Fixes: b944afc9d64d ("mm: add a VM_MAP_PUT_PAGES flag for vmap") Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Balbir Singh Acked-by: Michal Hocko Cc: Christoph Hellwig Cc: Muchun Song Cc: Roman Gushchin Cc: "Uladzislau Rezki (Sony)" Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f009b21705c1..5c88d0e90c20 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3374,7 +3374,8 @@ void vfree(const void *addr) struct page *page = vm->pages[i]; BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + if (!(vm->flags & VM_MAP_PUT_PAGES)) + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations @@ -3382,7 +3383,8 @@ void vfree(const void *addr) __free_page(page); cond_resched(); } - atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); + if (!(vm->flags & VM_MAP_PUT_PAGES)) + atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } From 6309b8ce98e9a18390b9fd8f03fc412f3c17aee9 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 13 Dec 2024 01:43:28 +0900 Subject: [PATCH 329/807] nilfs2: fix buffer head leaks in calls to truncate_inode_pages() When block_invalidatepage was converted to block_invalidate_folio, the fallback to block_invalidatepage in folio_invalidate() if the address_space_operations method invalidatepage (currently invalidate_folio) was not set, was removed. Unfortunately, some pseudo-inodes in nilfs2 use empty_aops set by inode_init_always_gfp() as is, or explicitly set it to address_space_operations. Therefore, with this change, block_invalidatepage() is no longer called from folio_invalidate(), and as a result, the buffer_head structures attached to these pages/folios are no longer freed via try_to_free_buffers(). Thus, these buffer heads are now leaked by truncate_inode_pages(), which cleans up the page cache from inode evict(), etc. Three types of caches use empty_aops: gc inode caches and the DAT shadow inode used by GC, and b-tree node caches. Of these, b-tree node caches explicitly call invalidate_mapping_pages() during cleanup, which involves calling try_to_free_buffers(), so the leak was not visible during normal operation but worsened when GC was performed. Fix this issue by using address_space_operations with invalidate_folio set to block_invalidate_folio instead of empty_aops, which will ensure the same behavior as before. Link: https://lkml.kernel.org/r/20241212164556.21338-1-konishi.ryusuke@gmail.com Fixes: 7ba13abbd31e ("fs: Turn block_invalidatepage into block_invalidate_folio") Signed-off-by: Ryusuke Konishi Cc: [5.18+] Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 1 + fs/nilfs2/gcinode.c | 2 +- fs/nilfs2/inode.c | 5 +++++ fs/nilfs2/nilfs.h | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 501ad7be5174..54a3fa0cf67e 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -35,6 +35,7 @@ void nilfs_init_btnc_inode(struct inode *btnc_inode) ii->i_flags = 0; memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap)); mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS); + btnc_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; } void nilfs_btnode_cache_clear(struct address_space *btnc) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index ace22253fed0..2dbb15767df1 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -163,7 +163,7 @@ int nilfs_init_gcinode(struct inode *inode) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - inode->i_mapping->a_ops = &empty_aops; + inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b7d4105f37bf..23f3a75edd50 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -276,6 +276,10 @@ const struct address_space_operations nilfs_aops = { .is_partially_uptodate = block_is_partially_uptodate, }; +const struct address_space_operations nilfs_buffer_cache_aops = { + .invalidate_folio = block_invalidate_folio, +}; + static int nilfs_insert_inode_locked(struct inode *inode, struct nilfs_root *root, unsigned long ino) @@ -681,6 +685,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode) NILFS_I(s_inode)->i_flags = 0; memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap)); mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS); + s_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; err = nilfs_attach_btree_node_cache(s_inode); if (unlikely(err)) { diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 45d03826eaf1..dff241c53fc5 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -401,6 +401,7 @@ extern const struct file_operations nilfs_dir_operations; extern const struct inode_operations nilfs_file_inode_operations; extern const struct file_operations nilfs_file_operations; extern const struct address_space_operations nilfs_aops; +extern const struct address_space_operations nilfs_buffer_cache_aops; extern const struct inode_operations nilfs_dir_inode_operations; extern const struct inode_operations nilfs_special_inode_operations; extern const struct inode_operations nilfs_symlink_inode_operations; From 42b2eb69835b0fda797f70eb5b4fc213dbe3a7ea Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 12 Dec 2024 18:33:51 +0000 Subject: [PATCH 330/807] mm: convert partially_mapped set/clear operations to be atomic Other page flags in the 2nd page, like PG_hwpoison and PG_anon_exclusive can get modified concurrently. Changes to other page flags might be lost if they are happening at the same time as non-atomic partially_mapped operations. Hence, make partially_mapped operations atomic. Link: https://lkml.kernel.org/r/20241212183351.1345389-1-usamaarif642@gmail.com Fixes: 8422acdc97ed ("mm: introduce a pageflag for partially mapped folios") Reported-by: David Hildenbrand Link: https://lore.kernel.org/all/e53b04ad-1827-43a2-a1ab-864c7efecf6e@redhat.com/ Signed-off-by: Usama Arif Acked-by: David Hildenbrand Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Barry Song Cc: Domenico Cerasuolo Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Nico Pache Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ++---------- mm/huge_memory.c | 8 ++++---- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cf46ac720802..691506bdf2c5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -862,18 +862,10 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) -FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -/* - * PG_partially_mapped is protected by deferred_split split_queue_lock, - * so its safe to use non-atomic set/clear. - */ -__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) -FOLIO_TEST_FLAG_FALSE(partially_mapped) -__FOLIO_SET_FLAG_NOOP(partially_mapped) -__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) +FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9bb351caa619..df0c4988dd88 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3577,7 +3577,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3689,7 +3689,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3733,7 +3733,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { - __folio_set_partially_mapped(folio); + folio_set_partially_mapped(folio); if (folio_test_pmd_mappable(folio)) count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); @@ -3826,7 +3826,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } else { /* We lost race with folio_put() */ if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } From 30c2de0a267c04046d89e678cc0067a9cfb455df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Dec 2024 13:31:26 -0800 Subject: [PATCH 331/807] mm/vmstat: fix a W=1 clang compiler warning Fix the following clang compiler warning that is reported if the kernel is built with W=1: ./include/linux/vmstat.h:518:36: error: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Werror,-Wenum-enum-conversion] 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" | ~~~~~~~~~~~ ^ ~~~ Link: https://lkml.kernel.org/r/20241212213126.1269116-1-bvanassche@acm.org Fixes: 9d7ea9a297e6 ("mm/vmstat: add helpers to get vmstat item names for each enum type") Signed-off-by: Bart Van Assche Cc: Konstantin Khlebnikov Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d2761bf8ff32..9f3a04345b86 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -515,7 +515,7 @@ static inline const char *node_stat_name(enum node_stat_item item) static inline const char *lru_list_name(enum lru_list lru) { - return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" + return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_" } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) From 640a603943a7659340c10044c0a1c98ae4e13189 Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Fri, 13 Dec 2024 09:33:32 +0800 Subject: [PATCH 332/807] mm/codetag: clear tags before swap When CONFIG_MEM_ALLOC_PROFILING_DEBUG is set, kernel WARN would be triggered when calling __alloc_tag_ref_set() during swap: alloc_tag was not cleared (got tag for mm/filemap.c:1951) WARNING: CPU: 0 PID: 816 at ./include/linux/alloc_tag.h... Clear code tags before swap can fix the warning. And this patch also fix a potential invalid address dereference in alloc_tag_add_check() when CONFIG_MEM_ALLOC_PROFILING_DEBUG is set and ref->ct is CODETAG_EMPTY, which is defined as ((void *)1). Link: https://lkml.kernel.org/r/20241213013332.89910-1-00107082@163.com Fixes: 51f43d5d82ed ("mm/codetag: swap tags when migrate pages") Signed-off-by: David Wang <00107082@163.com> Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412112227.df61ebb-lkp@intel.com Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 +- lib/alloc_tag.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 7c0786bdf9af..cba024bf2db3 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -135,7 +135,7 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) { - WARN_ONCE(ref && ref->ct, + WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref), "alloc_tag was not cleared (got tag for %s:%u)\n", ref->ct->filename, ref->ct->lineno); diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 35f7560a309a..3a0413462e9f 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -209,6 +209,13 @@ void pgalloc_tag_swap(struct folio *new, struct folio *old) return; } + /* + * Clear tag references to avoid debug warning when using + * __alloc_tag_ref_set() with non-empty reference. + */ + set_codetag_empty(&ref_old); + set_codetag_empty(&ref_new); + /* swap tags */ __alloc_tag_ref_set(&ref_old, tag_new); update_page_tag_ref(handle_old, &ref_old); From e269b5d2916d7a696c2d2ed370cea95d95a0675a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 29 Nov 2024 16:14:22 -0800 Subject: [PATCH 333/807] alloc_tag: fix module allocation tags populated area calculation vm_module_tags_populate() calculation of the populated area assumes that area starts at a page boundary and therefore when new pages are allocation, the end of the area is page-aligned as well. If the start of the area is not page-aligned then allocating a page and incrementing the end of the area by PAGE_SIZE leads to an area at the end but within the area boundary which is not populated. Accessing this are will lead to a kernel panic. Fix the calculation by down-aligning the start of the area and using that as the location allocated pages are mapped to. [gehao@kylinos.cn: fix vm_module_tags_populate's KASAN poisoning logic] Link: https://lkml.kernel.org/r/20241205170528.81000-1-hao.ge@linux.dev [gehao@kylinos.cn: fix panic when CONFIG_KASAN enabled and CONFIG_KASAN_VMALLOC not enabled] Link: https://lkml.kernel.org/r/20241212072126.134572-1-hao.ge@linux.dev Link: https://lkml.kernel.org/r/20241130001423.1114965-1-surenb@google.com Fixes: 0f9b685626da ("alloc_tag: populate memory for module tags as needed") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202411132111.6a221562-lkp@intel.com Acked-by: Yu Zhao Tested-by: Adrian Huang Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Sourav Panda Cc: Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 3a0413462e9f..7dcebf118a3e 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -408,28 +408,52 @@ repeat: static int vm_module_tags_populate(void) { - unsigned long phys_size = vm_module_tags->nr_pages << PAGE_SHIFT; + unsigned long phys_end = ALIGN_DOWN(module_tags.start_addr, PAGE_SIZE) + + (vm_module_tags->nr_pages << PAGE_SHIFT); + unsigned long new_end = module_tags.start_addr + module_tags.size; - if (phys_size < module_tags.size) { + if (phys_end < new_end) { struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages; - unsigned long addr = module_tags.start_addr + phys_size; + unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN); + unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN); unsigned long more_pages; unsigned long nr; - more_pages = ALIGN(module_tags.size - phys_size, PAGE_SIZE) >> PAGE_SHIFT; + more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT; nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN, NUMA_NO_NODE, more_pages, next_page); if (nr < more_pages || - vmap_pages_range(addr, addr + (nr << PAGE_SHIFT), PAGE_KERNEL, + vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL, next_page, PAGE_SHIFT) < 0) { /* Clean up and error out */ for (int i = 0; i < nr; i++) __free_page(next_page[i]); return -ENOMEM; } + vm_module_tags->nr_pages += nr; + + /* + * Kasan allocates 1 byte of shadow for every 8 bytes of data. + * When kasan_alloc_module_shadow allocates shadow memory, + * its unit of allocation is a page. + * Therefore, here we need to align to MODULE_ALIGN. + */ + if (old_shadow_end < new_shadow_end) + kasan_alloc_module_shadow((void *)old_shadow_end, + new_shadow_end - old_shadow_end, + GFP_KERNEL); } + /* + * Mark the pages as accessible, now that they are mapped. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + kasan_unpoison_vmalloc((void *)module_tags.start_addr, + new_end - module_tags.start_addr, + KASAN_VMALLOC_PROT_NORMAL); + return 0; } From 60da7445a142bd15e67f3cda915497781c3f781f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 29 Nov 2024 16:14:23 -0800 Subject: [PATCH 334/807] alloc_tag: fix set_codetag_empty() when !CONFIG_MEM_ALLOC_PROFILING_DEBUG It was recently noticed that set_codetag_empty() might be used not only to mark NULL alloctag references as empty to avoid warnings but also to reset valid tags (in clear_page_tag_ref()). Since set_codetag_empty() is defined as NOOP for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n, such use of set_codetag_empty() leads to subtle bugs. Fix set_codetag_empty() for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n to reset the tag reference. Link: https://lkml.kernel.org/r/20241130001423.1114965-2-surenb@google.com Fixes: a8fc28dad6d5 ("alloc_tag: introduce clear_page_tag_ref() helper function") Signed-off-by: Suren Baghdasaryan Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20241124074318.399027-1-00107082@163.com/ Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Sourav Panda Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index cba024bf2db3..0bbbe537c5f9 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -63,7 +63,12 @@ static inline void set_codetag_empty(union codetag_ref *ref) #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } -static inline void set_codetag_empty(union codetag_ref *ref) {} + +static inline void set_codetag_empty(union codetag_ref *ref) +{ + if (ref) + ref->ct = NULL; +} #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ From d3ac65d274b3a93cf9cf9559fd1473ab65e00e10 Mon Sep 17 00:00:00 2001 From: Leo Stone Date: Sun, 15 Dec 2024 20:27:51 -0800 Subject: [PATCH 335/807] mm: huge_memory: handle strsep not finding delimiter split_huge_pages_write() does not handle the case where strsep finds no delimiter in the given string and sets the input buffer to NULL, which allows this reproducer to trigger a protection fault. Link: https://lkml.kernel.org/r/20241216042752.257090-2-leocstone@gmail.com Signed-off-by: Leo Stone Reported-by: syzbot+8a3da2f1bbf59227c289@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8a3da2f1bbf59227c289 Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index df0c4988dd88..e53d83b3e5cf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4169,7 +4169,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, size_t input_len = strlen(input_buf); tok = strsep(&buf, ","); - if (tok) { + if (tok && buf) { strscpy(file_path, tok); } else { ret = -EINVAL; From a17975992cc11588767175247ccaae1213a8b582 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 17 Dec 2024 22:16:51 +0100 Subject: [PATCH 336/807] selftests: openvswitch: fix tcpdump execution Fix the way tcpdump is executed by: - Using the right variable for the namespace. Currently the use of the empty "ns" makes the command fail. - Waiting until it starts to capture to ensure the interesting traffic is caught on slow systems. - Using line-buffered output to ensure logs are available when the test is paused with "-p". Otherwise the last chunk of data might only be written when tcpdump is killed. Fixes: 74cc26f416b9 ("selftests: openvswitch: add interface support") Signed-off-by: Adrian Moreno Acked-by: Eelco Chaudron Link: https://patch.msgid.link/20241217211652.483016-1-amorenoz@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/openvswitch/openvswitch.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index cc0bfae2bafa..960e1ab4dd04 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -171,8 +171,10 @@ ovs_add_netns_and_veths () { ovs_add_if "$1" "$2" "$4" -u || return 1 fi - [ $TRACING -eq 1 ] && ovs_netns_spawn_daemon "$1" "$ns" \ - tcpdump -i any -s 65535 + if [ $TRACING -eq 1 ]; then + ovs_netns_spawn_daemon "$1" "$3" tcpdump -l -i any -s 6553 + ovs_wait grep -q "listening on any" ${ovs_dir}/stderr + fi return 0 } From 16f027cd40eeedd2325f7e720689462ca8d9d13e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 16 Dec 2024 15:50:59 +0200 Subject: [PATCH 337/807] net: dsa: restore dsa_software_vlan_untag() ability to operate on VLAN-untagged traffic Robert Hodaszi reports that locally terminated traffic towards VLAN-unaware bridge ports is broken with ocelot-8021q. He is describing the same symptoms as for commit 1f9fc48fd302 ("net: dsa: sja1105: fix reception from VLAN-unaware bridges"). For context, the set merged as "VLAN fixes for Ocelot driver": https://lore.kernel.org/netdev/20240815000707.2006121-1-vladimir.oltean@nxp.com/ was developed in a slightly different form earlier this year, in January. Initially, the switch was unconditionally configured to set OCELOT_ES0_TAG when using ocelot-8021q, regardless of port operating mode. This led to the situation where VLAN-unaware bridge ports would always push their PVID - see ocelot_vlan_unaware_pvid() - a negligible value anyway - into RX packets. To strip this in software, we would have needed DSA to know what private VID the switch chose for VLAN-unaware bridge ports, and pushed into the packets. This was implemented downstream, and a remnant of it remains in the form of a comment mentioning ds->ops->get_private_vid(), as something which would maybe need to be considered in the future. However, for upstream, it was deemed inappropriate, because it would mean introducing yet another behavior for stripping VLAN tags from VLAN-unaware bridge ports, when one already existed (ds->untag_bridge_pvid). The latter has been marked as obsolete along with an explanation why it is logically broken, but still, it would have been confusing. So, for upstream, felix_update_tag_8021q_rx_rule() was developed, which essentially changed the state of affairs from "Felix with ocelot-8021q delivers all packets as VLAN-tagged towards the CPU" into "Felix with ocelot-8021q delivers all packets from VLAN-aware bridge ports towards the CPU". This was done on the premise that in VLAN-unaware mode, there's nothing useful in the VLAN tags, and we can avoid introducing ds->ops->get_private_vid() in the DSA receive path if we configure the switch to not push those VLAN tags into packets in the first place. Unfortunately, and this is when the trainwreck started, the selftests developed initially and posted with the series were not re-ran. dsa_software_vlan_untag() was initially written given the assumption that users of this feature would send _all_ traffic as VLAN-tagged. It was only partially adapted to the new scheme, by removing ds->ops->get_private_vid(), which also used to be necessary in standalone ports mode. Where the trainwreck became even worse is that I had a second opportunity to think about this, when the dsa_software_vlan_untag() logic change initially broke sja1105, in commit 1f9fc48fd302 ("net: dsa: sja1105: fix reception from VLAN-unaware bridges"). I did not connect the dots that it also breaks ocelot-8021q, for pretty much the same reason that not all received packets will be VLAN-tagged. To be compatible with the optimized Felix control path which runs felix_update_tag_8021q_rx_rule() to only push VLAN tags when useful (in VLAN-aware mode), we need to restore the old dsa_software_vlan_untag() logic. The blamed commit introduced the assumption that dsa_software_vlan_untag() will see only VLAN-tagged packets, assumption which is false. What corrupts RX traffic is the fact that we call skb_vlan_untag() on packets which are not VLAN-tagged in the first place. Fixes: 93e4649efa96 ("net: dsa: provide a software untagging function on RX for VLAN-aware bridges") Reported-by: Robert Hodaszi Closes: https://lore.kernel.org/netdev/20241215163334.615427-1-robert.hodaszi@digi.com/ Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241216135059.1258266-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/tag.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/dsa/tag.h b/net/dsa/tag.h index d5707870906b..5d80ddad4ff6 100644 --- a/net/dsa/tag.h +++ b/net/dsa/tag.h @@ -138,9 +138,10 @@ static inline void dsa_software_untag_vlan_unaware_bridge(struct sk_buff *skb, * dsa_software_vlan_untag: Software VLAN untagging in DSA receive path * @skb: Pointer to socket buffer (packet) * - * Receive path method for switches which cannot avoid tagging all packets - * towards the CPU port. Called when ds->untag_bridge_pvid (legacy) or - * ds->untag_vlan_aware_bridge_pvid is set to true. + * Receive path method for switches which send some packets as VLAN-tagged + * towards the CPU port (generally from VLAN-aware bridge ports) even when the + * packet was not tagged on the wire. Called when ds->untag_bridge_pvid + * (legacy) or ds->untag_vlan_aware_bridge_pvid is set to true. * * As a side effect of this method, any VLAN tag from the skb head is moved * to hwaccel. @@ -149,14 +150,19 @@ static inline struct sk_buff *dsa_software_vlan_untag(struct sk_buff *skb) { struct dsa_port *dp = dsa_user_to_port(skb->dev); struct net_device *br = dsa_port_bridge_dev_get(dp); - u16 vid; + u16 vid, proto; + int err; /* software untagging for standalone ports not yet necessary */ if (!br) return skb; + err = br_vlan_get_proto(br, &proto); + if (err) + return skb; + /* Move VLAN tag from data to hwaccel */ - if (!skb_vlan_tag_present(skb)) { + if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) { skb = skb_vlan_untag(skb); if (!skb) return NULL; From 5eb70dbebf32c2fd1f2814c654ae17fc47d6e859 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Dec 2024 18:25:08 -0800 Subject: [PATCH 338/807] netdev-genl: avoid empty messages in queue dump Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down it has no queues. Fixes: 6b6171db7fc8 ("netdev-genl: Add netlink framework functions for queue") Reported-by: syzbot+0a884bc2d304ce4af70f@syzkaller.appspotmail.com Reviewed-by: Eric Dumazet Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241218022508.815344-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1be8c7c21d19..2d3ae0cd3ad2 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -430,10 +430,10 @@ static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - int err = 0; + int err; if (!(netdev->flags & IFF_UP)) - return err; + return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) From 5eecd85c77a254a43bde3212da8047b001745c9f Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 17 Dec 2024 12:37:39 +0100 Subject: [PATCH 339/807] psample: adjust size if rate_as_probability is set If PSAMPLE_ATTR_SAMPLE_PROBABILITY flag is to be sent, the available size for the packet data has to be adjusted accordingly. Also, check the error code returned by nla_put_flag. Fixes: 7b1b2b60c63f ("net: psample: allow using rate as probability") Signed-off-by: Adrian Moreno Reviewed-by: Aaron Conole Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241217113739.3929300-1-amorenoz@redhat.com Signed-off-by: Jakub Kicinski --- net/psample/psample.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/psample/psample.c b/net/psample/psample.c index a0ddae8a65f9..25f92ba0840c 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -393,7 +393,9 @@ void psample_sample_packet(struct psample_group *group, nla_total_size_64bit(sizeof(u64)) + /* timestamp */ nla_total_size(sizeof(u16)) + /* protocol */ (md->user_cookie_len ? - nla_total_size(md->user_cookie_len) : 0); /* user cookie */ + nla_total_size(md->user_cookie_len) : 0) + /* user cookie */ + (md->rate_as_probability ? + nla_total_size(0) : 0); /* rate as probability */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); @@ -498,8 +500,9 @@ void psample_sample_packet(struct psample_group *group, md->user_cookie)) goto error; - if (md->rate_as_probability) - nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY); + if (md->rate_as_probability && + nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY)) + goto error; genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, From 51df947678360faf1967fe0bd1a40c681f634104 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 16 Dec 2024 21:23:24 -0800 Subject: [PATCH 340/807] octeontx2-pf: fix netdev memory leak in rvu_rep_create() When rvu_rep_devlink_port_register() fails, free_netdev(ndev) for this incomplete iteration before going to "exit:" label. Fixes: 9ed0343f561e ("octeontx2-pf: Add devlink port support") Reviewed-by: Przemek Kitszel Signed-off-by: Harshit Mogalapalli Link: https://patch.msgid.link/20241217052326.1086191-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/rep.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c index 232b10740c13..9e3fcbae5dee 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c @@ -680,8 +680,10 @@ int rvu_rep_create(struct otx2_nic *priv, struct netlink_ext_ack *extack) ndev->features |= ndev->hw_features; eth_hw_addr_random(ndev); err = rvu_rep_devlink_port_register(rep); - if (err) + if (err) { + free_netdev(ndev); goto exit; + } SET_NETDEV_DEVLINK_PORT(ndev, &rep->dl_port); err = register_netdev(ndev); From b95c8c33ae687fcd3007cefa93907a6bd270119b Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 16 Dec 2024 21:23:25 -0800 Subject: [PATCH 341/807] octeontx2-pf: fix error handling of devlink port in rvu_rep_create() Unregister the devlink port when register_netdev() fails. Fixes: 9ed0343f561e ("octeontx2-pf: Add devlink port support") Reviewed-by: Przemek Kitszel Signed-off-by: Harshit Mogalapalli Link: https://patch.msgid.link/20241217052326.1086191-2-harshit.m.mogalapalli@oracle.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/rep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c index 9e3fcbae5dee..04e08e06f30f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c @@ -690,6 +690,7 @@ int rvu_rep_create(struct otx2_nic *priv, struct netlink_ext_ack *extack) if (err) { NL_SET_ERR_MSG_MOD(extack, "PFVF representor registration failed"); + rvu_rep_devlink_port_unregister(rep); free_netdev(ndev); goto exit; } From 13a6691910cc23ea9ba4066e098603088673d5b0 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Tue, 10 Dec 2024 09:33:10 +0200 Subject: [PATCH 342/807] RDMA/nldev: Set error code in rdma_nl_notify_event In case of error set the error code before the goto. Fixes: 6ff57a2ea7c2 ("RDMA/nldev: Fix NULL pointer dereferences issue in rdma_nl_notify_event") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-rdma/a84a2fc3-33b6-46da-a1bd-3343fa07eaf9@stanley.mountain/ Signed-off-by: Chiara Meiohas Reviewed-by: Maher Sanalla Link: https://patch.msgid.link/13eb25961923f5de9eb9ecbbc94e26113d6049ef.1733815944.git.leonro@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index ff121e59b9c0..cb987ab0177c 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -2833,8 +2833,8 @@ int rdma_nl_notify_event(struct ib_device *device, u32 port_num, enum rdma_nl_notify_event_type type) { struct sk_buff *skb; + int ret = -EMSGSIZE; struct net *net; - int ret = 0; void *nlh; net = read_pnet(&device->coredev.rdma_net); From 16b87037b48889d21854c8e97aec8a1baf2642b3 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 12 Dec 2024 16:18:48 +0100 Subject: [PATCH 343/807] RDMA/siw: Remove direct link to net_device Do not manage a per device direct link to net_device. Rely on associated ib_devices net_device management, not doubling the effort locally. A badly managed local link to net_device was causing a 'KASAN: slab-use-after-free' exception during siw_query_port() call. Fixes: bdcf26bf9b3a ("rdma/siw: network and RDMA core interface") Reported-by: syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=4b87489410b4efd181bf Signed-off-by: Bernard Metzler Link: https://patch.msgid.link/20241212151848.564872-1-bmt@zurich.ibm.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw.h | 7 +++--- drivers/infiniband/sw/siw/siw_cm.c | 27 ++++++++++++++++----- drivers/infiniband/sw/siw/siw_main.c | 15 +----------- drivers/infiniband/sw/siw/siw_verbs.c | 35 ++++++++++++++++++--------- 4 files changed, 49 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 86d4d6a2170e..ea5eee50dc39 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -46,6 +46,9 @@ */ #define SIW_IRQ_MAXBURST_SQ_ACTIVE 4 +/* There is always only a port 1 per siw device */ +#define SIW_PORT 1 + struct siw_dev_cap { int max_qp; int max_qp_wr; @@ -69,16 +72,12 @@ struct siw_pd { struct siw_device { struct ib_device base_dev; - struct net_device *netdev; struct siw_dev_cap attrs; u32 vendor_part_id; int numa_node; char raw_gid[ETH_ALEN]; - /* physical port state (only one port per device) */ - enum ib_port_state state; - spinlock_t lock; struct xarray qp_xa; diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 86323918a570..708b13993fdf 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1759,6 +1759,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) { struct socket *s; struct siw_cep *cep = NULL; + struct net_device *ndev = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; int rv = 0; @@ -1779,9 +1780,15 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); /* For wildcard addr, limit binding to current device only */ - if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) - s->sk->sk_bound_dev_if = sdev->netdev->ifindex; - + if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) { + ndev = ib_device_get_netdev(id->device, SIW_PORT); + if (ndev) { + s->sk->sk_bound_dev_if = ndev->ifindex; + } else { + rv = -ENODEV; + goto error; + } + } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in)); } else { @@ -1797,9 +1804,15 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) } /* For wildcard addr, limit binding to current device only */ - if (ipv6_addr_any(&laddr->sin6_addr)) - s->sk->sk_bound_dev_if = sdev->netdev->ifindex; - + if (ipv6_addr_any(&laddr->sin6_addr)) { + ndev = ib_device_get_netdev(id->device, SIW_PORT); + if (ndev) { + s->sk->sk_bound_dev_if = ndev->ifindex; + } else { + rv = -ENODEV; + goto error; + } + } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in6)); } @@ -1860,6 +1873,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) } list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); cep->state = SIW_EPSTATE_LISTENING; + dev_put(ndev); siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr); @@ -1879,6 +1893,7 @@ error: siw_cep_set_free_and_put(cep); } sock_release(s); + dev_put(ndev); return rv; } diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 17abef48abcd..14d3103aee6f 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -287,7 +287,6 @@ static struct siw_device *siw_device_create(struct net_device *netdev) return NULL; base_dev = &sdev->base_dev; - sdev->netdev = netdev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, @@ -381,12 +380,10 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, switch (event) { case NETDEV_UP: - sdev->state = IB_PORT_ACTIVE; siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; case NETDEV_DOWN: - sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); break; @@ -407,12 +404,8 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* - * Todo: Below netdev events are currently not handled. + * All other events are not handled */ - case NETDEV_CHANGEMTU: - case NETDEV_CHANGE: - break; - default: break; } @@ -442,12 +435,6 @@ static int siw_newlink(const char *basedev_name, struct net_device *netdev) sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); - - if (netif_running(netdev) && netif_carrier_ok(netdev)) - sdev->state = IB_PORT_ACTIVE; - else - sdev->state = IB_PORT_DOWN; - ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 986666c19378..7ca0297d68a4 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -171,21 +171,29 @@ int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, int siw_query_port(struct ib_device *base_dev, u32 port, struct ib_port_attr *attr) { - struct siw_device *sdev = to_siw_dev(base_dev); + struct net_device *ndev; int rv; memset(attr, 0, sizeof(*attr)); rv = ib_get_eth_speed(base_dev, port, &attr->active_speed, &attr->active_width); + if (rv) + return rv; + + ndev = ib_device_get_netdev(base_dev, SIW_PORT); + if (!ndev) + return -ENODEV; + attr->gid_tbl_len = 1; attr->max_msg_sz = -1; - attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); - attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); - attr->phys_state = sdev->state == IB_PORT_ACTIVE ? + attr->max_mtu = ib_mtu_int_to_enum(ndev->max_mtu); + attr->active_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu)); + attr->phys_state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; + attr->state = attr->phys_state == IB_PORT_PHYS_STATE_LINK_UP ? + IB_PORT_ACTIVE : IB_PORT_DOWN; attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; - attr->state = sdev->state; /* * All zero * @@ -199,6 +207,7 @@ int siw_query_port(struct ib_device *base_dev, u32 port, * attr->subnet_timeout = 0; * attr->init_type_repy = 0; */ + dev_put(ndev); return rv; } @@ -505,21 +514,24 @@ int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct siw_qp *qp; - struct siw_device *sdev; + struct net_device *ndev; - if (base_qp && qp_attr && qp_init_attr) { + if (base_qp && qp_attr && qp_init_attr) qp = to_siw_qp(base_qp); - sdev = to_siw_dev(base_qp->device); - } else { + else return -EINVAL; - } + + ndev = ib_device_get_netdev(base_qp->device, SIW_PORT); + if (!ndev) + return -ENODEV; + qp_attr->qp_state = siw_qp_state_to_ib_qp_state[qp->attrs.state]; qp_attr->cap.max_inline_data = SIW_MAX_INLINE; qp_attr->cap.max_send_wr = qp->attrs.sq_size; qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; qp_attr->cap.max_recv_wr = qp->attrs.rq_size; qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; - qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + qp_attr->path_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu)); qp_attr->max_rd_atomic = qp->attrs.irq_size; qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; @@ -534,6 +546,7 @@ int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; + dev_put(ndev); return 0; } From 572af9f284669d31d9175122bbef9bc62cea8ded Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Wed, 18 Dec 2024 12:51:06 +0900 Subject: [PATCH 344/807] net: mdiobus: fix an OF node reference leak fwnode_find_mii_timestamper() calls of_parse_phandle_with_fixed_args() but does not decrement the refcount of the obtained OF node. Add an of_node_put() call before returning from the function. This bug was detected by an experimental static analysis tool that I am developing. Fixes: bc1bee3b87ee ("net: mdiobus: Introduce fwnode_mdiobus_register_phy()") Signed-off-by: Joe Hattori Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218035106.1436405-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Paolo Abeni --- drivers/net/mdio/fwnode_mdio.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/mdio/fwnode_mdio.c b/drivers/net/mdio/fwnode_mdio.c index b156493d7084..aea0f0357568 100644 --- a/drivers/net/mdio/fwnode_mdio.c +++ b/drivers/net/mdio/fwnode_mdio.c @@ -40,6 +40,7 @@ fwnode_find_pse_control(struct fwnode_handle *fwnode) static struct mii_timestamper * fwnode_find_mii_timestamper(struct fwnode_handle *fwnode) { + struct mii_timestamper *mii_ts; struct of_phandle_args arg; int err; @@ -53,10 +54,16 @@ fwnode_find_mii_timestamper(struct fwnode_handle *fwnode) else if (err) return ERR_PTR(err); - if (arg.args_count != 1) - return ERR_PTR(-EINVAL); + if (arg.args_count != 1) { + mii_ts = ERR_PTR(-EINVAL); + goto put_node; + } - return register_mii_timestamper(arg.np, arg.args[0]); + mii_ts = register_mii_timestamper(arg.np, arg.args[0]); + +put_node: + of_node_put(arg.np); + return mii_ts; } int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio, From ce1219c3f76bb131d095e90521506d3c6ccfa086 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 18 Dec 2024 11:53:01 +0800 Subject: [PATCH 345/807] net: mctp: handle skb cleanup on sock_queue failures Currently, we don't use the return value from sock_queue_rcv_skb, which means we may leak skbs if a message is not successfully queued to a socket. Instead, ensure that we're freeing the skb where the sock hasn't otherwise taken ownership of the skb by adding checks on the sock_queue_rcv_skb() to invoke a kfree on failure. In doing so, rather than using the 'rc' value to trigger the kfree_skb(), use the skb pointer itself, which is more explicit. Also, add a kunit test for the sock delivery failure cases. Fixes: 4a992bbd3650 ("mctp: Implement message fragmentation & reassembly") Cc: stable@vger.kernel.org Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20241218-mctp-next-v2-1-1c1729645eaa@codeconstruct.com.au Signed-off-by: Paolo Abeni --- net/mctp/route.c | 36 +++++++++++----- net/mctp/test/route-test.c | 86 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 10 deletions(-) diff --git a/net/mctp/route.c b/net/mctp/route.c index 597e9cf5aa64..3f2bd65ff5e3 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -374,8 +374,13 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) msk = NULL; rc = -EINVAL; - /* we may be receiving a locally-routed packet; drop source sk - * accounting + /* We may be receiving a locally-routed packet; drop source sk + * accounting. + * + * From here, we will either queue the skb - either to a frag_queue, or + * to a receiving socket. When that succeeds, we clear the skb pointer; + * a non-NULL skb on exit will be otherwise unowned, and hence + * kfree_skb()-ed. */ skb_orphan(skb); @@ -434,7 +439,9 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * pending key. */ if (flags & MCTP_HDR_FLAG_EOM) { - sock_queue_rcv_skb(&msk->sk, skb); + rc = sock_queue_rcv_skb(&msk->sk, skb); + if (!rc) + skb = NULL; if (key) { /* we've hit a pending reassembly; not much we * can do but drop it @@ -443,7 +450,6 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) MCTP_TRACE_KEY_REPLIED); key = NULL; } - rc = 0; goto out_unlock; } @@ -470,8 +476,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * this function. */ rc = mctp_key_add(key, msk); - if (!rc) + if (!rc) { trace_mctp_key_acquire(key); + skb = NULL; + } /* we don't need to release key->lock on exit, so * clean up here and suppress the unlock via @@ -489,6 +497,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) key = NULL; } else { rc = mctp_frag_queue(key, skb); + if (!rc) + skb = NULL; } } @@ -503,12 +513,19 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) else rc = mctp_frag_queue(key, skb); + if (rc) + goto out_unlock; + + /* we've queued; the queue owns the skb now */ + skb = NULL; + /* end of message? deliver to socket, and we're done with * the reassembly/response key */ - if (!rc && flags & MCTP_HDR_FLAG_EOM) { - sock_queue_rcv_skb(key->sk, key->reasm_head); - key->reasm_head = NULL; + if (flags & MCTP_HDR_FLAG_EOM) { + rc = sock_queue_rcv_skb(key->sk, key->reasm_head); + if (!rc) + key->reasm_head = NULL; __mctp_key_done_in(key, net, f, MCTP_TRACE_KEY_REPLIED); key = NULL; } @@ -527,8 +544,7 @@ out_unlock: if (any_key) mctp_key_unref(any_key); out: - if (rc) - kfree_skb(skb); + kfree_skb(skb); return rc; } diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 8551dab1d1e6..17165b86ce22 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -837,6 +837,90 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test) mctp_test_route_input_multiple_nets_key_fini(test, &t2); } +/* Input route to socket, using a single-packet message, where sock delivery + * fails. Ensure we're handling the failure appropriately. + */ +static void mctp_test_route_input_sk_fail_single(struct kunit *test) +{ + const struct mctp_hdr hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO); + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + struct sk_buff *skb; + int rc; + + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + + /* No rcvbuf space, so delivery should fail. __sock_set_rcvbuf will + * clamp the minimum to SOCK_MIN_RCVBUF, so we open-code this. + */ + lock_sock(sock->sk); + WRITE_ONCE(sock->sk->sk_rcvbuf, 0); + release_sock(sock->sk); + + skb = mctp_test_create_skb(&hdr, 10); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + skb_get(skb); + + mctp_test_skb_set_dev(skb, dev); + + /* do route input, which should fail */ + rc = mctp_route_input(&rt->rt, skb); + KUNIT_EXPECT_NE(test, rc, 0); + + /* we should hold the only reference to skb */ + KUNIT_EXPECT_EQ(test, refcount_read(&skb->users), 1); + kfree_skb(skb); + + __mctp_route_test_fini(test, dev, rt, sock); +} + +/* Input route to socket, using a fragmented message, where sock delivery fails. + */ +static void mctp_test_route_input_sk_fail_frag(struct kunit *test) +{ + const struct mctp_hdr hdrs[2] = { RX_FRAG(FL_S, 0), RX_FRAG(FL_E, 1) }; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct sk_buff *skbs[2]; + struct socket *sock; + unsigned int i; + int rc; + + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + + lock_sock(sock->sk); + WRITE_ONCE(sock->sk->sk_rcvbuf, 0); + release_sock(sock->sk); + + for (i = 0; i < ARRAY_SIZE(skbs); i++) { + skbs[i] = mctp_test_create_skb(&hdrs[i], 10); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skbs[i]); + skb_get(skbs[i]); + + mctp_test_skb_set_dev(skbs[i], dev); + } + + /* first route input should succeed, we're only queueing to the + * frag list + */ + rc = mctp_route_input(&rt->rt, skbs[0]); + KUNIT_EXPECT_EQ(test, rc, 0); + + /* final route input should fail to deliver to the socket */ + rc = mctp_route_input(&rt->rt, skbs[1]); + KUNIT_EXPECT_NE(test, rc, 0); + + /* we should hold the only reference to both skbs */ + KUNIT_EXPECT_EQ(test, refcount_read(&skbs[0]->users), 1); + kfree_skb(skbs[0]); + + KUNIT_EXPECT_EQ(test, refcount_read(&skbs[1]->users), 1); + kfree_skb(skbs[1]); + + __mctp_route_test_fini(test, dev, rt, sock); +} + #if IS_ENABLED(CONFIG_MCTP_FLOWS) static void mctp_test_flow_init(struct kunit *test, @@ -1053,6 +1137,8 @@ static struct kunit_case mctp_test_cases[] = { mctp_route_input_sk_reasm_gen_params), KUNIT_CASE_PARAM(mctp_test_route_input_sk_keys, mctp_route_input_sk_keys_gen_params), + KUNIT_CASE(mctp_test_route_input_sk_fail_single), + KUNIT_CASE(mctp_test_route_input_sk_fail_frag), KUNIT_CASE(mctp_test_route_input_multiple_nets_bind), KUNIT_CASE(mctp_test_route_input_multiple_nets_key), KUNIT_CASE(mctp_test_packet_flow), From 32c9c06adb5b157ef259233775a063a43746d699 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Thu, 19 Dec 2024 18:53:02 +0800 Subject: [PATCH 346/807] ASoC: mediatek: disable buffer pre-allocation On Chromebooks based on Mediatek MT8195 or MT8188, the audio frontend (AFE) is limited to accessing a very small window (1 MiB) of memory, which is described as a reserved memory region in the device tree. On these two platforms, the maximum buffer size is given as 512 KiB. The MediaTek common code uses the same value for preallocations. This means that only the first two PCM substreams get preallocations, and then the whole space is exhausted, barring any other substreams from working. Since the substreams used are not always the first two, this means audio won't work correctly. This is observed on the MT8188 Geralt Chromebooks, on which the "mediatek,dai-link" property was dropped when it was upstreamed. That property causes the driver to only register the PCM substreams listed in the property, and in the order given. Instead of trying to compute an optimal value and figuring out which streams are used, simply disable preallocation. The PCM buffers are managed by the core and are allocated and released on the fly. There should be no impact to any of the other MediaTek platforms. Signed-off-by: Chen-Yu Tsai Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20241219105303.548437-1-wenst@chromium.org Signed-off-by: Mark Brown --- sound/soc/mediatek/common/mtk-afe-platform-driver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/mediatek/common/mtk-afe-platform-driver.c b/sound/soc/mediatek/common/mtk-afe-platform-driver.c index 9b72b2a7ae91..6b6330583941 100644 --- a/sound/soc/mediatek/common/mtk-afe-platform-driver.c +++ b/sound/soc/mediatek/common/mtk-afe-platform-driver.c @@ -120,8 +120,8 @@ int mtk_afe_pcm_new(struct snd_soc_component *component, struct mtk_base_afe *afe = snd_soc_component_get_drvdata(component); size = afe->mtk_afe_hardware->buffer_bytes_max; - snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_DEV, - afe->dev, size, size); + snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_DEV, afe->dev, 0, size); + return 0; } EXPORT_SYMBOL_GPL(mtk_afe_pcm_new); From 13221496065fa12fac4f8a8e725444679ffddb78 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Wed, 18 Dec 2024 20:54:53 +0100 Subject: [PATCH 347/807] regulator: rename regulator-uv-survival-time-ms according to DT binding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The regulator bindings don't document regulator-uv-survival-time-ms, but the more descriptive regulator-uv-less-critical-window-ms instead. Looking back at v3[1] and v4[2] of the series adding the support, the property was indeed renamed between these patch series, but unfortunately the rename only made it into the DT bindings with the driver code still using the old name. Let's therefore rename the property in the driver code to follow suit. This will break backwards compatibility, but there are no upstream device trees using the property and we never documented the old name of the property anyway. ¯\_(ツ)_/¯" [1]: https://lore.kernel.org/all/20231025084614.3092295-7-o.rempel@pengutronix.de/ [2]: https://lore.kernel.org/all/20231026144824.4065145-5-o.rempel@pengutronix.de/ Signed-off-by: Ahmad Fatoum Link: https://patch.msgid.link/20241218-regulator-uv-survival-time-ms-rename-v1-1-6cac9c3c75da@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/of_regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c index 3d85762beda6..e5b4b93c07e3 100644 --- a/drivers/regulator/of_regulator.c +++ b/drivers/regulator/of_regulator.c @@ -175,7 +175,7 @@ static int of_get_regulation_constraints(struct device *dev, if (!ret) constraints->enable_time = pval; - ret = of_property_read_u32(np, "regulator-uv-survival-time-ms", &pval); + ret = of_property_read_u32(np, "regulator-uv-less-critical-window-ms", &pval); if (!ret) constraints->uv_less_critical_window_ms = pval; else From 40be32303ec829ea12f9883e499bfd3fe9e52baf Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 17 Dec 2024 15:56:45 +0530 Subject: [PATCH 348/807] RDMA/bnxt_re: Fix max_qp_wrs reported While creating qps, driver adds one extra entry to the sq size passed by the ULPs in order to avoid queue full condition. When ULPs creates QPs with max_qp_wr reported, driver creates QP with 1 more than the max_wqes supported by HW. Create QP fails in this case. To avoid this error, reduce 1 entry in max_qp_wqes and report it to the stack. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Reviewed-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 7e20ae3d2c4f..73c9baaebb4e 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -129,7 +129,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_qp_init_rd_atom = sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ? BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; - attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr); + attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; /* * 128 WQEs needs to be reserved for the HW (8916). Prevent * reporting the max number From d5a38bf2f35979537c526acbc56bc435ed40685f Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 17 Dec 2024 15:56:46 +0530 Subject: [PATCH 349/807] RDMA/bnxt_re: Disable use of reserved wqes Disabling the reserved wqes logic for Gen P5/P7 devices because this workaround is required only for legacy devices. Fixes: ecb53febfcad ("RDMA/bnxt_en: Enable RDMA driver support for 57500 chip") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-3-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 73c9baaebb4e..776f8f1f1432 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -130,11 +130,13 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ? BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; - /* - * 128 WQEs needs to be reserved for the HW (8916). Prevent - * reporting the max number - */ - attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; + if (!bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) { + /* + * 128 WQEs needs to be reserved for the HW (8916). Prevent + * reporting the max number on legacy devices + */ + attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; + } attr->max_qp_sges = cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE ? min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; From d13be54dc18baee7a3e44349b80755a8c8205d3f Mon Sep 17 00:00:00 2001 From: Damodharam Ammepalli Date: Tue, 17 Dec 2024 15:56:47 +0530 Subject: [PATCH 350/807] RDMA/bnxt_re: Add send queue size check for variable wqe For the fixed WQE case, HW supports 0xFFFF WQEs. For variable Size WQEs, HW treats this number as the 16 bytes slots. The maximum supported WQEs needs to be adjusted based on the number of slots. Set a maximum WQE limit for variable WQE scenario. Fixes: de1d364c3815 ("RDMA/bnxt_re: Add support for Variable WQE in Genp7 adapters") Reviewed-by: Kalesh AP Signed-off-by: Damodharam Ammepalli Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 776f8f1f1432..9df3e3271577 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -138,6 +138,10 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; } + /* Adjust for max_qp_wqes for variable wqe */ + if (cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + attr->max_qp_wqes = BNXT_VAR_MAX_WQE - 1; + attr->max_qp_sges = cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE ? min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; attr->max_cq = le32_to_cpu(sb->max_cq); From bb839f3ace0fee532a0487b692cc4d868fccb7cf Mon Sep 17 00:00:00 2001 From: Damodharam Ammepalli Date: Tue, 17 Dec 2024 15:56:48 +0530 Subject: [PATCH 351/807] RDMA/bnxt_re: Fix MSN table size for variable wqe mode For variable size wqe mode, the MSN table size should be half the size of the SQ depth. Fixing this to avoid wrap around problems in the retransmission path. Fixes: de1d364c3815 ("RDMA/bnxt_re: Add support for Variable WQE in Genp7 adapters") Reviewed-by: Kashyap Desai Reviewed-by: Kalesh AP Signed-off-by: Damodharam Ammepalli Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-5-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index d8a2a929bbe3..951ad90f5aa9 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1033,7 +1033,12 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) : 0; /* Update msn tbl size */ if (qp->is_host_msn_tbl && psn_sz) { - hwq_attr.aux_depth = roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); + if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) + hwq_attr.aux_depth = + roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); + else + hwq_attr.aux_depth = + roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)) / 2; qp->msn_tbl_sz = hwq_attr.aux_depth; qp->msn = 0; } From 9272cba0ded71b5a2084da3004ec7806b8cb7fd2 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 17 Dec 2024 15:56:49 +0530 Subject: [PATCH 352/807] RDMA/bnxt_re: Fix the locking while accessing the QP table QP table handling is synchronized with destroy QP and Async event from the HW. The same needs to be synchronized during create_qp also. Use the same lock in create_qp also. Fixes: 76d3ddff7153 ("RDMA/bnxt_re: synchronize the qp-handle table array") Fixes: f218d67ef004 ("RDMA/bnxt_re: Allow posting when QPs are in error") Fixes: 84cf229f4001 ("RDMA/bnxt_re: Fix the qp table indexing") Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-6-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 951ad90f5aa9..5336f74297f8 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1181,9 +1181,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rq->dbinfo.db = qp->dpi->dbr; rq->dbinfo.max_slot = bnxt_qplib_set_rq_max_slot(rq->wqe_size); } + spin_lock_bh(&rcfw->tbl_lock); tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw); rcfw->qp_tbl[tbl_indx].qp_id = qp->id; rcfw->qp_tbl[tbl_indx].qp_handle = (void *)qp; + spin_unlock_bh(&rcfw->tbl_lock); return 0; fail: From 29d44cce324dab2bd86c447071a596262e7109b6 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 19 Dec 2024 19:15:06 +0800 Subject: [PATCH 353/807] selftests/bpf: Use asm constraint "m" for LoongArch Currently, LoongArch LLVM does not support the constraint "o" and no plan to support it, it only supports the similar constraint "m", so change the constraints from "nor" in the "else" case to arch-specific "nmr" to avoid the build error such as "unexpected asm memory constraint" for LoongArch. Fixes: 630301b0d59d ("selftests/bpf: Add basic USDT selftests") Suggested-by: Weining Lu Suggested-by: Li Chen Signed-off-by: Tiezhu Yang Signed-off-by: Daniel Borkmann Reviewed-by: Huacai Chen Cc: stable@vger.kernel.org Link: https://llvm.org/docs/LangRef.html#supported-constraint-code-list Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp#L172 Link: https://lore.kernel.org/bpf/20241219111506.20643-1-yangtiezhu@loongson.cn --- tools/testing/selftests/bpf/sdt.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/sdt.h b/tools/testing/selftests/bpf/sdt.h index ca0162b4dc57..1fcfa5160231 100644 --- a/tools/testing/selftests/bpf/sdt.h +++ b/tools/testing/selftests/bpf/sdt.h @@ -102,6 +102,8 @@ # define STAP_SDT_ARG_CONSTRAINT nZr # elif defined __arm__ # define STAP_SDT_ARG_CONSTRAINT g +# elif defined __loongarch__ +# define STAP_SDT_ARG_CONSTRAINT nmr # else # define STAP_SDT_ARG_CONSTRAINT nor # endif From 4b2efb9db0c22a130bbd1275e489b42c02d08050 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 10 Dec 2024 14:09:37 +0100 Subject: [PATCH 354/807] accel/ivpu: Fix general protection fault in ivpu_bo_list() Check if ctx is not NULL before accessing its fields. Fixes: 37dee2a2f433 ("accel/ivpu: Improve buffer object debug logs") Cc: stable@vger.kernel.org # v6.8 Reviewed-by: Karol Wachowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-2-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c index d8e97a760fbc..16178054e629 100644 --- a/drivers/accel/ivpu/ivpu_gem.c +++ b/drivers/accel/ivpu/ivpu_gem.c @@ -409,7 +409,7 @@ static void ivpu_bo_print_info(struct ivpu_bo *bo, struct drm_printer *p) mutex_lock(&bo->lock); drm_printf(p, "%-9p %-3u 0x%-12llx %-10lu 0x%-8x %-4u", - bo, bo->ctx->id, bo->vpu_addr, bo->base.base.size, + bo, bo->ctx ? bo->ctx->id : 0, bo->vpu_addr, bo->base.base.size, bo->flags, kref_read(&bo->base.base.refcount)); if (bo->base.pages) From 6c9ba75f147b24b5c59aac7356a38a0fef664afa Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 10 Dec 2024 14:09:38 +0100 Subject: [PATCH 355/807] accel/ivpu: Fix memory leak in ivpu_mmu_reserved_context_init() Add appropriate error handling to ensure all allocated resources are released upon encountering an error. Fixes: a74f4d991352 ("accel/ivpu: Defer MMU root page table allocation") Cc: Karol Wachowski Reviewed-by: Karol Wachowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-3-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_mmu_context.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_mmu_context.c b/drivers/accel/ivpu/ivpu_mmu_context.c index 891967a95bc3..0af614dfb6f9 100644 --- a/drivers/accel/ivpu/ivpu_mmu_context.c +++ b/drivers/accel/ivpu/ivpu_mmu_context.c @@ -612,18 +612,22 @@ int ivpu_mmu_reserved_context_init(struct ivpu_device *vdev) if (!ivpu_mmu_ensure_pgd(vdev, &vdev->rctx.pgtable)) { ivpu_err(vdev, "Failed to allocate root page table for reserved context\n"); ret = -ENOMEM; - goto unlock; + goto err_ctx_fini; } ret = ivpu_mmu_cd_set(vdev, vdev->rctx.id, &vdev->rctx.pgtable); if (ret) { ivpu_err(vdev, "Failed to set context descriptor for reserved context\n"); - goto unlock; + goto err_ctx_fini; } -unlock: mutex_unlock(&vdev->rctx.lock); return ret; + +err_ctx_fini: + mutex_unlock(&vdev->rctx.lock); + ivpu_mmu_context_fini(vdev, &vdev->rctx); + return ret; } void ivpu_mmu_reserved_context_fini(struct ivpu_device *vdev) From 0f6482caa6acdfdfc744db7430771fe7e6c4e787 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 10 Dec 2024 14:09:39 +0100 Subject: [PATCH 356/807] accel/ivpu: Fix WARN in ivpu_ipc_send_receive_internal() Move pm_runtime_set_active() to ivpu_pm_init() so when ivpu_ipc_send_receive_internal() is executed before ivpu_pm_enable() it already has correct runtime state, even if last resume was not successful. Fixes: 8ed520ff4682 ("accel/ivpu: Move set autosuspend delay to HW specific code") Cc: stable@vger.kernel.org # v6.7+ Reviewed-by: Karol Wachowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-4-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index dbc0711e28d1..949f4233946c 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -378,6 +378,7 @@ void ivpu_pm_init(struct ivpu_device *vdev) pm_runtime_use_autosuspend(dev); pm_runtime_set_autosuspend_delay(dev, delay); + pm_runtime_set_active(dev); ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay); } @@ -392,7 +393,6 @@ void ivpu_pm_enable(struct ivpu_device *vdev) { struct device *dev = vdev->drm.dev; - pm_runtime_set_active(dev); pm_runtime_allow(dev); pm_runtime_mark_last_busy(dev); pm_runtime_put_autosuspend(dev); From 716f2bca1ce93bb95364f1fc0555c1650507b588 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 18 Dec 2024 18:57:24 +0100 Subject: [PATCH 357/807] selftests/bpf: Fix compilation error in get_uprobe_offset() In get_uprobe_offset(), the call to procmap_query() use the constant PROCMAP_QUERY_VMA_EXECUTABLE, even if PROCMAP_QUERY is not defined. Define PROCMAP_QUERY_VMA_EXECUTABLE when PROCMAP_QUERY isn't. Fixes: 4e9e07603ecd ("selftests/bpf: make use of PROCMAP_QUERY ioctl if available") Signed-off-by: Jerome Marchand Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20241218175724.578884-1-jmarchan@redhat.com --- tools/testing/selftests/bpf/trace_helpers.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 2d742fdac6b9..81943c6254e6 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -293,6 +293,10 @@ static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *st return 0; } #else +# ifndef PROCMAP_QUERY_VMA_EXECUTABLE +# define PROCMAP_QUERY_VMA_EXECUTABLE 0x04 +# endif + static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) { return -EOPNOTSUPP; From 8d90a86ed053226a297ce062f4d9f4f521e05c4c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 12 Dec 2024 20:19:48 -0800 Subject: [PATCH 358/807] mmc: sdhci-msm: fix crypto key eviction Commit c7eed31e235c ("mmc: sdhci-msm: Switch to the new ICE API") introduced an incorrect check of the algorithm ID into the key eviction path, and thus qcom_ice_evict_key() is no longer ever called. Fix it. Fixes: c7eed31e235c ("mmc: sdhci-msm: Switch to the new ICE API") Cc: stable@vger.kernel.org Cc: Abel Vesa Signed-off-by: Eric Biggers Message-ID: <20241213041958.202565-6-ebiggers@kernel.org> Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index e00208535bd1..319f0ebbe652 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -1867,20 +1867,20 @@ static int sdhci_msm_program_key(struct cqhci_host *cq_host, struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); union cqhci_crypto_cap_entry cap; + if (!(cfg->config_enable & CQHCI_CRYPTO_CONFIGURATION_ENABLE)) + return qcom_ice_evict_key(msm_host->ice, slot); + /* Only AES-256-XTS has been tested so far. */ cap = cq_host->crypto_cap_array[cfg->crypto_cap_idx]; if (cap.algorithm_id != CQHCI_CRYPTO_ALG_AES_XTS || cap.key_size != CQHCI_CRYPTO_KEY_SIZE_256) return -EINVAL; - if (cfg->config_enable & CQHCI_CRYPTO_CONFIGURATION_ENABLE) - return qcom_ice_program_key(msm_host->ice, - QCOM_ICE_CRYPTO_ALG_AES_XTS, - QCOM_ICE_CRYPTO_KEY_SIZE_256, - cfg->crypto_key, - cfg->data_unit_size, slot); - else - return qcom_ice_evict_key(msm_host->ice, slot); + return qcom_ice_program_key(msm_host->ice, + QCOM_ICE_CRYPTO_ALG_AES_XTS, + QCOM_ICE_CRYPTO_KEY_SIZE_256, + cfg->crypto_key, + cfg->data_unit_size, slot); } #else /* CONFIG_MMC_CRYPTO */ From 974e3fe0ac61de85015bbe5a4990cf4127b304b2 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 19 Dec 2024 12:53:01 +0100 Subject: [PATCH 359/807] fs: relax assertions on failure to encode file handles Encoding file handles is usually performed by a filesystem >encode_fh() method that may fail for various reasons. The legacy users of exportfs_encode_fh(), namely, nfsd and name_to_handle_at(2) syscall are ready to cope with the possibility of failure to encode a file handle. There are a few other users of exportfs_encode_{fh,fid}() that currently have a WARN_ON() assertion when ->encode_fh() fails. Relax those assertions because they are wrong. The second linked bug report states commit 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles") in v6.6 as the regressing commit, but this is not accurate. The aforementioned commit only increases the chances of the assertion and allows triggering the assertion with the reproducer using overlayfs, inotify and drop_caches. Triggering this assertion was always possible with other filesystems and other reasons of ->encode_fh() failures and more particularly, it was also possible with the exact same reproducer using overlayfs that is mounted with options index=on,nfs_export=on also on kernels < v6.6. Therefore, I am not listing the aforementioned commit as a Fixes commit. Backport hint: this patch will have a trivial conflict applying to v6.6.y, and other trivial conflicts applying to stable kernels < v6.6. Reported-by: syzbot+ec07f6f5ce62b858579f@syzkaller.appspotmail.com Tested-by: syzbot+ec07f6f5ce62b858579f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-unionfs/671fd40c.050a0220.4735a.024f.GAE@google.com/ Reported-by: Dmitry Safonov Closes: https://lore.kernel.org/linux-fsdevel/CAGrbwDTLt6drB9eaUagnQVgdPBmhLfqqxAf3F+Juqy_o6oP8uw@mail.gmail.com/ Cc: stable@vger.kernel.org Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20241219115301.465396-1-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/notify/fdinfo.c | 4 +--- fs/overlayfs/copy_up.c | 5 ++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index dec553034027..e933f9c65d90 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -47,10 +47,8 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f->handle_bytes >> 2; ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size); - if ((ret == FILEID_INVALID) || (ret < 0)) { - WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); + if ((ret == FILEID_INVALID) || (ret < 0)) return; - } f->handle_type = ret; f->handle_bytes = size * sizeof(u32); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 3601ddfeddc2..56eee9f23ea9 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -442,9 +442,8 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, buflen = (dwords << 2); err = -EIO; - if (WARN_ON(fh_type < 0) || - WARN_ON(buflen > MAX_HANDLE_SZ) || - WARN_ON(fh_type == FILEID_INVALID)) + if (fh_type < 0 || fh_type == FILEID_INVALID || + WARN_ON(buflen > MAX_HANDLE_SZ)) goto out_err; fh->fb.version = OVL_FH_VERSION; From 469c0682e03d67d8dc970ecaa70c2d753057c7c0 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sun, 15 Dec 2024 12:01:59 +0900 Subject: [PATCH 360/807] pmdomain: imx: gpcv2: fix an OF node reference leak in imx_gpcv2_probe() imx_gpcv2_probe() leaks an OF node reference obtained by of_get_child_by_name(). Fix it by declaring the device node with the __free(device_node) cleanup construct. This bug was found by an experimental static analysis tool that I am developing. Fixes: 03aa12629fc4 ("soc: imx: Add GPCv2 power gating driver") Signed-off-by: Joe Hattori Cc: stable@vger.kernel.org Message-ID: <20241215030159.1526624-1-joe@pf.is.s.u-tokyo.ac.jp> Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/gpcv2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pmdomain/imx/gpcv2.c b/drivers/pmdomain/imx/gpcv2.c index e67ecf99ef84..9bdb80fd7210 100644 --- a/drivers/pmdomain/imx/gpcv2.c +++ b/drivers/pmdomain/imx/gpcv2.c @@ -1458,12 +1458,12 @@ static int imx_gpcv2_probe(struct platform_device *pdev) .max_register = SZ_4K, }; struct device *dev = &pdev->dev; - struct device_node *pgc_np; + struct device_node *pgc_np __free(device_node) = + of_get_child_by_name(dev->of_node, "pgc"); struct regmap *regmap; void __iomem *base; int ret; - pgc_np = of_get_child_by_name(dev->of_node, "pgc"); if (!pgc_np) { dev_err(dev, "No power domains specified in DT\n"); return -EINVAL; From f64f610ec6ab59dd0391b03842cea3a4cd8ee34f Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Wed, 18 Dec 2024 19:44:33 +0100 Subject: [PATCH 361/807] pmdomain: core: add dummy release function to genpd device The genpd device, which is really only used as a handle to lookup OPP, but not even registered to the device core otherwise and thus lifetime linked to the genpd struct it is contained in, is missing a release function. After b8f7bbd1f4ec ("pmdomain: core: Add missing put_device()") the device will be cleaned up going through the driver core device_release() function, which will warn when no release callback is present for the device. Add a dummy release function to shut up the warning. Signed-off-by: Lucas Stach Tested-by: Luca Ceresoli Fixes: b8f7bbd1f4ec ("pmdomain: core: Add missing put_device()") Cc: stable@vger.kernel.org Message-ID: <20241218184433.1930532-1-l.stach@pengutronix.de> Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index bb11f467dc78..20a9efebbcb7 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -2142,6 +2142,11 @@ static int genpd_set_default_power_state(struct generic_pm_domain *genpd) return 0; } +static void genpd_provider_release(struct device *dev) +{ + /* nothing to be done here */ +} + static int genpd_alloc_data(struct generic_pm_domain *genpd) { struct genpd_governor_data *gd = NULL; @@ -2173,6 +2178,7 @@ static int genpd_alloc_data(struct generic_pm_domain *genpd) genpd->gd = gd; device_initialize(&genpd->dev); + genpd->dev.release = genpd_provider_release; if (!genpd_is_dev_name_fw(genpd)) { dev_set_name(&genpd->dev, "%s", genpd->name); From 1b684ca15f9d78f45de3cdba7e19611387e16aa7 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Tue, 17 Dec 2024 10:49:15 +0700 Subject: [PATCH 362/807] drm/sched: Fix drm_sched_fini() docu generation Commit baf4afc5831438 ("drm/sched: Improve teardown documentation") added a list of drm_sched_fini()'s problems. The list triggers htmldocs warning (but renders correctly in htmldocs output): Documentation/gpu/drm-mm:571: ./drivers/gpu/drm/scheduler/sched_main.c:1359: ERROR: Unexpected indentation. Separate the list from the preceding paragraph by a blank line to fix the warning. While at it, also end the aforementioned paragraph by a colon. Fixes: baf4afc58314 ("drm/sched: Improve teardown documentation") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/r/20241108175655.6d3fcfb7@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya [phasta: Adjust commit message] Signed-off-by: Philipp Stanner Link: https://patchwork.freedesktop.org/patch/msgid/20241217034915.62594-1-bagasdotme@gmail.com --- drivers/gpu/drm/scheduler/sched_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 7ce25281c74c..57da84908752 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -1355,7 +1355,8 @@ EXPORT_SYMBOL(drm_sched_init); * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job() * will not be called for all jobs still in drm_gpu_scheduler.pending_list. * There is no solution for this currently. Thus, it is up to the driver to make - * sure that + * sure that: + * * a) drm_sched_fini() is only called after for all submitted jobs * drm_sched_backend_ops.free_job() has been called or that * b) the jobs for which drm_sched_backend_ops.free_job() has not been called From a769bee5f9fbca47efd4fa6bc3d726d370cedebe Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Tue, 17 Dec 2024 00:09:36 +0530 Subject: [PATCH 363/807] smb: use macros instead of constants for leasekey size and default cifsattrs value Replace default hardcoded value for cifsAttrs with ATTR_ARCHIVE macro Use SMB2_LEASE_KEY_SIZE macro for leasekey size in smb2_lease_break Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 2 +- fs/smb/client/smb2pdu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 9d96b833015c..b800c9f585d8 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -398,7 +398,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL); if (!cifs_inode) return NULL; - cifs_inode->cifsAttrs = 0x20; /* default */ + cifs_inode->cifsAttrs = ATTR_ARCHIVE; /* default */ cifs_inode->time = 0; /* * Until the file is open and we have gotten oplock info back from the diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 010eae9d6c47..c945b94318f8 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -6204,7 +6204,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, req->StructureSize = cpu_to_le16(36); total_len += 12; - memcpy(req->LeaseKey, lease_key, 16); + memcpy(req->LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); req->LeaseState = lease_state; flags |= CIFS_NO_RSP_BUF; From ee1c8e6b2931811a906b8c78006bfe0a3386fa60 Mon Sep 17 00:00:00 2001 From: Dragan Simic Date: Tue, 17 Dec 2024 10:25:10 +0100 Subject: [PATCH 364/807] smb: client: Deduplicate "select NETFS_SUPPORT" in Kconfig Repeating automatically selected options in Kconfig files is redundant, so let's delete repeated "select NETFS_SUPPORT" that was added accidentally. Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks") Signed-off-by: Dragan Simic Signed-off-by: Steve French --- fs/smb/client/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index 2aff6d1395ce..9f05f94e265a 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -2,7 +2,6 @@ config CIFS tristate "SMB3 and CIFS support (advanced network filesystem)" depends on INET - select NETFS_SUPPORT select NLS select NLS_UCS2_UTILS select CRYPTO From e9f2517a3e18a54a3943c098d2226b245d488801 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Tue, 10 Dec 2024 18:15:12 -0300 Subject: [PATCH 365/807] smb: client: fix TCP timers deadlock after rmmod Commit ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.") fixed a netns UAF by manually enabled socket refcounting (sk->sk_net_refcnt=1 and sock_inuse_add(net, 1)). The reason the patch worked for that bug was because we now hold references to the netns (get_net_track() gets a ref internally) and they're properly released (internally, on __sk_destruct()), but only because sk->sk_net_refcnt was set. Problem: (this happens regardless of CONFIG_NET_NS_REFCNT_TRACKER and regardless if init_net or other) Setting sk->sk_net_refcnt=1 *manually* and *after* socket creation is not only out of cifs scope, but also technically wrong -- it's set conditionally based on user (=1) vs kernel (=0) sockets. And net/ implementations seem to base their user vs kernel space operations on it. e.g. upon TCP socket close, the TCP timers are not cleared because sk->sk_net_refcnt=1: (cf. commit 151c9c724d05 ("tcp: properly terminate timers for kernel sockets")) net/ipv4/tcp.c: void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); if (!sk->sk_net_refcnt) inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } Which will throw a lockdep warning and then, as expected, deadlock on tcp_write_timer(). A way to reproduce this is by running the reproducer from ef7134c7fc48 and then 'rmmod cifs'. A few seconds later, the deadlock/lockdep warning shows up. Fix: We shouldn't mess with socket internals ourselves, so do not set sk_net_refcnt manually. Also change __sock_create() to sock_create_kern() for explicitness. As for non-init_net network namespaces, we deal with it the best way we can -- hold an extra netns reference for server->ssocket and drop it when it's released. This ensures that the netns still exists whenever we need to create/destroy server->ssocket, but is not directly tied to it. Fixes: ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.") Cc: stable@vger.kernel.org Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/smb/client/connect.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 2372538a1211..ddcc9e514a0e 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -987,9 +987,13 @@ clean_demultiplex_info(struct TCP_Server_Info *server) msleep(125); if (cifs_rdma_enabled(server)) smbd_destroy(server); + if (server->ssocket) { sock_release(server->ssocket); server->ssocket = NULL; + + /* Release netns reference for the socket. */ + put_net(cifs_net_ns(server)); } if (!list_empty(&server->pending_mid_q)) { @@ -1037,6 +1041,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server) */ } + /* Release netns reference for this server. */ put_net(cifs_net_ns(server)); kfree(server->leaf_fullpath); kfree(server); @@ -1713,6 +1718,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; + + /* Grab netns reference for this server. */ cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); @@ -1844,6 +1851,7 @@ smbd_connected: out_err_crypto_release: cifs_crypto_secmech_release(tcp_ses); + /* Release netns reference for this server. */ put_net(cifs_net_ns(tcp_ses)); out_err: @@ -1852,8 +1860,10 @@ out_err: cifs_put_tcp_session(tcp_ses->primary_server, false); kfree(tcp_ses->hostname); kfree(tcp_ses->leaf_fullpath); - if (tcp_ses->ssocket) + if (tcp_ses->ssocket) { sock_release(tcp_ses->ssocket); + put_net(cifs_net_ns(tcp_ses)); + } kfree(tcp_ses); } return ERR_PTR(rc); @@ -3131,20 +3141,20 @@ generic_ip_connect(struct TCP_Server_Info *server) socket = server->ssocket; } else { struct net *net = cifs_net_ns(server); - struct sock *sk; - rc = __sock_create(net, sfamily, SOCK_STREAM, - IPPROTO_TCP, &server->ssocket, 1); + rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } - sk = server->ssocket->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + /* + * Grab netns reference for the socket. + * + * It'll be released here, on error, or in clean_demultiplex_info() upon server + * teardown. + */ + get_net(net); /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); @@ -3158,8 +3168,10 @@ generic_ip_connect(struct TCP_Server_Info *server) } rc = bind_socket(server); - if (rc < 0) + if (rc < 0) { + put_net(cifs_net_ns(server)); return rc; + } /* * Eventually check for other socket options to change from @@ -3196,6 +3208,7 @@ generic_ip_connect(struct TCP_Server_Info *server) if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); + put_net(cifs_net_ns(server)); sock_release(socket); server->ssocket = NULL; return rc; @@ -3204,6 +3217,9 @@ generic_ip_connect(struct TCP_Server_Info *server) if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); + if (rc < 0) + put_net(cifs_net_ns(server)); + return rc; } From de35994ecd2dd6148ab5a6c5050a1670a04dec77 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 19 Dec 2024 09:30:30 +0000 Subject: [PATCH 366/807] workqueue: Do not warn when cancelling WQ_MEM_RECLAIM work from !WQ_MEM_RECLAIM worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 746ae46c1113 ("drm/sched: Mark scheduler work queues with WQ_MEM_RECLAIM") amdgpu started seeing the following warning: [ ] workqueue: WQ_MEM_RECLAIM sdma0:drm_sched_run_job_work [gpu_sched] is flushing !WQ_MEM_RECLAIM events:amdgpu_device_delay_enable_gfx_off [amdgpu] ... [ ] Workqueue: sdma0 drm_sched_run_job_work [gpu_sched] ... [ ] Call Trace: [ ] ... [ ] ? check_flush_dependency+0xf5/0x110 ... [ ] cancel_delayed_work_sync+0x6e/0x80 [ ] amdgpu_gfx_off_ctrl+0xab/0x140 [amdgpu] [ ] amdgpu_ring_alloc+0x40/0x50 [amdgpu] [ ] amdgpu_ib_schedule+0xf4/0x810 [amdgpu] [ ] ? drm_sched_run_job_work+0x22c/0x430 [gpu_sched] [ ] amdgpu_job_run+0xaa/0x1f0 [amdgpu] [ ] drm_sched_run_job_work+0x257/0x430 [gpu_sched] [ ] process_one_work+0x217/0x720 ... [ ] The intent of the verifcation done in check_flush_depedency is to ensure forward progress during memory reclaim, by flagging cases when either a memory reclaim process, or a memory reclaim work item is flushed from a context not marked as memory reclaim safe. This is correct when flushing, but when called from the cancel(_delayed)_work_sync() paths it is a false positive because work is either already running, or will not be running at all. Therefore cancelling it is safe and we can relax the warning criteria by letting the helper know of the calling context. Signed-off-by: Tvrtko Ursulin Fixes: fca839c00a12 ("workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue") References: 746ae46c1113 ("drm/sched: Mark scheduler work queues with WQ_MEM_RECLAIM") Cc: Tejun Heo Cc: Peter Zijlstra Cc: Lai Jiangshan Cc: Alex Deucher Cc: Christian König Cc: # v4.5+ Signed-off-by: Tejun Heo --- kernel/workqueue.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8b07576814a5..8336218ec4b8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3680,23 +3680,27 @@ void workqueue_softirq_dead(unsigned int cpu) * check_flush_dependency - check for flush dependency sanity * @target_wq: workqueue being flushed * @target_work: work item being flushed (NULL for workqueue flushes) + * @from_cancel: are we called from the work cancel path * * %current is trying to flush the whole @target_wq or @target_work on it. - * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not - * reclaiming memory or running on a workqueue which doesn't have - * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to - * a deadlock. + * If this is not the cancel path (which implies work being flushed is either + * already running, or will not be at all), check if @target_wq doesn't have + * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running + * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward- + * progress guarantee leading to a deadlock. */ static void check_flush_dependency(struct workqueue_struct *target_wq, - struct work_struct *target_work) + struct work_struct *target_work, + bool from_cancel) { - work_func_t target_func = target_work ? target_work->func : NULL; + work_func_t target_func; struct worker *worker; - if (target_wq->flags & WQ_MEM_RECLAIM) + if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM) return; worker = current_wq_worker(); + target_func = target_work ? target_work->func : NULL; WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", @@ -3980,7 +3984,7 @@ void __flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } - check_flush_dependency(wq, NULL); + check_flush_dependency(wq, NULL, false); mutex_unlock(&wq->mutex); @@ -4155,7 +4159,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, } wq = pwq->wq; - check_flush_dependency(wq, work); + check_flush_dependency(wq, work, from_cancel); insert_wq_barrier(pwq, barr, work, worker); raw_spin_unlock_irq(&pool->lock); From c261e4f1dd29fabab54b325bc1da8769a3998be1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Dec 2024 09:32:26 -0700 Subject: [PATCH 367/807] io_uring/register: limit ring resizing to DEFER_TASKRUN With DEFER_TASKRUN, we know the ring can't be both waited upon and resized at the same time. This is important for CQ resizing. Allowing SQ ring resizing is more trivial, but isn't the interesting use case. Hence limit ring resizing in general to DEFER_TASKRUN only for now. This isn't a huge problem as CQ ring resizing is generally the most useful on networking type of workloads where it can be hard to size the ring appropriately upfront, and those should be using DEFER_TASKRUN for better performance. Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Signed-off-by: Jens Axboe --- io_uring/register.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/register.c b/io_uring/register.c index 1e99c783abdf..fdd44914c39c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -414,6 +414,9 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && current != ctx->submitter_task) return -EEXIST; + /* limited to DEFER_TASKRUN for now */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; if (copy_from_user(&p, arg, sizeof(p))) return -EFAULT; if (p.flags & ~RESIZE_FLAGS) From 3202ca221578850f34e0fea39dc6cfa745ed7aac Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 17 Dec 2024 10:51:01 +0100 Subject: [PATCH 368/807] PCI: Honor Max Link Speed when determining supported speeds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Supported Link Speeds Vector in the Link Capabilities 2 Register indicates the *supported* link speeds. The Max Link Speed field in the Link Capabilities Register indicates the *maximum* of those speeds. pcie_get_supported_speeds() neglects to honor the Max Link Speed field and will thus incorrectly deem higher speeds as supported. Fix it. One user-visible issue addressed here is an incorrect value in the sysfs attribute "max_link_speed". But the main motivation is a boot hang reported by Niklas: Intel JHL7540 "Titan Ridge 2018" Thunderbolt controllers supports 2.5-8 GT/s speeds, but indicate 2.5 GT/s as maximum. Ilpo recalls seeing this on more devices. It can be explained by the controller's Downstream Ports supporting 8 GT/s if an Endpoint is attached, but limiting to 2.5 GT/s if the port interfaces to a PCIe Adapter, in accordance with USB4 v2 sec 11.2.1: "This section defines the functionality of an Internal PCIe Port that interfaces to a PCIe Adapter. [...] The Logical sub-block shall update the PCIe configuration registers with the following characteristics: [...] Max Link Speed field in the Link Capabilities Register set to 0001b (data rate of 2.5 GT/s only). Note: These settings do not represent actual throughput. Throughput is implementation specific and based on the USB4 Fabric performance." The present commit is not sufficient on its own to fix Niklas' boot hang, but it is a prerequisite: A subsequent commit will fix the boot hang by enabling bandwidth control only if more than one speed is supported. The GENMASK() macro used herein specifies 0 as lowest bit, even though the Supported Link Speeds Vector ends at bit 1. This is done on purpose to avoid a GENMASK(0, 1) macro if Max Link Speed is zero. That macro would be invalid as the lowest bit is greater than the highest bit. Ilpo has witnessed a zero Max Link Speed on Root Complex Integrated Endpoints in particular, so it does occur in practice. (The Link Capabilities Register is optional on RCiEPs per PCIe r6.2 sec 7.5.3.) Fixes: d2bd39c0456b ("PCI: Store all PCIe Supported Link Speeds") Closes: https://lore.kernel.org/r/70829798889c6d779ca0f6cd3260a765780d1369.camel@kernel.org Link: https://lore.kernel.org/r/fe03941e3e1cc42fb9bf4395e302bff53ee2198b.1734428762.git.lukas@wunner.de Reported-by: Niklas Schnelle Tested-by: Niklas Schnelle Signed-off-by: Lukas Wunner Signed-off-by: Krzysztof Wilczyński Reviewed-by: Jonathan Cameron Reviewed-by: Ilpo Järvinen --- drivers/pci/pci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 0b29ec6e8e5e..661f98c6c63a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6232,12 +6232,14 @@ u8 pcie_get_supported_speeds(struct pci_dev *dev) pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2); speeds = lnkcap2 & PCI_EXP_LNKCAP2_SLS; + /* Ignore speeds higher than Max Link Speed */ + pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); + speeds &= GENMASK(lnkcap & PCI_EXP_LNKCAP_SLS, 0); + /* PCIe r3.0-compliant */ if (speeds) return speeds; - pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); - /* Synthesize from the Max Link Speed field */ if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_5_0GB) speeds = PCI_EXP_LNKCAP2_SLS_5_0GB | PCI_EXP_LNKCAP2_SLS_2_5GB; From 774c71c52aa487001c7da9f93b10cedc9985c371 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 17 Dec 2024 10:51:02 +0100 Subject: [PATCH 369/807] PCI/bwctrl: Enable only if more than one speed is supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a PCIe port only supports a single speed, enabling bandwidth control is pointless: There's no need to monitor autonomous speed changes, nor can the speed be changed. Not enabling it saves a small amount of memory and compute resources, but also fixes a boot hang reported by Niklas: It occurs when enabling bandwidth control on Downstream Ports of Intel JHL7540 "Titan Ridge 2018" Thunderbolt controllers. The ports only support 2.5 GT/s in accordance with USB4 v2 sec 11.2.1, so the present commit works around the issue. PCIe r6.2 sec 8.2.1 prescribes that: "A device must support 2.5 GT/s and is not permitted to skip support for any data rates between 2.5 GT/s and the highest supported rate." Consequently, bandwidth control is currently only disabled if a port doesn't support higher speeds than 2.5 GT/s. However the Implementation Note in PCIe r6.2 sec 7.5.3.18 cautions: "It is strongly encouraged that software primarily utilize the Supported Link Speeds Vector instead of the Max Link Speed field, so that software can determine the exact set of supported speeds on current and future hardware. This can avoid software being confused if a future specification defines Links that do not require support for all slower speeds." In other words, future revisions of the PCIe Base Spec may allow gaps in the Supported Link Speeds Vector. To be future-proof, don't just check whether speeds above 2.5 GT/s are supported, but rather check whether *more than one* speed is supported. Fixes: 665745f27487 ("PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller") Closes: https://lore.kernel.org/r/db8e457fcd155436449b035e8791a8241b0df400.camel@kernel.org Link: https://lore.kernel.org/r/3564908a9c99fc0d2a292473af7a94ebfc8f5820.1734428762.git.lukas@wunner.de Reported-by: Niklas Schnelle Tested-by: Niklas Schnelle Signed-off-by: Lukas Wunner Signed-off-by: Krzysztof Wilczyński Reviewed-by: Jonathan Cameron Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen --- drivers/pci/pcie/portdrv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c index 5e10306b6308..02e73099bad0 100644 --- a/drivers/pci/pcie/portdrv.c +++ b/drivers/pci/pcie/portdrv.c @@ -265,12 +265,14 @@ static int get_port_device_capability(struct pci_dev *dev) (pcie_ports_dpc_native || (services & PCIE_PORT_SERVICE_AER))) services |= PCIE_PORT_SERVICE_DPC; + /* Enable bandwidth control if more than one speed is supported. */ if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM || pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) { u32 linkcap; pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &linkcap); - if (linkcap & PCI_EXP_LNKCAP_LBNC) + if (linkcap & PCI_EXP_LNKCAP_LBNC && + hweight8(dev->supported_speeds) > 1) services |= PCIE_PORT_SERVICE_BWCTRL; } From 92941c7f2c9529fac1b2670482d0ced3b46eac70 Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Thu, 19 Dec 2024 23:28:50 +0530 Subject: [PATCH 370/807] smb: fix bytes written value in /proc/fs/cifs/Stats With recent netfs apis changes, the bytes written value was not getting updated in /proc/fs/cifs/Stats. Fix this by updating tcon->bytes in write operations. Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib") Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index c945b94318f8..959359301250 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4840,6 +4840,8 @@ smb2_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; + cifs_stats_bytes_written(tcon, written); + if (written < wdata->subreq.len) wdata->result = -ENOSPC; else @@ -5156,6 +5158,7 @@ replay_again: cifs_dbg(VFS, "Send error in write = %d\n", rc); } else { *nbytes = le32_to_cpu(rsp->DataLength); + cifs_stats_bytes_written(io_parms->tcon, *nbytes); trace_smb3_write_done(0, 0, xid, req->PersistentFileId, io_parms->tcon->tid, From 2e5e1a7ea692dc2b9f1acf0ebeb75bc282733cac Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 10 Dec 2024 12:14:43 +0100 Subject: [PATCH 371/807] Revert "arm64: dts: qcom: x1e80100-crd: enable otg on usb ports" This reverts commit 2dd3250191bcfe93b0c9da46624af830310400a7. A recent change enabling OTG mode on the x1e81000 CRD breaks suspend. Specifically, the device hard resets during resume if suspended with all controllers in device mode (i.e. no USB device connected). The corresponding change on the T14s also led to SuperSpeed hotplugs not being detected. With retimer (and orientation detection) support not even merged yet, let's revert at least until we have stable host mode in mainline. Fixes: 2dd3250191bc ("arm64: dts: qcom: x1e80100-crd: enable otg on usb ports") Reported-by: Abel Vesa Cc: Jonathan Marek Signed-off-by: Johan Hovold Reviewed-by: Abel Vesa Link: https://lore.kernel.org/r/20241210111444.26240-3-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100-crd.dts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts index 39f9d9cdc10d..d51a9bdcf67f 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts @@ -1197,6 +1197,10 @@ status = "okay"; }; +&usb_1_ss0_dwc3 { + dr_mode = "host"; +}; + &usb_1_ss0_dwc3_hs { remote-endpoint = <&pmic_glink_ss0_hs_in>; }; @@ -1225,6 +1229,10 @@ status = "okay"; }; +&usb_1_ss1_dwc3 { + dr_mode = "host"; +}; + &usb_1_ss1_dwc3_hs { remote-endpoint = <&pmic_glink_ss1_hs_in>; }; @@ -1253,6 +1261,10 @@ status = "okay"; }; +&usb_1_ss2_dwc3 { + dr_mode = "host"; +}; + &usb_1_ss2_dwc3_hs { remote-endpoint = <&pmic_glink_ss2_hs_in>; }; From 7db0ba3e6e6c215353c1e58b42dfd77c7ab89256 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 10 Dec 2024 12:14:44 +0100 Subject: [PATCH 372/807] Revert "arm64: dts: qcom: x1e80100: enable OTG on USB-C controllers" This reverts commit f042bc234c2e00764b8aa2c9e2f8177cdc63f664. A recent change enabling role switching for the x1e80100 USB-C controllers breaks UCSI and DisplayPort Alternate Mode when the controllers are in host mode: ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: PPM init failed, stop trying As enabling OTG mode currently breaks SuperSpeed hotplug and suspend, and with retimer (and orientation detection) support not even merged yet, let's revert at least until we have stable host mode in mainline. Fixes: f042bc234c2e ("arm64: dts: qcom: x1e80100: enable OTG on USB-C controllers") Reported-by: Dmitry Baryshkov Link: https://lore.kernel.org/all/hw2pdof4ajadjsjrb44f2q4cz4yh5qcqz5d3l7gjt2koycqs3k@xx5xvd26uyef Link: https://lore.kernel.org/lkml/Z1gbyXk-SktGjL6-@hovoldconsulting.com/ Cc: Jonathan Marek Signed-off-by: Johan Hovold Reviewed-by: Abel Vesa Link: https://lore.kernel.org/r/20241210111444.26240-4-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/x1e80100.dtsi | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index f1a1e63f8ebc..7e4f46ad8edd 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -4066,8 +4066,6 @@ dma-coherent; - usb-role-switch; - ports { #address-cells = <1>; #size-cells = <0>; @@ -4321,8 +4319,6 @@ dma-coherent; - usb-role-switch; - ports { #address-cells = <1>; #size-cells = <0>; @@ -4421,8 +4417,6 @@ dma-coherent; - usb-role-switch; - ports { #address-cells = <1>; #size-cells = <0>; From dbd2ca9367eb19bc5e269b8c58b0b1514ada9156 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 19 Dec 2024 19:52:58 +0000 Subject: [PATCH 373/807] io_uring: check if iowq is killed before queuing task work can be executed after the task has gone through io_uring termination, whether it's the final task_work run or the fallback path. In this case, task work will find ->io_wq being already killed and null'ed, which is a problem if it then tries to forward the request to io_queue_iowq(). Make io_queue_iowq() fail requests in this case. Note that it also checks PF_KTHREAD, because the user can first close a DEFER_TASKRUN ring and shortly after kill the task, in which case ->iowq check would race. Cc: stable@vger.kernel.org Fixes: 50c52250e2d74 ("block: implement async io_uring discard cmd") Fixes: 773af69121ecc ("io_uring: always reissue from task_work context") Reported-by: Will Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/63312b4a2c2bb67ad67b857d17a300e1d3b078e8.1734637909.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 432b95ca9c85..d3403c8216db 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -514,7 +514,11 @@ static void io_queue_iowq(struct io_kiocb *req) struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); - BUG_ON(!tctx->io_wq); + + if ((current->flags & PF_KTHREAD) || !tctx->io_wq) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } /* init ->work of the whole link before punting */ io_prep_async_link(req); From 25c6a5ab151fb9c886552bf5aa7cbf2a5c6e96af Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 17 Dec 2024 14:35:00 +0800 Subject: [PATCH 374/807] net: phy: micrel: Dynamically control external clock of KSZ PHY On the i.MX6ULL-14x14-EVK board, enet1_ref and enet2_ref are used as the clock sources for two external KSZ PHYs. However, after closing the two FEC ports, the clk_enable_count of the enet1_ref and enet2_ref clocks is not 0. The root cause is that since the commit 985329462723 ("net: phy: micrel: use devm_clk_get_optional_enabled for the rmii-ref clock"), the external clock of KSZ PHY has been enabled when the PHY driver probes, and it can only be disabled when the PHY driver is removed. This causes the clock to continue working when the system is suspended or the network port is down. Although Heiko explained in the commit message that the patch was because some clock suppliers need to enable the clock to get the valid clock rate , it seems that the simple fix is to disable the clock after getting the clock rate to solve the current problem. This is indeed true, but we need to admit that Heiko's patch has been applied for more than a year, and we cannot guarantee whether there are platforms that only enable rmii-ref in the KSZ PHY driver during this period. If this is the case, disabling rmii-ref will cause RMII on these platforms to not work. Secondly, commit 99ac4cbcc2a5 ("net: phy: micrel: allow usage of generic ethernet-phy clock") just simply enables the generic clock permanently, which seems like the generic clock may only be enabled in the PHY driver. If we simply disable the generic clock, RMII may not work. If we keep it as it is, the platform using the generic clock will have the same problem as the i.MX6ULL platform. To solve this problem, the clock is enabled when phy_driver::resume() is called, and the clock is disabled when phy_driver::suspend() is called. Since phy_driver::resume() and phy_driver::suspend() are not called in pairs, an additional clk_enable flag is added. When phy_driver::suspend() is called, the clock is disabled only if clk_enable is true. Conversely, when phy_driver::resume() is called, the clock is enabled if clk_enable is false. The changes that introduced the problem were only a few lines, while the current fix is about a hundred lines, which seems out of proportion, but it is necessary because kszphy_probe() is used by multiple KSZ PHYs and we need to fix all of them. Fixes: 985329462723 ("net: phy: micrel: use devm_clk_get_optional_enabled for the rmii-ref clock") Fixes: 99ac4cbcc2a5 ("net: phy: micrel: allow usage of generic ethernet-phy clock") Signed-off-by: Wei Fang Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20241217063500.1424011-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 114 ++++++++++++++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 3ef508840674..eeb33eb181ac 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -432,10 +432,12 @@ struct kszphy_ptp_priv { struct kszphy_priv { struct kszphy_ptp_priv ptp_priv; const struct kszphy_type *type; + struct clk *clk; int led_mode; u16 vct_ctrl1000; bool rmii_ref_clk_sel; bool rmii_ref_clk_sel_val; + bool clk_enable; u64 stats[ARRAY_SIZE(kszphy_hw_stats)]; }; @@ -2050,6 +2052,46 @@ static void kszphy_get_stats(struct phy_device *phydev, data[i] = kszphy_get_stat(phydev, i); } +static void kszphy_enable_clk(struct phy_device *phydev) +{ + struct kszphy_priv *priv = phydev->priv; + + if (!priv->clk_enable && priv->clk) { + clk_prepare_enable(priv->clk); + priv->clk_enable = true; + } +} + +static void kszphy_disable_clk(struct phy_device *phydev) +{ + struct kszphy_priv *priv = phydev->priv; + + if (priv->clk_enable && priv->clk) { + clk_disable_unprepare(priv->clk); + priv->clk_enable = false; + } +} + +static int kszphy_generic_resume(struct phy_device *phydev) +{ + kszphy_enable_clk(phydev); + + return genphy_resume(phydev); +} + +static int kszphy_generic_suspend(struct phy_device *phydev) +{ + int ret; + + ret = genphy_suspend(phydev); + if (ret) + return ret; + + kszphy_disable_clk(phydev); + + return 0; +} + static int kszphy_suspend(struct phy_device *phydev) { /* Disable PHY Interrupts */ @@ -2059,7 +2101,7 @@ static int kszphy_suspend(struct phy_device *phydev) phydev->drv->config_intr(phydev); } - return genphy_suspend(phydev); + return kszphy_generic_suspend(phydev); } static void kszphy_parse_led_mode(struct phy_device *phydev) @@ -2090,7 +2132,9 @@ static int kszphy_resume(struct phy_device *phydev) { int ret; - genphy_resume(phydev); + ret = kszphy_generic_resume(phydev); + if (ret) + return ret; /* After switching from power-down to normal mode, an internal global * reset is automatically generated. Wait a minimum of 1 ms before @@ -2112,6 +2156,24 @@ static int kszphy_resume(struct phy_device *phydev) return 0; } +/* Because of errata DS80000700A, receiver error following software + * power down. Suspend and resume callbacks only disable and enable + * external rmii reference clock. + */ +static int ksz8041_resume(struct phy_device *phydev) +{ + kszphy_enable_clk(phydev); + + return 0; +} + +static int ksz8041_suspend(struct phy_device *phydev) +{ + kszphy_disable_clk(phydev); + + return 0; +} + static int ksz9477_resume(struct phy_device *phydev) { int ret; @@ -2159,7 +2221,10 @@ static int ksz8061_resume(struct phy_device *phydev) if (!(ret & BMCR_PDOWN)) return 0; - genphy_resume(phydev); + ret = kszphy_generic_resume(phydev); + if (ret) + return ret; + usleep_range(1000, 2000); /* Re-program the value after chip is reset. */ @@ -2177,6 +2242,11 @@ static int ksz8061_resume(struct phy_device *phydev) return 0; } +static int ksz8061_suspend(struct phy_device *phydev) +{ + return kszphy_suspend(phydev); +} + static int kszphy_probe(struct phy_device *phydev) { const struct kszphy_type *type = phydev->drv->driver_data; @@ -2217,10 +2287,14 @@ static int kszphy_probe(struct phy_device *phydev) } else if (!clk) { /* unnamed clock from the generic ethernet-phy binding */ clk = devm_clk_get_optional_enabled(&phydev->mdio.dev, NULL); - if (IS_ERR(clk)) - return PTR_ERR(clk); } + if (IS_ERR(clk)) + return PTR_ERR(clk); + + clk_disable_unprepare(clk); + priv->clk = clk; + if (ksz8041_fiber_mode(phydev)) phydev->port = PORT_FIBRE; @@ -5290,6 +5364,21 @@ static int lan8841_probe(struct phy_device *phydev) return 0; } +static int lan8804_resume(struct phy_device *phydev) +{ + return kszphy_resume(phydev); +} + +static int lan8804_suspend(struct phy_device *phydev) +{ + return kszphy_generic_suspend(phydev); +} + +static int lan8841_resume(struct phy_device *phydev) +{ + return kszphy_generic_resume(phydev); +} + static int lan8841_suspend(struct phy_device *phydev) { struct kszphy_priv *priv = phydev->priv; @@ -5298,7 +5387,7 @@ static int lan8841_suspend(struct phy_device *phydev) if (ptp_priv->ptp_clock) ptp_cancel_worker_sync(ptp_priv->ptp_clock); - return genphy_suspend(phydev); + return kszphy_generic_suspend(phydev); } static struct phy_driver ksphy_driver[] = { @@ -5358,9 +5447,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - /* No suspend/resume callbacks because of errata DS80000700A, - * receiver error following software power down. - */ + .suspend = ksz8041_suspend, + .resume = ksz8041_resume, }, { .phy_id = PHY_ID_KSZ8041RNLI, .phy_id_mask = MICREL_PHY_ID_MASK, @@ -5436,7 +5524,7 @@ static struct phy_driver ksphy_driver[] = { .soft_reset = genphy_soft_reset, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, - .suspend = kszphy_suspend, + .suspend = ksz8061_suspend, .resume = ksz8061_resume, }, { .phy_id = PHY_ID_KSZ9021, @@ -5507,8 +5595,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, - .resume = kszphy_resume, + .suspend = lan8804_suspend, + .resume = lan8804_resume, .config_intr = lan8804_config_intr, .handle_interrupt = lan8804_handle_interrupt, }, { @@ -5526,7 +5614,7 @@ static struct phy_driver ksphy_driver[] = { .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, .suspend = lan8841_suspend, - .resume = genphy_resume, + .resume = lan8841_resume, .cable_test_start = lan8814_cable_test_start, .cable_test_get_status = ksz886x_cable_test_get_status, }, { From d81cadbe164265337f149cf31c9462d7217c1eed Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 4 Nov 2024 07:58:45 +0000 Subject: [PATCH 375/807] KVM: SVM: Disable AVIC on SNP-enabled system without HvInUseWrAllowed feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On SNP-enabled system, VMRUN marks AVIC Backing Page as in-use while the guest is running for both secure and non-secure guest. Any hypervisor write to the in-use vCPU's AVIC backing page (e.g. to inject an interrupt) will generate unexpected #PF in the host. Currently, attempt to run AVIC guest would result in the following error: BUG: unable to handle page fault for address: ff3a442e549cc270 #PF: supervisor write access in kernel mode #PF: error_code(0x80000003) - RMP violation PGD b6ee01067 P4D b6ee02067 PUD 10096d063 PMD 11c540063 PTE 80000001149cc163 SEV-SNP: PFN 0x1149cc unassigned, dumping non-zero entries in 2M PFN region: [0x114800 - 0x114a00] ... Newer AMD system is enhanced to allow hypervisor to modify the backing page for non-secure guest on SNP-enabled system. This enhancement is available when the CPUID Fn8000_001F_EAX bit 30 is set (HvInUseWrAllowed). This table describes AVIC support matrix w.r.t. SNP enablement: | Non-SNP system | SNP system ----------------------------------------------------- Non-SNP guest | AVIC Activate | AVIC Activate iff | | HvInuseWrAllowed=1 ----------------------------------------------------- SNP guest | N/A | Secure AVIC Therefore, check and disable AVIC in kvm_amd driver when the feature is not available on SNP-enabled system. See the AMD64 Architecture Programmer’s Manual (APM) Volume 2 for detail. (https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/ programmer-references/40332.pdf) Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241104075845.7583-1-suravee.suthikulpanit@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/svm/avic.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 17b6590748c0..645aa360628d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -452,6 +452,7 @@ #define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */ #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ +#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 4b74ea91f4e6..65fd245a9953 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1199,6 +1199,12 @@ bool avic_hardware_setup(void) return false; } + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && + !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { + pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); + return false; + } + if (boot_cpu_has(X86_FEATURE_AVIC)) { pr_info("AVIC enabled\n"); } else if (force_avic) { From 9b42d1e8e4fe9dc631162c04caa69b0d1860b0f0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 Nov 2024 16:43:39 -0800 Subject: [PATCH 376/807] KVM: x86: Play nice with protected guests in complete_hypercall_exit() Use is_64_bit_hypercall() instead of is_64_bit_mode() to detect a 64-bit hypercall when completing said hypercall. For guests with protected state, e.g. SEV-ES and SEV-SNP, KVM must assume the hypercall was made in 64-bit mode as the vCPU state needed to detect 64-bit mode is unavailable. Hacking the sev_smoke_test selftest to generate a KVM_HC_MAP_GPA_RANGE hypercall via VMGEXIT trips the WARN: ------------[ cut here ]------------ WARNING: CPU: 273 PID: 326626 at arch/x86/kvm/x86.h:180 complete_hypercall_exit+0x44/0xe0 [kvm] Modules linked in: kvm_amd kvm ... [last unloaded: kvm] CPU: 273 UID: 0 PID: 326626 Comm: sev_smoke_test Not tainted 6.12.0-smp--392e932fa0f3-feat #470 Hardware name: Google Astoria/astoria, BIOS 0.20240617.0-0 06/17/2024 RIP: 0010:complete_hypercall_exit+0x44/0xe0 [kvm] Call Trace: kvm_arch_vcpu_ioctl_run+0x2400/0x2720 [kvm] kvm_vcpu_ioctl+0x54f/0x630 [kvm] __se_sys_ioctl+0x6b/0xc0 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e ---[ end trace 0000000000000000 ]--- Fixes: b5aead0064f3 ("KVM: x86: Assume a 64-bit hypercall for guests with protected state") Cc: stable@vger.kernel.org Cc: Tom Lendacky Reviewed-by: Xiaoyao Li Reviewed-by: Nikunj A Dadhania Reviewed-by: Tom Lendacky Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20241128004344.4072099-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2e713480933a..0b2fe4aa04a2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9976,7 +9976,7 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) { u64 ret = vcpu->run->hypercall.ret; - if (!is_64_bit_mode(vcpu)) + if (!is_64_bit_hypercall(vcpu)) ret = (u32)ret; kvm_rax_write(vcpu, ret); ++vcpu->stat.hypercalls; From 4d5163cba43fe96902165606fa54e1aecbbb32de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Dec 2024 09:29:52 -0800 Subject: [PATCH 377/807] KVM: SVM: Allow guest writes to set MSR_AMD64_DE_CFG bits Drop KVM's arbitrary behavior of making DE_CFG.LFENCE_SERIALIZE read-only for the guest, as rejecting writes can lead to guest crashes, e.g. Windows in particular doesn't gracefully handle unexpected #GPs on the WRMSR, and nothing in the AMD manuals suggests that LFENCE_SERIALIZE is read-only _if it exists_. KVM only allows LFENCE_SERIALIZE to be set, by the guest or host, if the underlying CPU has X86_FEATURE_LFENCE_RDTSC, i.e. if LFENCE is guaranteed to be serializing. So if the guest sets LFENCE_SERIALIZE, KVM will provide the desired/correct behavior without any additional action (the guest's value is never stuffed into hardware). And having LFENCE be serializing even when it's not _required_ to be is a-ok from a functional perspective. Fixes: 74a0e79df68a ("KVM: SVM: Disallow guest from changing userspace's MSR_AMD64_DE_CFG value") Fixes: d1d93fa90f1a ("KVM: SVM: Add MSR-based feature support for serializing LFENCE") Reported-by: Simon Pilkington Closes: https://lore.kernel.org/all/52914da7-a97b-45ad-86a0-affdf8266c61@mailbox.org Cc: Tom Lendacky Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20241211172952.1477605-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dd15cc635655..21dacd312779 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3201,15 +3201,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & ~supported_de_cfg) return 1; - /* - * Don't let the guest change the host-programmed value. The - * MSR is very model specific, i.e. contains multiple bits that - * are completely unknown to KVM, and the one bit known to KVM - * is simply a reflection of hardware capabilities. - */ - if (!msr->host_initiated && data != svm->msr_decfg) - return 1; - svm->msr_decfg = data; break; } From 386d69f9f29b0814881fa4f92ac7b8dfa9b4f44a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2024 13:36:11 -0800 Subject: [PATCH 378/807] KVM: x86/mmu: Treat TDP MMU faults as spurious if access is already allowed Treat slow-path TDP MMU faults as spurious if the access is allowed given the existing SPTE to fix a benign warning (other than the WARN itself) due to replacing a writable SPTE with a read-only SPTE, and to avoid the unnecessary LOCK CMPXCHG and subsequent TLB flush. If a read fault races with a write fault, fast GUP fails for any reason when trying to "promote" the read fault to a writable mapping, and KVM resolves the write fault first, then KVM will end up trying to install a read-only SPTE (for a !map_writable fault) overtop a writable SPTE. Note, it's not entirely clear why fast GUP fails, or if that's even how KVM ends up with a !map_writable fault with a writable SPTE. If something else is going awry, e.g. due to a bug in mmu_notifiers, then treating read faults as spurious in this scenario could effectively mask the underlying problem. However, retrying the faulting access instead of overwriting an existing SPTE is functionally correct and desirable irrespective of the WARN, and fast GUP _can_ legitimately fail with a writable VMA, e.g. if the Accessed bit in primary MMU's PTE is toggled and causes a PTE value mismatch. The WARN was also recently added, specifically to track down scenarios where KVM is unnecessarily overwrites SPTEs, i.e. treating the fault as spurious doesn't regress KVM's bug-finding capabilities in any way. In short, letting the WARN linger because there's a tiny chance it's due to a bug elsewhere would be excessively paranoid. Fixes: 1a175082b190 ("KVM: x86/mmu: WARN and flush if resolving a TDP MMU fault clears MMU-writable") Reported-by: Lei Yang Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219588 Tested-by: Lei Yang Link: https://lore.kernel.org/r/20241218213611.3181643-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 12 ------------ arch/x86/kvm/mmu/spte.h | 17 +++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.c | 5 +++++ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 22e7ad235123..2401606db260 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3364,18 +3364,6 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, return true; } -static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) -{ - if (fault->exec) - return is_executable_pte(spte); - - if (fault->write) - return is_writable_pte(spte); - - /* Fault was on Read access */ - return spte & PT_PRESENT_MASK; -} - /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index f332b33bc817..af10bc0380a3 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -461,6 +461,23 @@ static inline bool is_mmu_writable_spte(u64 spte) return spte & shadow_mmu_writable_mask; } +/* + * Returns true if the access indicated by @fault is allowed by the existing + * SPTE protections. Note, the caller is responsible for checking that the + * SPTE is a shadow-present, leaf SPTE (either before or after). + */ +static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) +{ + if (fault->exec) + return is_executable_pte(spte); + + if (fault->write) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + /* * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4508d868f1cd..2f15e0e33903 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -985,6 +985,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (fault->prefetch && is_shadow_present_pte(iter->old_spte)) return RET_PF_SPURIOUS; + if (is_shadow_present_pte(iter->old_spte) && + is_access_allowed(fault, iter->old_spte) && + is_last_spte(iter->old_spte, iter->level)) + return RET_PF_SPURIOUS; + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else From 902806baf3c1e8383c1fe3ff0b6042b8cb5c2707 Mon Sep 17 00:00:00 2001 From: Stefan Ekenberg Date: Tue, 19 Nov 2024 08:40:29 +0100 Subject: [PATCH 379/807] drm/bridge: adv7511_audio: Update Audio InfoFrame properly AUDIO_UPDATE bit (Bit 5 of MAIN register 0x4A) needs to be set to 1 while updating Audio InfoFrame information and then set to 0 when done. Otherwise partially updated Audio InfoFrames could be sent out. Two cases where this rule were not followed are fixed: - In adv7511_hdmi_hw_params() make sure AUDIO_UPDATE bit is updated before/after setting ADV7511_REG_AUDIO_INFOFRAME. - In audio_startup() use the correct register for clearing AUDIO_UPDATE bit. The problem with corrupted audio infoframes were discovered by letting a HDMI logic analyser check the output of ADV7535. Note that this patchs replaces writing REG_GC(1) with REG_INFOFRAME_UPDATE. Bit 5 of REG_GC(1) is positioned within field GC_PP[3:0] and that field doesn't control audio infoframe and is read- only. My conclusion therefore was that the author if this code meant to clear bit 5 of REG_INFOFRAME_UPDATE from the very beginning. Tested-by: Biju Das Fixes: 53c515befe28 ("drm/bridge: adv7511: Add Audio support") Signed-off-by: Stefan Ekenberg Reviewed-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20241119-adv7511-audio-info-frame-v4-1-4ae68e76c89c@axis.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/adv7511/adv7511_audio.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c b/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c index 61f4a38e7d2b..8f786592143b 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c @@ -153,7 +153,16 @@ static int adv7511_hdmi_hw_params(struct device *dev, void *data, ADV7511_AUDIO_CFG3_LEN_MASK, len); regmap_update_bits(adv7511->regmap, ADV7511_REG_I2C_FREQ_ID_CFG, ADV7511_I2C_FREQ_ID_CFG_RATE_MASK, rate << 4); - regmap_write(adv7511->regmap, 0x73, 0x1); + + /* send current Audio infoframe values while updating */ + regmap_update_bits(adv7511->regmap, ADV7511_REG_INFOFRAME_UPDATE, + BIT(5), BIT(5)); + + regmap_write(adv7511->regmap, ADV7511_REG_AUDIO_INFOFRAME(0), 0x1); + + /* use Audio infoframe updated info */ + regmap_update_bits(adv7511->regmap, ADV7511_REG_INFOFRAME_UPDATE, + BIT(5), 0); return 0; } @@ -184,8 +193,9 @@ static int audio_startup(struct device *dev, void *data) regmap_update_bits(adv7511->regmap, ADV7511_REG_GC(0), BIT(7) | BIT(6), BIT(7)); /* use Audio infoframe updated info */ - regmap_update_bits(adv7511->regmap, ADV7511_REG_GC(1), + regmap_update_bits(adv7511->regmap, ADV7511_REG_INFOFRAME_UPDATE, BIT(5), 0); + /* enable SPDIF receiver */ if (adv7511->audio_source == ADV7511_AUDIO_SOURCE_SPDIF) regmap_update_bits(adv7511->regmap, ADV7511_REG_AUDIO_CONFIG, From 81adbd3ff21c1182e06aa02c6be0bfd9ea02d8e8 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 19 Nov 2024 19:20:29 +0000 Subject: [PATCH 380/807] drm: adv7511: Fix use-after-free in adv7533_attach_dsi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The host_node pointer was assigned and freed in adv7533_parse_dt(), and later, adv7533_attach_dsi() uses the same. Fix this use-after-free issue by dropping of_node_put() in adv7533_parse_dt() and calling of_node_put() in error path of probe() and also in the remove(). Fixes: 1e4d58cd7f88 ("drm/bridge: adv7533: Create a MIPI DSI device") Cc: stable@vger.kernel.org Reviewed-by: Laurent Pinchart Signed-off-by: Biju Das Link: https://patchwork.freedesktop.org/patch/msgid/20241119192040.152657-2-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/adv7511/adv7511_drv.c | 10 ++++++++-- drivers/gpu/drm/bridge/adv7511/adv7533.c | 2 -- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c index eb5919b38263..a13b3d8ab6ac 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c @@ -1241,8 +1241,10 @@ static int adv7511_probe(struct i2c_client *i2c) return ret; ret = adv7511_init_regulators(adv7511); - if (ret) - return dev_err_probe(dev, ret, "failed to init regulators\n"); + if (ret) { + dev_err_probe(dev, ret, "failed to init regulators\n"); + goto err_of_node_put; + } /* * The power down GPIO is optional. If present, toggle it from active to @@ -1363,6 +1365,8 @@ err_i2c_unregister_edid: i2c_unregister_device(adv7511->i2c_edid); uninit_regulators: adv7511_uninit_regulators(adv7511); +err_of_node_put: + of_node_put(adv7511->host_node); return ret; } @@ -1371,6 +1375,8 @@ static void adv7511_remove(struct i2c_client *i2c) { struct adv7511 *adv7511 = i2c_get_clientdata(i2c); + of_node_put(adv7511->host_node); + adv7511_uninit_regulators(adv7511); drm_bridge_remove(&adv7511->bridge); diff --git a/drivers/gpu/drm/bridge/adv7511/adv7533.c b/drivers/gpu/drm/bridge/adv7511/adv7533.c index 4481489aaf5e..5f195e91b3e6 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7533.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7533.c @@ -181,8 +181,6 @@ int adv7533_parse_dt(struct device_node *np, struct adv7511 *adv) if (!adv->host_node) return -ENODEV; - of_node_put(adv->host_node); - adv->use_timing_gen = !of_property_read_bool(np, "adi,disable-timing-generator"); From ee8f9ed57a397605434caeef351bafa3ec4dfdd4 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 19 Nov 2024 19:20:30 +0000 Subject: [PATCH 381/807] dt-bindings: display: adi,adv7533: Drop single lane support As per [1] and [2], ADV7535/7533 supports only 2-, 3-, or 4-lane. Drop unsupported 1-lane from bindings. [1] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7535.pdf [2] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7533.pdf Fixes: 1e4d58cd7f88 ("drm/bridge: adv7533: Create a MIPI DSI device") Cc: stable@vger.kernel.org Acked-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: Biju Das Link: https://patchwork.freedesktop.org/patch/msgid/20241119192040.152657-3-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Baryshkov --- .../devicetree/bindings/display/bridge/adi,adv7533.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml b/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml index df20a3c9c744..ec89115c74e4 100644 --- a/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml +++ b/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml @@ -90,7 +90,7 @@ properties: adi,dsi-lanes: description: Number of DSI data lanes connected to the DSI host. $ref: /schemas/types.yaml#/definitions/uint32 - enum: [ 1, 2, 3, 4 ] + enum: [ 2, 3, 4 ] "#sound-dai-cells": const: 0 From 79d67c499c3f886202a40c5cb27e747e4fa4d738 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 19 Nov 2024 19:20:31 +0000 Subject: [PATCH 382/807] drm: adv7511: Drop dsi single lane support As per [1] and [2], ADV7535/7533 supports only 2-, 3-, or 4-lane. Drop unsupported 1-lane. [1] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7535.pdf [2] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7533.pdf Fixes: 1e4d58cd7f88 ("drm/bridge: adv7533: Create a MIPI DSI device") Reported-by: Hien Huynh Cc: stable@vger.kernel.org Reviewed-by: Laurent Pinchart Reviewed-by: Adam Ford Signed-off-by: Biju Das Link: https://patchwork.freedesktop.org/patch/msgid/20241119192040.152657-4-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/adv7511/adv7533.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7533.c b/drivers/gpu/drm/bridge/adv7511/adv7533.c index 5f195e91b3e6..122ad91e8a32 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7533.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7533.c @@ -172,7 +172,7 @@ int adv7533_parse_dt(struct device_node *np, struct adv7511 *adv) of_property_read_u32(np, "adi,dsi-lanes", &num_lanes); - if (num_lanes < 1 || num_lanes > 4) + if (num_lanes < 2 || num_lanes > 4) return -EINVAL; adv->num_dsi_lanes = num_lanes; From 262bfba8ab820641c8cfbbf03b86d6c00242c078 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 17 Dec 2024 18:02:23 -0800 Subject: [PATCH 383/807] net: dsa: microchip: Fix KSZ9477 set_ageing_time function The aging count is not a simple 11-bit value but comprises a 3-bit multiplier and an 8-bit second count. The code tries to use the original multiplier which is 4 as the second count is still 300 seconds by default. Fixes: 2c119d9982b1 ("net: dsa: microchip: add the support for set_ageing_time") Signed-off-by: Tristram Ha Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218020224.70590-2-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz9477.c | 47 +++++++++++++++++++------ drivers/net/dsa/microchip/ksz9477_reg.h | 4 +-- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index d16817e0476f..29fe79ea74cd 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -2,7 +2,7 @@ /* * Microchip KSZ9477 switch driver main logic * - * Copyright (C) 2017-2019 Microchip Technology Inc. + * Copyright (C) 2017-2024 Microchip Technology Inc. */ #include @@ -983,26 +983,51 @@ void ksz9477_get_caps(struct ksz_device *dev, int port, int ksz9477_set_ageing_time(struct ksz_device *dev, unsigned int msecs) { u32 secs = msecs / 1000; - u8 value; - u8 data; + u8 data, mult, value; + u32 max_val; int ret; - value = FIELD_GET(SW_AGE_PERIOD_7_0_M, secs); +#define MAX_TIMER_VAL ((1 << 8) - 1) - ret = ksz_write8(dev, REG_SW_LUE_CTRL_3, value); - if (ret < 0) - return ret; + /* The aging timer comprises a 3-bit multiplier and an 8-bit second + * value. Either of them cannot be zero. The maximum timer is then + * 7 * 255 = 1785 seconds. + */ + if (!secs) + secs = 1; - data = FIELD_GET(SW_AGE_PERIOD_10_8_M, secs); + /* Return error if too large. */ + else if (secs > 7 * MAX_TIMER_VAL) + return -EINVAL; ret = ksz_read8(dev, REG_SW_LUE_CTRL_0, &value); if (ret < 0) return ret; - value &= ~SW_AGE_CNT_M; - value |= FIELD_PREP(SW_AGE_CNT_M, data); + /* Check whether there is need to update the multiplier. */ + mult = FIELD_GET(SW_AGE_CNT_M, value); + max_val = MAX_TIMER_VAL; + if (mult > 0) { + /* Try to use the same multiplier already in the register as + * the hardware default uses multiplier 4 and 75 seconds for + * 300 seconds. + */ + max_val = DIV_ROUND_UP(secs, mult); + if (max_val > MAX_TIMER_VAL || max_val * mult != secs) + max_val = MAX_TIMER_VAL; + } - return ksz_write8(dev, REG_SW_LUE_CTRL_0, value); + data = DIV_ROUND_UP(secs, max_val); + if (mult != data) { + value &= ~SW_AGE_CNT_M; + value |= FIELD_PREP(SW_AGE_CNT_M, data); + ret = ksz_write8(dev, REG_SW_LUE_CTRL_0, value); + if (ret < 0) + return ret; + } + + value = DIV_ROUND_UP(secs, data); + return ksz_write8(dev, REG_SW_LUE_CTRL_3, value); } void ksz9477_port_queue_split(struct ksz_device *dev, int port) diff --git a/drivers/net/dsa/microchip/ksz9477_reg.h b/drivers/net/dsa/microchip/ksz9477_reg.h index 04235c22bf40..ff579920078e 100644 --- a/drivers/net/dsa/microchip/ksz9477_reg.h +++ b/drivers/net/dsa/microchip/ksz9477_reg.h @@ -2,7 +2,7 @@ /* * Microchip KSZ9477 register definitions * - * Copyright (C) 2017-2018 Microchip Technology Inc. + * Copyright (C) 2017-2024 Microchip Technology Inc. */ #ifndef __KSZ9477_REGS_H @@ -165,8 +165,6 @@ #define SW_VLAN_ENABLE BIT(7) #define SW_DROP_INVALID_VID BIT(6) #define SW_AGE_CNT_M GENMASK(5, 3) -#define SW_AGE_CNT_S 3 -#define SW_AGE_PERIOD_10_8_M GENMASK(10, 8) #define SW_RESV_MCAST_ENABLE BIT(2) #define SW_HASH_OPTION_M 0x03 #define SW_HASH_OPTION_CRC 1 From bb9869043438af5b94230f94fb4c39206525d758 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 17 Dec 2024 18:02:24 -0800 Subject: [PATCH 384/807] net: dsa: microchip: Fix LAN937X set_ageing_time function The aging count is not a simple 20-bit value but comprises a 3-bit multiplier and a 20-bit second time. The code tries to use the original multiplier which is 4 as the second count is still 300 seconds by default. As the 20-bit number is now too large for practical use there is an option to interpret it as microseconds instead of seconds. Fixes: 2c119d9982b1 ("net: dsa: microchip: add the support for set_ageing_time") Signed-off-by: Tristram Ha Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218020224.70590-3-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/lan937x_main.c | 62 ++++++++++++++++++++++-- drivers/net/dsa/microchip/lan937x_reg.h | 9 ++-- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/microchip/lan937x_main.c b/drivers/net/dsa/microchip/lan937x_main.c index b7652efd632e..b1ae3b9de3d1 100644 --- a/drivers/net/dsa/microchip/lan937x_main.c +++ b/drivers/net/dsa/microchip/lan937x_main.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Microchip LAN937X switch driver main logic - * Copyright (C) 2019-2022 Microchip Technology Inc. + * Copyright (C) 2019-2024 Microchip Technology Inc. */ #include #include @@ -461,10 +461,66 @@ int lan937x_change_mtu(struct ksz_device *dev, int port, int new_mtu) int lan937x_set_ageing_time(struct ksz_device *dev, unsigned int msecs) { - u32 secs = msecs / 1000; - u32 value; + u8 data, mult, value8; + bool in_msec = false; + u32 max_val, value; + u32 secs = msecs; int ret; +#define MAX_TIMER_VAL ((1 << 20) - 1) + + /* The aging timer comprises a 3-bit multiplier and a 20-bit second + * value. Either of them cannot be zero. The maximum timer is then + * 7 * 1048575 = 7340025 seconds. As this value is too large for + * practical use it can be interpreted as microseconds, making the + * maximum timer 7340 seconds with finer control. This allows for + * maximum 122 minutes compared to 29 minutes in KSZ9477 switch. + */ + if (msecs % 1000) + in_msec = true; + else + secs /= 1000; + if (!secs) + secs = 1; + + /* Return error if too large. */ + else if (secs > 7 * MAX_TIMER_VAL) + return -EINVAL; + + /* Configure how to interpret the number value. */ + ret = ksz_rmw8(dev, REG_SW_LUE_CTRL_2, SW_AGE_CNT_IN_MICROSEC, + in_msec ? SW_AGE_CNT_IN_MICROSEC : 0); + if (ret < 0) + return ret; + + ret = ksz_read8(dev, REG_SW_LUE_CTRL_0, &value8); + if (ret < 0) + return ret; + + /* Check whether there is need to update the multiplier. */ + mult = FIELD_GET(SW_AGE_CNT_M, value8); + max_val = MAX_TIMER_VAL; + if (mult > 0) { + /* Try to use the same multiplier already in the register as + * the hardware default uses multiplier 4 and 75 seconds for + * 300 seconds. + */ + max_val = DIV_ROUND_UP(secs, mult); + if (max_val > MAX_TIMER_VAL || max_val * mult != secs) + max_val = MAX_TIMER_VAL; + } + + data = DIV_ROUND_UP(secs, max_val); + if (mult != data) { + value8 &= ~SW_AGE_CNT_M; + value8 |= FIELD_PREP(SW_AGE_CNT_M, data); + ret = ksz_write8(dev, REG_SW_LUE_CTRL_0, value8); + if (ret < 0) + return ret; + } + + secs = DIV_ROUND_UP(secs, data); + value = FIELD_GET(SW_AGE_PERIOD_7_0_M, secs); ret = ksz_write8(dev, REG_SW_AGE_PERIOD__1, value); diff --git a/drivers/net/dsa/microchip/lan937x_reg.h b/drivers/net/dsa/microchip/lan937x_reg.h index 4ec93e421da4..72042fd64e5b 100644 --- a/drivers/net/dsa/microchip/lan937x_reg.h +++ b/drivers/net/dsa/microchip/lan937x_reg.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Microchip LAN937X switch register definitions - * Copyright (C) 2019-2021 Microchip Technology Inc. + * Copyright (C) 2019-2024 Microchip Technology Inc. */ #ifndef __LAN937X_REG_H #define __LAN937X_REG_H @@ -56,8 +56,7 @@ #define SW_VLAN_ENABLE BIT(7) #define SW_DROP_INVALID_VID BIT(6) -#define SW_AGE_CNT_M 0x7 -#define SW_AGE_CNT_S 3 +#define SW_AGE_CNT_M GENMASK(5, 3) #define SW_RESV_MCAST_ENABLE BIT(2) #define REG_SW_LUE_CTRL_1 0x0311 @@ -70,6 +69,10 @@ #define SW_FAST_AGING BIT(1) #define SW_LINK_AUTO_AGING BIT(0) +#define REG_SW_LUE_CTRL_2 0x0312 + +#define SW_AGE_CNT_IN_MICROSEC BIT(7) + #define REG_SW_AGE_PERIOD__1 0x0313 #define SW_AGE_PERIOD_7_0_M GENMASK(7, 0) From 1ae40d5231732275c620a1c58c83884a979b6eb1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Dec 2024 10:33:40 +0100 Subject: [PATCH 385/807] ALSA: compress_offload: import DMA_BUF namespace The compression offload code cannot be in a loadable module unless it imports that namespace: ERROR: modpost: module snd-compress uses symbol dma_buf_get from namespace DMA_BUF, but does not import it. ERROR: modpost: module snd-compress uses symbol dma_buf_put from namespace DMA_BUF, but does not import it. ERROR: modpost: module snd-compress uses symbol dma_buf_fd from namespace DMA_BUF, but does not import it. Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Signed-off-by: Arnd Bergmann Acked-by: Shengjiu Wang Acked-by: Vinod Koul Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241216093410.377112-1-arnd@kernel.org --- sound/core/compress_offload.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 86ed2fbee0c8..ec2485c00e49 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1247,6 +1247,7 @@ void snd_compr_task_finished(struct snd_compr_stream *stream, } EXPORT_SYMBOL_GPL(snd_compr_task_finished); +MODULE_IMPORT_NS("DMA_BUF"); #endif /* CONFIG_SND_COMPRESS_ACCEL */ static long snd_compr_ioctl(struct file *f, unsigned int cmd, unsigned long arg) From 6018f2fe1089b46c6c9eb136338eca7b16a92331 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Dec 2024 10:33:41 +0100 Subject: [PATCH 386/807] ALSA: compress_offload: avoid 64-bit get_user() On some architectures, get_user() cannot read a 64-bit user variable: arm-linux-gnueabi-ld: sound/core/compress_offload.o: in function `snd_compr_ioctl': compress_offload.c:(.text.snd_compr_ioctl+0x538): undefined reference to `__get_user_bad' Use an equivalent copy_from_user() instead. Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Signed-off-by: Arnd Bergmann Acked-by: Shengjiu Wang Acked-by: Vinod Koul Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241216093410.377112-2-arnd@kernel.org --- sound/core/compress_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index ec2485c00e49..1d6769a66810 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1180,9 +1180,9 @@ static int snd_compr_task_seq(struct snd_compr_stream *stream, unsigned long arg if (stream->runtime->state != SNDRV_PCM_STATE_SETUP) return -EPERM; - retval = get_user(seqno, (__u64 __user *)arg); - if (retval < 0) - return retval; + retval = copy_from_user(&seqno, (__u64 __user *)arg, sizeof(seqno)); + if (retval) + return -EFAULT; retval = 0; if (seqno == 0) { list_for_each_entry_reverse(task, &stream->runtime->tasks, list) From f25a51b47c61540585a9e8a4e16f91677ebcbbc4 Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Tue, 17 Dec 2024 11:07:07 +0100 Subject: [PATCH 387/807] ALSA: compress_offload: use safe list iteration in snd_compr_task_seq() The sequence function can call snd_compr_task_free_one(). Use list_for_each_entry_safe_reverse() to make sure that the used pointers are safe. Link: https://lore.kernel.org/linux-sound/f2769cff-6c7a-4092-a2d1-c33a5411a182@stanley.mountain/ Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Reported-by: Dan Carpenter Cc: Vinod Koul Signed-off-by: Jaroslav Kysela Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241217100707.732766-1-perex@perex.cz --- sound/core/compress_offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 1d6769a66810..bdb6e307e453 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1174,7 +1174,7 @@ typedef void (*snd_compr_seq_func_t)(struct snd_compr_stream *stream, static int snd_compr_task_seq(struct snd_compr_stream *stream, unsigned long arg, snd_compr_seq_func_t fcn) { - struct snd_compr_task_runtime *task; + struct snd_compr_task_runtime *task, *temp; __u64 seqno; int retval; @@ -1185,7 +1185,7 @@ static int snd_compr_task_seq(struct snd_compr_stream *stream, unsigned long arg return -EFAULT; retval = 0; if (seqno == 0) { - list_for_each_entry_reverse(task, &stream->runtime->tasks, list) + list_for_each_entry_safe_reverse(task, temp, &stream->runtime->tasks, list) fcn(stream, task); } else { task = snd_compr_find_task(stream, seqno); From 3d3f43fab4cfb9cf245e3dbffa1736ce925bb54a Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Tue, 17 Dec 2024 11:07:26 +0100 Subject: [PATCH 388/807] ALSA: compress_offload: improve file descriptors installation for dma-buf Avoid to use single dma_buf_fd() call for both directions. This code ensures that both file descriptors are allocated before fd_install(). Link: https://lore.kernel.org/linux-sound/6a923647-4495-4cff-a253-b73f48cfd0ea@stanley.mountain/ Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Reported-by: Dan Carpenter Cc: Vinod Koul Signed-off-by: Jaroslav Kysela Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241217100726.732863-1-perex@perex.cz --- sound/core/compress_offload.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index bdb6e307e453..edf5aadf38e5 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1025,7 +1025,7 @@ static u64 snd_compr_seqno_next(struct snd_compr_stream *stream) static int snd_compr_task_new(struct snd_compr_stream *stream, struct snd_compr_task *utask) { struct snd_compr_task_runtime *task; - int retval; + int retval, fd_i, fd_o; if (stream->runtime->total_tasks >= stream->runtime->fragments) return -EBUSY; @@ -1039,16 +1039,24 @@ static int snd_compr_task_new(struct snd_compr_stream *stream, struct snd_compr_ retval = stream->ops->task_create(stream, task); if (retval < 0) goto cleanup; - utask->input_fd = dma_buf_fd(task->input, O_WRONLY|O_CLOEXEC); - if (utask->input_fd < 0) { - retval = utask->input_fd; + /* similar functionality as in dma_buf_fd(), but ensure that both + file descriptors are allocated before fd_install() */ + if (!task->input || !task->input->file || !task->output || !task->output->file) { + retval = -EINVAL; goto cleanup; } - utask->output_fd = dma_buf_fd(task->output, O_RDONLY|O_CLOEXEC); - if (utask->output_fd < 0) { - retval = utask->output_fd; + fd_i = get_unused_fd_flags(O_WRONLY|O_CLOEXEC); + if (fd_i < 0) + goto cleanup; + fd_o = get_unused_fd_flags(O_RDONLY|O_CLOEXEC); + if (fd_o < 0) { + put_unused_fd(fd_i); goto cleanup; } + fd_install(fd_i, task->input->file); + fd_install(fd_o, task->output->file); + utask->input_fd = fd_i; + utask->output_fd = fd_o; /* keep dmabuf reference until freed with task free ioctl */ dma_buf_get(utask->input_fd); dma_buf_get(utask->output_fd); From fa0308134d26dbbeb209a1581eea46df663866b6 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Thu, 19 Dec 2024 23:33:45 +0300 Subject: [PATCH 389/807] ALSA: memalloc: prefer dma_mapping_error() over explicit address checking With CONFIG_DMA_API_DEBUG enabled, the following warning is observed: DMA-API: snd_hda_intel 0000:03:00.1: device driver failed to check map error[device address=0x00000000ffff0000] [size=20480 bytes] [mapped as single] WARNING: CPU: 28 PID: 2255 at kernel/dma/debug.c:1036 check_unmap+0x1408/0x2430 CPU: 28 UID: 42 PID: 2255 Comm: wireplumber Tainted: G W L 6.12.0-10-133577cad6bf48e5a7848c4338124081393bfe8a+ #759 debug_dma_unmap_page+0xe9/0xf0 snd_dma_wc_free+0x85/0x130 [snd_pcm] snd_pcm_lib_free_pages+0x1e3/0x440 [snd_pcm] snd_pcm_common_ioctl+0x1c9a/0x2960 [snd_pcm] snd_pcm_ioctl+0x6a/0xc0 [snd_pcm] ... Check for returned DMA addresses using specialized dma_mapping_error() helper which is generally recommended for this purpose by Documentation/core-api/dma-api.rst. Fixes: c880a5146642 ("ALSA: memalloc: Use proper DMA mapping API for x86 WC buffer allocations") Reported-by: Mikhail Gavrilov Closes: https://lore.kernel.org/r/CABXGCsNB3RsMGvCucOy3byTEOxoc-Ys+zB_HQ=Opb_GhX1ioDA@mail.gmail.com/ Tested-by: Mikhail Gavrilov Signed-off-by: Fedor Pchelkin Link: https://patch.msgid.link/20241219203345.195898-1-pchelkin@ispras.ru Signed-off-by: Takashi Iwai --- sound/core/memalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 13b71069ae18..b3853583d2ae 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -505,7 +505,7 @@ static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) if (!p) return NULL; dmab->addr = dma_map_single(dmab->dev.dev, p, size, DMA_BIDIRECTIONAL); - if (dmab->addr == DMA_MAPPING_ERROR) { + if (dma_mapping_error(dmab->dev.dev, dmab->addr)) { do_free_pages(dmab->area, size, true); return NULL; } From 55853cb829dc707427c3519f6b8686682a204368 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 18 Dec 2024 10:59:31 +0800 Subject: [PATCH 390/807] selftests/alsa: Fix circular dependency involving global-timer The pattern rule `$(OUTPUT)/%: %.c` inadvertently included a circular dependency on the global-timer target due to its inclusion in $(TEST_GEN_PROGS_EXTENDED). This resulted in a circular dependency warning during the build process. To resolve this, the dependency on $(TEST_GEN_PROGS_EXTENDED) has been replaced with an explicit dependency on $(OUTPUT)/libatest.so. This change ensures that libatest.so is built before any other targets that require it, without creating a circular dependency. This fix addresses the following warning: make[4]: Entering directory 'tools/testing/selftests/alsa' make[4]: Circular default_modconfig/kselftest/alsa/global-timer <- default_modconfig/kselftest/alsa/global-timer dependency dropped. make[4]: Nothing to be done for 'all'. make[4]: Leaving directory 'tools/testing/selftests/alsa' Cc: Mark Brown Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Shuah Khan Signed-off-by: Li Zhijian Link: https://patch.msgid.link/20241218025931.914164-1-lizhijian@fujitsu.com Signed-off-by: Takashi Iwai --- tools/testing/selftests/alsa/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/alsa/Makefile b/tools/testing/selftests/alsa/Makefile index 944279160fed..8dab90ad22bb 100644 --- a/tools/testing/selftests/alsa/Makefile +++ b/tools/testing/selftests/alsa/Makefile @@ -27,5 +27,5 @@ include ../lib.mk $(OUTPUT)/libatest.so: conf.c alsa-local.h $(CC) $(CFLAGS) -shared -fPIC $< $(LDLIBS) -o $@ -$(OUTPUT)/%: %.c $(TEST_GEN_PROGS_EXTENDED) alsa-local.h +$(OUTPUT)/%: %.c $(OUTPUT)/libatest.so alsa-local.h $(CC) $(CFLAGS) $< $(LDLIBS) -latest -o $@ From 66a0a2b0473c39ae85c44628d14e4366fdc0aa0d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 20 Dec 2024 12:44:16 +0100 Subject: [PATCH 391/807] ALSA: sh: Fix wrong argument order for copy_from_iter() Fix a brown paper bag bug I introduced at converting to the standard iter helper; the arguments were wrongly passed and have to be swapped. Fixes: 9b5f8ee43e48 ("ALSA: sh: Use standard helper for buffer accesses") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412140019.jat5Dofr-lkp@intel.com/ Link: https://patch.msgid.link/20241220114417.5898-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/sh/sh_dac_audio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/sh/sh_dac_audio.c b/sound/sh/sh_dac_audio.c index a4d07438ad64..3f5422145c5e 100644 --- a/sound/sh/sh_dac_audio.c +++ b/sound/sh/sh_dac_audio.c @@ -163,7 +163,7 @@ static int snd_sh_dac_pcm_copy(struct snd_pcm_substream *substream, /* channel is not used (interleaved data) */ struct snd_sh_dac *chip = snd_pcm_substream_chip(substream); - if (copy_from_iter(chip->data_buffer + pos, src, count) != count) + if (copy_from_iter(chip->data_buffer + pos, count, src) != count) return -EFAULT; chip->buffer_end = chip->data_buffer + pos + count; From 6321f5fb70d502d95de8a212a7b484c297ec9644 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:11 -0800 Subject: [PATCH 392/807] gve: clean XDP queues in gve_tx_stop_ring_gqi When stopping XDP TX rings, the XDP clean function needs to be called to clean out the entire queue, similar to what happens in the normal TX queue case. Otherwise, the FIFO won't be cleared correctly, and xsk_tx_completed won't be reported. Fixes: 75eaae158b1b ("gve: Add XDP DROP and TX support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index e7fb7d6d283d..83ad278ec91f 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -206,7 +206,10 @@ void gve_tx_stop_ring_gqi(struct gve_priv *priv, int idx) return; gve_remove_napi(priv, ntfy_idx); - gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); + if (tx->q_num < priv->tx_cfg.num_queues) + gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); + else + gve_clean_xdp_done(priv, tx, priv->tx_desc_cnt); netdev_tx_reset_queue(tx->netdev_txq); gve_tx_remove_from_block(priv, idx); } From ff7c2dea9dd1a436fc79d6273adffdcc4a7ffea3 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:12 -0800 Subject: [PATCH 393/807] gve: guard XDP xmit NDO on existence of xdp queues In GVE, dedicated XDP queues only exist when an XDP program is installed and the interface is up. As such, the NDO XDP XMIT callback should return early if either of these conditions are false. In the case of no loaded XDP program, priv->num_xdp_queues=0 which can cause a divide-by-zero error, and in the case of interface down, num_xdp_queues remains untouched to persist XDP queue count for the next interface up, but the TX pointer itself would be NULL. The XDP xmit callback also needs to synchronize with a device transitioning from open to close. This synchronization will happen via the GVE_PRIV_FLAGS_NAPI_ENABLED bit along with a synchronize_net() call, which waits for any RCU critical sections at call-time to complete. Fixes: 39a7f4aa3e4a ("gve: Add XDP REDIRECT support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 3 +++ drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e171ca248f9a..5d7b0cc59959 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1899,6 +1899,9 @@ static void gve_turndown(struct gve_priv *priv) gve_clear_napi_enabled(priv); gve_clear_report_stats(priv); + + /* Make sure that all traffic is finished processing. */ + synchronize_net(); } static void gve_turnup(struct gve_priv *priv) diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 83ad278ec91f..852f8c7e39d2 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -837,9 +837,12 @@ int gve_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, struct gve_tx_ring *tx; int i, err = 0, qid; - if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK) || !priv->xdp_prog) return -EINVAL; + if (!gve_get_napi_enabled(priv)) + return -ENETDOWN; + qid = gve_xdp_tx_queue_id(priv, smp_processor_id() % priv->num_xdp_queues); From 40338d7987d810fcaa95c500b1068a52b08eec9b Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:13 -0800 Subject: [PATCH 394/807] gve: guard XSK operations on the existence of queues This patch predicates the enabling and disabling of XSK pools on the existence of queues. As it stands, if the interface is down, disabling or enabling XSK pools would result in a crash, as the RX queue pointer would be NULL. XSK pool registration will occur as part of the next interface up. Similarly, xsk_wakeup needs be guarded against queues disappearing while the function is executing, so a check against the GVE_PRIV_FLAGS_NAPI_ENABLED flag is added to synchronize with the disabling of the bit and the synchronize_net() in gve_turndown. Fixes: fd8e40321a12 ("gve: Add AF_XDP zero-copy support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Reviewed-by: Larysa Zaremba Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 5d7b0cc59959..e4e8ff4f9f80 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1623,8 +1623,8 @@ static int gve_xsk_pool_enable(struct net_device *dev, if (err) return err; - /* If XDP prog is not installed, return */ - if (!priv->xdp_prog) + /* If XDP prog is not installed or interface is down, return. */ + if (!priv->xdp_prog || !netif_running(dev)) return 0; rx = &priv->rx[qid]; @@ -1669,21 +1669,16 @@ static int gve_xsk_pool_disable(struct net_device *dev, if (qid >= priv->rx_cfg.num_queues) return -EINVAL; - /* If XDP prog is not installed, unmap DMA and return */ - if (!priv->xdp_prog) + /* If XDP prog is not installed or interface is down, unmap DMA and + * return. + */ + if (!priv->xdp_prog || !netif_running(dev)) goto done; - tx_qid = gve_xdp_tx_queue_id(priv, qid); - if (!netif_running(dev)) { - priv->rx[qid].xsk_pool = NULL; - xdp_rxq_info_unreg(&priv->rx[qid].xsk_rxq); - priv->tx[tx_qid].xsk_pool = NULL; - goto done; - } - napi_rx = &priv->ntfy_blocks[priv->rx[qid].ntfy_id].napi; napi_disable(napi_rx); /* make sure current rx poll is done */ + tx_qid = gve_xdp_tx_queue_id(priv, qid); napi_tx = &priv->ntfy_blocks[priv->tx[tx_qid].ntfy_id].napi; napi_disable(napi_tx); /* make sure current tx poll is done */ @@ -1711,6 +1706,9 @@ static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) struct gve_priv *priv = netdev_priv(dev); int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id); + if (!gve_get_napi_enabled(priv)) + return -ENETDOWN; + if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog) return -EINVAL; From ba0925c34e0fa6fe02d3d642bc02ab099ab312c7 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:14 -0800 Subject: [PATCH 395/807] gve: process XSK TX descriptors as part of RX NAPI When busy polling is enabled, xsk_sendmsg for AF_XDP zero copy marks the NAPI ID corresponding to the memory pool allocated for the socket. In GVE, this NAPI ID will never correspond to a NAPI ID of one of the dedicated XDP TX queues registered with the umem because XDP TX is not set up to share a NAPI with a corresponding RX queue. This patch moves XSK TX descriptor processing from the TX NAPI to the RX NAPI, and the gve_xsk_wakeup callback is updated to use the RX NAPI instead of the TX NAPI, accordingly. The branch on if the wakeup is for TX is removed, as the NAPI poll should be invoked whether the wakeup is for TX or for RX. Fixes: fd8e40321a12 ("gve: Add AF_XDP zero-copy support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Praveen Kaligineedi Signed-off-by: Joshua Washington Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve.h | 1 + drivers/net/ethernet/google/gve/gve_main.c | 8 +++++ drivers/net/ethernet/google/gve/gve_tx.c | 36 +++++++++++++--------- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index dd92949bb214..8167cc5fb0df 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1140,6 +1140,7 @@ int gve_xdp_xmit_one(struct gve_priv *priv, struct gve_tx_ring *tx, void gve_xdp_tx_flush(struct gve_priv *priv, u32 xdp_qid); bool gve_tx_poll(struct gve_notify_block *block, int budget); bool gve_xdp_poll(struct gve_notify_block *block, int budget); +int gve_xsk_tx_poll(struct gve_notify_block *block, int budget); int gve_tx_alloc_rings_gqi(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *cfg); void gve_tx_free_rings_gqi(struct gve_priv *priv, diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e4e8ff4f9f80..5cab7b88610f 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -333,6 +333,14 @@ int gve_napi_poll(struct napi_struct *napi, int budget) if (block->rx) { work_done = gve_rx_poll(block, budget); + + /* Poll XSK TX as part of RX NAPI. Setup re-poll based on max of + * TX and RX work done. + */ + if (priv->xdp_prog) + work_done = max_t(int, work_done, + gve_xsk_tx_poll(block, budget)); + reschedule |= work_done == budget; } diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 852f8c7e39d2..4350ebd9c2bd 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -981,33 +981,41 @@ out: return sent; } +int gve_xsk_tx_poll(struct gve_notify_block *rx_block, int budget) +{ + struct gve_rx_ring *rx = rx_block->rx; + struct gve_priv *priv = rx->gve; + struct gve_tx_ring *tx; + int sent = 0; + + tx = &priv->tx[gve_xdp_tx_queue_id(priv, rx->q_num)]; + if (tx->xsk_pool) { + sent = gve_xsk_tx(priv, tx, budget); + + u64_stats_update_begin(&tx->statss); + tx->xdp_xsk_sent += sent; + u64_stats_update_end(&tx->statss); + if (xsk_uses_need_wakeup(tx->xsk_pool)) + xsk_set_tx_need_wakeup(tx->xsk_pool); + } + + return sent; +} + bool gve_xdp_poll(struct gve_notify_block *block, int budget) { struct gve_priv *priv = block->priv; struct gve_tx_ring *tx = block->tx; u32 nic_done; - bool repoll; u32 to_do; /* Find out how much work there is to be done */ nic_done = gve_tx_load_event_counter(priv, tx); to_do = min_t(u32, (nic_done - tx->done), budget); gve_clean_xdp_done(priv, tx, to_do); - repoll = nic_done != tx->done; - - if (tx->xsk_pool) { - int sent = gve_xsk_tx(priv, tx, budget); - - u64_stats_update_begin(&tx->statss); - tx->xdp_xsk_sent += sent; - u64_stats_update_end(&tx->statss); - repoll |= (sent == budget); - if (xsk_uses_need_wakeup(tx->xsk_pool)) - xsk_set_tx_need_wakeup(tx->xsk_pool); - } /* If we still have work we want to repoll */ - return repoll; + return nic_done != tx->done; } bool gve_tx_poll(struct gve_notify_block *block, int budget) From de63ac44a527b2c5067551dbd70d939fe151325a Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:15 -0800 Subject: [PATCH 396/807] gve: fix XDP allocation path in edge cases This patch fixes a number of consistency issues in the queue allocation path related to XDP. As it stands, the number of allocated XDP queues changes in three different scenarios. 1) Adding an XDP program while the interface is up via gve_add_xdp_queues 2) Removing an XDP program while the interface is up via gve_remove_xdp_queues 3) After queues have been allocated and the old queue memory has been removed in gve_queues_start. However, the requirement for the interface to be up for gve_(add|remove)_xdp_queues to be called, in conjunction with the fact that the number of queues stored in priv isn't updated until _after_ XDP queues have been allocated in the normal queue allocation path means that if an XDP program is added while the interface is down, XDP queues won't be added until the _second_ if_up, not the first. Given the expectation that the number of XDP queues is equal to the number of RX queues, scenario (3) has another problematic implication. When changing the number of queues while an XDP program is loaded, the number of XDP queues must be updated as well, as there is logic in the driver (gve_xdp_tx_queue_id()) which relies on every RX queue having a corresponding XDP TX queue. However, the number of XDP queues stored in priv would not be updated until _after_ a close/open leading to a mismatch in the number of XDP queues reported vs the number of XDP queues which actually exist after the queue count update completes. This patch remedies these issues by doing the following: 1) The allocation config getter function is set up to retrieve the _expected_ number of XDP queues to allocate instead of relying on the value stored in `priv` which is only updated once the queues have been allocated. 2) When adjusting queues, XDP queues are adjusted to match the number of RX queues when XDP is enabled. This only works in the case when queues are live, so part (1) of the fix must still be available in the case that queues are adjusted when there is an XDP program and the interface is down. Fixes: 5f08cd3d6423 ("gve: Alloc before freeing when adjusting queues") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 5cab7b88610f..09fb7f16f73e 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -930,11 +930,13 @@ static void gve_init_sync_stats(struct gve_priv *priv) static void gve_tx_get_curr_alloc_cfg(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *cfg) { + int num_xdp_queues = priv->xdp_prog ? priv->rx_cfg.num_queues : 0; + cfg->qcfg = &priv->tx_cfg; cfg->raw_addressing = !gve_is_qpl(priv); cfg->ring_size = priv->tx_desc_cnt; cfg->start_idx = 0; - cfg->num_rings = gve_num_tx_queues(priv); + cfg->num_rings = priv->tx_cfg.num_queues + num_xdp_queues; cfg->tx = priv->tx; } @@ -1843,6 +1845,7 @@ int gve_adjust_queues(struct gve_priv *priv, { struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0}; struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0}; + int num_xdp_queues; int err; gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg); @@ -1853,6 +1856,10 @@ int gve_adjust_queues(struct gve_priv *priv, rx_alloc_cfg.qcfg = &new_rx_config; tx_alloc_cfg.num_rings = new_tx_config.num_queues; + /* Add dedicated XDP TX queues if enabled. */ + num_xdp_queues = priv->xdp_prog ? new_rx_config.num_queues : 0; + tx_alloc_cfg.num_rings += num_xdp_queues; + if (netif_running(priv->dev)) { err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg); return err; From 926e862058978a8f81872845715d67ad21c30f65 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 14 Dec 2024 02:12:06 +0000 Subject: [PATCH 397/807] arm64/signal: Silence sparse warning storing GCSPR_EL0 We are seeing a sparse warning in gcs_restore_signal(): arch/arm64/kernel/signal.c:1054:9: sparse: sparse: cast removes address space '__user' of expression when storing the final GCSPR_EL0 value back into the register, caused by the fact that write_sysreg_s() casts the value it writes to a u64 which sparse sees as discarding the __userness of the pointer. Avoid this by treating the address as an integer, casting to a pointer only when using it to write to userspace. While we're at it also inline gcs_signal_cap_valid() into it's one user and make equivalent updates to gcs_signal_entry(). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412082005.OBJ0BbWs-lkp@intel.com/ Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241214-arm64-gcs-signal-sparse-v3-1-5e8d18fffc0c@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/signal.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 37e24f1bd227..99ea26d400ff 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -36,15 +36,8 @@ #include #include -#ifdef CONFIG_ARM64_GCS #define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) -static bool gcs_signal_cap_valid(u64 addr, u64 val) -{ - return val == GCS_SIGNAL_CAP(addr); -} -#endif - /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ @@ -1062,8 +1055,7 @@ static int restore_sigframe(struct pt_regs *regs, #ifdef CONFIG_ARM64_GCS static int gcs_restore_signal(void) { - unsigned long __user *gcspr_el0; - u64 cap; + u64 gcspr_el0, cap; int ret; if (!system_supports_gcs()) @@ -1072,7 +1064,7 @@ static int gcs_restore_signal(void) if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) return 0; - gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Ensure that any changes to the GCS done via GCS operations @@ -1087,22 +1079,23 @@ static int gcs_restore_signal(void) * then faults will be generated on GCS operations - the main * concern is to protect GCS pages. */ - ret = copy_from_user(&cap, gcspr_el0, sizeof(cap)); + ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0, + sizeof(cap)); if (ret) return -EFAULT; /* * Check that the cap is the actual GCS before replacing it. */ - if (!gcs_signal_cap_valid((u64)gcspr_el0, cap)) + if (cap != GCS_SIGNAL_CAP(gcspr_el0)) return -EINVAL; /* Invalidate the token to prevent reuse */ - put_user_gcs(0, (__user void*)gcspr_el0, &ret); + put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret); if (ret != 0) return -EFAULT; - write_sysreg_s(gcspr_el0 + 1, SYS_GCSPR_EL0); + write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0); return 0; } @@ -1421,7 +1414,7 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) { - unsigned long __user *gcspr_el0; + u64 gcspr_el0; int ret = 0; if (!system_supports_gcs()) @@ -1434,18 +1427,20 @@ static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) * We are entering a signal handler, current register state is * active. */ - gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Push a cap and the GCS entry for the trampoline onto the GCS. */ - put_user_gcs((unsigned long)sigtramp, gcspr_el0 - 2, &ret); - put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 1), gcspr_el0 - 1, &ret); + put_user_gcs((unsigned long)sigtramp, + (unsigned long __user *)(gcspr_el0 - 16), &ret); + put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8), + (unsigned long __user *)(gcspr_el0 - 8), &ret); if (ret != 0) return ret; - gcspr_el0 -= 2; - write_sysreg_s((unsigned long)gcspr_el0, SYS_GCSPR_EL0); + gcspr_el0 -= 16; + write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0); return 0; } From 7917f01a286ce01e9c085e24468421f596ee1a0c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 Dec 2024 15:28:18 +1100 Subject: [PATCH 398/807] nfsd: restore callback functionality for NFSv4.0 A recent patch inadvertently broke callbacks for NFSv4.0. In the 4.0 case we do not expect a session to be found but still need to call setup_callback_client() which will not try to dereference it. This patch moves the check for failure to find a session into the 4.1+ branch of setup_callback_client() Fixes: 1e02c641c3a4 ("NFSD: Prevent NULL dereference in nfsd4_process_cb_update()") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3877b53e429f..c083e539e898 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1100,7 +1100,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = clp->cl_cred.cr_flavor; clp->cl_cb_ident = conn->cb_ident; } else { - if (!conn->cb_xprt) + if (!conn->cb_xprt || !ses) return -EINVAL; clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; @@ -1522,8 +1522,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) ses = c->cn_session; } spin_unlock(&clp->cl_lock); - if (!c) - return; err = setup_callback_client(clp, &conn, ses); if (err) { From aa5d2ca7c179c40669edb5e96d931bf9828dea3d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 16 Dec 2024 08:02:52 -0800 Subject: [PATCH 399/807] perf/x86/intel: Fix bitmask of OCR and FRONTEND events for LNC The released OCR and FRONTEND events utilized more bits on Lunar Lake p-core. The corresponding mask in the extra_regs has to be extended to unblock the extra bits. Add a dedicated intel_lnc_extra_regs. Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Reported-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20241216160252.430858-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2e1e26846050..99c590da0ae2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -429,6 +429,16 @@ static struct event_constraint intel_lnc_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct extra_reg intel_lnc_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0xfffffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0xfffffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE), + INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0xf, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); @@ -6422,7 +6432,7 @@ static __always_inline void intel_pmu_init_lnc(struct pmu *pmu) intel_pmu_init_glc(pmu); hybrid(pmu, event_constraints) = intel_lnc_event_constraints; hybrid(pmu, pebs_constraints) = intel_lnc_pebs_event_constraints; - hybrid(pmu, extra_regs) = intel_rwc_extra_regs; + hybrid(pmu, extra_regs) = intel_lnc_extra_regs; } static __always_inline void intel_pmu_init_skt(struct pmu *pmu) From 4da38536e2190fb5bfabfcf5229f4d5398648295 Mon Sep 17 00:00:00 2001 From: Dave Penkler Date: Thu, 5 Dec 2024 10:34:42 +0100 Subject: [PATCH 400/807] staging: gpib: Fix erroneous removal of blank before newline The USB_GPIB_SET_LINES command string used to be: "\nIBDC \n" but when we were merging this code into the upstream kernel we deleted the space character before the newline to make checkpatch happy. That turned out to be a mistake. The "\nIBDC" part of the string is a command that we pass to the firmware and the next character is a variable u8 value. It gets set in set_control_line(). msg[leng - 2] = value ? (retval & ~line) : retval | line; where leng is the length of the command string. Imagine the parameter was supposed to be "8". With the pre-merge code the command string would be "\nIBDC8\n" With the post-merge code the command string became "\nIBD8\n" The firmware doesn't recognize "IBD8" as a valid command and rejects it. Putting a "." where the parameter is supposed to go fixes the driver and makes checkpatch happy. Same thing with the other define and the in-line assignment. Reported-by: Marcello Carla' Fixes: fce79512a96a ("staging: gpib: Add LPVO DIY USB GPIB driver") Co-developed-by: Marcello Carla' Signed-off-by: Marcello Carla' Signed-off-by: Dave Penkler Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20241205093442.5796-1-dpenkler@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c index 267651a15fa0..2de0b470c7bc 100644 --- a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c +++ b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c @@ -99,8 +99,8 @@ module_param(debug, int, 0644); #define USB_GPIB_DEBUG_ON "\nIBDE\xAA\n" #define USB_GPIB_SET_LISTEN "\nIBDT0\n" #define USB_GPIB_SET_TALK "\nIBDT1\n" -#define USB_GPIB_SET_LINES "\nIBDC\n" -#define USB_GPIB_SET_DATA "\nIBDM\n" +#define USB_GPIB_SET_LINES "\nIBDC.\n" +#define USB_GPIB_SET_DATA "\nIBDM.\n" #define USB_GPIB_READ_LINES "\nIBD?C\n" #define USB_GPIB_READ_DATA "\nIBD?M\n" #define USB_GPIB_READ_BUS "\nIBD??\n" @@ -589,7 +589,7 @@ static int usb_gpib_command(gpib_board_t *board, size_t *bytes_written) { int i, retval; - char command[6] = "IBc\n"; + char command[6] = "IBc.\n"; DIA_LOG(1, "enter %p\n", board); From fd1885db8ecab1abc96dbb9df49b0d4b9eed1672 Mon Sep 17 00:00:00 2001 From: Dave Penkler Date: Sat, 7 Dec 2024 13:34:10 +0100 Subject: [PATCH 401/807] staging: gpib: Add lower bound check for secondary address Commit 9dde4559e939 ("staging: gpib: Add GPIB common core driver") from Sep 18, 2024 (linux-next), leads to the following Smatch static checker warning: drivers/staging/gpib/common/gpib_os.c:541 dvrsp() warn: no lower bound on 'sad' rl='s32min-30' The value -1 was introduced in user land to signify No secondary address to the driver so that a lower bound check could be added. This patch adds that check. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-staging/4efd91f3-4259-4e95-a4e0-925853b98858@stanley.mountain/ Signed-off-by: Dave Penkler Link: https://lore.kernel.org/r/20241207123410.28759-1-dpenkler@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/common/gpib_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/gpib/common/gpib_os.c b/drivers/staging/gpib/common/gpib_os.c index 405237d8cb47..f25e7c458581 100644 --- a/drivers/staging/gpib/common/gpib_os.c +++ b/drivers/staging/gpib/common/gpib_os.c @@ -536,7 +536,7 @@ int dvrsp(gpib_board_t *board, unsigned int pad, int sad, return -1; } - if (pad > MAX_GPIB_PRIMARY_ADDRESS || sad > MAX_GPIB_SECONDARY_ADDRESS) { + if (pad > MAX_GPIB_PRIMARY_ADDRESS || sad > MAX_GPIB_SECONDARY_ADDRESS || sad < -1) { pr_err("gpib: bad address for serial poll"); return -1; } From 8c41fae53016c2c9796441148c08b754c4e7dfc8 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 6 Dec 2024 10:25:04 +0800 Subject: [PATCH 402/807] staging: gpib: Modify mismatched function name No functional modification involved. drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c:676: warning: expecting prototype for interface_clear(). Prototype was for usb_gpib_interface_clear() instead. drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c:654: warning: expecting prototype for go_to_standby(). Prototype was for usb_gpib_go_to_standby() instead. drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c:636: warning: expecting prototype for enable_eos(). Prototype was for usb_gpib_enable_eos() instead. drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c:618: warning: expecting prototype for disable_eos(). Prototype was for usb_gpib_disable_eos() instead. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=12253 Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20241206022504.69670-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c index 2de0b470c7bc..5388ae1afbc6 100644 --- a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c +++ b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c @@ -608,7 +608,7 @@ static int usb_gpib_command(gpib_board_t *board, } /** - * disable_eos() - Disable END on eos byte (END on EOI only) + * usb_gpib_disable_eos() - Disable END on eos byte (END on EOI only) * * @board: the gpib_board data area for this gpib interface * @@ -624,7 +624,7 @@ static void usb_gpib_disable_eos(gpib_board_t *board) } /** - * enable_eos() - Enable END for reads when eos byte is received. + * usb_gpib_enable_eos() - Enable END for reads when eos byte is received. * * @board: the gpib_board data area for this gpib interface * @eos_byte: the 'eos' byte @@ -647,7 +647,7 @@ static int usb_gpib_enable_eos(gpib_board_t *board, } /** - * go_to_standby() - De-assert ATN + * usb_gpib_go_to_standby() - De-assert ATN * * @board: the gpib_board data area for this gpib interface */ @@ -664,7 +664,7 @@ static int usb_gpib_go_to_standby(gpib_board_t *board) } /** - * interface_clear() - Assert or de-assert IFC + * usb_gpib_interface_clear() - Assert or de-assert IFC * * @board: the gpib_board data area for this gpib interface * assert: 1: assert IFC; 0: de-assert IFC From d99d65aeddf437c052031043c96f94f93f0124d6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 12 Dec 2024 16:42:34 +0100 Subject: [PATCH 403/807] staging: gpib: make global 'usec_diff' functions static Trying to build both gpib_bitbang and lpvo_usb_gpib into the kernel reveals a function that should have been static and is also duplicated: x86_64-linux-ld: drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.o: in function `usec_diff': lpvo_usb_gpib.c:(.text+0x23c0): multiple definition of `usec_diff'; drivers/staging/gpib/gpio/gpib_bitbang.o:gpib_bitbang.c:(.text+0x2470): first defined here Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241212154245.1411411-1-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/gpio/gpib_bitbang.c | 2 +- drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/gpib/gpio/gpib_bitbang.c b/drivers/staging/gpib/gpio/gpib_bitbang.c index a2d562cbd65b..23550502e012 100644 --- a/drivers/staging/gpib/gpio/gpib_bitbang.c +++ b/drivers/staging/gpib/gpio/gpib_bitbang.c @@ -315,7 +315,7 @@ struct bb_priv { enum listener_function_state listener_state; }; -inline long usec_diff(struct timespec64 *a, struct timespec64 *b); +static inline long usec_diff(struct timespec64 *a, struct timespec64 *b); static void bb_buffer_print(unsigned char *buffer, size_t length, int cmd, int eoi); static void set_data_lines(u8 byte); static u8 get_data_lines(void); diff --git a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c index 5388ae1afbc6..200885e3ab82 100644 --- a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c +++ b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c @@ -210,7 +210,7 @@ static int skel_do_release(gpib_board_t *); * (unix time in sec and NANOsec) */ -inline int usec_diff(struct timespec64 *a, struct timespec64 *b) +static inline int usec_diff(struct timespec64 *a, struct timespec64 *b) { return ((a->tv_sec - b->tv_sec) * 1000000 + (a->tv_nsec - b->tv_nsec) / 1000); From 79d2e1919a2728ef49d938eb20ebd5903c14dfb0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 12 Dec 2024 16:42:35 +0100 Subject: [PATCH 404/807] staging: gpib: fix Makefiles Having gpib drivers built-in rather than as loadable modules causes link failure because the drivers are never actually built: arm-linux-gnueabi-ld: drivers/staging/gpib/fmh_gpib/fmh_gpib.o: in function `fmh_gpib_t1_delay': fmh_gpib.c:(.text+0x3b0): undefined reference to `nec7210_t1_delay' arm-linux-gnueabi-ld: drivers/staging/gpib/fmh_gpib/fmh_gpib.o: in function `fmh_gpib_serial_poll_status': fmh_gpib.c:(.text+0x418): undefined reference to `nec7210_serial_poll_status' arm-linux-gnueabi-ld: drivers/staging/gpib/fmh_gpib/fmh_gpib.o: in function `fmh_gpib_secondary_address': fmh_gpib.c:(.text+0x57c): undefined reference to `nec7210_secondary_address' arm-linux-gnueabi-ld: drivers/staging/gpib/fmh_gpib/fmh_gpib.o: in function `fmh_gpib_primary_address': fmh_gpib.c:(.text+0x5ac): undefined reference to `nec7210_primary_address' Change this to use the correct Makefile syntax, setting either obj-m or obj-y. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241212154245.1411411-2-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/agilent_82350b/Makefile | 2 +- drivers/staging/gpib/agilent_82357a/Makefile | 2 +- drivers/staging/gpib/cb7210/Makefile | 2 +- drivers/staging/gpib/cec/Makefile | 2 +- drivers/staging/gpib/common/Makefile | 2 +- drivers/staging/gpib/eastwood/Makefile | 2 +- drivers/staging/gpib/gpio/Makefile | 2 +- drivers/staging/gpib/hp_82335/Makefile | 2 +- drivers/staging/gpib/hp_82341/Makefile | 2 +- drivers/staging/gpib/ines/Makefile | 2 +- drivers/staging/gpib/lpvo_usb_gpib/Makefile | 2 +- drivers/staging/gpib/nec7210/Makefile | 2 +- drivers/staging/gpib/ni_usb/Makefile | 2 +- drivers/staging/gpib/pc2/Makefile | 2 +- drivers/staging/gpib/tms9914/Makefile | 2 +- drivers/staging/gpib/tnt4882/Makefile | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/staging/gpib/agilent_82350b/Makefile b/drivers/staging/gpib/agilent_82350b/Makefile index d9236c92e04b..f24e1e713a63 100644 --- a/drivers/staging/gpib/agilent_82350b/Makefile +++ b/drivers/staging/gpib/agilent_82350b/Makefile @@ -1,2 +1,2 @@ -obj-m += agilent_82350b.o +obj-$(CONFIG_GPIB_AGILENT_82350B) += agilent_82350b.o diff --git a/drivers/staging/gpib/agilent_82357a/Makefile b/drivers/staging/gpib/agilent_82357a/Makefile index 4a1d940fce2b..81a55c257a6e 100644 --- a/drivers/staging/gpib/agilent_82357a/Makefile +++ b/drivers/staging/gpib/agilent_82357a/Makefile @@ -1,4 +1,4 @@ -obj-m += agilent_82357a.o +obj-$(CONFIG_GPIB_AGILENT_82357A) += agilent_82357a.o diff --git a/drivers/staging/gpib/cb7210/Makefile b/drivers/staging/gpib/cb7210/Makefile index 22e0214fc17d..cda0725d6487 100644 --- a/drivers/staging/gpib/cb7210/Makefile +++ b/drivers/staging/gpib/cb7210/Makefile @@ -1,4 +1,4 @@ ccflags-$(CONFIG_GPIB_PCMCIA) := -DGPIB_PCMCIA -obj-m += cb7210.o +obj-$(CONFIG_GPIB_CB7210) += cb7210.o diff --git a/drivers/staging/gpib/cec/Makefile b/drivers/staging/gpib/cec/Makefile index f4638628ff29..b7141e23d4e0 100644 --- a/drivers/staging/gpib/cec/Makefile +++ b/drivers/staging/gpib/cec/Makefile @@ -1,3 +1,3 @@ -obj-m += cec_gpib.o +obj-$(CONFIG_GPIB_CEC_PCI) += cec_gpib.o diff --git a/drivers/staging/gpib/common/Makefile b/drivers/staging/gpib/common/Makefile index 0c4c77bea75b..460586edb574 100644 --- a/drivers/staging/gpib/common/Makefile +++ b/drivers/staging/gpib/common/Makefile @@ -1,5 +1,5 @@ -obj-m += gpib_common.o +obj-$(CONFIG_GPIB_COMMON) += gpib_common.o gpib_common-objs := gpib_os.o iblib.o diff --git a/drivers/staging/gpib/eastwood/Makefile b/drivers/staging/gpib/eastwood/Makefile index c74056f959d0..384825195f77 100644 --- a/drivers/staging/gpib/eastwood/Makefile +++ b/drivers/staging/gpib/eastwood/Makefile @@ -1,3 +1,3 @@ -obj-m += fluke_gpib.o +obj-$(CONFIG_GPIB_FLUKE) += fluke_gpib.o diff --git a/drivers/staging/gpib/gpio/Makefile b/drivers/staging/gpib/gpio/Makefile index a31ded6e5924..00ea52abdda7 100644 --- a/drivers/staging/gpib/gpio/Makefile +++ b/drivers/staging/gpib/gpio/Makefile @@ -1,4 +1,4 @@ -obj-m += gpib_bitbang.o +obj-$(CONFIG_GPIB_GPIO) += gpib_bitbang.o diff --git a/drivers/staging/gpib/hp_82335/Makefile b/drivers/staging/gpib/hp_82335/Makefile index 8b7a552e9355..305ce44ee48a 100644 --- a/drivers/staging/gpib/hp_82335/Makefile +++ b/drivers/staging/gpib/hp_82335/Makefile @@ -1,4 +1,4 @@ -obj-m += hp82335.o +obj-$(CONFIG_GPIB_HP82335) += hp82335.o diff --git a/drivers/staging/gpib/hp_82341/Makefile b/drivers/staging/gpib/hp_82341/Makefile index 1fe7db4f8ca4..21367310a17e 100644 --- a/drivers/staging/gpib/hp_82341/Makefile +++ b/drivers/staging/gpib/hp_82341/Makefile @@ -1,2 +1,2 @@ -obj-m += hp_82341.o +obj-$(CONFIG_GPIB_HP82341) += hp_82341.o diff --git a/drivers/staging/gpib/ines/Makefile b/drivers/staging/gpib/ines/Makefile index cdcaa59a4e39..6b6e480fd811 100644 --- a/drivers/staging/gpib/ines/Makefile +++ b/drivers/staging/gpib/ines/Makefile @@ -1,4 +1,4 @@ ccflags-$(CONFIG_GPIB_PCMCIA) := -DGPIB_PCMCIA -obj-m += ines_gpib.o +obj-$(CONFIG_GPIB_INES) += ines_gpib.o diff --git a/drivers/staging/gpib/lpvo_usb_gpib/Makefile b/drivers/staging/gpib/lpvo_usb_gpib/Makefile index 137511acce63..360553488e6d 100644 --- a/drivers/staging/gpib/lpvo_usb_gpib/Makefile +++ b/drivers/staging/gpib/lpvo_usb_gpib/Makefile @@ -1,3 +1,3 @@ -obj-m += lpvo_usb_gpib.o +obj-$(CONFIG_GPIB_LPVO) += lpvo_usb_gpib.o diff --git a/drivers/staging/gpib/nec7210/Makefile b/drivers/staging/gpib/nec7210/Makefile index 8d4d90f21109..64330f2e89d1 100644 --- a/drivers/staging/gpib/nec7210/Makefile +++ b/drivers/staging/gpib/nec7210/Makefile @@ -1,4 +1,4 @@ -obj-m += nec7210.o +obj-$(CONFIG_GPIB_NEC7210) += nec7210.o diff --git a/drivers/staging/gpib/ni_usb/Makefile b/drivers/staging/gpib/ni_usb/Makefile index e22b3b21a62c..469c5d16add3 100644 --- a/drivers/staging/gpib/ni_usb/Makefile +++ b/drivers/staging/gpib/ni_usb/Makefile @@ -1,4 +1,4 @@ -obj-m += ni_usb_gpib.o +obj-$(CONFIG_GPIB_NI_USB) += ni_usb_gpib.o diff --git a/drivers/staging/gpib/pc2/Makefile b/drivers/staging/gpib/pc2/Makefile index 8148425e0f87..481ee4296e1b 100644 --- a/drivers/staging/gpib/pc2/Makefile +++ b/drivers/staging/gpib/pc2/Makefile @@ -1,5 +1,5 @@ -obj-m += pc2_gpib.o +obj-$(CONFIG_GPIB_PC2) += pc2_gpib.o diff --git a/drivers/staging/gpib/tms9914/Makefile b/drivers/staging/gpib/tms9914/Makefile index 81b7e3cf104c..4705ab07f413 100644 --- a/drivers/staging/gpib/tms9914/Makefile +++ b/drivers/staging/gpib/tms9914/Makefile @@ -1,5 +1,5 @@ -obj-m += tms9914.o +obj-$(CONFIG_GPIB_TMS9914) += tms9914.o diff --git a/drivers/staging/gpib/tnt4882/Makefile b/drivers/staging/gpib/tnt4882/Makefile index f767c990db7a..04a4520ed3b7 100644 --- a/drivers/staging/gpib/tnt4882/Makefile +++ b/drivers/staging/gpib/tnt4882/Makefile @@ -1,5 +1,5 @@ ccflags-$(CONFIG_GPIB_PCMCIA) := -DGPIB_PCMCIA -obj-m += tnt4882.o +obj-$(CONFIG_GPIB_TNT4882) += tnt4882.o tnt4882-objs := tnt4882_gpib.o mite.o From 003d2abde115102a3e62c1a96d2cb8c5345b2af2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Dec 2024 09:31:12 +0100 Subject: [PATCH 405/807] staging: gpib: add module author and description fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FMH driver is still missing both, so take them from the comment at the start of the file. Fixes: 8e4841a0888c ("staging: gpib: Add Frank Mori Hess FPGA PCI GPIB driver") Signed-off-by: Arnd Bergmann Reviewed-by: Dominik Karol Piątkowski Link: https://lore.kernel.org/r/20241213083119.2607901-1-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/fmh_gpib/fmh_gpib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/staging/gpib/fmh_gpib/fmh_gpib.c b/drivers/staging/gpib/fmh_gpib/fmh_gpib.c index 62791db1c34a..2ff9b5a434e5 100644 --- a/drivers/staging/gpib/fmh_gpib/fmh_gpib.c +++ b/drivers/staging/gpib/fmh_gpib/fmh_gpib.c @@ -24,6 +24,8 @@ #include MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("GPIB Driver for fmh_gpib_core"); +MODULE_AUTHOR("Frank Mori Hess "); static irqreturn_t fmh_gpib_interrupt(int irq, void *arg); static int fmh_gpib_attach_holdoff_all(gpib_board_t *board, const gpib_board_config_t *config); From edbb7200ca99b29b173ea4f3f473e4e8db595025 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Dec 2024 07:49:49 +0100 Subject: [PATCH 406/807] staging: gpib: fix pcmcia dependencies With CONFIG_PCMCIA=m, the gpib drivers that optionally support PCMCIA cannot be built-in. Add a Kconfig dependency to force these to be loadable modules as well, and change the GPIB_PCMCIA symbol to have the correct state for that. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241213064959.1045243-1-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/Kconfig | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/staging/gpib/Kconfig b/drivers/staging/gpib/Kconfig index 259f3ff33646..275dbc32b28d 100644 --- a/drivers/staging/gpib/Kconfig +++ b/drivers/staging/gpib/Kconfig @@ -65,6 +65,7 @@ config GPIB_NI_PCI_ISA depends on ISA_BUS || PCI || PCMCIA depends on HAS_IOPORT depends on !X86_PAE + depends on PCMCIA || !PCMCIA select GPIB_COMMON select GPIB_NEC7210 help @@ -89,6 +90,7 @@ config GPIB_CB7210 depends on HAS_IOPORT depends on ISA_BUS || PCI || PCMCIA depends on !X86_PAE + depends on PCMCIA || !PCMCIA select GPIB_COMMON select GPIB_NEC7210 help @@ -177,6 +179,7 @@ config GPIB_HP82341 config GPIB_INES tristate "INES" depends on PCI || ISA_BUS || PCMCIA + depends on PCMCIA || !PCMCIA depends on HAS_IOPORT depends on !X86_PAE select GPIB_COMMON @@ -199,8 +202,8 @@ config GPIB_INES called cb7210. config GPIB_PCMCIA - bool "PCMCIA/Cardbus support for NI MC and Ines boards" - depends on PCCARD && (GPIB_NI_PCI_ISA || GPIB_CB7210 || GPIB_INES) + def_bool y + depends on PCMCIA && (GPIB_NI_PCI_ISA || GPIB_CB7210 || GPIB_INES) help Enable PCMCIA/CArdbus support for National Instruments, measurement computing boards and Ines boards. From fec866a00360a19a1f4c8e6bd123a4d1b2d5a8ee Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Dec 2024 07:49:50 +0100 Subject: [PATCH 407/807] staging: gpib: use ioport_map The tnt4882 backend has a rather elabolate way of abstracting the PIO and MMIO based hardware variants, duplicating the functionality of ioport_map() in a less portable way. Change it to use ioport_map() with ioread8()/iowrite8() to do this more easily. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241213064959.1045243-2-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/Kconfig | 1 + drivers/staging/gpib/common/gpib_os.c | 50 --------------------- drivers/staging/gpib/include/gpibP.h | 12 +---- drivers/staging/gpib/tnt4882/tnt4882_gpib.c | 49 +++++++++----------- 4 files changed, 23 insertions(+), 89 deletions(-) diff --git a/drivers/staging/gpib/Kconfig b/drivers/staging/gpib/Kconfig index 275dbc32b28d..81510db3072e 100644 --- a/drivers/staging/gpib/Kconfig +++ b/drivers/staging/gpib/Kconfig @@ -66,6 +66,7 @@ config GPIB_NI_PCI_ISA depends on HAS_IOPORT depends on !X86_PAE depends on PCMCIA || !PCMCIA + depends on HAS_IOPORT_MAP select GPIB_COMMON select GPIB_NEC7210 help diff --git a/drivers/staging/gpib/common/gpib_os.c b/drivers/staging/gpib/common/gpib_os.c index f25e7c458581..0fb93a9f395f 100644 --- a/drivers/staging/gpib/common/gpib_os.c +++ b/drivers/staging/gpib/common/gpib_os.c @@ -116,56 +116,6 @@ int io_timed_out(gpib_board_t *board) return 0; } -void writeb_wrapper(unsigned int value, void *address) -{ - writeb(value, address); -}; -EXPORT_SYMBOL(writeb_wrapper); - -void writew_wrapper(unsigned int value, void *address) -{ - writew(value, address); -}; -EXPORT_SYMBOL(writew_wrapper); - -unsigned int readb_wrapper(void *address) -{ - return readb(address); -}; -EXPORT_SYMBOL(readb_wrapper); - -unsigned int readw_wrapper(void *address) -{ - return readw(address); -}; -EXPORT_SYMBOL(readw_wrapper); - -#ifdef CONFIG_HAS_IOPORT -void outb_wrapper(unsigned int value, void *address) -{ - outb(value, (unsigned long)(address)); -}; -EXPORT_SYMBOL(outb_wrapper); - -void outw_wrapper(unsigned int value, void *address) -{ - outw(value, (unsigned long)(address)); -}; -EXPORT_SYMBOL(outw_wrapper); - -unsigned int inb_wrapper(void *address) -{ - return inb((unsigned long)(address)); -}; -EXPORT_SYMBOL(inb_wrapper); - -unsigned int inw_wrapper(void *address) -{ - return inw((unsigned long)(address)); -}; -EXPORT_SYMBOL(inw_wrapper); -#endif - /* this is a function instead of a constant because of Suse * defining HZ to be a function call to get_hz() */ diff --git a/drivers/staging/gpib/include/gpibP.h b/drivers/staging/gpib/include/gpibP.h index 5fc42b645ab7..b97da577ba33 100644 --- a/drivers/staging/gpib/include/gpibP.h +++ b/drivers/staging/gpib/include/gpibP.h @@ -16,6 +16,7 @@ #include #include +#include void gpib_register_driver(gpib_interface_t *interface, struct module *mod); void gpib_unregister_driver(gpib_interface_t *interface); @@ -35,16 +36,5 @@ extern gpib_board_t board_array[GPIB_MAX_NUM_BOARDS]; extern struct list_head registered_drivers; -#include - -void writeb_wrapper(unsigned int value, void *address); -unsigned int readb_wrapper(void *address); -void outb_wrapper(unsigned int value, void *address); -unsigned int inb_wrapper(void *address); -void writew_wrapper(unsigned int value, void *address); -unsigned int readw_wrapper(void *address); -void outw_wrapper(unsigned int value, void *address); -unsigned int inw_wrapper(void *address); - #endif // _GPIB_P_H diff --git a/drivers/staging/gpib/tnt4882/tnt4882_gpib.c b/drivers/staging/gpib/tnt4882/tnt4882_gpib.c index e49a952fa0d8..408a123e9542 100644 --- a/drivers/staging/gpib/tnt4882/tnt4882_gpib.c +++ b/drivers/staging/gpib/tnt4882/tnt4882_gpib.c @@ -45,10 +45,6 @@ struct tnt4882_priv { unsigned short imr0_bits; unsigned short imr3_bits; unsigned short auxg_bits; // bits written to auxiliary register G - void (*io_writeb)(unsigned int value, void *address); - void (*io_writew)(unsigned int value, void *address); - unsigned int (*io_readb)(void *address); - unsigned int (*io_readw)(void *address); }; // interface functions @@ -104,17 +100,17 @@ static const int atgpib_iosize = 32; /* paged io */ static inline unsigned int tnt_paged_readb(struct tnt4882_priv *priv, unsigned long offset) { - priv->io_writeb(AUX_PAGEIN, priv->nec7210_priv.iobase + AUXMR * priv->nec7210_priv.offset); + iowrite8(AUX_PAGEIN, priv->nec7210_priv.iobase + AUXMR * priv->nec7210_priv.offset); udelay(1); - return priv->io_readb(priv->nec7210_priv.iobase + offset); + return ioread8(priv->nec7210_priv.iobase + offset); } static inline void tnt_paged_writeb(struct tnt4882_priv *priv, unsigned int value, unsigned long offset) { - priv->io_writeb(AUX_PAGEIN, priv->nec7210_priv.iobase + AUXMR * priv->nec7210_priv.offset); + iowrite8(AUX_PAGEIN, priv->nec7210_priv.iobase + AUXMR * priv->nec7210_priv.offset); udelay(1); - priv->io_writeb(value, priv->nec7210_priv.iobase + offset); + iowrite8(value, priv->nec7210_priv.iobase + offset); } /* readb/writeb wrappers */ @@ -134,7 +130,7 @@ static inline unsigned short tnt_readb(struct tnt4882_priv *priv, unsigned long switch (priv->nec7210_priv.type) { case TNT4882: case TNT5004: - retval = priv->io_readb(address); + retval = ioread8(address); break; case NAT4882: retval = tnt_paged_readb(priv, offset - tnt_pagein_offset); @@ -149,7 +145,7 @@ static inline unsigned short tnt_readb(struct tnt4882_priv *priv, unsigned long } break; default: - retval = priv->io_readb(address); + retval = ioread8(address); break; } spin_unlock_irqrestore(register_lock, flags); @@ -170,7 +166,7 @@ static inline void tnt_writeb(struct tnt4882_priv *priv, unsigned short value, u switch (priv->nec7210_priv.type) { case TNT4882: case TNT5004: - priv->io_writeb(value, address); + iowrite8(value, address); break; case NAT4882: tnt_paged_writeb(priv, value, offset - tnt_pagein_offset); @@ -183,7 +179,7 @@ static inline void tnt_writeb(struct tnt4882_priv *priv, unsigned short value, u } break; default: - priv->io_writeb(value, address); + iowrite8(value, address); break; } spin_unlock_irqrestore(register_lock, flags); @@ -288,7 +284,7 @@ static int drain_fifo_words(struct tnt4882_priv *tnt_priv, uint8_t *buffer, int while (fifo_word_available(tnt_priv) && count + 2 <= num_bytes) { short word; - word = tnt_priv->io_readw(nec_priv->iobase + FIFOB); + word = ioread16(nec_priv->iobase + FIFOB); buffer[count++] = word & 0xff; buffer[count++] = (word >> 8) & 0xff; } @@ -573,7 +569,7 @@ static int generic_write(gpib_board_t *board, uint8_t *buffer, size_t length, word = buffer[count++] & 0xff; if (count < length) word |= (buffer[count++] << 8) & 0xff00; - tnt_priv->io_writew(word, nec_priv->iobase + FIFOB); + iowrite16(word, nec_priv->iobase + FIFOB); } // avoid unnecessary HR_NFF interrupts // tnt_priv->imr3_bits |= HR_NFF; @@ -1269,10 +1265,6 @@ int ni_pci_attach(gpib_board_t *board, const gpib_board_config_t *config) if (tnt4882_allocate_private(board)) return -ENOMEM; tnt_priv = board->private_data; - tnt_priv->io_writeb = writeb_wrapper; - tnt_priv->io_readb = readb_wrapper; - tnt_priv->io_writew = writew_wrapper; - tnt_priv->io_readw = readw_wrapper; nec_priv = &tnt_priv->nec7210_priv; nec_priv->type = TNT4882; nec_priv->read_byte = nec7210_locking_iomem_read_byte; @@ -1408,10 +1400,6 @@ static int ni_isa_attach_common(gpib_board_t *board, const gpib_board_config_t * if (tnt4882_allocate_private(board)) return -ENOMEM; tnt_priv = board->private_data; - tnt_priv->io_writeb = outb_wrapper; - tnt_priv->io_readb = inb_wrapper; - tnt_priv->io_writew = outw_wrapper; - tnt_priv->io_readw = inw_wrapper; nec_priv = &tnt_priv->nec7210_priv; nec_priv->type = chipset; nec_priv->read_byte = nec7210_locking_ioport_read_byte; @@ -1438,7 +1426,9 @@ static int ni_isa_attach_common(gpib_board_t *board, const gpib_board_config_t * pr_err("tnt4882: failed to allocate ioports\n"); return -1; } - nec_priv->iobase = iobase; + nec_priv->iobase = ioport_map(iobase, atgpib_iosize); + if (!nec_priv->iobase) + return -1; // get irq if (request_irq(irq, tnt4882_interrupt, isr_flags, "atgpib", board)) { @@ -1478,6 +1468,8 @@ void ni_isa_detach(gpib_board_t *board) tnt4882_board_reset(tnt_priv, board); if (tnt_priv->irq) free_irq(tnt_priv->irq, board); + if (nec_priv->iobase) + ioport_unmap(nec_priv->iobase); if (nec_priv->iobase) release_region((unsigned long)(nec_priv->iobase), atgpib_iosize); if (tnt_priv->pnp_dev) @@ -1817,10 +1809,6 @@ int ni_pcmcia_attach(gpib_board_t *board, const gpib_board_config_t *config) if (tnt4882_allocate_private(board)) return -ENOMEM; tnt_priv = board->private_data; - tnt_priv->io_writeb = outb_wrapper; - tnt_priv->io_readb = inb_wrapper; - tnt_priv->io_writew = outw_wrapper; - tnt_priv->io_readw = inw_wrapper; nec_priv = &tnt_priv->nec7210_priv; nec_priv->type = TNT4882; nec_priv->read_byte = nec7210_locking_ioport_read_byte; @@ -1835,7 +1823,10 @@ int ni_pcmcia_attach(gpib_board_t *board, const gpib_board_config_t *config) return -EIO; } - nec_priv->iobase = (void *)(unsigned long)curr_dev->resource[0]->start; + nec_priv->iobase = ioport_map(curr_dev->resource[0]->start, + resource_size(curr_dev->resource[0])); + if (!nec_priv->iobase) + return -1; // get irq if (request_irq(curr_dev->irq, tnt4882_interrupt, isr_flags, "tnt4882", board)) { @@ -1860,6 +1851,8 @@ void ni_pcmcia_detach(gpib_board_t *board) nec_priv = &tnt_priv->nec7210_priv; if (tnt_priv->irq) free_irq(tnt_priv->irq, board); + if (nec_priv->iobase) + ioport_unmap(nec_priv->iobase); if (nec_priv->iobase) { tnt4882_board_reset(tnt_priv, board); release_region((unsigned long)nec_priv->iobase, pcmcia_gpib_iosize); From baf8855c916007a8a372576b65492316f43ed60b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Dec 2024 07:49:51 +0100 Subject: [PATCH 408/807] staging: gpib: fix address space mixup Throughout the gpib drivers, a 'void *' struct member is used in place of either port numbers or __iomem pointers, which leads to lots of extra type casts, sparse warnings and less portable code. Split the struct member in two separate ones with the correct types, so each driver can pick which one to use. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/all/f10e976e-7a04-4454-b38d-39cd18f142da@roeck-us.net/ Link: https://lore.kernel.org/r/20241213064959.1045243-3-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- .../gpib/agilent_82350b/agilent_82350b.c | 4 +- drivers/staging/gpib/cb7210/cb7210.c | 12 ++--- drivers/staging/gpib/cb7210/cb7210.h | 4 +- drivers/staging/gpib/cec/cec_gpib.c | 4 +- drivers/staging/gpib/common/gpib_os.c | 2 +- drivers/staging/gpib/eastwood/fluke_gpib.c | 12 ++--- drivers/staging/gpib/eastwood/fluke_gpib.h | 4 +- drivers/staging/gpib/fmh_gpib/fmh_gpib.c | 25 +++++----- drivers/staging/gpib/fmh_gpib/fmh_gpib.h | 4 +- drivers/staging/gpib/hp_82335/hp82335.c | 21 +++++---- drivers/staging/gpib/hp_82341/hp_82341.c | 16 +++---- drivers/staging/gpib/include/gpib_types.h | 3 +- drivers/staging/gpib/include/nec7210.h | 5 +- drivers/staging/gpib/include/tms9914.h | 5 +- drivers/staging/gpib/ines/ines.h | 4 +- drivers/staging/gpib/ines/ines_gpib.c | 22 ++++----- .../gpib/lpvo_usb_gpib/lpvo_usb_gpib.c | 2 +- drivers/staging/gpib/nec7210/nec7210.c | 16 +++---- drivers/staging/gpib/pc2/pc2_gpib.c | 16 +++---- drivers/staging/gpib/tms9914/tms9914.c | 8 ++-- drivers/staging/gpib/tnt4882/mite.h | 4 +- drivers/staging/gpib/tnt4882/tnt4882_gpib.c | 46 +++++++++---------- 22 files changed, 123 insertions(+), 116 deletions(-) diff --git a/drivers/staging/gpib/agilent_82350b/agilent_82350b.c b/drivers/staging/gpib/agilent_82350b/agilent_82350b.c index 53006d0cc79c..8e2334fe5c9b 100644 --- a/drivers/staging/gpib/agilent_82350b/agilent_82350b.c +++ b/drivers/staging/gpib/agilent_82350b/agilent_82350b.c @@ -700,7 +700,7 @@ static int agilent_82350b_generic_attach(gpib_board_t *board, const gpib_board_c GPIB_82350A_REGION)); dev_dbg(board->gpib_dev, "%s: gpib base address remapped to 0x%p\n", driver_name, a_priv->gpib_base); - tms_priv->iobase = a_priv->gpib_base + TMS9914_BASE_REG; + tms_priv->mmiobase = a_priv->gpib_base + TMS9914_BASE_REG; a_priv->sram_base = ioremap(pci_resource_start(a_priv->pci_device, SRAM_82350A_REGION), pci_resource_len(a_priv->pci_device, @@ -724,7 +724,7 @@ static int agilent_82350b_generic_attach(gpib_board_t *board, const gpib_board_c pci_resource_len(a_priv->pci_device, GPIB_REGION)); dev_dbg(board->gpib_dev, "%s: gpib base address remapped to 0x%p\n", driver_name, a_priv->gpib_base); - tms_priv->iobase = a_priv->gpib_base + TMS9914_BASE_REG; + tms_priv->mmiobase = a_priv->gpib_base + TMS9914_BASE_REG; a_priv->sram_base = ioremap(pci_resource_start(a_priv->pci_device, SRAM_REGION), pci_resource_len(a_priv->pci_device, SRAM_REGION)); dev_dbg(board->gpib_dev, "%s: sram base address remapped to 0x%p\n", diff --git a/drivers/staging/gpib/cb7210/cb7210.c b/drivers/staging/gpib/cb7210/cb7210.c index 63df7f3eb3f3..59e41c97f518 100644 --- a/drivers/staging/gpib/cb7210/cb7210.c +++ b/drivers/staging/gpib/cb7210/cb7210.c @@ -971,12 +971,12 @@ int cb_pci_attach(gpib_board_t *board, const gpib_board_config_t *config) switch (cb_priv->pci_chip) { case PCI_CHIP_AMCC_S5933: cb_priv->amcc_iobase = pci_resource_start(cb_priv->pci_device, 0); - nec_priv->iobase = (void *)(pci_resource_start(cb_priv->pci_device, 1)); + nec_priv->iobase = pci_resource_start(cb_priv->pci_device, 1); cb_priv->fifo_iobase = pci_resource_start(cb_priv->pci_device, 2); break; case PCI_CHIP_QUANCOM: - nec_priv->iobase = (void *)(pci_resource_start(cb_priv->pci_device, 0)); - cb_priv->fifo_iobase = (unsigned long)nec_priv->iobase; + nec_priv->iobase = pci_resource_start(cb_priv->pci_device, 0); + cb_priv->fifo_iobase = nec_priv->iobase; break; default: pr_err("cb7210: bug! unhandled pci_chip=%i\n", cb_priv->pci_chip); @@ -1040,8 +1040,8 @@ int cb_isa_attach(gpib_board_t *board, const gpib_board_config_t *config) return retval; cb_priv = board->private_data; nec_priv = &cb_priv->nec7210_priv; - if (request_region((unsigned long)config->ibbase, cb7210_iosize, "cb7210") == 0) { - pr_err("gpib: ioports starting at 0x%p are already in use\n", config->ibbase); + if (request_region(config->ibbase, cb7210_iosize, "cb7210") == 0) { + pr_err("gpib: ioports starting at 0x%u are already in use\n", config->ibbase); return -EIO; } nec_priv->iobase = config->ibbase; @@ -1471,7 +1471,7 @@ int cb_pcmcia_attach(gpib_board_t *board, const gpib_board_config_t *config) (unsigned long)curr_dev->resource[0]->start); return -EIO; } - nec_priv->iobase = (void *)(unsigned long)curr_dev->resource[0]->start; + nec_priv->iobase = curr_dev->resource[0]->start; cb_priv->fifo_iobase = curr_dev->resource[0]->start; if (request_irq(curr_dev->irq, cb7210_interrupt, IRQF_SHARED, diff --git a/drivers/staging/gpib/cb7210/cb7210.h b/drivers/staging/gpib/cb7210/cb7210.h index 4ad976de2b68..c17cb22585f7 100644 --- a/drivers/staging/gpib/cb7210/cb7210.h +++ b/drivers/staging/gpib/cb7210/cb7210.h @@ -113,9 +113,9 @@ enum hs_regs { HS_STATUS = 0x8, /* HS_STATUS register */ }; -static inline unsigned long nec7210_iobase(const struct cb7210_priv *cb_priv) +static inline u32 nec7210_iobase(const struct cb7210_priv *cb_priv) { - return (unsigned long)(cb_priv->nec7210_priv.iobase); + return cb_priv->nec7210_priv.iobase; } static inline int cb7210_page_in_bits(unsigned int page) diff --git a/drivers/staging/gpib/cec/cec_gpib.c b/drivers/staging/gpib/cec/cec_gpib.c index 3dc933deb401..9c00a874468c 100644 --- a/drivers/staging/gpib/cec/cec_gpib.c +++ b/drivers/staging/gpib/cec/cec_gpib.c @@ -297,8 +297,8 @@ int cec_pci_attach(gpib_board_t *board, const gpib_board_config_t *config) cec_priv->plx_iobase = pci_resource_start(cec_priv->pci_device, 1); pr_info(" plx9050 base address 0x%lx\n", cec_priv->plx_iobase); - nec_priv->iobase = (void *)(pci_resource_start(cec_priv->pci_device, 3)); - pr_info(" nec7210 base address 0x%p\n", nec_priv->iobase); + nec_priv->iobase = pci_resource_start(cec_priv->pci_device, 3); + pr_info(" nec7210 base address 0x%x\n", nec_priv->iobase); isr_flags |= IRQF_SHARED; if (request_irq(cec_priv->pci_device->irq, cec_interrupt, isr_flags, "pci-gpib", board)) { diff --git a/drivers/staging/gpib/common/gpib_os.c b/drivers/staging/gpib/common/gpib_os.c index 0fb93a9f395f..0962729d7dfe 100644 --- a/drivers/staging/gpib/common/gpib_os.c +++ b/drivers/staging/gpib/common/gpib_os.c @@ -1573,7 +1573,7 @@ static int iobase_ioctl(gpib_board_config_t *config, unsigned long arg) if (WARN_ON_ONCE(sizeof(void *) > sizeof(base_addr))) return -EFAULT; - config->ibbase = (void *)(unsigned long)(base_addr); + config->ibbase = base_addr; return 0; } diff --git a/drivers/staging/gpib/eastwood/fluke_gpib.c b/drivers/staging/gpib/eastwood/fluke_gpib.c index 3f938ab0c84d..5e59d38beb35 100644 --- a/drivers/staging/gpib/eastwood/fluke_gpib.c +++ b/drivers/staging/gpib/eastwood/fluke_gpib.c @@ -1011,12 +1011,12 @@ static int fluke_attach_impl(gpib_board_t *board, const gpib_board_config_t *con } e_priv->gpib_iomem_res = res; - nec_priv->iobase = ioremap(e_priv->gpib_iomem_res->start, + nec_priv->mmiobase = ioremap(e_priv->gpib_iomem_res->start, resource_size(e_priv->gpib_iomem_res)); - pr_info("gpib: iobase %lx remapped to %p, length=%d\n", - (unsigned long)e_priv->gpib_iomem_res->start, - nec_priv->iobase, (int)resource_size(e_priv->gpib_iomem_res)); - if (!nec_priv->iobase) { + pr_info("gpib: mmiobase %llx remapped to %p, length=%d\n", + (u64)e_priv->gpib_iomem_res->start, + nec_priv->mmiobase, (int)resource_size(e_priv->gpib_iomem_res)); + if (!nec_priv->mmiobase) { dev_err(&fluke_gpib_pdev->dev, "Could not map I/O memory\n"); return -ENOMEM; } @@ -1107,7 +1107,7 @@ void fluke_detach(gpib_board_t *board) gpib_free_pseudo_irq(board); nec_priv = &e_priv->nec7210_priv; - if (nec_priv->iobase) { + if (nec_priv->mmiobase) { fluke_paged_write_byte(e_priv, 0, ISR0_IMR0, ISR0_IMR0_PAGE); nec7210_board_reset(nec_priv, board); } diff --git a/drivers/staging/gpib/eastwood/fluke_gpib.h b/drivers/staging/gpib/eastwood/fluke_gpib.h index fcbd42f8f9af..4e2144d45270 100644 --- a/drivers/staging/gpib/eastwood/fluke_gpib.h +++ b/drivers/staging/gpib/eastwood/fluke_gpib.h @@ -72,7 +72,7 @@ static inline uint8_t fluke_read_byte_nolock(struct nec7210_priv *nec_priv, { u8 retval; - retval = readl(nec_priv->iobase + register_num * nec_priv->offset); + retval = readl(nec_priv->mmiobase + register_num * nec_priv->offset); return retval; } @@ -80,7 +80,7 @@ static inline uint8_t fluke_read_byte_nolock(struct nec7210_priv *nec_priv, static inline void fluke_write_byte_nolock(struct nec7210_priv *nec_priv, uint8_t data, int register_num) { - writel(data, nec_priv->iobase + register_num * nec_priv->offset); + writel(data, nec_priv->mmiobase + register_num * nec_priv->offset); } static inline uint8_t fluke_paged_read_byte(struct fluke_priv *e_priv, diff --git a/drivers/staging/gpib/fmh_gpib/fmh_gpib.c b/drivers/staging/gpib/fmh_gpib/fmh_gpib.c index 2ff9b5a434e5..0662b20a45e7 100644 --- a/drivers/staging/gpib/fmh_gpib/fmh_gpib.c +++ b/drivers/staging/gpib/fmh_gpib/fmh_gpib.c @@ -1421,15 +1421,14 @@ static int fmh_gpib_attach_impl(gpib_board_t *board, const gpib_board_config_t * } e_priv->gpib_iomem_res = res; - nec_priv->iobase = ioremap(e_priv->gpib_iomem_res->start, + nec_priv->mmiobase = ioremap(e_priv->gpib_iomem_res->start, resource_size(e_priv->gpib_iomem_res)); - if (!nec_priv->iobase) { + if (!nec_priv->mmiobase) { dev_err(board->dev, "Could not map I/O memory for gpib\n"); return -ENOMEM; } - dev_info(board->dev, "iobase 0x%lx remapped to %p, length=%ld\n", - (unsigned long)e_priv->gpib_iomem_res->start, - nec_priv->iobase, (unsigned long)resource_size(e_priv->gpib_iomem_res)); + dev_info(board->dev, "iobase %pr remapped to %p\n", + e_priv->gpib_iomem_res, nec_priv->mmiobase); res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dma_fifos"); if (!res) { @@ -1509,14 +1508,14 @@ void fmh_gpib_detach(gpib_board_t *board) free_irq(e_priv->irq, board); if (e_priv->fifo_base) fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG); - if (nec_priv->iobase) { + if (nec_priv->mmiobase) { write_byte(nec_priv, 0, ISR0_IMR0_REG); nec7210_board_reset(nec_priv, board); } if (e_priv->fifo_base) iounmap(e_priv->fifo_base); - if (nec_priv->iobase) - iounmap(nec_priv->iobase); + if (nec_priv->mmiobase) + iounmap(nec_priv->mmiobase); if (e_priv->dma_port_res) { release_mem_region(e_priv->dma_port_res->start, resource_size(e_priv->dma_port_res)); @@ -1566,12 +1565,12 @@ static int fmh_gpib_pci_attach_impl(gpib_board_t *board, const gpib_board_config e_priv->gpib_iomem_res = &pci_device->resource[gpib_control_status_pci_resource_index]; e_priv->dma_port_res = &pci_device->resource[gpib_fifo_pci_resource_index]; - nec_priv->iobase = ioremap(pci_resource_start(pci_device, + nec_priv->mmiobase = ioremap(pci_resource_start(pci_device, gpib_control_status_pci_resource_index), pci_resource_len(pci_device, gpib_control_status_pci_resource_index)); dev_info(board->dev, "base address for gpib control/status registers remapped to 0x%p\n", - nec_priv->iobase); + nec_priv->mmiobase); if (e_priv->dma_port_res->flags & IORESOURCE_MEM) { e_priv->fifo_base = ioremap(pci_resource_start(pci_device, @@ -1634,14 +1633,14 @@ void fmh_gpib_pci_detach(gpib_board_t *board) free_irq(e_priv->irq, board); if (e_priv->fifo_base) fifos_write(e_priv, 0, FIFO_CONTROL_STATUS_REG); - if (nec_priv->iobase) { + if (nec_priv->mmiobase) { write_byte(nec_priv, 0, ISR0_IMR0_REG); nec7210_board_reset(nec_priv, board); } if (e_priv->fifo_base) iounmap(e_priv->fifo_base); - if (nec_priv->iobase) - iounmap(nec_priv->iobase); + if (nec_priv->mmiobase) + iounmap(nec_priv->mmiobase); if (e_priv->dma_port_res || e_priv->gpib_iomem_res) pci_release_regions(to_pci_dev(board->dev)); if (board->dev) diff --git a/drivers/staging/gpib/fmh_gpib/fmh_gpib.h b/drivers/staging/gpib/fmh_gpib/fmh_gpib.h index 43bfc89d2a6f..60b1bd6d3c15 100644 --- a/drivers/staging/gpib/fmh_gpib/fmh_gpib.h +++ b/drivers/staging/gpib/fmh_gpib/fmh_gpib.h @@ -127,13 +127,13 @@ static const unsigned int fifo_max_burst_length_mask = 0x00ff; static inline uint8_t gpib_cs_read_byte(struct nec7210_priv *nec_priv, unsigned int register_num) { - return readb(nec_priv->iobase + register_num * nec_priv->offset); + return readb(nec_priv->mmiobase + register_num * nec_priv->offset); } static inline void gpib_cs_write_byte(struct nec7210_priv *nec_priv, uint8_t data, unsigned int register_num) { - writeb(data, nec_priv->iobase + register_num * nec_priv->offset); + writeb(data, nec_priv->mmiobase + register_num * nec_priv->offset); } static inline uint16_t fifos_read(struct fmh_priv *fmh_priv, int register_num) diff --git a/drivers/staging/gpib/hp_82335/hp82335.c b/drivers/staging/gpib/hp_82335/hp82335.c index 40afe42aea47..ea78143c7ab6 100644 --- a/drivers/staging/gpib/hp_82335/hp82335.c +++ b/drivers/staging/gpib/hp_82335/hp82335.c @@ -9,6 +9,7 @@ */ #include "hp82335.h" +#include #include #include #include @@ -233,7 +234,7 @@ static void hp82335_clear_interrupt(struct hp82335_priv *hp_priv) { struct tms9914_priv *tms_priv = &hp_priv->tms9914_priv; - writeb(0, tms_priv->iobase + HPREG_INTR_CLEAR); + writeb(0, tms_priv->mmiobase + HPREG_INTR_CLEAR); } int hp82335_attach(gpib_board_t *board, const gpib_board_config_t *config) @@ -241,7 +242,7 @@ int hp82335_attach(gpib_board_t *board, const gpib_board_config_t *config) struct hp82335_priv *hp_priv; struct tms9914_priv *tms_priv; int retval; - const unsigned long upper_iomem_base = (unsigned long)config->ibbase + hp82335_rom_size; + const unsigned long upper_iomem_base = config->ibbase + hp82335_rom_size; board->status = 0; @@ -253,7 +254,7 @@ int hp82335_attach(gpib_board_t *board, const gpib_board_config_t *config) tms_priv->write_byte = hp82335_write_byte; tms_priv->offset = 1; - switch ((unsigned long)(config->ibbase)) { + switch (config->ibbase) { case 0xc4000: case 0xc8000: case 0xcc000: @@ -271,7 +272,7 @@ int hp82335_attach(gpib_board_t *board, const gpib_board_config_t *config) case 0xfc000: break; default: - pr_err("hp82335: invalid base io address 0x%p\n", config->ibbase); + pr_err("hp82335: invalid base io address 0x%u\n", config->ibbase); return -EINVAL; } if (!request_mem_region(upper_iomem_base, hp82335_upper_iomem_size, "hp82335")) { @@ -280,9 +281,9 @@ int hp82335_attach(gpib_board_t *board, const gpib_board_config_t *config) return -EBUSY; } hp_priv->raw_iobase = upper_iomem_base; - tms_priv->iobase = ioremap(upper_iomem_base, hp82335_upper_iomem_size); + tms_priv->mmiobase = ioremap(upper_iomem_base, hp82335_upper_iomem_size); pr_info("hp82335: upper half of 82335 iomem region 0x%lx remapped to 0x%p\n", - hp_priv->raw_iobase, tms_priv->iobase); + hp_priv->raw_iobase, tms_priv->mmiobase); retval = request_irq(config->ibirq, hp82335_interrupt, 0, "hp82335", board); if (retval) { @@ -296,7 +297,7 @@ int hp82335_attach(gpib_board_t *board, const gpib_board_config_t *config) hp82335_clear_interrupt(hp_priv); - writeb(INTR_ENABLE, tms_priv->iobase + HPREG_CCR); + writeb(INTR_ENABLE, tms_priv->mmiobase + HPREG_CCR); tms9914_online(board, tms_priv); @@ -312,10 +313,10 @@ void hp82335_detach(gpib_board_t *board) tms_priv = &hp_priv->tms9914_priv; if (hp_priv->irq) free_irq(hp_priv->irq, board); - if (tms_priv->iobase) { - writeb(0, tms_priv->iobase + HPREG_CCR); + if (tms_priv->mmiobase) { + writeb(0, tms_priv->mmiobase + HPREG_CCR); tms9914_board_reset(tms_priv); - iounmap((void *)tms_priv->iobase); + iounmap(tms_priv->mmiobase); } if (hp_priv->raw_iobase) release_mem_region(hp_priv->raw_iobase, hp82335_upper_iomem_size); diff --git a/drivers/staging/gpib/hp_82341/hp_82341.c b/drivers/staging/gpib/hp_82341/hp_82341.c index 8ad1c885a9fb..71d481e88bd9 100644 --- a/drivers/staging/gpib/hp_82341/hp_82341.c +++ b/drivers/staging/gpib/hp_82341/hp_82341.c @@ -473,12 +473,12 @@ void hp_82341_free_private(gpib_board_t *board) static uint8_t hp_82341_read_byte(struct tms9914_priv *priv, unsigned int register_num) { - return inb((unsigned long)(priv->iobase) + register_num); + return inb(priv->iobase + register_num); } static void hp_82341_write_byte(struct tms9914_priv *priv, uint8_t data, unsigned int register_num) { - outb(data, (unsigned long)(priv->iobase) + register_num); + outb(data, priv->iobase + register_num); } static int hp_82341_find_isapnp_board(struct pnp_dev **dev) @@ -682,8 +682,8 @@ int hp_82341_attach(gpib_board_t *board, const gpib_board_config_t *config) { struct hp_82341_priv *hp_priv; struct tms9914_priv *tms_priv; - unsigned long start_addr; - void *iobase; + u32 start_addr; + u32 iobase; int irq; int i; int retval; @@ -704,7 +704,7 @@ int hp_82341_attach(gpib_board_t *board, const gpib_board_config_t *config) if (retval < 0) return retval; hp_priv->pnp_dev = dev; - iobase = (void *)(pnp_port_start(dev, 0)); + iobase = pnp_port_start(dev, 0); irq = pnp_irq(dev, 0); hp_priv->hw_version = HW_VERSION_82341D; hp_priv->io_region_offset = 0x8; @@ -714,9 +714,9 @@ int hp_82341_attach(gpib_board_t *board, const gpib_board_config_t *config) hp_priv->hw_version = HW_VERSION_82341C; hp_priv->io_region_offset = 0x400; } - pr_info("hp_82341: base io 0x%p\n", iobase); + pr_info("hp_82341: base io 0x%u\n", iobase); for (i = 0; i < hp_82341_num_io_regions; ++i) { - start_addr = (unsigned long)(iobase) + i * hp_priv->io_region_offset; + start_addr = iobase + i * hp_priv->io_region_offset; if (!request_region(start_addr, hp_82341_region_iosize, "hp_82341")) { pr_err("hp_82341: failed to allocate io ports 0x%lx-0x%lx\n", start_addr, @@ -725,7 +725,7 @@ int hp_82341_attach(gpib_board_t *board, const gpib_board_config_t *config) } hp_priv->iobase[i] = start_addr; } - tms_priv->iobase = (void *)(hp_priv->iobase[2]); + tms_priv->iobase = hp_priv->iobase[2]; if (hp_priv->hw_version == HW_VERSION_82341D) { retval = isapnp_cfg_begin(hp_priv->pnp_dev->card->number, hp_priv->pnp_dev->number); diff --git a/drivers/staging/gpib/include/gpib_types.h b/drivers/staging/gpib/include/gpib_types.h index ee2643da6d71..b41781a55a60 100644 --- a/drivers/staging/gpib/include/gpib_types.h +++ b/drivers/staging/gpib/include/gpib_types.h @@ -31,7 +31,8 @@ typedef struct { void *init_data; int init_data_length; /* IO base address to use for non-pnp cards (set by core, driver should make local copy) */ - void *ibbase; + u32 ibbase; + void __iomem *mmibbase; /* IRQ to use for non-pnp cards (set by core, driver should make local copy) */ unsigned int ibirq; /* dma channel to use for non-pnp cards (set by core, driver should make local copy) */ diff --git a/drivers/staging/gpib/include/nec7210.h b/drivers/staging/gpib/include/nec7210.h index c00aba4ce846..ca998c4a84bf 100644 --- a/drivers/staging/gpib/include/nec7210.h +++ b/drivers/staging/gpib/include/nec7210.h @@ -18,7 +18,10 @@ /* struct used to provide variables local to a nec7210 chip */ struct nec7210_priv { - void *iobase; +#ifdef CONFIG_HAS_IOPORT + u32 iobase; +#endif + void __iomem *mmiobase; unsigned int offset; // offset between successive nec7210 io addresses unsigned int dma_channel; u8 *dma_buffer; diff --git a/drivers/staging/gpib/include/tms9914.h b/drivers/staging/gpib/include/tms9914.h index 456b488212d2..d8c8d1c9b131 100644 --- a/drivers/staging/gpib/include/tms9914.h +++ b/drivers/staging/gpib/include/tms9914.h @@ -20,7 +20,10 @@ enum tms9914_holdoff_mode { /* struct used to provide variables local to a tms9914 chip */ struct tms9914_priv { - void *iobase; +#ifdef CONFIG_HAS_IOPORT + u32 iobase; +#endif + void __iomem *mmiobase; unsigned int offset; // offset between successive tms9914 io addresses unsigned int dma_channel; // software copy of bits written to interrupt mask registers diff --git a/drivers/staging/gpib/ines/ines.h b/drivers/staging/gpib/ines/ines.h index 7e8302619998..eed038fd3f28 100644 --- a/drivers/staging/gpib/ines/ines.h +++ b/drivers/staging/gpib/ines/ines.h @@ -83,14 +83,14 @@ void ines_set_xfer_counter(struct ines_priv *priv, unsigned int count); /* inb/outb wrappers */ static inline unsigned int ines_inb(struct ines_priv *priv, unsigned int register_number) { - return inb((unsigned long)(priv->nec7210_priv.iobase) + + return inb(priv->nec7210_priv.iobase + register_number * priv->nec7210_priv.offset); } static inline void ines_outb(struct ines_priv *priv, unsigned int value, unsigned int register_number) { - outb(value, (unsigned long)(priv->nec7210_priv.iobase) + + outb(value, priv->nec7210_priv.iobase + register_number * priv->nec7210_priv.offset); } diff --git a/drivers/staging/gpib/ines/ines_gpib.c b/drivers/staging/gpib/ines/ines_gpib.c index 9d8387c3bf01..e18455ba842f 100644 --- a/drivers/staging/gpib/ines/ines_gpib.c +++ b/drivers/staging/gpib/ines/ines_gpib.c @@ -273,10 +273,10 @@ irqreturn_t ines_pci_interrupt(int irq, void *arg) struct nec7210_priv *nec_priv = &priv->nec7210_priv; if (priv->pci_chip_type == PCI_CHIP_QUANCOM) { - if ((inb((unsigned long)nec_priv->iobase + + if ((inb(nec_priv->iobase + QUANCOM_IRQ_CONTROL_STATUS_REG) & QUANCOM_IRQ_ASSERTED_BIT)) - outb(QUANCOM_IRQ_ENABLE_BIT, (unsigned long)(nec_priv->iobase) + + outb(QUANCOM_IRQ_ENABLE_BIT, nec_priv->iobase + QUANCOM_IRQ_CONTROL_STATUS_REG); } @@ -780,8 +780,8 @@ static int ines_common_pci_attach(gpib_board_t *board, const gpib_board_config_t if (pci_request_regions(ines_priv->pci_device, "ines-gpib")) return -1; - nec_priv->iobase = (void *)(pci_resource_start(ines_priv->pci_device, - found_id.gpib_region)); + nec_priv->iobase = pci_resource_start(ines_priv->pci_device, + found_id.gpib_region); ines_priv->pci_chip_type = found_id.pci_chip_type; nec_priv->offset = found_id.io_offset; @@ -840,7 +840,7 @@ static int ines_common_pci_attach(gpib_board_t *board, const gpib_board_config_t } break; case PCI_CHIP_QUANCOM: - outb(QUANCOM_IRQ_ENABLE_BIT, (unsigned long)(nec_priv->iobase) + + outb(QUANCOM_IRQ_ENABLE_BIT, nec_priv->iobase + QUANCOM_IRQ_CONTROL_STATUS_REG); break; case PCI_CHIP_QUICKLOGIC5030: @@ -899,8 +899,8 @@ int ines_isa_attach(gpib_board_t *board, const gpib_board_config_t *config) ines_priv = board->private_data; nec_priv = &ines_priv->nec7210_priv; - if (!request_region((unsigned long)config->ibbase, ines_isa_iosize, "ines_gpib")) { - pr_err("ines_gpib: ioports at 0x%p already in use\n", config->ibbase); + if (!request_region(config->ibbase, ines_isa_iosize, "ines_gpib")) { + pr_err("ines_gpib: ioports at 0x%x already in use\n", config->ibbase); return -1; } nec_priv->iobase = config->ibbase; @@ -931,7 +931,7 @@ void ines_pci_detach(gpib_board_t *board) break; case PCI_CHIP_QUANCOM: if (nec_priv->iobase) - outb(0, (unsigned long)(nec_priv->iobase) + + outb(0, nec_priv->iobase + QUANCOM_IRQ_CONTROL_STATUS_REG); break; default: @@ -960,7 +960,7 @@ void ines_isa_detach(gpib_board_t *board) free_irq(ines_priv->irq, board); if (nec_priv->iobase) { nec7210_board_reset(nec_priv, board); - release_region((unsigned long)(nec_priv->iobase), ines_isa_iosize); + release_region(nec_priv->iobase, ines_isa_iosize); } } ines_free_private(board); @@ -1355,7 +1355,7 @@ int ines_common_pcmcia_attach(gpib_board_t *board) return -1; } - nec_priv->iobase = (void *)(unsigned long)curr_dev->resource[0]->start; + nec_priv->iobase = curr_dev->resource[0]->start; nec7210_board_reset(nec_priv, board); @@ -1410,7 +1410,7 @@ void ines_pcmcia_detach(gpib_board_t *board) free_irq(ines_priv->irq, board); if (nec_priv->iobase) { nec7210_board_reset(nec_priv, board); - release_region((unsigned long)(nec_priv->iobase), ines_pcmcia_iosize); + release_region(nec_priv->iobase, ines_pcmcia_iosize); } } ines_free_private(board); diff --git a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c index 200885e3ab82..93a05c792816 100644 --- a/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c +++ b/drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c @@ -436,7 +436,7 @@ static void set_timeout(gpib_board_t *board) static int usb_gpib_attach(gpib_board_t *board, const gpib_board_config_t *config) { int retval, j; - int base = (long)config->ibbase; + u32 base = config->ibbase; char *device_path; int match; struct usb_device *udev; diff --git a/drivers/staging/gpib/nec7210/nec7210.c b/drivers/staging/gpib/nec7210/nec7210.c index 1d9951035497..c9a837fad96e 100644 --- a/drivers/staging/gpib/nec7210/nec7210.c +++ b/drivers/staging/gpib/nec7210/nec7210.c @@ -1035,7 +1035,7 @@ EXPORT_SYMBOL(nec7210_board_online); /* wrappers for io */ uint8_t nec7210_ioport_read_byte(struct nec7210_priv *priv, unsigned int register_num) { - return inb((unsigned long)(priv->iobase) + register_num * priv->offset); + return inb(priv->iobase + register_num * priv->offset); } EXPORT_SYMBOL(nec7210_ioport_read_byte); @@ -1047,7 +1047,7 @@ void nec7210_ioport_write_byte(struct nec7210_priv *priv, uint8_t data, unsigned */ nec7210_locking_ioport_write_byte(priv, data, register_num); else - outb(data, (unsigned long)(priv->iobase) + register_num * priv->offset); + outb(data, priv->iobase + register_num * priv->offset); } EXPORT_SYMBOL(nec7210_ioport_write_byte); @@ -1058,7 +1058,7 @@ uint8_t nec7210_locking_ioport_read_byte(struct nec7210_priv *priv, unsigned int unsigned long flags; spin_lock_irqsave(&priv->register_page_lock, flags); - retval = inb((unsigned long)(priv->iobase) + register_num * priv->offset); + retval = inb(priv->iobase + register_num * priv->offset); spin_unlock_irqrestore(&priv->register_page_lock, flags); return retval; } @@ -1072,7 +1072,7 @@ void nec7210_locking_ioport_write_byte(struct nec7210_priv *priv, uint8_t data, spin_lock_irqsave(&priv->register_page_lock, flags); if (register_num == AUXMR) udelay(1); - outb(data, (unsigned long)(priv->iobase) + register_num * priv->offset); + outb(data, priv->iobase + register_num * priv->offset); spin_unlock_irqrestore(&priv->register_page_lock, flags); } EXPORT_SYMBOL(nec7210_locking_ioport_write_byte); @@ -1080,7 +1080,7 @@ EXPORT_SYMBOL(nec7210_locking_ioport_write_byte); uint8_t nec7210_iomem_read_byte(struct nec7210_priv *priv, unsigned int register_num) { - return readb(priv->iobase + register_num * priv->offset); + return readb(priv->mmiobase + register_num * priv->offset); } EXPORT_SYMBOL(nec7210_iomem_read_byte); @@ -1092,7 +1092,7 @@ void nec7210_iomem_write_byte(struct nec7210_priv *priv, uint8_t data, unsigned */ nec7210_locking_iomem_write_byte(priv, data, register_num); else - writeb(data, priv->iobase + register_num * priv->offset); + writeb(data, priv->mmiobase + register_num * priv->offset); } EXPORT_SYMBOL(nec7210_iomem_write_byte); @@ -1102,7 +1102,7 @@ uint8_t nec7210_locking_iomem_read_byte(struct nec7210_priv *priv, unsigned int unsigned long flags; spin_lock_irqsave(&priv->register_page_lock, flags); - retval = readb(priv->iobase + register_num * priv->offset); + retval = readb(priv->mmiobase + register_num * priv->offset); spin_unlock_irqrestore(&priv->register_page_lock, flags); return retval; } @@ -1116,7 +1116,7 @@ void nec7210_locking_iomem_write_byte(struct nec7210_priv *priv, uint8_t data, spin_lock_irqsave(&priv->register_page_lock, flags); if (register_num == AUXMR) udelay(1); - writeb(data, priv->iobase + register_num * priv->offset); + writeb(data, priv->mmiobase + register_num * priv->offset); spin_unlock_irqrestore(&priv->register_page_lock, flags); } EXPORT_SYMBOL(nec7210_locking_iomem_write_byte); diff --git a/drivers/staging/gpib/pc2/pc2_gpib.c b/drivers/staging/gpib/pc2/pc2_gpib.c index 7b3b34f47341..d22af25125b1 100644 --- a/drivers/staging/gpib/pc2/pc2_gpib.c +++ b/drivers/staging/gpib/pc2/pc2_gpib.c @@ -426,7 +426,7 @@ int pc2_attach(gpib_board_t *board, const gpib_board_config_t *config) nec_priv = &pc2_priv->nec7210_priv; nec_priv->offset = pc2_reg_offset; - if (request_region((unsigned long)config->ibbase, pc2_iosize, "pc2") == 0) { + if (request_region(config->ibbase, pc2_iosize, "pc2") == 0) { pr_err("gpib: ioports are already in use\n"); return -1; } @@ -471,7 +471,7 @@ void pc2_detach(gpib_board_t *board) free_irq(pc2_priv->irq, board); if (nec_priv->iobase) { nec7210_board_reset(nec_priv, board); - release_region((unsigned long)(nec_priv->iobase), pc2_iosize); + release_region(nec_priv->iobase, pc2_iosize); } if (nec_priv->dma_buffer) { dma_free_coherent(board->dev, nec_priv->dma_buffer_length, @@ -498,14 +498,14 @@ static int pc2a_common_attach(gpib_board_t *board, const gpib_board_config_t *co nec_priv = &pc2_priv->nec7210_priv; nec_priv->offset = pc2a_reg_offset; - switch ((unsigned long)(config->ibbase)) { + switch (config->ibbase) { case 0x02e1: case 0x22e1: case 0x42e1: case 0x62e1: break; default: - pr_err("PCIIa base range invalid, must be one of 0x[0246]2e1, but is 0x%p\n", + pr_err("PCIIa base range invalid, must be one of 0x[0246]2e1, but is 0x%d\n", config->ibbase); return -1; } @@ -522,7 +522,7 @@ static int pc2a_common_attach(gpib_board_t *board, const gpib_board_config_t *co unsigned int err = 0; for (i = 0; i < num_registers; i++) { - if (check_region((unsigned long)config->ibbase + i * pc2a_reg_offset, 1)) + if (check_region(config->ibbase + i * pc2a_reg_offset, 1)) err++; } if (config->ibirq && check_region(pc2a_clear_intr_iobase + config->ibirq, 1)) @@ -533,11 +533,11 @@ static int pc2a_common_attach(gpib_board_t *board, const gpib_board_config_t *co } #endif for (i = 0; i < num_registers; i++) { - if (!request_region((unsigned long)config->ibbase + + if (!request_region(config->ibbase + i * pc2a_reg_offset, 1, "pc2a")) { pr_err("gpib: ioports are already in use"); for (j = 0; j < i; j++) - release_region((unsigned long)(config->ibbase) + + release_region(config->ibbase + j * pc2a_reg_offset, 1); return -1; } @@ -608,7 +608,7 @@ static void pc2a_common_detach(gpib_board_t *board, unsigned int num_registers) if (nec_priv->iobase) { nec7210_board_reset(nec_priv, board); for (i = 0; i < num_registers; i++) - release_region((unsigned long)nec_priv->iobase + + release_region(nec_priv->iobase + i * pc2a_reg_offset, 1); } if (pc2_priv->clear_intr_addr) diff --git a/drivers/staging/gpib/tms9914/tms9914.c b/drivers/staging/gpib/tms9914/tms9914.c index 152b243b845b..ec8e1d4d762f 100644 --- a/drivers/staging/gpib/tms9914/tms9914.c +++ b/drivers/staging/gpib/tms9914/tms9914.c @@ -866,14 +866,14 @@ EXPORT_SYMBOL_GPL(tms9914_online); // wrapper for inb uint8_t tms9914_ioport_read_byte(struct tms9914_priv *priv, unsigned int register_num) { - return inb((unsigned long)(priv->iobase) + register_num * priv->offset); + return inb(priv->iobase + register_num * priv->offset); } EXPORT_SYMBOL_GPL(tms9914_ioport_read_byte); // wrapper for outb void tms9914_ioport_write_byte(struct tms9914_priv *priv, uint8_t data, unsigned int register_num) { - outb(data, (unsigned long)(priv->iobase) + register_num * priv->offset); + outb(data, priv->iobase + register_num * priv->offset); if (register_num == AUXCR) udelay(1); } @@ -883,14 +883,14 @@ EXPORT_SYMBOL_GPL(tms9914_ioport_write_byte); // wrapper for readb uint8_t tms9914_iomem_read_byte(struct tms9914_priv *priv, unsigned int register_num) { - return readb(priv->iobase + register_num * priv->offset); + return readb(priv->mmiobase + register_num * priv->offset); } EXPORT_SYMBOL_GPL(tms9914_iomem_read_byte); // wrapper for writeb void tms9914_iomem_write_byte(struct tms9914_priv *priv, uint8_t data, unsigned int register_num) { - writeb(data, priv->iobase + register_num * priv->offset); + writeb(data, priv->mmiobase + register_num * priv->offset); if (register_num == AUXCR) udelay(1); } diff --git a/drivers/staging/gpib/tnt4882/mite.h b/drivers/staging/gpib/tnt4882/mite.h index 6454d069b8cc..7a475279b2fb 100644 --- a/drivers/staging/gpib/tnt4882/mite.h +++ b/drivers/staging/gpib/tnt4882/mite.h @@ -34,9 +34,9 @@ struct mite_struct { struct pci_dev *pcidev; unsigned long mite_phys_addr; - void *mite_io_addr; + void __iomem *mite_io_addr; unsigned long daq_phys_addr; - void *daq_io_addr; + void __iomem *daq_io_addr; int DMA_CheckNearEnd; diff --git a/drivers/staging/gpib/tnt4882/tnt4882_gpib.c b/drivers/staging/gpib/tnt4882/tnt4882_gpib.c index 408a123e9542..ce91c3eb768c 100644 --- a/drivers/staging/gpib/tnt4882/tnt4882_gpib.c +++ b/drivers/staging/gpib/tnt4882/tnt4882_gpib.c @@ -100,23 +100,23 @@ static const int atgpib_iosize = 32; /* paged io */ static inline unsigned int tnt_paged_readb(struct tnt4882_priv *priv, unsigned long offset) { - iowrite8(AUX_PAGEIN, priv->nec7210_priv.iobase + AUXMR * priv->nec7210_priv.offset); + iowrite8(AUX_PAGEIN, priv->nec7210_priv.mmiobase + AUXMR * priv->nec7210_priv.offset); udelay(1); - return ioread8(priv->nec7210_priv.iobase + offset); + return ioread8(priv->nec7210_priv.mmiobase + offset); } static inline void tnt_paged_writeb(struct tnt4882_priv *priv, unsigned int value, unsigned long offset) { - iowrite8(AUX_PAGEIN, priv->nec7210_priv.iobase + AUXMR * priv->nec7210_priv.offset); + iowrite8(AUX_PAGEIN, priv->nec7210_priv.mmiobase + AUXMR * priv->nec7210_priv.offset); udelay(1); - iowrite8(value, priv->nec7210_priv.iobase + offset); + iowrite8(value, priv->nec7210_priv.mmiobase + offset); } /* readb/writeb wrappers */ static inline unsigned short tnt_readb(struct tnt4882_priv *priv, unsigned long offset) { - void *address = priv->nec7210_priv.iobase + offset; + void *address = priv->nec7210_priv.mmiobase + offset; unsigned long flags; unsigned short retval; spinlock_t *register_lock = &priv->nec7210_priv.register_page_lock; @@ -154,7 +154,7 @@ static inline unsigned short tnt_readb(struct tnt4882_priv *priv, unsigned long static inline void tnt_writeb(struct tnt4882_priv *priv, unsigned short value, unsigned long offset) { - void *address = priv->nec7210_priv.iobase + offset; + void *address = priv->nec7210_priv.mmiobase + offset; unsigned long flags; spinlock_t *register_lock = &priv->nec7210_priv.register_page_lock; @@ -284,7 +284,7 @@ static int drain_fifo_words(struct tnt4882_priv *tnt_priv, uint8_t *buffer, int while (fifo_word_available(tnt_priv) && count + 2 <= num_bytes) { short word; - word = ioread16(nec_priv->iobase + FIFOB); + word = ioread16(nec_priv->mmiobase + FIFOB); buffer[count++] = word & 0xff; buffer[count++] = (word >> 8) & 0xff; } @@ -569,7 +569,7 @@ static int generic_write(gpib_board_t *board, uint8_t *buffer, size_t length, word = buffer[count++] & 0xff; if (count < length) word |= (buffer[count++] << 8) & 0xff00; - iowrite16(word, nec_priv->iobase + FIFOB); + iowrite16(word, nec_priv->mmiobase + FIFOB); } // avoid unnecessary HR_NFF interrupts // tnt_priv->imr3_bits |= HR_NFF; @@ -1316,7 +1316,7 @@ int ni_pci_attach(gpib_board_t *board, const gpib_board_config_t *config) return retval; } - nec_priv->iobase = tnt_priv->mite->daq_io_addr; + nec_priv->mmiobase = tnt_priv->mite->daq_io_addr; // get irq if (request_irq(mite_irq(tnt_priv->mite), tnt4882_interrupt, isr_flags, @@ -1351,7 +1351,7 @@ void ni_pci_detach(gpib_board_t *board) if (tnt_priv) { nec_priv = &tnt_priv->nec7210_priv; - if (nec_priv->iobase) + if (nec_priv->mmiobase) tnt4882_board_reset(tnt_priv, board); if (tnt_priv->irq) free_irq(tnt_priv->irq, board); @@ -1392,7 +1392,7 @@ static int ni_isa_attach_common(gpib_board_t *board, const gpib_board_config_t * struct tnt4882_priv *tnt_priv; struct nec7210_priv *nec_priv; int isr_flags = 0; - void *iobase; + u32 iobase; int irq; board->status = 0; @@ -1415,19 +1415,19 @@ static int ni_isa_attach_common(gpib_board_t *board, const gpib_board_config_t * if (retval < 0) return retval; tnt_priv->pnp_dev = dev; - iobase = (void *)(pnp_port_start(dev, 0)); + iobase = pnp_port_start(dev, 0); irq = pnp_irq(dev, 0); } else { iobase = config->ibbase; irq = config->ibirq; } // allocate ioports - if (!request_region((unsigned long)(iobase), atgpib_iosize, "atgpib")) { + if (!request_region(iobase, atgpib_iosize, "atgpib")) { pr_err("tnt4882: failed to allocate ioports\n"); return -1; } - nec_priv->iobase = ioport_map(iobase, atgpib_iosize); - if (!nec_priv->iobase) + nec_priv->mmiobase = ioport_map(iobase, atgpib_iosize); + if (!nec_priv->mmiobase) return -1; // get irq @@ -1468,10 +1468,10 @@ void ni_isa_detach(gpib_board_t *board) tnt4882_board_reset(tnt_priv, board); if (tnt_priv->irq) free_irq(tnt_priv->irq, board); + if (nec_priv->mmiobase) + ioport_unmap(nec_priv->mmiobase); if (nec_priv->iobase) - ioport_unmap(nec_priv->iobase); - if (nec_priv->iobase) - release_region((unsigned long)(nec_priv->iobase), atgpib_iosize); + release_region(nec_priv->iobase, atgpib_iosize); if (tnt_priv->pnp_dev) pnp_device_detach(tnt_priv->pnp_dev); } @@ -1823,9 +1823,9 @@ int ni_pcmcia_attach(gpib_board_t *board, const gpib_board_config_t *config) return -EIO; } - nec_priv->iobase = ioport_map(curr_dev->resource[0]->start, + nec_priv->mmiobase = ioport_map(curr_dev->resource[0]->start, resource_size(curr_dev->resource[0])); - if (!nec_priv->iobase) + if (!nec_priv->mmiobase) return -1; // get irq @@ -1851,11 +1851,11 @@ void ni_pcmcia_detach(gpib_board_t *board) nec_priv = &tnt_priv->nec7210_priv; if (tnt_priv->irq) free_irq(tnt_priv->irq, board); - if (nec_priv->iobase) - ioport_unmap(nec_priv->iobase); + if (nec_priv->mmiobase) + ioport_unmap(nec_priv->mmiobase); if (nec_priv->iobase) { tnt4882_board_reset(tnt_priv, board); - release_region((unsigned long)nec_priv->iobase, pcmcia_gpib_iosize); + release_region(nec_priv->iobase, pcmcia_gpib_iosize); } } tnt4882_free_private(board); From 669bf56cb2a197bca968ed6079226ee340606671 Mon Sep 17 00:00:00 2001 From: Bingwu Zhang Date: Sun, 8 Dec 2024 12:13:52 +0800 Subject: [PATCH 409/807] mailmap: update Bingwu Zhang's email address I used to contribute to the kernel as 'Bingwu Zhang ' and 'Zhang Bingwu '. Signed-off-by: Bingwu Zhang Link: https://lore.kernel.org/r/20241208041352.168131-2-xtex@envs.net Signed-off-by: Greg Kroah-Hartman --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 5ff0e5d681e7..0fa6d27adec2 100644 --- a/.mailmap +++ b/.mailmap @@ -121,6 +121,8 @@ Ben Widawsky Benjamin Poirier Benjamin Tissoires Benjamin Tissoires +Bingwu Zhang +Bingwu Zhang Bjorn Andersson Bjorn Andersson Bjorn Andersson From 54f89b3178d5448dd4457afbb98fc1ab99090a65 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 10 Dec 2024 01:20:38 +0000 Subject: [PATCH 410/807] tcp_bpf: Charge receive socket buffer in bpf_tcp_ingress() When bpf_tcp_ingress() is called, the skmsg is being redirected to the ingress of the destination socket. Therefore, we should charge its receive socket buffer, instead of sending socket buffer. Because sk_rmem_schedule() tests pfmemalloc of skb, we need to introduce a wrapper and call it for skmsg. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-2-zijianzhang@bytedance.com --- include/net/sock.h | 10 ++++++++-- net/ipv4/tcp_bpf.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c..c383126f691d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1527,7 +1527,7 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { int delta; @@ -1535,7 +1535,13 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || - skb_pfmemalloc(skb); + pfmemalloc; +} + +static inline bool +sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +{ + return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } static inline int sk_unused_reserved_mem(const struct sock *sk) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 99cef92e6290..b21ea634909c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -49,7 +49,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, sge = sk_msg_elem(msg, i); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; - if (!sk_wmem_schedule(sk, size)) { + if (!__sk_rmem_schedule(sk, size, false)) { if (!copied) ret = -ENOMEM; break; From d888b7af7c149c115dd6ac772cc11c375da3e17c Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Tue, 10 Dec 2024 01:20:39 +0000 Subject: [PATCH 411/807] tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will be created out of the skb, and the rmem accounting of the sk_msg will be handled by the skb. For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result, except for the global memory limit, the rmem of sk_redir is nearly unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer. Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are used in these two paths. We use "msg->skb" to test whether the sk_msg is skb backed up. If it's not, we shall do the memory accounting explicitly. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Zijian Zhang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com --- include/linux/skmsg.h | 11 ++++++++--- net/core/skmsg.c | 6 +++++- net/ipv4/tcp_bpf.c | 4 +++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9b03e0746e7..2cbe0c22a32f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void sk_psock_queue_msg(struct sk_psock *psock, +static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + bool ret; + spin_lock_bh(&psock->ingress_lock); - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); - else { + ret = true; + } else { sk_msg_free(psock->sk, msg); kfree(msg); + ret = false; } spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e90fbab703b2..8ad7e6755fd6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -445,8 +445,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -772,6 +774,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index b21ea634909c..392678ae80f4 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -56,6 +56,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, } sk_mem_charge(sk, size); + atomic_add(size, &sk->sk_rmem_alloc); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) @@ -74,7 +75,8 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; - sk_psock_queue_msg(psock, tmp); + if (!sk_psock_queue_msg(psock, tmp)) + atomic_sub(copied, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); From 724c6ce38bbaeb4b3f109b0e066d6c0ecd15446c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 19 Dec 2024 14:57:34 +0100 Subject: [PATCH 412/807] stddef: make __struct_group() UAPI C++-friendly For the most part of the C++ history, it couldn't have type declarations inside anonymous unions for different reasons. At the same time, __struct_group() relies on the latters, so when the @TAG argument is not empty, C++ code doesn't want to build (even under `extern "C"`): ../linux/include/uapi/linux/pkt_cls.h:25:24: error: 'struct tc_u32_sel::::tc_u32_sel_hdr,' invalid; an anonymous union may only have public non-static data members [-fpermissive] The safest way to fix this without trying to switch standards (which is impossible in UAPI anyway) etc., is to disable tag declaration for that language. This won't break anything since for now it's not buildable at all. Use a separate definition for __struct_group() when __cplusplus is defined to mitigate the error, including the version from tools/. Fixes: 50d7bd38c3aa ("stddef: Introduce struct_group() helper macro") Reported-by: Christopher Ferris Closes: https://lore.kernel.org/linux-hardening/Z1HZpe3WE5As8UAz@google.com Suggested-by: Kees Cook # __struct_group_tag() Signed-off-by: Alexander Lobakin Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20241219135734.2130002-1-aleksander.lobakin@intel.com Signed-off-by: Kees Cook --- include/uapi/linux/stddef.h | 13 ++++++++++--- tools/include/uapi/linux/stddef.h | 15 +++++++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index 58154117d9b0..a6fce46aeb37 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -8,6 +8,13 @@ #define __always_inline inline #endif +/* Not all C++ standards support type declarations inside an anonymous union */ +#ifndef __cplusplus +#define __struct_group_tag(TAG) TAG +#else +#define __struct_group_tag(TAG) +#endif + /** * __struct_group() - Create a mirrored named and anonyomous struct * @@ -20,13 +27,13 @@ * and size: one anonymous and one named. The former's members can be used * normally without sub-struct naming, and the latter can be used to * reason about the start, end, and size of the group of struct members. - * The named struct can also be explicitly tagged for layer reuse, as well - * as both having struct attributes appended. + * The named struct can also be explicitly tagged for layer reuse (C only), + * as well as both having struct attributes appended. */ #define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \ union { \ struct { MEMBERS } ATTRS; \ - struct TAG { MEMBERS } ATTRS NAME; \ + struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \ } ATTRS #ifdef __cplusplus diff --git a/tools/include/uapi/linux/stddef.h b/tools/include/uapi/linux/stddef.h index bb6ea517efb5..c53cde425406 100644 --- a/tools/include/uapi/linux/stddef.h +++ b/tools/include/uapi/linux/stddef.h @@ -8,6 +8,13 @@ #define __always_inline __inline__ #endif +/* Not all C++ standards support type declarations inside an anonymous union */ +#ifndef __cplusplus +#define __struct_group_tag(TAG) TAG +#else +#define __struct_group_tag(TAG) +#endif + /** * __struct_group() - Create a mirrored named and anonyomous struct * @@ -20,14 +27,14 @@ * and size: one anonymous and one named. The former's members can be used * normally without sub-struct naming, and the latter can be used to * reason about the start, end, and size of the group of struct members. - * The named struct can also be explicitly tagged for layer reuse, as well - * as both having struct attributes appended. + * The named struct can also be explicitly tagged for layer reuse (C only), + * as well as both having struct attributes appended. */ #define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \ union { \ struct { MEMBERS } ATTRS; \ - struct TAG { MEMBERS } ATTRS NAME; \ - } + struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \ + } ATTRS /** * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union From 246068b86b1c36e4590388ab8f278e21f1997dc1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 19 Dec 2024 17:54:10 +0200 Subject: [PATCH 413/807] selftests: net: local_termination: require mausezahn Since the blamed commit, we require mausezahn because send_raw() uses it. Remove the "REQUIRE_MZ=no" line, which overwrites the default of requiring it. Fixes: 237979504264 ("selftests: net: local_termination: add PTP frames to the mix") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241219155410.1856868-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/local_termination.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index c35548767756..ecd34f364125 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -7,7 +7,6 @@ ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge test_vlan \ NUM_NETIFS=2 PING_COUNT=1 REQUIRE_MTOOLS=yes -REQUIRE_MZ=no source lib.sh From 4a25201aa46ce88e8e31f9ccdec0e4e3dd6bb736 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Dec 2024 19:28:32 -0800 Subject: [PATCH 414/807] netdev-genl: avoid empty messages in napi get Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down we "hide" the NAPI instances. Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241219032833.1165433-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 2d3ae0cd3ad2..b0772d135efb 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -246,8 +246,12 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rcu_read_unlock(); rtnl_unlock(); - if (err) + if (err) { goto err_free_msg; + } else if (!rsp->len) { + err = -ENOENT; + goto err_free_msg; + } return genlmsg_reply(rsp, info); From 30b981796b94b083da8fdded7cb74cb493608760 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Dec 2024 19:28:33 -0800 Subject: [PATCH 415/807] selftests: drv-net: test empty queue and NAPI responses in netlink Make sure kernel doesn't respond to GETs for queues and NAPIs when link is down. Not with valid data, or with empty message, we want a ENOENT. Link: https://patch.msgid.link/20241219032833.1165433-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/queues.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 9c5473abbd78..38303da957ee 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_run, ksft_exit, ksft_eq, KsftSkipEx -from lib.py import EthtoolFamily, NetdevFamily +from lib.py import ksft_disruptive, ksft_exit, ksft_run +from lib.py import ksft_eq, ksft_raises, KsftSkipEx +from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import NetDrvEnv -from lib.py import cmd +from lib.py import cmd, defer, ip +import errno import glob @@ -59,9 +61,27 @@ def addremove_queues(cfg, nl) -> None: ksft_eq(queues, expected) +@ksft_disruptive +def check_down(cfg, nl) -> None: + # Check the NAPI IDs before interface goes down and hides them + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + + ip(f"link set dev {cfg.dev['ifname']} down") + defer(ip, f"link set dev {cfg.dev['ifname']} up") + + with ksft_raises(NlError) as cm: + nl.queue_get({'ifindex': cfg.ifindex, 'id': 0, 'type': 'rx'}) + ksft_eq(cm.exception.nl_msg.error, -errno.ENOENT) + + if napis: + with ksft_raises(NlError) as cm: + nl.napi_get({'id': napis[0]['id']}) + ksft_eq(cm.exception.nl_msg.error, -errno.ENOENT) + + def main() -> None: with NetDrvEnv(__file__, queue_count=100) as cfg: - ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily())) + ksft_run([get_queues, addremove_queues, check_down], args=(cfg, NetdevFamily())) ksft_exit() From 973b710b8821c3401ad7a25360c89e94b26884ac Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:01 +0000 Subject: [PATCH 416/807] kheaders: Ignore silly-rename files Tell tar to ignore silly-rename files (".__afs*" and ".nfs*") when building the header archive. These occur when a file that is open is unlinked locally, but hasn't yet been closed. Such files are visible to the user via the getdents() syscall and so programs may want to do things with them. During the kernel build, such files may be made during the processing of header files and the cleanup may get deferred by fput() which may result in tar seeing these files when it reads the directory, but they may have disappeared by the time it tries to open them, causing tar to fail with an error. Further, we don't want to include them in the tarball if they still exist. With CONFIG_HEADERS_INSTALL=y, something like the following may be seen: find: './kernel/.tmp_cpio_dir/include/dt-bindings/reset/.__afs2080': No such file or directory tar: ./include/linux/greybus/.__afs3C95: File removed before we read it The find warning doesn't seem to cause a problem. Fix this by telling tar when called from in gen_kheaders.sh to exclude such files. This only affects afs and nfs; cifs uses the Windows Hidden attribute to prevent the file from being seen. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-2-dhowells@redhat.com cc: Masahiro Yamada cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: linux-nfs@vger.kernel.org cc: linux-kernel@vger.kernel.org Signed-off-by: Christian Brauner --- kernel/gen_kheaders.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 383fd43ac612..7e1340da5aca 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -89,6 +89,7 @@ find $cpio_dir -type f -print0 | # Create archive and try to normalize metadata for reproducibility. tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --exclude=".__afs*" --exclude=".nfs*" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null From c8b90d40d5bba8e6fba457b8a7c10d3c0d467e37 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:02 +0000 Subject: [PATCH 417/807] netfs: Fix non-contiguous donation between completed reads When a read subrequest finishes, if it doesn't have sufficient coverage to complete the folio(s) covering either side of it, it will donate the excess coverage to the adjacent subrequests on either side, offloading responsibility for unlocking the folio(s) covered to them. Now, preference is given to donating down to a lower file offset over donating up because that check is done first - but there's no check that the lower subreq is actually contiguous, and so we can end up donating incorrectly. The scenario seen[1] is that an 8MiB readahead request spanning four 2MiB folios is split into eight 1MiB subreqs (numbered 1 through 8). These terminate in the order 1,6,2,5,3,7,4,8. What happens is: - 1 donates to 2 - 6 donates to 5 - 2 completes, unlocking the first folio (with 1). - 5 completes, unlocking the third folio (with 6). - 3 donates to 4 - 7 donates to 4 incorrectly - 4 completes, unlocking the second folio (with 3), but can't use the excess from 7. - 8 donates to 4, also incorrectly. Fix this by preventing downward donation if the subreqs are not contiguous (in the example above, 7 donates to 4 across the gap left by 5 and 6). Reported-by: Shyam Prasad N Closes: https://lore.kernel.org/r/CANT5p=qBwjBm-D8soFVVtswGEfmMtQXVW83=TNfUtvyHeFQZBA@mail.gmail.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/526707.1733224486@warthog.procyon.org.uk/ [1] Link: https://lore.kernel.org/r/20241213135013.2964079-3-dhowells@redhat.com cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 3cbb289535a8..b415e3972336 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -247,16 +247,17 @@ donation_changed: /* Deal with the trickiest case: that this subreq is in the middle of a * folio, not touching either edge, but finishes first. In such a - * case, we donate to the previous subreq, if there is one, so that the - * donation is only handled when that completes - and remove this - * subreq from the list. + * case, we donate to the previous subreq, if there is one and if it is + * contiguous, so that the donation is only handled when that completes + * - and remove this subreq from the list. * * If the previous subreq finished first, we will have acquired their * donation and should be able to unlock folios and/or donate nextwards. */ if (!subreq->consumed && !prev_donated && - !list_is_first(&subreq->rreq_link, &rreq->subrequests)) { + !list_is_first(&subreq->rreq_link, &rreq->subrequests) && + subreq->start == prev->start + prev->len) { prev = list_prev_entry(subreq, rreq_link); WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len); subreq->start += subreq->len; From 105549d09a539a876b7c3330ab52d8aceedad358 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:03 +0000 Subject: [PATCH 418/807] netfs: Fix enomem handling in buffered reads If netfs_read_to_pagecache() gets an error from either ->prepare_read() or from netfs_prepare_read_iterator(), it needs to decrement ->nr_outstanding, cancel the subrequest and break out of the issuing loop. Currently, it only does this for two of the cases, but there are two more that aren't handled. Fix this by moving the handling to a common place and jumping to it from all four places. This is in preference to inserting a wrapper around netfs_prepare_read_iterator() as proposed by Dmitry Antipov[1]. Link: https://lore.kernel.org/r/20241202093943.227786-1-dmantipov@yandex.ru/ [1] Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Reported-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=404b4b745080b6210c6c Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-4-dhowells@redhat.com Tested-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com cc: Dmitry Antipov cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 7ac34550c403..4dc9b8286355 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -275,22 +275,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) netfs_stat(&netfs_n_rh_download); if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); - if (ret < 0) { - atomic_dec(&rreq->nr_outstanding); - netfs_put_subrequest(subreq, false, - netfs_sreq_trace_put_cancel); - break; - } + if (ret < 0) + goto prep_failed; trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); } slice = netfs_prepare_read_iterator(subreq); - if (slice < 0) { - atomic_dec(&rreq->nr_outstanding); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); - ret = slice; - break; - } + if (slice < 0) + goto prep_iter_failed; rreq->netfs_ops->issue_read(subreq); goto done; @@ -302,6 +294,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) trace_netfs_sreq(subreq, netfs_sreq_trace_submit); netfs_stat(&netfs_n_rh_zero); slice = netfs_prepare_read_iterator(subreq); + if (slice < 0) + goto prep_iter_failed; __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); netfs_read_subreq_terminated(subreq, 0, false); goto done; @@ -310,6 +304,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) if (source == NETFS_READ_FROM_CACHE) { trace_netfs_sreq(subreq, netfs_sreq_trace_submit); slice = netfs_prepare_read_iterator(subreq); + if (slice < 0) + goto prep_iter_failed; netfs_read_cache_to_pagecache(rreq, subreq); goto done; } @@ -318,6 +314,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) WARN_ON_ONCE(1); break; + prep_iter_failed: + ret = slice; + prep_failed: + subreq->error = ret; + atomic_dec(&rreq->nr_outstanding); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + break; + done: size -= slice; start += slice; From 86ad1a58f6a9453f49e06ef957a40a8dac00a13f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:04 +0000 Subject: [PATCH 419/807] nfs: Fix oops in nfs_netfs_init_request() when copying to cache When netfslib wants to copy some data that has just been read on behalf of nfs, it creates a new write request and calls nfs_netfs_init_request() to initialise it, but with a NULL file pointer. This causes nfs_file_open_context() to oops - however, we don't actually need the nfs context as we're only going to write to the cache. Fix this by just returning if we aren't given a file pointer and emit a warning if the request was for something other than copy-to-cache. Further, fix nfs_netfs_free_request() so that it doesn't try to free the context if the pointer is NULL. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Reported-by: Max Kellermann Closes: https://lore.kernel.org/r/CAKPOu+9DyMbKLhyJb7aMLDTb=Fh0T8Teb9sjuf_pze+XWT1VaQ@mail.gmail.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-5-dhowells@redhat.com cc: Trond Myklebust cc: Anna Schumaker cc: Dave Wysochanski cc: Jeff Layton cc: linux-nfs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/nfs/fscache.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 810269ee0a50..d49e4ce27999 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -263,6 +263,12 @@ int nfs_netfs_readahead(struct readahead_control *ractl) static atomic_t nfs_netfs_debug_id; static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file) { + if (!file) { + if (WARN_ON_ONCE(rreq->origin != NETFS_PGPRIV2_COPY_TO_CACHE)) + return -EIO; + return 0; + } + rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ @@ -274,7 +280,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi static void nfs_netfs_free_request(struct netfs_io_request *rreq) { - put_nfs_open_context(rreq->netfs_priv); + if (rreq->netfs_priv) + put_nfs_open_context(rreq->netfs_priv); } static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) From e5a8b6446c0d370716f193771ccacf3260a57534 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 13 Dec 2024 13:50:05 +0000 Subject: [PATCH 420/807] cachefiles: Parse the "secctx" immediately Instead of storing an opaque string, call security_secctx_to_secid() right in the "secctx" command handler and store only the numeric "secid". This eliminates an unnecessary string allocation and allows the daemon to receive errors when writing the "secctx" command instead of postponing the error to the "bind" command handler. For example, if the kernel was built without `CONFIG_SECURITY`, "bind" will return `EOPNOTSUPP`, but the daemon doesn't know why. With this patch, the "secctx" will instead return `EOPNOTSUPP` which is the right context for this error. This patch adds a boolean flag `have_secid` because I'm not sure if we can safely assume that zero is the special secid value for "not set". This appears to be true for SELinux, Smack and AppArmor, but since this attribute is not documented, I'm unable to derive a stable guarantee for that. Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241209141554.638708-1-max.kellermann@ionos.com/ Link: https://lore.kernel.org/r/20241213135013.2964079-6-dhowells@redhat.com Signed-off-by: Christian Brauner --- fs/cachefiles/daemon.c | 14 +++++++------- fs/cachefiles/internal.h | 3 ++- fs/cachefiles/security.c | 6 +++--- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 89b11336a836..1806bff8e59b 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -576,7 +577,7 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) */ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) { - char *secctx; + int err; _enter(",%s", args); @@ -585,16 +586,16 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) return -EINVAL; } - if (cache->secctx) { + if (cache->have_secid) { pr_err("Second security context specified\n"); return -EINVAL; } - secctx = kstrdup(args, GFP_KERNEL); - if (!secctx) - return -ENOMEM; + err = security_secctx_to_secid(args, strlen(args), &cache->secid); + if (err) + return err; - cache->secctx = secctx; + cache->have_secid = true; return 0; } @@ -820,7 +821,6 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache) put_cred(cache->cache_cred); kfree(cache->rootdirname); - kfree(cache->secctx); kfree(cache->tag); _leave(""); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 7b99bd98de75..38c236e38cef 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -122,7 +122,6 @@ struct cachefiles_cache { #define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ #define CACHEFILES_ONDEMAND_MODE 4 /* T if in on-demand read mode */ char *rootdirname; /* name of cache root directory */ - char *secctx; /* LSM security context */ char *tag; /* cache binding tag */ refcount_t unbind_pincount;/* refcount to do daemon unbind */ struct xarray reqs; /* xarray of pending on-demand requests */ @@ -130,6 +129,8 @@ struct cachefiles_cache { struct xarray ondemand_ids; /* xarray for ondemand_id allocation */ u32 ondemand_id_next; u32 msg_id_next; + u32 secid; /* LSM security id */ + bool have_secid; /* whether "secid" was set */ }; static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache) diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index fe777164f1d8..fc6611886b3b 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -18,7 +18,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache) struct cred *new; int ret; - _enter("{%s}", cache->secctx); + _enter("{%u}", cache->have_secid ? cache->secid : 0); new = prepare_kernel_cred(current); if (!new) { @@ -26,8 +26,8 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache) goto error; } - if (cache->secctx) { - ret = set_security_override_from_ctx(new, cache->secctx); + if (cache->have_secid) { + ret = set_security_override(new, cache->secid); if (ret < 0) { put_cred(new); pr_err("Security denies permission to nominate security context: error %d\n", From f4d3cde410cc62b5483f59f0f3454a5c5203a2cb Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Fri, 13 Dec 2024 13:50:06 +0000 Subject: [PATCH 421/807] netfs: Remove redundant use of smp_rmb() The function netfs_unbuffered_write_iter_locked() in fs/netfs/direct_write.c contains an unnecessary smp_rmb() call after wait_on_bit(). Since wait_on_bit() already incorporates a memory barrier that ensures the flag update is visible before the function returns, the smp_rmb() provides no additional benefit and incurs unnecessary overhead. This patch removes the redundant barrier to simplify and optimize the code. Signed-off-by: Zilin Guan Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241207021952.2978530-1-zilin@seu.edu.cn/ Link: https://lore.kernel.org/r/20241213135013.2964079-7-dhowells@redhat.com Reviewed-by: Akira Yokosawa cc: Akira Yokosawa cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/direct_write.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 88f2adfab75e..173e8b5e6a93 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -104,7 +104,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); - smp_rmb(); /* Read error/transferred after RIP flag */ ret = wreq->error; if (ret == 0) { ret = wreq->transferred; From aa3956418985bda1f68313eadde3267921847978 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:07 +0000 Subject: [PATCH 422/807] netfs: Fix missing barriers by using clear_and_wake_up_bit() Use clear_and_wake_up_bit() rather than something like: clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); as there needs to be a barrier inserted between which is present in clear_and_wake_up_bit(). Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-8-dhowells@redhat.com Reviewed-by: Akira Yokosawa cc: Zilin Guan cc: Akira Yokosawa cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 3 +-- fs/netfs/write_collect.c | 9 +++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index b415e3972336..46ce3b7adf07 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -379,8 +379,7 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq) task_io_account_read(rreq->transferred); trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_done); netfs_clear_subrequests(rreq, false); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 1d438be2e1b4..82290c92ba7a 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -501,8 +501,7 @@ reassess_streams: goto need_retry; if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { trace_netfs_rreq(wreq, netfs_rreq_trace_unpause); - clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags); - wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE); + clear_and_wake_up_bit(NETFS_RREQ_PAUSE, &wreq->flags); } if (notes & NEED_REASSESS) { @@ -605,8 +604,7 @@ void netfs_write_collection_worker(struct work_struct *work) _debug("finished"); trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); - wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags); if (wreq->iocb) { size_t written = min(wreq->transferred, wreq->len); @@ -714,8 +712,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); /* If we are at the head of the queue, wake up the collector, * transferring a ref to it if we were the ones to do so. From 4acb665cf4f3e5436844f17ece0a8a55ce688c7b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:08 +0000 Subject: [PATCH 423/807] netfs: Work around recursion by abandoning retry if nothing read syzkaller reported recursion with a loop of three calls (netfs_rreq_assess, netfs_retry_reads and netfs_rreq_terminated) hitting the limit of the stack during an unbuffered or direct I/O read. There are a number of issues: (1) There is no limit on the number of retries. (2) A subrequest is supposed to be abandoned if it does not transfer anything (NETFS_SREQ_NO_PROGRESS), but that isn't checked under all circumstances. (3) The actual root cause, which is this: if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_terminated(rreq, ...); When we do a retry, we bump the rreq->nr_outstanding counter to prevent the final cleanup phase running before we've finished dispatching the retries. The problem is if we hit 0, we have to do the cleanup phase - but we're in the cleanup phase and end up repeating the retry cycle, hence the recursion. Work around the problem by limiting the number of retries. This is based on Lizhi Xu's patch[1], and makes the following changes: (1) Replace NETFS_SREQ_NO_PROGRESS with NETFS_SREQ_MADE_PROGRESS and make the filesystem set it if it managed to read or write at least one byte of data. Clear this bit before issuing a subrequest. (2) Add a ->retry_count member to the subrequest and increment it any time we do a retry. (3) Remove the NETFS_SREQ_RETRYING flag as it is superfluous with ->retry_count. If the latter is non-zero, we're doing a retry. (4) Abandon a subrequest if retry_count is non-zero and we made no progress. (5) Use ->retry_count in both the write-side and the read-size. [?] Question: Should I set a hard limit on retry_count in both read and write? Say it hits 50, we always abandon it. The problem is that these changes only mitigate the issue. As long as it made at least one byte of progress, the recursion is still an issue. This patch mitigates the problem, but does not fix the underlying cause. I have patches that will do that, but it's an intrusive fix that's currently pending for the next merge window. The oops generated by KASAN looks something like: BUG: TASK stack guard page was hit at ffffc9000482ff48 (stack is ffffc90004830000..ffffc90004838000) Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN NOPTI ... RIP: 0010:mark_lock+0x25/0xc60 kernel/locking/lockdep.c:4686 ... mark_usage kernel/locking/lockdep.c:4646 [inline] __lock_acquire+0x906/0x3ce0 kernel/locking/lockdep.c:5156 lock_acquire.part.0+0x11b/0x380 kernel/locking/lockdep.c:5825 local_lock_acquire include/linux/local_lock_internal.h:29 [inline] ___slab_alloc+0x123/0x1880 mm/slub.c:3695 __slab_alloc.constprop.0+0x56/0xb0 mm/slub.c:3908 __slab_alloc_node mm/slub.c:3961 [inline] slab_alloc_node mm/slub.c:4122 [inline] kmem_cache_alloc_noprof+0x2a7/0x2f0 mm/slub.c:4141 radix_tree_node_alloc.constprop.0+0x1e8/0x350 lib/radix-tree.c:253 idr_get_free+0x528/0xa40 lib/radix-tree.c:1506 idr_alloc_u32+0x191/0x2f0 lib/idr.c:46 idr_alloc+0xc1/0x130 lib/idr.c:87 p9_tag_alloc+0x394/0x870 net/9p/client.c:321 p9_client_prepare_req+0x19f/0x4d0 net/9p/client.c:644 p9_client_zc_rpc.constprop.0+0x105/0x880 net/9p/client.c:793 p9_client_read_once+0x443/0x820 net/9p/client.c:1570 p9_client_read+0x13f/0x1b0 net/9p/client.c:1534 v9fs_issue_read+0x115/0x310 fs/9p/vfs_addr.c:74 netfs_retry_read_subrequests fs/netfs/read_retry.c:60 [inline] netfs_retry_reads+0x153a/0x1d00 fs/netfs/read_retry.c:232 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 ... netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_dispatch_unbuffered_reads fs/netfs/direct_read.c:103 [inline] netfs_unbuffered_read fs/netfs/direct_read.c:127 [inline] netfs_unbuffered_read_iter_locked+0x12f6/0x19b0 fs/netfs/direct_read.c:221 netfs_unbuffered_read_iter+0xc5/0x100 fs/netfs/direct_read.c:256 v9fs_file_read_iter+0xbf/0x100 fs/9p/vfs_file.c:361 do_iter_readv_writev+0x614/0x7f0 fs/read_write.c:832 vfs_readv+0x4cf/0x890 fs/read_write.c:1025 do_preadv fs/read_write.c:1142 [inline] __do_sys_preadv fs/read_write.c:1192 [inline] __se_sys_preadv fs/read_write.c:1187 [inline] __x64_sys_preadv+0x22d/0x310 fs/read_write.c:1187 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Closes: https://syzkaller.appspot.com/bug?extid=1fc6f64c40a9d143cfb6 Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241108034020.3695718-1-lizhi.xu@windriver.com/ [1] Link: https://lore.kernel.org/r/20241213135013.2964079-9-dhowells@redhat.com Tested-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com Suggested-by: Lizhi Xu cc: Dominique Martinet cc: Jeff Layton cc: v9fs@lists.linux.dev cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/9p/vfs_addr.c | 6 +++++- fs/afs/write.c | 5 ++++- fs/netfs/read_collect.c | 15 +++++++++------ fs/netfs/read_retry.c | 6 ++++-- fs/netfs/write_collect.c | 5 ++--- fs/netfs/write_issue.c | 2 ++ fs/smb/client/cifssmb.c | 13 +++++++++---- fs/smb/client/smb2pdu.c | 9 ++++++--- include/linux/netfs.h | 6 +++--- 9 files changed, 44 insertions(+), 23 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 819c75233235..3bc9ce6c575e 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -57,6 +57,8 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq) int err, len; len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); + if (len > 0) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); netfs_write_subrequest_terminated(subreq, len ?: err, false); } @@ -80,8 +82,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) if (pos + total >= i_size_read(rreq->inode)) __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); - if (!err) + if (!err) { subreq->transferred += total; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + } netfs_read_subreq_terminated(subreq, err, false); } diff --git a/fs/afs/write.c b/fs/afs/write.c index 34107b55f834..ccb6aa8027c5 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -122,7 +122,7 @@ static void afs_issue_write_worker(struct work_struct *work) if (subreq->debug_index == 3) return netfs_write_subrequest_terminated(subreq, -ENOANO, false); - if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { + if (!subreq->retry_count) { set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); } @@ -149,6 +149,9 @@ static void afs_issue_write_worker(struct work_struct *work) afs_wait_for_operation(op); ret = afs_put_operation(op); switch (ret) { + case 0: + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + break; case -EACCES: case -EPERM: case -ENOKEY: diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 46ce3b7adf07..47ed3a5044e2 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -438,7 +438,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, rreq->origin == NETFS_READPAGE || rreq->origin == NETFS_READ_FOR_WRITE)) { netfs_consume_read_data(subreq, was_async); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); } } EXPORT_SYMBOL(netfs_read_subreq_progress); @@ -497,7 +497,7 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, rreq->origin == NETFS_READPAGE || rreq->origin == NETFS_READ_FOR_WRITE)) { netfs_consume_read_data(subreq, was_async); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); } rreq->transferred += subreq->transferred; } @@ -511,10 +511,13 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, } else { trace_netfs_sreq(subreq, netfs_sreq_trace_short); if (subreq->transferred > subreq->consumed) { - __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); - } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + /* If we didn't read new data, abandon retry. */ + if (subreq->retry_count && + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { + __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); + } + } else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); } else { diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0350592ea804..0e72e9226fc8 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -56,6 +56,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) break; if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + subreq->retry_count++; netfs_reset_iter(subreq); netfs_reissue_read(rreq, subreq); } @@ -137,7 +139,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) stream0->sreq_max_len = subreq->len; __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + subreq->retry_count++; spin_lock_bh(&rreq->lock); list_add_tail(&subreq->rreq_link, &rreq->subrequests); @@ -213,7 +216,6 @@ abandon: subreq->error = -ENOMEM; __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __clear_bit(NETFS_SREQ_RETRYING, &subreq->flags); } spin_lock_bh(&rreq->lock); list_splice_tail_init(&queue, &rreq->subrequests); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 82290c92ba7a..ca3a11ed9b54 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -179,7 +179,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, struct iov_iter source = subreq->io_iter; iov_iter_revert(&source, subreq->len - source.count); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_write(stream, subreq, &source); } @@ -234,7 +233,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, /* Renegotiate max_len (wsize) */ trace_netfs_sreq(subreq, netfs_sreq_trace_retry); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + subreq->retry_count++; stream->prepare_write(subreq); part = min(len, stream->sreq_max_len); @@ -279,7 +278,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, subreq->start = start; subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); subreq->stream_nr = to->stream_nr; - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + subreq->retry_count = 1; trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index bf6d507578e5..ff0e82505a0b 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -244,6 +244,8 @@ void netfs_reissue_write(struct netfs_io_stream *stream, iov_iter_advance(source, size); iov_iter_truncate(&subreq->io_iter, size); + subreq->retry_count++; + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); netfs_do_issue_write(stream, subreq); } diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index bd42a419458e..6cb1e81993f8 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1319,14 +1319,16 @@ cifs_readv_callback(struct mid_q_entry *mid) } if (rdata->result == -ENODATA) { - __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && rdata->subreq.start + trans == ictx->remote_i_size) { - __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + } else if (rdata->got_bytes > 0) { + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } } @@ -1670,10 +1672,13 @@ cifs_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; - if (written < wdata->subreq.len) + if (written < wdata->subreq.len) { result = -ENOSPC; - else + } else { result = written; + if (written > 0) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags); + } break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 010eae9d6c47..458b53d1f9cb 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4615,6 +4615,7 @@ smb2_readv_callback(struct mid_q_entry *mid) __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, server->credits, server->in_flight, @@ -4840,10 +4841,12 @@ smb2_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; - if (written < wdata->subreq.len) + if (written < wdata->subreq.len) { wdata->result = -ENOSPC; - else + } else if (written > 0) { wdata->subreq.len = written; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags); + } break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: @@ -5012,7 +5015,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) } #endif - if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags)) + if (wdata->subreq.retry_count > 0) smb2_set_replay(server, &rqst); cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5eaceef41e6c..4083d77e3f39 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -185,6 +185,7 @@ struct netfs_io_subrequest { short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ + u8 retry_count; /* The number of retries (0 on initial pass) */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ unsigned char curr_folioq_slot; /* Folio currently being read */ @@ -194,14 +195,13 @@ struct netfs_io_subrequest { #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ -#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ +#define NETFS_SREQ_MADE_PROGRESS 4 /* Set if we transferred at least some data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ #define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ -#define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ -#define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ +#define NETFS_SREQ_FAILED 10 /* Set if the subreq failed unretryably */ }; enum netfs_io_origin { From 38cf8e945721ffe708fa675507465da7f4f2a9f7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:09 +0000 Subject: [PATCH 424/807] netfs: Fix ceph copy to cache on write-begin At the end of netfs_unlock_read_folio() in which folios are marked appropriately for copying to the cache (either with by being marked dirty and having their private data set or by having PG_private_2 set) and then unlocked, the folio_queue struct has the entry pointing to the folio cleared. This presents a problem for netfs_pgpriv2_write_to_the_cache(), which is used to write folios marked with PG_private_2 to the cache as it expects to be able to trawl the folio_queue list thereafter to find the relevant folios, leading to a hang. Fix this by not clearing the folio_queue entry if we're going to do the deprecated copy-to-cache. The clearance will be done instead as the folios are written to the cache. This can be reproduced by starting cachefiles, mounting a ceph filesystem with "-o fsc" and writing to it. Fixes: 796a4049640b ("netfs: In readahead, put the folio refs as soon extracted") Reported-by: Max Kellermann Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-10-dhowells@redhat.com Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") cc: Jeff Layton cc: Ilya Dryomov cc: Xiubo Li cc: netfs@lists.linux.dev cc: ceph-devel@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 47ed3a5044e2..e8624f5c7fcc 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -62,10 +62,14 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, } else { trace_netfs_folio(folio, netfs_folio_trace_read_done); } + + folioq_clear(folioq, slot); } else { // TODO: Use of PG_private_2 is deprecated. if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot); + else + folioq_clear(folioq, slot); } if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { @@ -77,8 +81,6 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, folio_unlock(folio); } } - - folioq_clear(folioq, slot); } /* From d0327c824338cdccad058723a31d038ecd553409 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:10 +0000 Subject: [PATCH 425/807] netfs: Fix the (non-)cancellation of copy when cache is temporarily disabled When the caching for a cookie is temporarily disabled (e.g. due to a DIO write on that file), future copying to the cache for that file is disabled until all fds open on that file are closed. However, if netfslib is using the deprecated PG_private_2 method (such as is currently used by ceph), and decides it wants to copy to the cache, netfs_advance_write() will just bail at the first check seeing that the cache stream is unavailable, and indicate that it dealt with all the content. This means that we have no subrequests to provide notifications to drive the state machine or even to pin the request and the request just gets discarded, leaving the folios with PG_private_2 set. Fix this by jumping directly to cancel the request if the cache is not available. That way, we don't remove mark3 from the folio_queue list and netfs_pgpriv2_cancel() will clean up the folios. This was found by running the generic/013 xfstest against ceph with an active cache and the "-o fsc" option passed to ceph. That would usually hang Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Reported-by: Max Kellermann Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-11-dhowells@redhat.com cc: Jeff Layton cc: Ilya Dryomov cc: Xiubo Li cc: netfs@lists.linux.dev cc: ceph-devel@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_pgpriv2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index ba5af89d37fa..54d5004fec18 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -170,6 +170,10 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq) trace_netfs_write(wreq, netfs_write_trace_copy_to_cache); netfs_stat(&netfs_n_wh_copy_to_cache); + if (!wreq->io_streams[1].avail) { + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + goto couldnt_start; + } for (;;) { error = netfs_pgpriv2_copy_folio(wreq, folio); From d4e338de17cb6532bf805fae00db8b41e914009b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:34:45 +0000 Subject: [PATCH 426/807] netfs: Fix is-caching check in read-retry netfs: Fix is-caching check in read-retry The read-retry code checks the NETFS_RREQ_COPY_TO_CACHE flag to determine if there might be failed reads from the cache that need turning into reads from the server, with the intention of skipping the complicated part if it can. The code that set the flag, however, got lost during the read-side rewrite. Fix the check to see if the cache_resources are valid instead. The flag can then be removed. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Signed-off-by: David Howells Link: https://lore.kernel.org/r/3752048.1734381285@warthog.procyon.org.uk cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_retry.c | 2 +- include/linux/netfs.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0e72e9226fc8..21b4a54e545e 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -49,7 +49,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) * up to the first permanently failed one. */ if (!rreq->netfs_ops->prepare_read && - !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) { + !rreq->cache_resources.ops) { struct netfs_io_subrequest *subreq; list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 4083d77e3f39..ecdd5ced16a8 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -269,7 +269,6 @@ struct netfs_io_request { size_t prev_donated; /* Fallback for subreq->prev_donated */ refcount_t ref; unsigned long flags; -#define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ From 8600058ba28a7b07660ddcd150372d72fb3bc895 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Fri, 20 Dec 2024 15:06:47 -0600 Subject: [PATCH 427/807] of: Add coreboot firmware to excluded default cells list Google Juniper and other Chromebook platforms have a very old bootloader which populates /firmware node without proper address/size-cells leading to warnings: Missing '#address-cells' in /firmware WARNING: CPU: 0 PID: 1 at drivers/of/base.c:106 of_bus_n_addr_cells+0x90/0xf0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0 #1 933ab9971ff4d5dc58cb378a96f64c7f72e3454d Hardware name: Google juniper sku16 board (DT) ... Missing '#size-cells' in /firmware WARNING: CPU: 0 PID: 1 at drivers/of/base.c:133 of_bus_n_size_cells+0x90/0xf0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Tainted: G W 6.12.0 #1 933ab9971ff4d5dc58cb378a96f64c7f72e3454d Tainted: [W]=WARN Hardware name: Google juniper sku16 board (DT) These platform won't receive updated bootloader/firmware, so add an exclusion for platforms with a "coreboot" compatible node. While this is wider than necessary, that's the easiest fix and it doesn't doesn't matter if we miss checking other platforms using coreboot. We may revisit this later and address with a fixup to the DT itself. Reported-by: Sasha Levin Closes: https://lore.kernel.org/all/Z0NUdoG17EwuCigT@sashalap/ Cc: AngeloGioacchino Del Regno Cc: Matthias Brugger Cc: Chen-Yu Tsai Cc: Krzysztof Kozlowski Signed-off-by: Rob Herring (Arm) --- drivers/of/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index 44b1c8bf9cc0..e6ef31c4940f 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -88,7 +88,8 @@ static bool __of_node_is_type(const struct device_node *np, const char *type) } #define EXCLUDED_DEFAULT_CELLS_PLATFORMS ( \ - IS_ENABLED(CONFIG_SPARC) \ + IS_ENABLED(CONFIG_SPARC) || \ + of_find_compatible_node(NULL, NULL, "coreboot") \ ) int of_bus_n_addr_cells(struct device_node *np) From fdf478d236dcf0f1f68534df5d456ced625195bd Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:22 +0800 Subject: [PATCH 428/807] skmsg: Return copied bytes in sk_msg_memcopy_from_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously sk_msg_memcopy_from_iter returns the copied bytes from the last copy_from_iter{,_nocache} call upon success. This commit changes it to return the total number of copied bytes on success. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-1-bae583d014f3@outlook.com --- net/core/skmsg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8ad7e6755fd6..61f3f3d4e528 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -369,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; + u32 copy, buf_size, copied = 0; struct scatterlist *sge; - u32 copy, buf_size; void *to; do { @@ -397,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } bytes -= copy; + copied += copy; if (!bytes) break; msg->sg.copybreak = 0; @@ -404,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } while (i != msg->sg.end); out: msg->sg.curr = i; - return ret; + return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); From 5153a75ef34b3f7478ca918044d0f05eed8fb3f9 Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:23 +0800 Subject: [PATCH 429/807] tcp_bpf: Fix copied value in tcp_bpf_sendmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf kselftest sockhash::test_txmsg_cork_hangs in test_sockmap.c triggers a kernel NULL pointer dereference: BUG: kernel NULL pointer dereference, address: 0000000000000008 ? __die_body+0x6e/0xb0 ? __die+0x8b/0xa0 ? page_fault_oops+0x358/0x3c0 ? local_clock+0x19/0x30 ? lock_release+0x11b/0x440 ? kernelmode_fixup_or_oops+0x54/0x60 ? __bad_area_nosemaphore+0x4f/0x210 ? mmap_read_unlock+0x13/0x30 ? bad_area_nosemaphore+0x16/0x20 ? do_user_addr_fault+0x6fd/0x740 ? prb_read_valid+0x1d/0x30 ? exc_page_fault+0x55/0xd0 ? asm_exc_page_fault+0x2b/0x30 ? splice_to_socket+0x52e/0x630 ? shmem_file_splice_read+0x2b1/0x310 direct_splice_actor+0x47/0x70 splice_direct_to_actor+0x133/0x300 ? do_splice_direct+0x90/0x90 do_splice_direct+0x64/0x90 ? __ia32_sys_tee+0x30/0x30 do_sendfile+0x214/0x300 __se_sys_sendfile64+0x8e/0xb0 __x64_sys_sendfile64+0x25/0x30 x64_sys_call+0xb82/0x2840 do_syscall_64+0x75/0x110 entry_SYSCALL_64_after_hwframe+0x4b/0x53 This is caused by tcp_bpf_sendmsg() returning a larger value(12289) than size (8192), which causes the while loop in splice_to_socket() to release an uninitialized pipe buf. The underlying cause is that this code assumes sk_msg_memcopy_from_iter() will copy all bytes upon success but it actually might only copy part of it. This commit changes it to use the real copied bytes. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-2-bae583d014f3@outlook.com --- net/ipv4/tcp_bpf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 392678ae80f4..47f65b1b70ca 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -495,7 +495,7 @@ more_data: static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; - int copied = 0, err = 0; + int copied = 0, err = 0, ret = 0; struct sk_psock *psock; long timeo; int flags; @@ -538,14 +538,14 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) copy = msg_tx->sg.size - osize; } - err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, copy); - if (err < 0) { + if (ret < 0) { sk_msg_trim(sk, msg_tx, osize); goto out_err; } - copied += copy; + copied += ret; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; From 9ecc4d858b92c1bb0673ad9c327298e600c55659 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:54 -0800 Subject: [PATCH 430/807] bpf: Check negative offsets in __bpf_skb_min_len() skb_network_offset() and skb_transport_offset() can be negative when they are called after we pull the transport header, for example, when we use eBPF sockmap at the point of ->sk_data_ready(). __bpf_skb_min_len() uses an unsigned int to get these offsets, this leads to a very large number which then causes bpf_skb_change_tail() failed unexpectedly. Fix this by using a signed int to get these offsets and ensure the minimum is at least zero. Fixes: 5293efe62df8 ("bpf: add bpf_skb_change_tail helper") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-2-xiyou.wangcong@gmail.com --- net/core/filter.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 21131ec25f24..834614071727 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3734,13 +3734,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = { static u32 __bpf_skb_min_len(const struct sk_buff *skb) { - u32 min_len = skb_network_offset(skb); + int offset = skb_network_offset(skb); + u32 min_len = 0; - if (skb_transport_header_was_set(skb)) - min_len = skb_transport_offset(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) - min_len = skb_checksum_start_offset(skb) + - skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + if (skb_transport_header_was_set(skb)) { + offset = skb_transport_offset(skb); + if (offset > 0) + min_len = offset; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) { + offset = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + } return min_len; } From 9ee0c7b8654346d60c823babe4b3747357a30477 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:55 -0800 Subject: [PATCH 431/807] selftests/bpf: Add a BPF selftest for bpf_skb_change_tail() As requested by Daniel, we need to add a selftest to cover bpf_skb_change_tail() cases in skb_verdict. Here we test trimming, growing and error cases, and validate its expected return values and the expected sizes of the payload. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-3-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 51 +++++++++++++++++++ .../bpf/progs/test_sockmap_change_tail.c | 40 +++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 248754296d97..884ad87783d5 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -12,6 +12,7 @@ #include "test_sockmap_progs_query.skel.h" #include "test_sockmap_pass_prog.skel.h" #include "test_sockmap_drop_prog.skel.h" +#include "test_sockmap_change_tail.skel.h" #include "bpf_iter_sockmap.skel.h" #include "sockmap_helpers.h" @@ -643,6 +644,54 @@ out: test_sockmap_drop_prog__destroy(drop); } +static void test_sockmap_skb_verdict_change_tail(void) +{ + struct test_sockmap_change_tail *skel; + int err, map, verdict; + int c1, p1, sent, recvd; + int zero = 0; + char buf[2]; + + skel = test_sockmap_change_tail__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1); + if (!ASSERT_OK(err, "create_pair()")) + goto out; + err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) + goto out_close; + sent = xsend(p1, "Tr", 2, 0); + ASSERT_EQ(sent, 2, "xsend(p1)"); + recvd = recv(c1, buf, 2, 0); + ASSERT_EQ(recvd, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + sent = xsend(p1, "G", 1, 0); + ASSERT_EQ(sent, 1, "xsend(p1)"); + recvd = recv(c1, buf, 2, 0); + ASSERT_EQ(recvd, 2, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + sent = xsend(p1, "E", 1, 0); + ASSERT_EQ(sent, 1, "xsend(p1)"); + recvd = recv(c1, buf, 1, 0); + ASSERT_EQ(recvd, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret"); + +out_close: + close(c1); + close(p1); +out: + test_sockmap_change_tail__destroy(skel); +} + static void test_sockmap_skb_verdict_peek_helper(int map) { int err, c1, p1, zero = 0, sent, recvd, avail; @@ -1058,6 +1107,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(true); if (test__start_subtest("sockmap skb_verdict fionread on drop")) test_sockmap_skb_verdict_fionread(false); + if (test__start_subtest("sockmap skb_verdict change tail")) + test_sockmap_skb_verdict_change_tail(); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); if (test__start_subtest("sockmap skb_verdict msg_f_peek with link")) diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c b/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c new file mode 100644 index 000000000000..2796dd8545eb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 ByteDance */ +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sock_map_rx SEC(".maps"); + +long change_tail_ret = 1; + +SEC("sk_skb") +int prog_skb_verdict(struct __sk_buff *skb) +{ + char *data, *data_end; + + bpf_skb_pull_data(skb, 1); + data = (char *)(unsigned long)skb->data; + data_end = (char *)(unsigned long)skb->data_end; + + if (data + 1 > data_end) + return SK_PASS; + + if (data[0] == 'T') { /* Trim the packet */ + change_tail_ret = bpf_skb_change_tail(skb, skb->len - 1, 0); + return SK_PASS; + } else if (data[0] == 'G') { /* Grow the packet */ + change_tail_ret = bpf_skb_change_tail(skb, skb->len + 1, 0); + return SK_PASS; + } else if (data[0] == 'E') { /* Error */ + change_tail_ret = bpf_skb_change_tail(skb, 65535, 0); + return SK_PASS; + } + return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; From 472759c9f5377912c7483cca5da847888a27cecc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:56 -0800 Subject: [PATCH 432/807] selftests/bpf: Introduce socket_helpers.h for TC tests Pull socket helpers out of sockmap_helpers.h so that they can be reused for TC tests as well. This prepares for the next patch. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-4-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/socket_helpers.h | 394 ++++++++++++++++++ .../bpf/prog_tests/sockmap_helpers.h | 385 +---------------- 2 files changed, 395 insertions(+), 384 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/socket_helpers.h diff --git a/tools/testing/selftests/bpf/prog_tests/socket_helpers.h b/tools/testing/selftests/bpf/prog_tests/socket_helpers.h new file mode 100644 index 000000000000..1bdfb79ef009 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/socket_helpers.h @@ -0,0 +1,394 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SOCKET_HELPERS__ +#define __SOCKET_HELPERS__ + +#include + +/* include/linux/net.h */ +#define SOCK_TYPE_MASK 0xf + +#define IO_TIMEOUT_SEC 30 +#define MAX_STRERR_LEN 256 + +/* workaround for older vm_sockets.h */ +#ifndef VMADDR_CID_LOCAL +#define VMADDR_CID_LOCAL 1 +#endif + +/* include/linux/cleanup.h */ +#define __get_and_null(p, nullvalue) \ + ({ \ + __auto_type __ptr = &(p); \ + __auto_type __val = *__ptr; \ + *__ptr = nullvalue; \ + __val; \ + }) + +#define take_fd(fd) __get_and_null(fd, -EBADF) + +/* Wrappers that fail the test on error and report it. */ + +#define _FAIL(errnum, fmt...) \ + ({ \ + error_at_line(0, (errnum), __func__, __LINE__, fmt); \ + CHECK_FAIL(true); \ + }) +#define FAIL(fmt...) _FAIL(0, fmt) +#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt) +#define FAIL_LIBBPF(err, msg) \ + ({ \ + char __buf[MAX_STRERR_LEN]; \ + libbpf_strerror((err), __buf, sizeof(__buf)); \ + FAIL("%s: %s", (msg), __buf); \ + }) + + +#define xaccept_nonblock(fd, addr, len) \ + ({ \ + int __ret = \ + accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC); \ + if (__ret == -1) \ + FAIL_ERRNO("accept"); \ + __ret; \ + }) + +#define xbind(fd, addr, len) \ + ({ \ + int __ret = bind((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("bind"); \ + __ret; \ + }) + +#define xclose(fd) \ + ({ \ + int __ret = close((fd)); \ + if (__ret == -1) \ + FAIL_ERRNO("close"); \ + __ret; \ + }) + +#define xconnect(fd, addr, len) \ + ({ \ + int __ret = connect((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("connect"); \ + __ret; \ + }) + +#define xgetsockname(fd, addr, len) \ + ({ \ + int __ret = getsockname((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("getsockname"); \ + __ret; \ + }) + +#define xgetsockopt(fd, level, name, val, len) \ + ({ \ + int __ret = getsockopt((fd), (level), (name), (val), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("getsockopt(" #name ")"); \ + __ret; \ + }) + +#define xlisten(fd, backlog) \ + ({ \ + int __ret = listen((fd), (backlog)); \ + if (__ret == -1) \ + FAIL_ERRNO("listen"); \ + __ret; \ + }) + +#define xsetsockopt(fd, level, name, val, len) \ + ({ \ + int __ret = setsockopt((fd), (level), (name), (val), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("setsockopt(" #name ")"); \ + __ret; \ + }) + +#define xsend(fd, buf, len, flags) \ + ({ \ + ssize_t __ret = send((fd), (buf), (len), (flags)); \ + if (__ret == -1) \ + FAIL_ERRNO("send"); \ + __ret; \ + }) + +#define xrecv_nonblock(fd, buf, len, flags) \ + ({ \ + ssize_t __ret = recv_timeout((fd), (buf), (len), (flags), \ + IO_TIMEOUT_SEC); \ + if (__ret == -1) \ + FAIL_ERRNO("recv"); \ + __ret; \ + }) + +#define xsocket(family, sotype, flags) \ + ({ \ + int __ret = socket(family, sotype, flags); \ + if (__ret == -1) \ + FAIL_ERRNO("socket"); \ + __ret; \ + }) + +static inline void close_fd(int *fd) +{ + if (*fd >= 0) + xclose(*fd); +} + +#define __close_fd __attribute__((cleanup(close_fd))) + +static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss) +{ + return (struct sockaddr *)ss; +} + +static inline void init_addr_loopback4(struct sockaddr_storage *ss, + socklen_t *len) +{ + struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss)); + + addr4->sin_family = AF_INET; + addr4->sin_port = 0; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + *len = sizeof(*addr4); +} + +static inline void init_addr_loopback6(struct sockaddr_storage *ss, + socklen_t *len) +{ + struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss)); + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = 0; + addr6->sin6_addr = in6addr_loopback; + *len = sizeof(*addr6); +} + +static inline void init_addr_loopback_vsock(struct sockaddr_storage *ss, + socklen_t *len) +{ + struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss)); + + addr->svm_family = AF_VSOCK; + addr->svm_port = VMADDR_PORT_ANY; + addr->svm_cid = VMADDR_CID_LOCAL; + *len = sizeof(*addr); +} + +static inline void init_addr_loopback(int family, struct sockaddr_storage *ss, + socklen_t *len) +{ + switch (family) { + case AF_INET: + init_addr_loopback4(ss, len); + return; + case AF_INET6: + init_addr_loopback6(ss, len); + return; + case AF_VSOCK: + init_addr_loopback_vsock(ss, len); + return; + default: + FAIL("unsupported address family %d", family); + } +} + +static inline int enable_reuseport(int s, int progfd) +{ + int err, one = 1; + + err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); + if (err) + return -1; + err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd, + sizeof(progfd)); + if (err) + return -1; + + return 0; +} + +static inline int socket_loopback_reuseport(int family, int sotype, int progfd) +{ + struct sockaddr_storage addr; + socklen_t len = 0; + int err, s; + + init_addr_loopback(family, &addr, &len); + + s = xsocket(family, sotype, 0); + if (s == -1) + return -1; + + if (progfd >= 0) + enable_reuseport(s, progfd); + + err = xbind(s, sockaddr(&addr), len); + if (err) + goto close; + + if (sotype & SOCK_DGRAM) + return s; + + err = xlisten(s, SOMAXCONN); + if (err) + goto close; + + return s; +close: + xclose(s); + return -1; +} + +static inline int socket_loopback(int family, int sotype) +{ + return socket_loopback_reuseport(family, sotype, -1); +} + +static inline int poll_connect(int fd, unsigned int timeout_sec) +{ + struct timeval timeout = { .tv_sec = timeout_sec }; + fd_set wfds; + int r, eval; + socklen_t esize = sizeof(eval); + + FD_ZERO(&wfds); + FD_SET(fd, &wfds); + + r = select(fd + 1, NULL, &wfds, NULL, &timeout); + if (r == 0) + errno = ETIME; + if (r != 1) + return -1; + + if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &eval, &esize) < 0) + return -1; + if (eval != 0) { + errno = eval; + return -1; + } + + return 0; +} + +static inline int poll_read(int fd, unsigned int timeout_sec) +{ + struct timeval timeout = { .tv_sec = timeout_sec }; + fd_set rfds; + int r; + + FD_ZERO(&rfds); + FD_SET(fd, &rfds); + + r = select(fd + 1, &rfds, NULL, NULL, &timeout); + if (r == 0) + errno = ETIME; + + return r == 1 ? 0 : -1; +} + +static inline int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len, + unsigned int timeout_sec) +{ + if (poll_read(fd, timeout_sec)) + return -1; + + return accept(fd, addr, len); +} + +static inline int recv_timeout(int fd, void *buf, size_t len, int flags, + unsigned int timeout_sec) +{ + if (poll_read(fd, timeout_sec)) + return -1; + + return recv(fd, buf, len, flags); +} + + +static inline int create_pair(int family, int sotype, int *p0, int *p1) +{ + __close_fd int s, c = -1, p = -1; + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int err; + + s = socket_loopback(family, sotype); + if (s < 0) + return s; + + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + return err; + + c = xsocket(family, sotype, 0); + if (c < 0) + return c; + + err = connect(c, sockaddr(&addr), len); + if (err) { + if (errno != EINPROGRESS) { + FAIL_ERRNO("connect"); + return err; + } + + err = poll_connect(c, IO_TIMEOUT_SEC); + if (err) { + FAIL_ERRNO("poll_connect"); + return err; + } + } + + switch (sotype & SOCK_TYPE_MASK) { + case SOCK_DGRAM: + err = xgetsockname(c, sockaddr(&addr), &len); + if (err) + return err; + + err = xconnect(s, sockaddr(&addr), len); + if (err) + return err; + + *p0 = take_fd(s); + break; + case SOCK_STREAM: + case SOCK_SEQPACKET: + p = xaccept_nonblock(s, NULL, NULL); + if (p < 0) + return p; + + *p0 = take_fd(p); + break; + default: + FAIL("Unsupported socket type %#x", sotype); + return -EOPNOTSUPP; + } + + *p1 = take_fd(c); + return 0; +} + +static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1, + int *p0, int *p1) +{ + int err; + + err = create_pair(family, sotype, c0, p0); + if (err) + return err; + + err = create_pair(family, sotype, c1, p1); + if (err) { + close(*c0); + close(*p0); + } + + return err; +} + +#endif // __SOCKET_HELPERS__ diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h index 38e35c72bdaa..3e5571dd578d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -1,139 +1,12 @@ #ifndef __SOCKMAP_HELPERS__ #define __SOCKMAP_HELPERS__ -#include +#include "socket_helpers.h" -/* include/linux/net.h */ -#define SOCK_TYPE_MASK 0xf - -#define IO_TIMEOUT_SEC 30 -#define MAX_STRERR_LEN 256 #define MAX_TEST_NAME 80 -/* workaround for older vm_sockets.h */ -#ifndef VMADDR_CID_LOCAL -#define VMADDR_CID_LOCAL 1 -#endif - #define __always_unused __attribute__((__unused__)) -/* include/linux/cleanup.h */ -#define __get_and_null(p, nullvalue) \ - ({ \ - __auto_type __ptr = &(p); \ - __auto_type __val = *__ptr; \ - *__ptr = nullvalue; \ - __val; \ - }) - -#define take_fd(fd) __get_and_null(fd, -EBADF) - -#define _FAIL(errnum, fmt...) \ - ({ \ - error_at_line(0, (errnum), __func__, __LINE__, fmt); \ - CHECK_FAIL(true); \ - }) -#define FAIL(fmt...) _FAIL(0, fmt) -#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt) -#define FAIL_LIBBPF(err, msg) \ - ({ \ - char __buf[MAX_STRERR_LEN]; \ - libbpf_strerror((err), __buf, sizeof(__buf)); \ - FAIL("%s: %s", (msg), __buf); \ - }) - -/* Wrappers that fail the test on error and report it. */ - -#define xaccept_nonblock(fd, addr, len) \ - ({ \ - int __ret = \ - accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC); \ - if (__ret == -1) \ - FAIL_ERRNO("accept"); \ - __ret; \ - }) - -#define xbind(fd, addr, len) \ - ({ \ - int __ret = bind((fd), (addr), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("bind"); \ - __ret; \ - }) - -#define xclose(fd) \ - ({ \ - int __ret = close((fd)); \ - if (__ret == -1) \ - FAIL_ERRNO("close"); \ - __ret; \ - }) - -#define xconnect(fd, addr, len) \ - ({ \ - int __ret = connect((fd), (addr), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("connect"); \ - __ret; \ - }) - -#define xgetsockname(fd, addr, len) \ - ({ \ - int __ret = getsockname((fd), (addr), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("getsockname"); \ - __ret; \ - }) - -#define xgetsockopt(fd, level, name, val, len) \ - ({ \ - int __ret = getsockopt((fd), (level), (name), (val), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("getsockopt(" #name ")"); \ - __ret; \ - }) - -#define xlisten(fd, backlog) \ - ({ \ - int __ret = listen((fd), (backlog)); \ - if (__ret == -1) \ - FAIL_ERRNO("listen"); \ - __ret; \ - }) - -#define xsetsockopt(fd, level, name, val, len) \ - ({ \ - int __ret = setsockopt((fd), (level), (name), (val), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("setsockopt(" #name ")"); \ - __ret; \ - }) - -#define xsend(fd, buf, len, flags) \ - ({ \ - ssize_t __ret = send((fd), (buf), (len), (flags)); \ - if (__ret == -1) \ - FAIL_ERRNO("send"); \ - __ret; \ - }) - -#define xrecv_nonblock(fd, buf, len, flags) \ - ({ \ - ssize_t __ret = recv_timeout((fd), (buf), (len), (flags), \ - IO_TIMEOUT_SEC); \ - if (__ret == -1) \ - FAIL_ERRNO("recv"); \ - __ret; \ - }) - -#define xsocket(family, sotype, flags) \ - ({ \ - int __ret = socket(family, sotype, flags); \ - if (__ret == -1) \ - FAIL_ERRNO("socket"); \ - __ret; \ - }) - #define xbpf_map_delete_elem(fd, key) \ ({ \ int __ret = bpf_map_delete_elem((fd), (key)); \ @@ -193,130 +66,6 @@ __ret; \ }) -static inline void close_fd(int *fd) -{ - if (*fd >= 0) - xclose(*fd); -} - -#define __close_fd __attribute__((cleanup(close_fd))) - -static inline int poll_connect(int fd, unsigned int timeout_sec) -{ - struct timeval timeout = { .tv_sec = timeout_sec }; - fd_set wfds; - int r, eval; - socklen_t esize = sizeof(eval); - - FD_ZERO(&wfds); - FD_SET(fd, &wfds); - - r = select(fd + 1, NULL, &wfds, NULL, &timeout); - if (r == 0) - errno = ETIME; - if (r != 1) - return -1; - - if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &eval, &esize) < 0) - return -1; - if (eval != 0) { - errno = eval; - return -1; - } - - return 0; -} - -static inline int poll_read(int fd, unsigned int timeout_sec) -{ - struct timeval timeout = { .tv_sec = timeout_sec }; - fd_set rfds; - int r; - - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - - r = select(fd + 1, &rfds, NULL, NULL, &timeout); - if (r == 0) - errno = ETIME; - - return r == 1 ? 0 : -1; -} - -static inline int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len, - unsigned int timeout_sec) -{ - if (poll_read(fd, timeout_sec)) - return -1; - - return accept(fd, addr, len); -} - -static inline int recv_timeout(int fd, void *buf, size_t len, int flags, - unsigned int timeout_sec) -{ - if (poll_read(fd, timeout_sec)) - return -1; - - return recv(fd, buf, len, flags); -} - -static inline void init_addr_loopback4(struct sockaddr_storage *ss, - socklen_t *len) -{ - struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss)); - - addr4->sin_family = AF_INET; - addr4->sin_port = 0; - addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - *len = sizeof(*addr4); -} - -static inline void init_addr_loopback6(struct sockaddr_storage *ss, - socklen_t *len) -{ - struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss)); - - addr6->sin6_family = AF_INET6; - addr6->sin6_port = 0; - addr6->sin6_addr = in6addr_loopback; - *len = sizeof(*addr6); -} - -static inline void init_addr_loopback_vsock(struct sockaddr_storage *ss, - socklen_t *len) -{ - struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss)); - - addr->svm_family = AF_VSOCK; - addr->svm_port = VMADDR_PORT_ANY; - addr->svm_cid = VMADDR_CID_LOCAL; - *len = sizeof(*addr); -} - -static inline void init_addr_loopback(int family, struct sockaddr_storage *ss, - socklen_t *len) -{ - switch (family) { - case AF_INET: - init_addr_loopback4(ss, len); - return; - case AF_INET6: - init_addr_loopback6(ss, len); - return; - case AF_VSOCK: - init_addr_loopback_vsock(ss, len); - return; - default: - FAIL("unsupported address family %d", family); - } -} - -static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss) -{ - return (struct sockaddr *)ss; -} - static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2) { u64 value; @@ -334,136 +83,4 @@ static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2) return xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); } -static inline int enable_reuseport(int s, int progfd) -{ - int err, one = 1; - - err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); - if (err) - return -1; - err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd, - sizeof(progfd)); - if (err) - return -1; - - return 0; -} - -static inline int socket_loopback_reuseport(int family, int sotype, int progfd) -{ - struct sockaddr_storage addr; - socklen_t len = 0; - int err, s; - - init_addr_loopback(family, &addr, &len); - - s = xsocket(family, sotype, 0); - if (s == -1) - return -1; - - if (progfd >= 0) - enable_reuseport(s, progfd); - - err = xbind(s, sockaddr(&addr), len); - if (err) - goto close; - - if (sotype & SOCK_DGRAM) - return s; - - err = xlisten(s, SOMAXCONN); - if (err) - goto close; - - return s; -close: - xclose(s); - return -1; -} - -static inline int socket_loopback(int family, int sotype) -{ - return socket_loopback_reuseport(family, sotype, -1); -} - -static inline int create_pair(int family, int sotype, int *p0, int *p1) -{ - __close_fd int s, c = -1, p = -1; - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int err; - - s = socket_loopback(family, sotype); - if (s < 0) - return s; - - err = xgetsockname(s, sockaddr(&addr), &len); - if (err) - return err; - - c = xsocket(family, sotype, 0); - if (c < 0) - return c; - - err = connect(c, sockaddr(&addr), len); - if (err) { - if (errno != EINPROGRESS) { - FAIL_ERRNO("connect"); - return err; - } - - err = poll_connect(c, IO_TIMEOUT_SEC); - if (err) { - FAIL_ERRNO("poll_connect"); - return err; - } - } - - switch (sotype & SOCK_TYPE_MASK) { - case SOCK_DGRAM: - err = xgetsockname(c, sockaddr(&addr), &len); - if (err) - return err; - - err = xconnect(s, sockaddr(&addr), len); - if (err) - return err; - - *p0 = take_fd(s); - break; - case SOCK_STREAM: - case SOCK_SEQPACKET: - p = xaccept_nonblock(s, NULL, NULL); - if (p < 0) - return p; - - *p0 = take_fd(p); - break; - default: - FAIL("Unsupported socket type %#x", sotype); - return -EOPNOTSUPP; - } - - *p1 = take_fd(c); - return 0; -} - -static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1, - int *p0, int *p1) -{ - int err; - - err = create_pair(family, sotype, c0, p0); - if (err) - return err; - - err = create_pair(family, sotype, c1, p1); - if (err) { - close(*c0); - close(*p0); - } - - return err; -} - #endif // __SOCKMAP_HELPERS__ From 4a58963d10fa3cb654b859e3f9a8aecbcf9f4982 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:57 -0800 Subject: [PATCH 433/807] selftests/bpf: Test bpf_skb_change_tail() in TC ingress Similarly to the previous test, we also need a test case to cover positive offsets as well, TC is an excellent hook for this. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Tested-by: Zijian Zhang Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-5-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/tc_change_tail.c | 62 ++++++++++ .../selftests/bpf/progs/test_tc_change_tail.c | 106 ++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_change_tail.c create mode 100644 tools/testing/selftests/bpf/progs/test_tc_change_tail.c diff --git a/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c b/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c new file mode 100644 index 000000000000..74752233e779 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "test_tc_change_tail.skel.h" +#include "socket_helpers.h" + +#define LO_IFINDEX 1 + +void test_tc_change_tail(void) +{ + LIBBPF_OPTS(bpf_tcx_opts, tcx_opts); + struct test_tc_change_tail *skel = NULL; + struct bpf_link *link; + int c1, p1; + char buf[2]; + int ret; + + skel = test_tc_change_tail__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tc_change_tail__open_and_load")) + return; + + link = bpf_program__attach_tcx(skel->progs.change_tail, LO_IFINDEX, + &tcx_opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_tcx")) + goto destroy; + + skel->links.change_tail = link; + ret = create_pair(AF_INET, SOCK_DGRAM, &c1, &p1); + if (!ASSERT_OK(ret, "create_pair")) + goto destroy; + + ret = xsend(p1, "Tr", 2, 0); + ASSERT_EQ(ret, 2, "xsend(p1)"); + ret = recv(c1, buf, 2, 0); + ASSERT_EQ(ret, 2, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + ret = xsend(p1, "G", 1, 0); + ASSERT_EQ(ret, 1, "xsend(p1)"); + ret = recv(c1, buf, 2, 0); + ASSERT_EQ(ret, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + ret = xsend(p1, "E", 1, 0); + ASSERT_EQ(ret, 1, "xsend(p1)"); + ret = recv(c1, buf, 1, 0); + ASSERT_EQ(ret, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret"); + + ret = xsend(p1, "Z", 1, 0); + ASSERT_EQ(ret, 1, "xsend(p1)"); + ret = recv(c1, buf, 1, 0); + ASSERT_EQ(ret, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret"); + + close(c1); + close(p1); +destroy: + test_tc_change_tail__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_tc_change_tail.c b/tools/testing/selftests/bpf/progs/test_tc_change_tail.c new file mode 100644 index 000000000000..28edafe803f0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_change_tail.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +long change_tail_ret = 1; + +static __always_inline struct iphdr *parse_ip_header(struct __sk_buff *skb, int *ip_proto) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + struct ethhdr *eth = data; + struct iphdr *iph; + + /* Verify Ethernet header */ + if ((void *)(data + sizeof(*eth)) > data_end) + return NULL; + + /* Skip Ethernet header to get to IP header */ + iph = (void *)(data + sizeof(struct ethhdr)); + + /* Verify IP header */ + if ((void *)(data + sizeof(struct ethhdr) + sizeof(*iph)) > data_end) + return NULL; + + /* Basic IP header validation */ + if (iph->version != 4) /* Only support IPv4 */ + return NULL; + + if (iph->ihl < 5) /* Minimum IP header length */ + return NULL; + + *ip_proto = iph->protocol; + return iph; +} + +static __always_inline struct udphdr *parse_udp_header(struct __sk_buff *skb, struct iphdr *iph) +{ + void *data_end = (void *)(long)skb->data_end; + void *hdr = (void *)iph; + struct udphdr *udp; + + /* Calculate UDP header position */ + udp = hdr + (iph->ihl * 4); + hdr = (void *)udp; + + /* Verify UDP header bounds */ + if ((void *)(hdr + sizeof(*udp)) > data_end) + return NULL; + + return udp; +} + +SEC("tc/ingress") +int change_tail(struct __sk_buff *skb) +{ + int len = skb->len; + struct udphdr *udp; + struct iphdr *iph; + void *data_end; + char *payload; + int ip_proto; + + bpf_skb_pull_data(skb, len); + + data_end = (void *)(long)skb->data_end; + iph = parse_ip_header(skb, &ip_proto); + if (!iph) + return TCX_PASS; + + if (ip_proto != IPPROTO_UDP) + return TCX_PASS; + + udp = parse_udp_header(skb, iph); + if (!udp) + return TCX_PASS; + + payload = (char *)udp + (sizeof(struct udphdr)); + if (payload + 1 > (char *)data_end) + return TCX_PASS; + + if (payload[0] == 'T') { /* Trim the packet */ + change_tail_ret = bpf_skb_change_tail(skb, len - 1, 0); + if (!change_tail_ret) + bpf_skb_change_tail(skb, len, 0); + return TCX_PASS; + } else if (payload[0] == 'G') { /* Grow the packet */ + change_tail_ret = bpf_skb_change_tail(skb, len + 1, 0); + if (!change_tail_ret) + bpf_skb_change_tail(skb, len, 0); + return TCX_PASS; + } else if (payload[0] == 'E') { /* Error */ + change_tail_ret = bpf_skb_change_tail(skb, 65535, 0); + return TCX_PASS; + } else if (payload[0] == 'Z') { /* Zero */ + change_tail_ret = bpf_skb_change_tail(skb, 0, 0); + return TCX_PASS; + } + return TCX_DROP; +} + +char _license[] SEC("license") = "GPL"; From c384481006476ac65478fa3584c7245782e52f34 Mon Sep 17 00:00:00 2001 From: Nikolaus Voss Date: Thu, 19 Dec 2024 11:54:11 +0100 Subject: [PATCH 434/807] clk: clk-imx8mp-audiomix: fix function signature clk_imx8mp_audiomix_reset_controller_register() in the "if !CONFIG_RESET_CONTROLLER" branch had the first argument missing. It is an empty function for this branch so it wasn't immediately apparent. Fixes: 6f0e817175c5 ("clk: imx: clk-audiomix: Add reset controller") Cc: # 6.12.x Signed-off-by: Nikolaus Voss Link: https://lore.kernel.org/r/20241219105447.889CB11FE@mail.steuer-voss.de Reviewed-by: Daniel Baluta Acked-by: Shengjiu Wang Reviewed-by: Peng Fan Signed-off-by: Stephen Boyd --- drivers/clk/imx/clk-imx8mp-audiomix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clk/imx/clk-imx8mp-audiomix.c b/drivers/clk/imx/clk-imx8mp-audiomix.c index b2cb157703c5..c409fc7e0618 100644 --- a/drivers/clk/imx/clk-imx8mp-audiomix.c +++ b/drivers/clk/imx/clk-imx8mp-audiomix.c @@ -278,7 +278,8 @@ static int clk_imx8mp_audiomix_reset_controller_register(struct device *dev, #else /* !CONFIG_RESET_CONTROLLER */ -static int clk_imx8mp_audiomix_reset_controller_register(struct clk_imx8mp_audiomix_priv *priv) +static int clk_imx8mp_audiomix_reset_controller_register(struct device *dev, + struct clk_imx8mp_audiomix_priv *priv) { return 0; } From d67393f4d28ef0544eaf382f1123dcaf56495dc9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 5 Dec 2024 14:20:43 +0100 Subject: [PATCH 435/807] kbuild: Drop support for include/asm- in headers_check.pl "include/asm-" was replaced by "arch//include/asm" a long time ago. All assembler header files are now included using "#include ", so there is no longer a need to rewrite paths. Signed-off-by: Geert Uytterhoeven Signed-off-by: Masahiro Yamada --- usr/include/Makefile | 2 +- usr/include/headers_check.pl | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/usr/include/Makefile b/usr/include/Makefile index 771e32872b2a..6c6de1b1622b 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -78,7 +78,7 @@ quiet_cmd_hdrtest = HDRTEST $< cmd_hdrtest = \ $(CC) $(c_flags) -fsyntax-only -x c /dev/null \ $(if $(filter-out $(no-header-test), $*.h), -include $< -include $<); \ - $(PERL) $(src)/headers_check.pl $(obj) $(SRCARCH) $<; \ + $(PERL) $(src)/headers_check.pl $(obj) $<; \ touch $@ $(obj)/%.hdrtest: $(obj)/%.h FORCE diff --git a/usr/include/headers_check.pl b/usr/include/headers_check.pl index b6aec5e4365f..2b70bfa5558e 100755 --- a/usr/include/headers_check.pl +++ b/usr/include/headers_check.pl @@ -3,9 +3,8 @@ # # headers_check.pl execute a number of trivial consistency checks # -# Usage: headers_check.pl dir arch [files...] +# Usage: headers_check.pl dir [files...] # dir: dir to look for included files -# arch: architecture # files: list of files to check # # The script reads the supplied files line by line and: @@ -23,7 +22,7 @@ use warnings; use strict; use File::Basename; -my ($dir, $arch, @files) = @ARGV; +my ($dir, @files) = @ARGV; my $ret = 0; my $line; @@ -54,10 +53,6 @@ sub check_include my $inc = $1; my $found; $found = stat($dir . "/" . $inc); - if (!$found) { - $inc =~ s#asm/#asm-$arch/#; - $found = stat($dir . "/" . $inc); - } if (!$found) { printf STDERR "$filename:$lineno: included file '$inc' is not exported\n"; $ret = 1; From a34e92d2e831729f0ed5df20d15b4df419cd0ba4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 3 Dec 2024 20:14:45 +0900 Subject: [PATCH 436/807] kbuild: deb-pkg: add debarch for ARCH=um 'make ARCH=um bindeb-pkg' shows the following warning. $ make ARCH=um bindeb-pkg [snip] GEN debian ** ** ** WARNING ** ** ** Your architecture doesn't have its equivalent Debian userspace architecture defined! Falling back to the current host architecture (amd64). Please add support for um to ./scripts/package/mkdebian ... This commit hard-codes i386/amd64 because UML is only supported for x86. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/mkdebian | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 4ffcc70f8e31..b038a1380b8a 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -70,6 +70,13 @@ set_debarch() { debarch=sh4$(if_enabled_echo CONFIG_CPU_BIG_ENDIAN eb) fi ;; + um) + if is_enabled CONFIG_64BIT; then + debarch=amd64 + else + debarch=i386 + fi + ;; esac if [ -z "$debarch" ]; then debarch=$(dpkg-architecture -qDEB_HOST_ARCH) From 54956567a055345d17438f08c895c68aff3f4cf2 Mon Sep 17 00:00:00 2001 From: Nicolas Schier Date: Thu, 12 Dec 2024 14:05:29 +0100 Subject: [PATCH 437/807] kbuild: deb-pkg: Do not install maint scripts for arch 'um' Stop installing Debian maintainer scripts when building a user-mode-linux Debian package. Debian maintainer scripts are used for e.g. requesting rebuilds of initrd, rebuilding DKMS modules and updating of grub configuration. As all of this is not relevant for UML but also may lead to failures while processing the kernel hooks, do no more install maintainer scripts for the UML package. Suggested-by: Masahiro Yamada Signed-off-by: Nicolas Schier Signed-off-by: Masahiro Yamada --- scripts/package/builddeb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index fb686fd3266f..ad7aba0f268e 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -63,6 +63,12 @@ install_linux_image () { esac cp "$(${MAKE} -s -f ${srctree}/Makefile image_name)" "${pdir}/${installed_image_path}" + if [ "${ARCH}" != um ]; then + install_maint_scripts "${pdir}" + fi +} + +install_maint_scripts () { # Install the maintainer scripts # Note: hook scripts under /etc/kernel are also executed by official Debian # kernel packages, as well as kernel packages built using make-kpkg. From 9435dc77a33fa20afec7cd35ceaae5f7f42dbbe2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 13 Dec 2024 00:46:15 +0900 Subject: [PATCH 438/807] modpost: distinguish same module paths from different dump files Since commit 13b25489b6f8 ("kbuild: change working directory to external module directory with M="), module paths are always relative to the top of the external module tree. The module paths recorded in Module.symvers are no longer globally unique when they are passed via KBUILD_EXTRA_SYMBOLS for building other external modules, which may result in false-positive "exported twice" errors. Such errors should not occur because external modules should be able to override in-tree modules. To address this, record the dump file path in struct module and check it when searching for a module. Fixes: 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") Reported-by: Jon Hunter Closes: https://lore.kernel.org/all/eb21a546-a19c-40df-b821-bbba80f19a3d@nvidia.com/ Signed-off-by: Masahiro Yamada Tested-by: Jon Hunter --- scripts/mod/modpost.c | 17 +++++++++-------- scripts/mod/modpost.h | 3 ++- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index fb787a5715f5..94ee49207a45 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -155,12 +155,13 @@ char *get_line(char **stringp) /* A list of all modules we processed */ LIST_HEAD(modules); -static struct module *find_module(const char *modname) +static struct module *find_module(const char *filename, const char *modname) { struct module *mod; list_for_each_entry(mod, &modules, list) { - if (strcmp(mod->name, modname) == 0) + if (!strcmp(mod->dump_file, filename) && + !strcmp(mod->name, modname)) return mod; } return NULL; @@ -2030,10 +2031,10 @@ static void read_dump(const char *fname) continue; } - mod = find_module(modname); + mod = find_module(fname, modname); if (!mod) { mod = new_module(modname, strlen(modname)); - mod->from_dump = true; + mod->dump_file = fname; } s = sym_add_exported(symname, mod, gpl_only, namespace); sym_set_crc(s, crc); @@ -2052,7 +2053,7 @@ static void write_dump(const char *fname) struct symbol *sym; list_for_each_entry(mod, &modules, list) { - if (mod->from_dump) + if (mod->dump_file) continue; list_for_each_entry(sym, &mod->exported_symbols, list) { if (trim_unused_exports && !sym->used) @@ -2076,7 +2077,7 @@ static void write_namespace_deps_files(const char *fname) list_for_each_entry(mod, &modules, list) { - if (mod->from_dump || list_empty(&mod->missing_namespaces)) + if (mod->dump_file || list_empty(&mod->missing_namespaces)) continue; buf_printf(&ns_deps_buf, "%s.ko:", mod->name); @@ -2194,7 +2195,7 @@ int main(int argc, char **argv) read_symbols_from_files(files_source); list_for_each_entry(mod, &modules, list) { - if (mod->from_dump || mod->is_vmlinux) + if (mod->dump_file || mod->is_vmlinux) continue; check_modname_len(mod); @@ -2205,7 +2206,7 @@ int main(int argc, char **argv) handle_white_list_exports(unused_exports_white_list); list_for_each_entry(mod, &modules, list) { - if (mod->from_dump) + if (mod->dump_file) continue; if (mod->is_vmlinux) diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 49848fcbe2a1..8b72c227ebf4 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -95,14 +95,15 @@ struct module_alias { /** * struct module - represent a module (vmlinux or *.ko) * + * @dump_file: path to the .symvers file if loaded from a file * @aliases: list head for module_aliases */ struct module { struct list_head list; struct list_head exported_symbols; struct list_head unresolved_symbols; + const char *dump_file; bool is_gpl_compatible; - bool from_dump; /* true if module was loaded from *.symvers */ bool is_vmlinux; bool seen; bool has_init; From e84a3bf7f4aa669c05e3884497774148ac111468 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Dec 2024 10:19:04 -0500 Subject: [PATCH 439/807] staging: gpib: Fix allyesconfig build failures My tests run an allyesconfig build and it failed with the following errors: LD [M] samples/kfifo/dma-example.ko ld.lld: error: undefined symbol: nec7210_board_reset ld.lld: error: undefined symbol: nec7210_read ld.lld: error: undefined symbol: nec7210_write It appears that some modules call the function nec7210_board_reset() that is defined in nec7210.c. In an allyesconfig build, these other modules are built in. But the file that holds nec7210_board_reset() has: obj-m += nec7210.o Where that "-m" means it only gets built as a module. With the other modules built in, they have no access to nec7210_board_reset() and the build fails. This isn't the only function. After fixing that one, I hit another: ld.lld: error: undefined symbol: push_gpib_event ld.lld: error: undefined symbol: gpib_match_device_path Where push_gpib_event() was also used outside of the file it was defined in, and that file too only was built as a module. Since the directory that nec7210.c is only traversed when CONFIG_GPIB_NEC7210 is set, and the directory with gpib_common.c is only traversed when CONFIG_GPIB_COMMON is set, use those configs as the option to build those modules. When it is an allyesconfig, then they will both be built in and their functions will be available to the other modules that are also built in. Fixes: 3ba84ac69b53e ("staging: gpib: Add nec7210 GPIB chip driver") Fixes: 9dde4559e9395 ("staging: gpib: Add GPIB common core driver") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Palmer Dabbelt Signed-off-by: Linus Torvalds --- drivers/staging/gpib/common/Makefile | 2 +- drivers/staging/gpib/nec7210/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/gpib/common/Makefile b/drivers/staging/gpib/common/Makefile index 0c4c77bea75b..460586edb574 100644 --- a/drivers/staging/gpib/common/Makefile +++ b/drivers/staging/gpib/common/Makefile @@ -1,5 +1,5 @@ -obj-m += gpib_common.o +obj-$(CONFIG_GPIB_COMMON) += gpib_common.o gpib_common-objs := gpib_os.o iblib.o diff --git a/drivers/staging/gpib/nec7210/Makefile b/drivers/staging/gpib/nec7210/Makefile index 8d4d90f21109..64330f2e89d1 100644 --- a/drivers/staging/gpib/nec7210/Makefile +++ b/drivers/staging/gpib/nec7210/Makefile @@ -1,4 +1,4 @@ -obj-m += nec7210.o +obj-$(CONFIG_GPIB_NEC7210) += nec7210.o From 37d1d99b8806b24ffe4a2b453620df932994a5c0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 17 Dec 2024 08:05:40 +0100 Subject: [PATCH 440/807] KVM: VMX: don't include '' directly The header clearly states that it does not want to be included directly, only via ''. Replace the include accordingly. Signed-off-by: Wolfram Sang Message-ID: <20241217070539.2433-2-wsa+renesas@sang-engineering.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 1715d2ab07be..ad9116a99bcc 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -2,7 +2,7 @@ #ifndef __KVM_X86_VMX_POSTED_INTR_H #define __KVM_X86_VMX_POSTED_INTR_H -#include +#include #include void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); From 398b7b6cb9e046f137a188670da12f790492b56b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 19 Dec 2024 07:43:20 -0500 Subject: [PATCH 441/807] KVM: x86: let it be known that ignore_msrs is a bad idea When running KVM with ignore_msrs=1 and report_ignored_msrs=0, the user has no clue that that the guest is being lied to. This may cause bug reports such as https://gitlab.com/qemu-project/qemu/-/issues/2571, where enabling a CPUID bit in QEMU caused Linux guests to try reading MSR_CU_DEF_ERR; and being lied about the existence of MSR_CU_DEF_ERR caused the guest to assume other things about the local APIC which were not true: Sep 14 12:02:53 kernel: mce: [Firmware Bug]: Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly. Sep 14 12:02:53 kernel: unchecked MSR access error: RDMSR from 0x852 at rIP: 0xffffffffb548ffa7 (native_read_msr+0x7/0x40) Sep 14 12:02:53 kernel: Call Trace: ... Sep 14 12:02:53 kernel: native_apic_msr_read+0x20/0x30 Sep 14 12:02:53 kernel: setup_APIC_eilvt+0x47/0x110 Sep 14 12:02:53 kernel: mce_amd_feature_init+0x485/0x4e0 ... Sep 14 12:02:53 kernel: [Firmware Bug]: cpu 0, try to use APIC520 (LVT offset 2) for vector 0xf4, but the register is already in use for vector 0x0 on this cpu Without reported_ignored_msrs=0 at least the host kernel log will contain enough information to avoid going on a wild goose chase. But if reports about individual MSR accesses are being silenced too, at least complain loudly the first time a VM is started. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8160baf3838..12fa68a06966 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12724,6 +12724,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_xen_init_vm(kvm); + if (ignore_msrs && !report_ignored_msrs) { + pr_warn_once("Running KVM with ignore_msrs=1 and report_ignored_msrs=0 is not a\n" + "a supported configuration. Lying to the guest about the existence of MSRs\n" + "may cause the guest operating system to hang or produce errors. If a guest\n" + "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); + } + return 0; out_uninit_mmu: From 4bbf9020becbfd8fc2c3da790855b7042fad455b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 Dec 2024 13:22:21 -0800 Subject: [PATCH 442/807] Linux 6.13-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e5b8a8832c0c..5c9b1d2d59b4 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* From 0b7a66a2c864859fbf9bb16229c03172eef02c05 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 5 Dec 2024 17:06:02 +0100 Subject: [PATCH 443/807] preempt: Move PREEMPT_RT before PREEMPT in vermagic. Since the dynamic preemption has been enabled for PREEMPT_RT we have now CONFIG_PREEMPT and CONFIG_PREEMPT_RT set simultaneously. This affects the vermagic strings which comes now PREEMPT with PREEMPT_RT enabled. The PREEMPT_RT module usually can not be loaded on a PREEMPT kernel because some symbols are missing. However if the symbols are fine then it continues and it crashes later. The problem is that the struct module has a different layout and the num_exentries or init members are at a different position leading to a crash later on. This is not necessary caught by the size check in elf_validity_cache_index_mod() because the mem member has an alignment requirement of __module_memory_align which is big enough keep the total size unchanged. Therefore we should keep the string accurate instead of removing it. Move the PREEMPT_RT check before the PREEMPT so that it takes precedence if both symbols are enabled. Fixes: 35772d627b55c ("sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20241205160602.3lIAsJRT@linutronix.de Signed-off-by: Petr Pavlu --- include/linux/vermagic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index a54046bf37e5..939ceabcaf06 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -15,10 +15,10 @@ #else #define MODULE_VERMAGIC_SMP "" #endif -#ifdef CONFIG_PREEMPT_BUILD -#define MODULE_VERMAGIC_PREEMPT "preempt " -#elif defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_PREEMPT_RT #define MODULE_VERMAGIC_PREEMPT "preempt_rt " +#elif defined(CONFIG_PREEMPT_BUILD) +#define MODULE_VERMAGIC_PREEMPT "preempt " #else #define MODULE_VERMAGIC_PREEMPT "" #endif From b8ea3b1ff544b47c1d64a22860f33b755638164e Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Fri, 13 Dec 2024 22:50:21 +0530 Subject: [PATCH 444/807] smb: enable reuse of deferred file handles for write operations Previously, deferred file handles were reused only for read operations, this commit extends to reusing deferred handles for write operations. By reusing these handles we can reduce the need for open/close operations over the wire. Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index a58a3333ecc3..3b2d33291a7e 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -990,7 +990,11 @@ int cifs_open(struct inode *inode, struct file *file) } /* Get the cached handle as SMB2 close is deferred */ - rc = cifs_get_readable_path(tcon, full_path, &cfile); + if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) { + rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile); + } else { + rc = cifs_get_readable_path(tcon, full_path, &cfile); + } if (rc == 0) { if (file->f_flags == cfile->f_flags) { file->private_data = cfile; From f17224c2a7bdc11a17c96d9d8cb2d829f54d40bb Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 20 Dec 2024 21:59:37 +0000 Subject: [PATCH 445/807] cifs: Remove unused is_server_using_iface() The last use of is_server_using_iface() was removed in 2022 by commit aa45dadd34e4 ("cifs: change iface_list from array to sorted linked list") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Steve French --- fs/smb/client/cifsproto.h | 2 -- fs/smb/client/sess.c | 25 ------------------------- 2 files changed, 27 deletions(-) diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 754417cb3294..d26f9bbb5382 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -614,8 +614,6 @@ int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); int cifs_try_adding_channels(struct cifs_ses *ses); -bool is_server_using_iface(struct TCP_Server_Info *server, - struct cifs_server_iface *iface); bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); void cifs_ses_mark_for_reconnect(struct cifs_ses *ses); diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 3306fb655136..91d4d409cb1d 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -27,31 +27,6 @@ static int cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface); -bool -is_server_using_iface(struct TCP_Server_Info *server, - struct cifs_server_iface *iface) -{ - struct sockaddr_in *i4 = (struct sockaddr_in *)&iface->sockaddr; - struct sockaddr_in6 *i6 = (struct sockaddr_in6 *)&iface->sockaddr; - struct sockaddr_in *s4 = (struct sockaddr_in *)&server->dstaddr; - struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&server->dstaddr; - - if (server->dstaddr.ss_family != iface->sockaddr.ss_family) - return false; - if (server->dstaddr.ss_family == AF_INET) { - if (s4->sin_addr.s_addr != i4->sin_addr.s_addr) - return false; - } else if (server->dstaddr.ss_family == AF_INET6) { - if (memcmp(&s6->sin6_addr, &i6->sin6_addr, - sizeof(i6->sin6_addr)) != 0) - return false; - } else { - /* unknown family.. */ - return false; - } - return true; -} - bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) { int i; From d08555758fb1dbfb48f0cb58176fdc98009e6070 Mon Sep 17 00:00:00 2001 From: Chun-Kuang Hu Date: Thu, 12 Dec 2024 00:19:08 +0000 Subject: [PATCH 446/807] Revert "drm/mediatek: dsi: Correct calculation formula of PHY Timing" This reverts commit 417d8c47271d5cf1a705e997065873b2a9a36fd4. With that patch the panel in the Tentacruel ASUS Chromebook CM14 (CM1402F) flickers. There are 1 or 2 times per second a black panel. Stable Kernel 6.11.5 and mainline 6.12-rc4 works only when reverse that patch. Fixes: 417d8c47271d ("drm/mediatek: dsi: Correct calculation formula of PHY Timing") Cc: stable@vger.kernel.org Cc: Shuijing Li Reported-by: Jens Ziller Closes: https://patchwork.kernel.org/project/dri-devel/patch/20240412031208.30688-1-shuijing.li@mediatek.com/ Link: https://patchwork.kernel.org/project/dri-devel/patch/20241212001908.6056-1-chunkuang.hu@kernel.org/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_dsi.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_dsi.c b/drivers/gpu/drm/mediatek/mtk_dsi.c index 33ceeb8d6925..3907863579b9 100644 --- a/drivers/gpu/drm/mediatek/mtk_dsi.c +++ b/drivers/gpu/drm/mediatek/mtk_dsi.c @@ -246,23 +246,22 @@ static void mtk_dsi_phy_timconfig(struct mtk_dsi *dsi) u32 data_rate_mhz = DIV_ROUND_UP(dsi->data_rate, HZ_PER_MHZ); struct mtk_phy_timing *timing = &dsi->phy_timing; - timing->lpx = (80 * data_rate_mhz / (8 * 1000)) + 1; - timing->da_hs_prepare = (59 * data_rate_mhz + 4 * 1000) / 8000 + 1; - timing->da_hs_zero = (163 * data_rate_mhz + 11 * 1000) / 8000 + 1 - + timing->lpx = (60 * data_rate_mhz / (8 * 1000)) + 1; + timing->da_hs_prepare = (80 * data_rate_mhz + 4 * 1000) / 8000; + timing->da_hs_zero = (170 * data_rate_mhz + 10 * 1000) / 8000 + 1 - timing->da_hs_prepare; - timing->da_hs_trail = (78 * data_rate_mhz + 7 * 1000) / 8000 + 1; + timing->da_hs_trail = timing->da_hs_prepare + 1; - timing->ta_go = 4 * timing->lpx; - timing->ta_sure = 3 * timing->lpx / 2; - timing->ta_get = 5 * timing->lpx; - timing->da_hs_exit = (118 * data_rate_mhz / (8 * 1000)) + 1; + timing->ta_go = 4 * timing->lpx - 2; + timing->ta_sure = timing->lpx + 2; + timing->ta_get = 4 * timing->lpx; + timing->da_hs_exit = 2 * timing->lpx + 1; - timing->clk_hs_prepare = (57 * data_rate_mhz / (8 * 1000)) + 1; - timing->clk_hs_post = (65 * data_rate_mhz + 53 * 1000) / 8000 + 1; - timing->clk_hs_trail = (78 * data_rate_mhz + 7 * 1000) / 8000 + 1; - timing->clk_hs_zero = (330 * data_rate_mhz / (8 * 1000)) + 1 - - timing->clk_hs_prepare; - timing->clk_hs_exit = (118 * data_rate_mhz / (8 * 1000)) + 1; + timing->clk_hs_prepare = 70 * data_rate_mhz / (8 * 1000); + timing->clk_hs_post = timing->clk_hs_prepare + 8; + timing->clk_hs_trail = timing->clk_hs_prepare; + timing->clk_hs_zero = timing->clk_hs_trail * 4; + timing->clk_hs_exit = 2 * timing->clk_hs_trail; timcon0 = FIELD_PREP(LPX, timing->lpx) | FIELD_PREP(HS_PREP, timing->da_hs_prepare) | From 8673a6c2d9e483dfeeef83a1f06f59e05636f4d1 Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Fri, 20 Dec 2024 13:52:46 +0800 Subject: [PATCH 447/807] RDMA/hns: Fix mapping error of zero-hop WQE buffer Due to HW limitation, the three region of WQE buffer must be mapped and set to HW in a fixed order: SQ buffer, SGE buffer, and RQ buffer. Currently when one region is zero-hop while the other two are not, the zero-hop region will not be mapped. This violate the limitation above and leads to address error. Fixes: 38389eaa4db1 ("RDMA/hns: Add mtr support for mixed multihop addressing") Signed-off-by: wenglianfa Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hem.c | 43 ++++++++++++++++-------- drivers/infiniband/hw/hns/hns_roce_mr.c | 5 --- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index f84521be3bea..605562122ecc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -931,6 +931,7 @@ struct hns_roce_hem_item { size_t count; /* max ba numbers */ int start; /* start buf offset in this hem */ int end; /* end buf offset in this hem */ + bool exist_bt; }; /* All HEM items are linked in a tree structure */ @@ -959,6 +960,7 @@ hem_list_alloc_item(struct hns_roce_dev *hr_dev, int start, int end, int count, } } + hem->exist_bt = exist_bt; hem->count = count; hem->start = start; hem->end = end; @@ -969,22 +971,22 @@ hem_list_alloc_item(struct hns_roce_dev *hr_dev, int start, int end, int count, } static void hem_list_free_item(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_item *hem, bool exist_bt) + struct hns_roce_hem_item *hem) { - if (exist_bt) + if (hem->exist_bt) dma_free_coherent(hr_dev->dev, hem->count * BA_BYTE_LEN, hem->addr, hem->dma_addr); kfree(hem); } static void hem_list_free_all(struct hns_roce_dev *hr_dev, - struct list_head *head, bool exist_bt) + struct list_head *head) { struct hns_roce_hem_item *hem, *temp_hem; list_for_each_entry_safe(hem, temp_hem, head, list) { list_del(&hem->list); - hem_list_free_item(hr_dev, hem, exist_bt); + hem_list_free_item(hr_dev, hem); } } @@ -1084,6 +1086,10 @@ int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, for (i = 0; i < region_cnt; i++) { r = (struct hns_roce_buf_region *)®ions[i]; + /* when r->hopnum = 0, the region should not occupy root_ba. */ + if (!r->hopnum) + continue; + if (r->hopnum > 1) { step = hem_list_calc_ba_range(r->hopnum, 1, unit); if (step > 0) @@ -1177,7 +1183,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, err_exit: for (level = 1; level < hopnum; level++) - hem_list_free_all(hr_dev, &temp_list[level], true); + hem_list_free_all(hr_dev, &temp_list[level]); return ret; } @@ -1218,16 +1224,26 @@ static int alloc_fake_root_bt(struct hns_roce_dev *hr_dev, void *cpu_base, { struct hns_roce_hem_item *hem; + /* This is on the has_mtt branch, if r->hopnum + * is 0, there is no root_ba to reuse for the + * region's fake hem, so a dma_alloc request is + * necessary here. + */ hem = hem_list_alloc_item(hr_dev, r->offset, r->offset + r->count - 1, - r->count, false); + r->count, !r->hopnum); if (!hem) return -ENOMEM; - hem_list_assign_bt(hem, cpu_base, phy_base); + /* The root_ba can be reused only when r->hopnum > 0. */ + if (r->hopnum) + hem_list_assign_bt(hem, cpu_base, phy_base); list_add(&hem->list, branch_head); list_add(&hem->sibling, leaf_head); - return r->count; + /* If r->hopnum == 0, 0 is returned, + * so that the root_bt entry is not occupied. + */ + return r->hopnum ? r->count : 0; } static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, @@ -1271,7 +1287,7 @@ setup_root_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, return -ENOMEM; total = 0; - for (i = 0; i < region_cnt && total < max_ba_num; i++) { + for (i = 0; i < region_cnt && total <= max_ba_num; i++) { r = ®ions[i]; if (!r->count) continue; @@ -1337,9 +1353,9 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev, region_cnt); if (ret) { for (i = 0; i < region_cnt; i++) - hem_list_free_all(hr_dev, &head.branch[i], false); + hem_list_free_all(hr_dev, &head.branch[i]); - hem_list_free_all(hr_dev, &head.root, true); + hem_list_free_all(hr_dev, &head.root); } return ret; @@ -1402,10 +1418,9 @@ void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev, for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++) for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++) - hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j], - j != 0); + hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j]); - hem_list_free_all(hr_dev, &hem_list->root_bt, true); + hem_list_free_all(hr_dev, &hem_list->root_bt); INIT_LIST_HEAD(&hem_list->btm_bt); hem_list->root_ba = 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index bf30b3a65a9b..55b9283bfc6f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -814,11 +814,6 @@ int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, for (i = 0, mapped_cnt = 0; i < mtr->hem_cfg.region_count && mapped_cnt < page_cnt; i++) { r = &mtr->hem_cfg.region[i]; - /* if hopnum is 0, no need to map pages in this region */ - if (!r->hopnum) { - mapped_cnt += r->count; - continue; - } if (r->offset + r->count > page_cnt) { ret = -EINVAL; From 0572eccf239ce4bd89bd531767ec5ab20e249290 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 20 Dec 2024 13:52:47 +0800 Subject: [PATCH 448/807] RDMA/hns: Fix accessing invalid dip_ctx during destroying QP If it fails to modify QP to RTR, dip_ctx will not be attached. And during detroying QP, the invalid dip_ctx pointer will be accessed. Fixes: faa62440a577 ("RDMA/hns: Fix different dgids mapping to the same dip_idx") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 697b17cca02e..6dddadb90e02 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5619,6 +5619,9 @@ static void put_dip_ctx_idx(struct hns_roce_dev *hr_dev, { struct hns_roce_dip *hr_dip = hr_qp->dip; + if (!hr_dip) + return; + xa_lock(&hr_dev->qp_table.dip_xa); hr_dip->qp_cnt--; From fa5c4ba8cdbfd2c2d6422e001311c8213283ebbf Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 20 Dec 2024 13:52:48 +0800 Subject: [PATCH 449/807] RDMA/hns: Fix warning storm caused by invalid input in IO path WARN_ON() is called in the IO path. And it could lead to a warning storm. Use WARN_ON_ONCE() instead of WARN_ON(). Fixes: 12542f1de179 ("RDMA/hns: Refactor process about opcode in post_send()") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-4-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 6dddadb90e02..d0469d27c63c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -468,7 +468,7 @@ static inline int set_ud_wqe(struct hns_roce_qp *qp, valid_num_sge = calc_wr_sge_num(wr, &msg_len); ret = set_ud_opcode(ud_sq_wqe, wr); - if (WARN_ON(ret)) + if (WARN_ON_ONCE(ret)) return ret; ud_sq_wqe->msg_len = cpu_to_le32(msg_len); @@ -572,7 +572,7 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, rc_sq_wqe->msg_len = cpu_to_le32(msg_len); ret = set_rc_opcode(hr_dev, rc_sq_wqe, wr); - if (WARN_ON(ret)) + if (WARN_ON_ONCE(ret)) return ret; hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SO, From e3debdd48423d3d75b9d366399228d7225d902cd Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 20 Dec 2024 13:52:49 +0800 Subject: [PATCH 450/807] RDMA/hns: Fix missing flush CQE for DWQE Flush CQE handler has not been called if QP state gets into errored mode in DWQE path. So, the new added outstanding WQEs will never be flushed. It leads to a hung task timeout when using NFS over RDMA: __switch_to+0x7c/0xd0 __schedule+0x350/0x750 schedule+0x50/0xf0 schedule_timeout+0x2c8/0x340 wait_for_common+0xf4/0x2b0 wait_for_completion+0x20/0x40 __ib_drain_sq+0x140/0x1d0 [ib_core] ib_drain_sq+0x98/0xb0 [ib_core] rpcrdma_xprt_disconnect+0x68/0x270 [rpcrdma] xprt_rdma_close+0x20/0x60 [rpcrdma] xprt_autoclose+0x64/0x1cc [sunrpc] process_one_work+0x1d8/0x4e0 worker_thread+0x154/0x420 kthread+0x108/0x150 ret_from_fork+0x10/0x18 Fixes: 01584a5edcc4 ("RDMA/hns: Add support of direct wqe") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-5-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index d0469d27c63c..0144e7210d05 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -670,6 +670,10 @@ static void write_dwqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, #define HNS_ROCE_SL_SHIFT 2 struct hns_roce_v2_rc_send_wqe *rc_sq_wqe = wqe; + if (unlikely(qp->state == IB_QPS_ERR)) { + flush_cqe(hr_dev, qp); + return; + } /* All kinds of DirectWQE have the same header field layout */ hr_reg_enable(rc_sq_wqe, RC_SEND_WQE_FLAG); hr_reg_write(rc_sq_wqe, RC_SEND_WQE_DB_SL_L, qp->sl); From d685d55dfc86b1a4bdcec77c3c1f8a83f181264e Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 11 Dec 2024 09:10:55 +0900 Subject: [PATCH 451/807] tracing/kprobe: Make trace_kprobe's module callback called after jump_label update Make sure the trace_kprobe's module notifer callback function is called after jump_label's callback is called. Since the trace_kprobe's callback eventually checks jump_label address during registering new kprobe on the loading module, jump_label must be updated before this registration happens. Link: https://lore.kernel.org/all/173387585556.995044.3157941002975446119.stgit@devnote2/ Fixes: 614243181050 ("tracing/kprobes: Support module init function probing") Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 263fac44d3ca..935a886af40c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -725,7 +725,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, static struct notifier_block trace_kprobe_module_nb = { .notifier_call = trace_kprobe_module_callback, - .priority = 1 /* Invoked after kprobe module callback */ + .priority = 2 /* Invoked after kprobe and jump_label module callback */ }; static int trace_kprobe_register_module_notifier(void) { From a53da2fb25a31f4fb8eaeb93c7b1134fc14fd209 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 13 Dec 2024 09:28:33 -0800 Subject: [PATCH 452/807] drm/xe: Revert some changes that break a mesa debug tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a mesa debug tool for decoding devcoredump files. Recent changes to improve the devcoredump output broke that tool. So revert the changes until the tool can be extended to support the new fields. Signed-off-by: John Harrison Fixes: c28fd6c358db ("drm/xe/devcoredump: Improve section headings and add tile info") Fixes: ec1455ce7e35 ("drm/xe/devcoredump: Add ASCII85 dump helper function") Cc: John Harrison Cc: Julia Filipchuk Cc: Lucas De Marchi Cc: Thomas Hellström Cc: Rodrigo Vivi Cc: intel-xe@lists.freedesktop.org Reviewed-by: Jonathan Cavitt Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20241213172833.1733376-1-John.C.Harrison@Intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 70fb86a85dc9fd66014d7eb2fe356f50702ceeb6) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_devcoredump.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index f8947e7e917e..21a50d539426 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -109,7 +109,11 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count, drm_puts(&p, "\n**** GuC CT ****\n"); xe_guc_ct_snapshot_print(ss->guc.ct, &p); - drm_puts(&p, "\n**** Contexts ****\n"); + /* + * Don't add a new section header here because the mesa debug decoder + * tool expects the context information to be in the 'GuC CT' section. + */ + /* drm_puts(&p, "\n**** Contexts ****\n"); */ xe_guc_exec_queue_snapshot_print(ss->ge, &p); drm_puts(&p, "\n**** Job ****\n"); @@ -363,6 +367,15 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char buff[ASCII85_BUFSZ], *line_buff; size_t line_pos = 0; + /* + * Splitting blobs across multiple lines is not compatible with the mesa + * debug decoder tool. Note that even dropping the explicit '\n' below + * doesn't help because the GuC log is so big some underlying implementation + * still splits the lines at 512K characters. So just bail completely for + * the moment. + */ + return; + #define DMESG_MAX_LINE_LEN 800 #define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "\n\0" */ From 528cef1b4170f328d28d4e9b437380d8e5a2d18f Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Fri, 13 Dec 2024 13:24:14 +0100 Subject: [PATCH 453/807] drm/xe: Use non-interruptible wait when moving BO to system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure a non-interruptible wait is used when moving a bo to XE_PL_SYSTEM. This prevents dma_mappings from being removed prematurely while a GPU job is still in progress, even if the CPU receives a signal during the operation. Fixes: 75521e8b56e8 ("drm/xe: Perform dma_map when moving system buffer objects to TT") Cc: Thomas Hellström Cc: Matthew Brost Cc: Lucas De Marchi Cc: stable@vger.kernel.org # v6.11+ Suggested-by: Matthew Auld Reviewed-by: Matthew Auld Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20241213122415.3880017-1-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit dc5e20ae1f8a7c354dc9833faa2720254e5a5443) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_bo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index ae6b337cdc54..1aec4133008e 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -724,7 +724,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, new_mem->mem_type == XE_PL_SYSTEM) { long timeout = dma_resv_wait_timeout(ttm_bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, - true, + false, MAX_SCHEDULE_TIMEOUT); if (timeout < 0) { ret = timeout; From 5e0a67fdb894d34c5f109e969320eef9ddae7480 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Fri, 13 Dec 2024 13:24:15 +0100 Subject: [PATCH 454/807] drm/xe: Wait for migration job before unmapping pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a potential GPU page fault during tt -> system moves by waiting for migration jobs to complete before unmapping SG. This ensures that IOMMU mappings are not prematurely torn down while a migration job is still in progress. v2: Use intr=false(Matt A) v3: Update commit message(Matt A) v4: s/DMA_RESV_USAGE_BOOKKEEP/DMA_RESV_USAGE_KERNEL(Thomas) Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3466 Fixes: 75521e8b56e8 ("drm/xe: Perform dma_map when moving system buffer objects to TT") Cc: Thomas Hellström Cc: Matthew Brost Cc: Lucas De Marchi Cc: stable@vger.kernel.org # v6.11+ Cc: Matthew Auld Reviewed-by: Matthew Auld Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20241213122415.3880017-2-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit cda06412c06893a6f07a2fbf89d42a0972ec9e8e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_bo.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 1aec4133008e..f61a8ef38094 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -848,8 +848,16 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, out: if ((!ttm_bo->resource || ttm_bo->resource->mem_type == XE_PL_SYSTEM) && - ttm_bo->ttm) + ttm_bo->ttm) { + long timeout = dma_resv_wait_timeout(ttm_bo->base.resv, + DMA_RESV_USAGE_KERNEL, + false, + MAX_SCHEDULE_TIMEOUT); + if (timeout < 0) + ret = timeout; + xe_tt_unmap_sg(ttm_bo->ttm); + } return ret; } From af12ba67d09ebe2b31ab997cea1a930864028562 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Mon, 16 Dec 2024 23:32:53 +0100 Subject: [PATCH 455/807] drm/xe/pf: Use correct function to check LMEM provisioning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a typo in function call and instead of VF LMEM we were looking at VF GGTT provisioning. Fix that. Fixes: 234670cea9a2 ("drm/xe/pf: Skip fair VFs provisioning if already provisioned") Signed-off-by: Michal Wajdeczko Cc: Piotr Piórkowski Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20241216223253.819-1-michal.wajdeczko@intel.com (cherry picked from commit a8d0aa0e7fcd20c9f1992688c0f0d07a68287403) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 192643d63d22..ca49860168f6 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -2046,7 +2046,7 @@ static int pf_validate_vf_config(struct xe_gt *gt, unsigned int vfid) valid_any = valid_any || (valid_ggtt && is_primary); if (IS_DGFX(xe)) { - bool valid_lmem = pf_get_vf_config_ggtt(primary_gt, vfid); + bool valid_lmem = pf_get_vf_config_lmem(primary_gt, vfid); valid_any = valid_any || (valid_lmem && is_primary); valid_all = valid_all && valid_lmem; From fe39b222a4139354d32ff9d46b88757f63f71d63 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Tue, 17 Dec 2024 21:31:21 -0800 Subject: [PATCH 456/807] drm/xe: Fix fault on fd close after unbind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If userspace holds an fd open, unbinds the device and then closes it, the driver shouldn't try to access the hardware. Protect it by using drm_dev_enter()/drm_dev_exit(). This fixes the following page fault: <6> [IGT] xe_wedged: exiting, ret=98 <1> BUG: unable to handle page fault for address: ffffc901bc5e508c <1> #PF: supervisor read access in kernel mode <1> #PF: error_code(0x0000) - not-present page ... <4> xe_lrc_update_timestamp+0x1c/0xd0 [xe] <4> xe_exec_queue_update_run_ticks+0x50/0xb0 [xe] <4> xe_exec_queue_fini+0x16/0xb0 [xe] <4> __guc_exec_queue_fini_async+0xc4/0x190 [xe] <4> guc_exec_queue_fini_async+0xa0/0xe0 [xe] <4> guc_exec_queue_fini+0x23/0x40 [xe] <4> xe_exec_queue_destroy+0xb3/0xf0 [xe] <4> xe_file_close+0xd4/0x1a0 [xe] <4> drm_file_free+0x210/0x280 [drm] <4> drm_close_helper.isra.0+0x6d/0x80 [drm] <4> drm_release_noglobal+0x20/0x90 [drm] Fixes: 514447a12190 ("drm/xe: Stop accumulating LRC timestamp on job_free") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3421 Reviewed-by: Umesh Nerlige Ramappa Link: https://patchwork.freedesktop.org/patch/msgid/20241218053122.2730195-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 4ca1fd418338d4d135428a0eb1e16e3b3ce17ee8) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_exec_queue.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index fd0f3b3c9101..268cd3123be9 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -762,9 +763,11 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q) */ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) { + struct xe_device *xe = gt_to_xe(q->gt); struct xe_file *xef; struct xe_lrc *lrc; u32 old_ts, new_ts; + int idx; /* * Jobs that are run during driver load may use an exec_queue, but are @@ -774,6 +777,10 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) if (!q->vm || !q->vm->xef) return; + /* Synchronize with unbind while holding the xe file open */ + if (!drm_dev_enter(&xe->drm, &idx)) + return; + xef = q->vm->xef; /* @@ -787,6 +794,8 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) lrc = q->lrc[0]; new_ts = xe_lrc_update_timestamp(lrc, &old_ts); xef->run_ticks[q->class] += (new_ts - old_ts) * q->width; + + drm_dev_exit(idx); } /** From 01ea6bf5cb58b20cc1bd159f0cf74a76cf04bb69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Mon, 9 Dec 2024 11:49:53 +0000 Subject: [PATCH 457/807] usb: dwc3: gadget: fix writing NYET threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before writing a new value to the register, the old value needs to be masked out for the new value to be programmed as intended, because at least in some cases the reset value of that field is 0xf (max value). At the moment, the dwc3 core initialises the threshold to the maximum value (0xf), with the option to override it via a DT. No upstream DTs seem to override it, therefore this commit doesn't change behaviour for any upstream platform. Nevertheless, the code should be fixed to have the desired outcome. Do so. Fixes: 80caf7d21adc ("usb: dwc3: add lpm erratum support") Cc: stable@vger.kernel.org # 5.10+ (needs adjustment for 5.4) Signed-off-by: André Draszik Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20241209-dwc3-nyet-fix-v2-1-02755683345b@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.h | 1 + drivers/usb/dwc3/gadget.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index ee73789326bc..f11570c8ffd0 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -464,6 +464,7 @@ #define DWC3_DCTL_TRGTULST_SS_INACT (DWC3_DCTL_TRGTULST(6)) /* These apply for core versions 1.94a and later */ +#define DWC3_DCTL_NYET_THRES_MASK (0xf << 20) #define DWC3_DCTL_NYET_THRES(n) (((n) & 0xf) << 20) #define DWC3_DCTL_KEEP_CONNECT BIT(19) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 83dc7304d701..31a654c6f15b 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -4195,8 +4195,10 @@ static void dwc3_gadget_conndone_interrupt(struct dwc3 *dwc) WARN_ONCE(DWC3_VER_IS_PRIOR(DWC3, 240A) && dwc->has_lpm_erratum, "LPM Erratum not available on dwc3 revisions < 2.40a\n"); - if (dwc->has_lpm_erratum && !DWC3_VER_IS_PRIOR(DWC3, 240A)) + if (dwc->has_lpm_erratum && !DWC3_VER_IS_PRIOR(DWC3, 240A)) { + reg &= ~DWC3_DCTL_NYET_THRES_MASK; reg |= DWC3_DCTL_NYET_THRES(dwc->lpm_nyet_threshold); + } dwc3_gadget_dctl_write_safe(dwc, reg); } else { From 625e70ccb7bbbb2cc912e23c63390946170c085c Mon Sep 17 00:00:00 2001 From: Prashanth K Date: Mon, 9 Dec 2024 16:27:28 +0530 Subject: [PATCH 458/807] usb: dwc3-am62: Disable autosuspend during remove Runtime PM documentation (Section 5) mentions, during remove() callbacks, drivers should undo the runtime PM changes done in probe(). Usually this means calling pm_runtime_disable(), pm_runtime_dont_use_autosuspend() etc. Hence add missing function to disable autosuspend on dwc3-am62 driver unbind. Fixes: e8784c0aec03 ("drivers: usb: dwc3: Add AM62 USB wrapper driver") Cc: stable Signed-off-by: Prashanth K Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20241209105728.3216872-1-quic_prashk@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-am62.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/dwc3-am62.c b/drivers/usb/dwc3/dwc3-am62.c index 5e3d1741701f..7d43da5f2897 100644 --- a/drivers/usb/dwc3/dwc3-am62.c +++ b/drivers/usb/dwc3/dwc3-am62.c @@ -309,6 +309,7 @@ static void dwc3_ti_remove(struct platform_device *pdev) pm_runtime_put_sync(dev); pm_runtime_disable(dev); + pm_runtime_dont_use_autosuspend(dev); pm_runtime_set_suspended(dev); } From e19852d0bfecbc80976b1423cf2af87ca514a58c Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 9 Dec 2024 19:14:23 +0800 Subject: [PATCH 459/807] usb: host: xhci-plat: set skip_phy_initialization if software node has XHCI_SKIP_PHY_INIT property The source of quirk XHCI_SKIP_PHY_INIT comes from xhci_plat_priv.quirks or software node property. This will set skip_phy_initialization if software node also has XHCI_SKIP_PHY_INIT property. Fixes: a6cd2b3fa894 ("usb: host: xhci-plat: Parse xhci-missing_cas_quirk and apply quirk") Cc: stable Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20241209111423.4085548-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-plat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c index e6c9006bd568..db109b570c5c 100644 --- a/drivers/usb/host/xhci-plat.c +++ b/drivers/usb/host/xhci-plat.c @@ -290,7 +290,8 @@ int xhci_plat_probe(struct platform_device *pdev, struct device *sysdev, const s hcd->tpl_support = of_usb_host_tpl_support(sysdev->of_node); - if (priv && (priv->quirks & XHCI_SKIP_PHY_INIT)) + if ((priv && (priv->quirks & XHCI_SKIP_PHY_INIT)) || + (xhci->quirks & XHCI_SKIP_PHY_INIT)) hcd->skip_phy_initialization = 1; if (priv && (priv->quirks & XHCI_SG_TRB_CACHE_SIZE_QUIRK)) From b9711ff7cde0cfbcdd44cb1fac55b6eec496e690 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 6 Dec 2024 16:09:18 +0300 Subject: [PATCH 460/807] usb: typec: tcpm/tcpci_maxim: fix error code in max_contaminant_read_resistance_kohm() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If max_contaminant_read_adc_mv() fails, then return the error code. Don't return zero. Fixes: 02b332a06397 ("usb: typec: maxim_contaminant: Implement check_contaminant callback") Cc: stable Signed-off-by: Dan Carpenter Reviewed-by: André Draszik Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/f1bf3768-419e-40dd-989c-f7f455d6c824@stanley.mountain Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/maxim_contaminant.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/tcpm/maxim_contaminant.c b/drivers/usb/typec/tcpm/maxim_contaminant.c index 22163d8f9eb0..0cdda06592fd 100644 --- a/drivers/usb/typec/tcpm/maxim_contaminant.c +++ b/drivers/usb/typec/tcpm/maxim_contaminant.c @@ -135,7 +135,7 @@ static int max_contaminant_read_resistance_kohm(struct max_tcpci_chip *chip, mv = max_contaminant_read_adc_mv(chip, channel, sleep_msec, raw, true); if (mv < 0) - return ret; + return mv; /* OVP enable */ ret = regmap_update_bits(regmap, TCPC_VENDOR_CC_CTRL2, CCOVPDIS, 0); @@ -157,7 +157,7 @@ static int max_contaminant_read_resistance_kohm(struct max_tcpci_chip *chip, mv = max_contaminant_read_adc_mv(chip, channel, sleep_msec, raw, true); if (mv < 0) - return ret; + return mv; /* Disable current source */ ret = regmap_update_bits(regmap, TCPC_VENDOR_CC_CTRL2, SBURPCTRL, 0); if (ret < 0) From a072ffd896efa6a6c8a0334c712fbc98a63c789c Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Wed, 18 Dec 2024 15:25:58 -0800 Subject: [PATCH 461/807] eth: fbnic: fix csr boundary for RPM RAM section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CSR dump support leverages the FBNIC_BOUNDS macro, which pads the end condition for each section by adding an offset of 1. However, the RPC RAM section, which is dumped differently from other sections, does not rely on this macro and instead directly uses end boundary address. Hence, subtracting 1 from the end address results in skipping a register. Fixes 3d12862b216d (“eth: fbnic: Add support to dump registers”) Signed-off-by: Mohsin Bashir Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20241218232614.439329-1-mohsin.bashr@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic_csr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.c b/drivers/net/ethernet/meta/fbnic/fbnic_csr.c index 2118901b25e9..aeb9f333f4c7 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.c @@ -64,7 +64,7 @@ static void fbnic_csr_get_regs_rpc_ram(struct fbnic_dev *fbd, u32 **data_p) u32 i, j; *(data++) = start; - *(data++) = end - 1; + *(data++) = end; /* FBNIC_RPC_TCAM_ACT */ for (i = 0; i < FBNIC_RPC_TCAM_ACT_NUM_ENTRIES; i++) { From 057bd54dfcf68b1f67e6dfc32a47a72e12198495 Mon Sep 17 00:00:00 2001 From: Prashanth K Date: Wed, 11 Dec 2024 17:29:15 +0530 Subject: [PATCH 462/807] usb: gadget: f_uac2: Fix incorrect setting of bNumEndpoints Currently afunc_bind sets std_ac_if_desc.bNumEndpoints to 1 if controls (mute/volume) are enabled. During next afunc_bind call, bNumEndpoints would be unchanged and incorrectly set to 1 even if the controls aren't enabled. Fix this by resetting the value of bNumEndpoints to 0 on every afunc_bind call. Fixes: eaf6cbe09920 ("usb: gadget: f_uac2: add volume and mute support") Cc: stable Signed-off-by: Prashanth K Link: https://lore.kernel.org/r/20241211115915.159864-1-quic_prashk@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_uac2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c index ce5b77f89190..9b324821c93b 100644 --- a/drivers/usb/gadget/function/f_uac2.c +++ b/drivers/usb/gadget/function/f_uac2.c @@ -1185,6 +1185,7 @@ afunc_bind(struct usb_configuration *cfg, struct usb_function *fn) uac2->as_in_alt = 0; } + std_ac_if_desc.bNumEndpoints = 0; if (FUOUT_EN(uac2_opts) || FUIN_EN(uac2_opts)) { uac2->int_ep = usb_ep_autoconfig(gadget, &fs_ep_int_desc); if (!uac2->int_ep) { From 7a3d76a0b60b3f6fc3375e4de2174bab43f64545 Mon Sep 17 00:00:00 2001 From: Jun Yan Date: Thu, 12 Dec 2024 22:38:52 +0800 Subject: [PATCH 463/807] USB: usblp: return error when setting unsupported protocol Fix the regression introduced by commit d8c6edfa3f4e ("USB: usblp: don't call usb_set_interface if there's a single alt"), which causes that unsupported protocols can also be set via ioctl when the num_altsetting of the device is 1. Move the check for protocol support to the earlier stage. Fixes: d8c6edfa3f4e ("USB: usblp: don't call usb_set_interface if there's a single alt") Cc: stable Signed-off-by: Jun Yan Link: https://lore.kernel.org/r/20241212143852.671889-1-jerrysteve1101@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/usblp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c index 5a2e43331064..ff1a941fd2ed 100644 --- a/drivers/usb/class/usblp.c +++ b/drivers/usb/class/usblp.c @@ -1337,11 +1337,12 @@ static int usblp_set_protocol(struct usblp *usblp, int protocol) if (protocol < USBLP_FIRST_PROTOCOL || protocol > USBLP_LAST_PROTOCOL) return -EINVAL; + alts = usblp->protocol[protocol].alt_setting; + if (alts < 0) + return -EINVAL; + /* Don't unnecessarily set the interface if there's a single alt. */ if (usblp->intf->num_altsetting > 1) { - alts = usblp->protocol[protocol].alt_setting; - if (alts < 0) - return -EINVAL; r = usb_set_interface(usblp->dev, usblp->ifnum, alts); if (r < 0) { printk(KERN_ERR "usblp: can't set desired altsetting %d on interface %d\n", From 9466545720e231fc02acd69b5f4e9138e09a26f6 Mon Sep 17 00:00:00 2001 From: Ingo Rohloff Date: Thu, 12 Dec 2024 16:41:14 +0100 Subject: [PATCH 464/807] usb: gadget: configfs: Ignore trailing LF for user strings to cdev Since commit c033563220e0f7a8 ("usb: gadget: configfs: Attach arbitrary strings to cdev") a user can provide extra string descriptors to a USB gadget via configfs. For "manufacturer", "product", "serialnumber", setting the string via configfs ignores a trailing LF. For the arbitrary strings the LF was not ignored. This patch ignores a trailing LF to make this consistent with the existing behavior for "manufacturer", ... string descriptors. Fixes: c033563220e0 ("usb: gadget: configfs: Attach arbitrary strings to cdev") Cc: stable Signed-off-by: Ingo Rohloff Link: https://lore.kernel.org/r/20241212154114.29295-1-ingo.rohloff@lauterbach.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/configfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 6499a88d346c..fba2a56dae97 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -827,11 +827,15 @@ static ssize_t gadget_string_s_store(struct config_item *item, const char *page, { struct gadget_string *string = to_gadget_string(item); int size = min(sizeof(string->string), len + 1); + ssize_t cpy_len; if (len > USB_MAX_STRING_LEN) return -EINVAL; - return strscpy(string->string, page, size); + cpy_len = strscpy(string->string, page, size); + if (cpy_len > 0 && string->string[cpy_len - 1] == '\n') + string->string[cpy_len - 1] = 0; + return len; } CONFIGFS_ATTR(gadget_string_, s); From f47eba045e6cb97f9ee154c68dbf7c3c756919aa Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 12 Dec 2024 19:37:43 +0200 Subject: [PATCH 465/807] usb: typec: ucsi: Set orientation as none when connector is unplugged The current implementation of the ucsi glink client connector_status() callback is only relying on the state of the gpio. This means that even when the cable is unplugged, the orientation propagated to the switches along the graph is "orientation normal", instead of "orientation none", which would be the correct one in this case. One of the Qualcomm DP-USB PHY combo drivers, which needs to be aware of the orientation change, is relying on the "orientation none" to skip the reinitialization of the entire PHY. Since the ucsi glink client advertises "orientation normal" even when the cable is unplugged, the mentioned PHY is taken down and reinitialized when in fact it should be left as-is. This triggers a crash within the displayport controller driver in turn, which brings the whole system down on some Qualcomm platforms. Propagating "orientation none" from the ucsi glink client on the connector_status() callback hides the problem of the mentioned PHY driver away for now. But the "orientation none" is nonetheless the correct one to be used in this case. So propagate the "orientation none" instead when the connector status flags says cable is disconnected. Fixes: 76716fd5bf09 ("usb: typec: ucsi: glink: move GPIO reading into connector_status callback") Cc: stable # 6.10 Reviewed-by: Bryan O'Donoghue Reviewed-by: Heikki Krogerus Reviewed-by: Neil Armstrong Signed-off-by: Abel Vesa Reviewed-by: Johan Hovold Tested-by: Johan Hovold Link: https://lore.kernel.org/r/20241212-usb-typec-ucsi-glink-add-orientation-none-v2-1-db5a50498a77@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi_glink.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c index 90948cd6d297..fed39d458090 100644 --- a/drivers/usb/typec/ucsi/ucsi_glink.c +++ b/drivers/usb/typec/ucsi/ucsi_glink.c @@ -185,6 +185,11 @@ static void pmic_glink_ucsi_connector_status(struct ucsi_connector *con) struct pmic_glink_ucsi *ucsi = ucsi_get_drvdata(con->ucsi); int orientation; + if (!UCSI_CONSTAT(con, CONNECTED)) { + typec_set_orientation(con->port, TYPEC_ORIENTATION_NONE); + return; + } + if (con->num > PMIC_GLINK_MAX_PORTS || !ucsi->port_orientation[con->num - 1]) return; From 74adad500346fb07d69af2c79acbff4adb061134 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Mon, 16 Dec 2024 10:55:39 +0900 Subject: [PATCH 466/807] usb: chipidea: ci_hdrc_imx: decrement device's refcount in .remove() and in the error path of .probe() Current implementation of ci_hdrc_imx_driver does not decrement the refcount of the device obtained in usbmisc_get_init_data(). Add a put_device() call in .remove() and in .probe() before returning an error. This bug was found by an experimental static analysis tool that I am developing. Cc: stable Fixes: f40017e0f332 ("chipidea: usbmisc_imx: Add USB support for VF610 SoCs") Signed-off-by: Joe Hattori Acked-by: Peter Chen Link: https://lore.kernel.org/r/20241216015539.352579-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/ci_hdrc_imx.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c index f2801700be8e..1a7fc638213e 100644 --- a/drivers/usb/chipidea/ci_hdrc_imx.c +++ b/drivers/usb/chipidea/ci_hdrc_imx.c @@ -370,25 +370,29 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) data->pinctrl = devm_pinctrl_get(dev); if (PTR_ERR(data->pinctrl) == -ENODEV) data->pinctrl = NULL; - else if (IS_ERR(data->pinctrl)) - return dev_err_probe(dev, PTR_ERR(data->pinctrl), + else if (IS_ERR(data->pinctrl)) { + ret = dev_err_probe(dev, PTR_ERR(data->pinctrl), "pinctrl get failed\n"); + goto err_put; + } data->hsic_pad_regulator = devm_regulator_get_optional(dev, "hsic"); if (PTR_ERR(data->hsic_pad_regulator) == -ENODEV) { /* no pad regulator is needed */ data->hsic_pad_regulator = NULL; - } else if (IS_ERR(data->hsic_pad_regulator)) - return dev_err_probe(dev, PTR_ERR(data->hsic_pad_regulator), + } else if (IS_ERR(data->hsic_pad_regulator)) { + ret = dev_err_probe(dev, PTR_ERR(data->hsic_pad_regulator), "Get HSIC pad regulator error\n"); + goto err_put; + } if (data->hsic_pad_regulator) { ret = regulator_enable(data->hsic_pad_regulator); if (ret) { dev_err(dev, "Failed to enable HSIC pad regulator\n"); - return ret; + goto err_put; } } } @@ -402,13 +406,14 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) dev_err(dev, "pinctrl_hsic_idle lookup failed, err=%ld\n", PTR_ERR(pinctrl_hsic_idle)); - return PTR_ERR(pinctrl_hsic_idle); + ret = PTR_ERR(pinctrl_hsic_idle); + goto err_put; } ret = pinctrl_select_state(data->pinctrl, pinctrl_hsic_idle); if (ret) { dev_err(dev, "hsic_idle select failed, err=%d\n", ret); - return ret; + goto err_put; } data->pinctrl_hsic_active = pinctrl_lookup_state(data->pinctrl, @@ -417,7 +422,8 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) dev_err(dev, "pinctrl_hsic_active lookup failed, err=%ld\n", PTR_ERR(data->pinctrl_hsic_active)); - return PTR_ERR(data->pinctrl_hsic_active); + ret = PTR_ERR(data->pinctrl_hsic_active); + goto err_put; } } @@ -527,6 +533,8 @@ disable_hsic_regulator: if (pdata.flags & CI_HDRC_PMQOS) cpu_latency_qos_remove_request(&data->pm_qos_req); data->ci_pdev = NULL; +err_put: + put_device(data->usbmisc_data->dev); return ret; } @@ -551,6 +559,7 @@ static void ci_hdrc_imx_remove(struct platform_device *pdev) if (data->hsic_pad_regulator) regulator_disable(data->hsic_pad_regulator); } + put_device(data->usbmisc_data->dev); } static void ci_hdrc_imx_shutdown(struct platform_device *pdev) From 2b6ffcd7873b7e8a62c3e15a6f305bfc747c466b Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Thu, 19 Dec 2024 11:41:19 +0900 Subject: [PATCH 467/807] net: stmmac: restructure the error path of stmmac_probe_config_dt() Current implementation of stmmac_probe_config_dt() does not release the OF node reference obtained by of_parse_phandle() in some error paths. The problem is that some error paths call stmmac_remove_config_dt() to clean up but others use and unwind ladder. These two types of error handling have not kept in sync and have been a recurring source of bugs. Re-write the error handling in stmmac_probe_config_dt() to use an unwind ladder. Consequently, stmmac_remove_config_dt() is not needed anymore, thus remove it. This bug was found by an experimental verification tool that I am developing. Fixes: 4838a5405028 ("net: stmmac: Fix wrapper drivers not detecting PHY") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241219024119.2017012-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/stmmac_platform.c | 43 ++++++++----------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 3ac32444e492..dc9884130b91 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -405,22 +405,6 @@ static int stmmac_of_get_mac_mode(struct device_node *np) return -ENODEV; } -/** - * stmmac_remove_config_dt - undo the effects of stmmac_probe_config_dt() - * @pdev: platform_device structure - * @plat: driver data platform structure - * - * Release resources claimed by stmmac_probe_config_dt(). - */ -static void stmmac_remove_config_dt(struct platform_device *pdev, - struct plat_stmmacenet_data *plat) -{ - clk_disable_unprepare(plat->stmmac_clk); - clk_disable_unprepare(plat->pclk); - of_node_put(plat->phy_node); - of_node_put(plat->mdio_node); -} - /** * stmmac_probe_config_dt - parse device-tree driver parameters * @pdev: platform_device structure @@ -490,8 +474,10 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dev_warn(&pdev->dev, "snps,phy-addr property is deprecated\n"); rc = stmmac_mdio_setup(plat, np, &pdev->dev); - if (rc) - return ERR_PTR(rc); + if (rc) { + ret = ERR_PTR(rc); + goto error_put_phy; + } of_property_read_u32(np, "tx-fifo-depth", &plat->tx_fifo_size); @@ -581,8 +567,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), GFP_KERNEL); if (!dma_cfg) { - stmmac_remove_config_dt(pdev, plat); - return ERR_PTR(-ENOMEM); + ret = ERR_PTR(-ENOMEM); + goto error_put_mdio; } plat->dma_cfg = dma_cfg; @@ -610,8 +596,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) rc = stmmac_mtl_setup(pdev, plat); if (rc) { - stmmac_remove_config_dt(pdev, plat); - return ERR_PTR(rc); + ret = ERR_PTR(rc); + goto error_put_mdio; } /* clock setup */ @@ -663,6 +649,10 @@ error_hw_init: clk_disable_unprepare(plat->pclk); error_pclk_get: clk_disable_unprepare(plat->stmmac_clk); +error_put_mdio: + of_node_put(plat->mdio_node); +error_put_phy: + of_node_put(plat->phy_node); return ret; } @@ -671,16 +661,17 @@ static void devm_stmmac_remove_config_dt(void *data) { struct plat_stmmacenet_data *plat = data; - /* Platform data argument is unused */ - stmmac_remove_config_dt(NULL, plat); + clk_disable_unprepare(plat->stmmac_clk); + clk_disable_unprepare(plat->pclk); + of_node_put(plat->mdio_node); + of_node_put(plat->phy_node); } /** * devm_stmmac_probe_config_dt * @pdev: platform_device structure * @mac: MAC address to use - * Description: Devres variant of stmmac_probe_config_dt(). Does not require - * the user to call stmmac_remove_config_dt() at driver detach. + * Description: Devres variant of stmmac_probe_config_dt(). */ struct plat_stmmacenet_data * devm_stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) From 13014969cbf07f18d62ceea40bd8ca8ec9d36cec Mon Sep 17 00:00:00 2001 From: Lianqin Hu Date: Tue, 17 Dec 2024 07:58:44 +0000 Subject: [PATCH 468/807] usb: gadget: u_serial: Disable ep before setting port to null to fix the crash caused by port being null Considering that in some extreme cases, when performing the unbinding operation, gserial_disconnect has cleared gser->ioport, which triggers gadget reconfiguration, and then calls gs_read_complete, resulting in access to a null pointer. Therefore, ep is disabled before gserial_disconnect sets port to null to prevent this from happening. Call trace: gs_read_complete+0x58/0x240 usb_gadget_giveback_request+0x40/0x160 dwc3_remove_requests+0x170/0x484 dwc3_ep0_out_start+0xb0/0x1d4 __dwc3_gadget_start+0x25c/0x720 kretprobe_trampoline.cfi_jt+0x0/0x8 kretprobe_trampoline.cfi_jt+0x0/0x8 udc_bind_to_driver+0x1d8/0x300 usb_gadget_probe_driver+0xa8/0x1dc gadget_dev_desc_UDC_store+0x13c/0x188 configfs_write_iter+0x160/0x1f4 vfs_write+0x2d0/0x40c ksys_write+0x7c/0xf0 __arm64_sys_write+0x20/0x30 invoke_syscall+0x60/0x150 el0_svc_common+0x8c/0xf8 do_el0_svc+0x28/0xa0 el0_svc+0x24/0x84 Fixes: c1dca562be8a ("usb gadget: split out serial core") Cc: stable Suggested-by: Greg Kroah-Hartman Signed-off-by: Lianqin Hu Link: https://lore.kernel.org/r/TYUPR06MB621733B5AC690DBDF80A0DCCD2042@TYUPR06MB6217.apcprd06.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/u_serial.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c index 53d9fc41acc5..bc143a86c2dd 100644 --- a/drivers/usb/gadget/function/u_serial.c +++ b/drivers/usb/gadget/function/u_serial.c @@ -1420,6 +1420,10 @@ void gserial_disconnect(struct gserial *gser) /* REVISIT as above: how best to track this? */ port->port_line_coding = gser->port_line_coding; + /* disable endpoints, aborting down any active I/O */ + usb_ep_disable(gser->out); + usb_ep_disable(gser->in); + port->port_usb = NULL; gser->ioport = NULL; if (port->port.count > 0) { @@ -1431,10 +1435,6 @@ void gserial_disconnect(struct gserial *gser) spin_unlock(&port->port_lock); spin_unlock_irqrestore(&serial_port_lock, flags); - /* disable endpoints, aborting down any active I/O */ - usb_ep_disable(gser->out); - usb_ep_disable(gser->in); - /* finally, free any unused/unusable I/O buffers */ spin_lock_irqsave(&port->port_lock, flags); if (port->port.count == 0) From 862a9c0f68487fd6ced15622d9cdcec48f8b5aaa Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Wed, 18 Dec 2024 17:53:28 +0800 Subject: [PATCH 469/807] usb: typec: tcpci: fix NULL pointer issue on shared irq case The tcpci_irq() may meet below NULL pointer dereference issue: [ 2.641851] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010 [ 2.641951] status 0x1, 0x37f [ 2.650659] Mem abort info: [ 2.656490] ESR = 0x0000000096000004 [ 2.660230] EC = 0x25: DABT (current EL), IL = 32 bits [ 2.665532] SET = 0, FnV = 0 [ 2.668579] EA = 0, S1PTW = 0 [ 2.671715] FSC = 0x04: level 0 translation fault [ 2.676584] Data abort info: [ 2.679459] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 2.684936] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 2.689980] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 2.695284] [0000000000000010] user address but active_mm is swapper [ 2.701632] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 2.707883] Modules linked in: [ 2.710936] CPU: 1 UID: 0 PID: 87 Comm: irq/111-2-0051 Not tainted 6.12.0-rc6-06316-g7f63786ad3d1-dirty #4 [ 2.720570] Hardware name: NXP i.MX93 11X11 EVK board (DT) [ 2.726040] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 2.732989] pc : tcpci_irq+0x38/0x318 [ 2.736647] lr : _tcpci_irq+0x14/0x20 [ 2.740295] sp : ffff80008324bd30 [ 2.743597] x29: ffff80008324bd70 x28: ffff800080107894 x27: ffff800082198f70 [ 2.750721] x26: ffff0000050e6680 x25: ffff000004d172ac x24: ffff0000050f0000 [ 2.757845] x23: ffff000004d17200 x22: 0000000000000001 x21: ffff0000050f0000 [ 2.764969] x20: ffff000004d17200 x19: 0000000000000000 x18: 0000000000000001 [ 2.772093] x17: 0000000000000000 x16: ffff80008183d8a0 x15: ffff00007fbab040 [ 2.779217] x14: ffff00007fb918c0 x13: 0000000000000000 x12: 000000000000017a [ 2.786341] x11: 0000000000000001 x10: 0000000000000a90 x9 : ffff80008324bd00 [ 2.793465] x8 : ffff0000050f0af0 x7 : ffff00007fbaa840 x6 : 0000000000000031 [ 2.800589] x5 : 000000000000017a x4 : 0000000000000002 x3 : 0000000000000002 [ 2.807713] x2 : ffff80008324bd3a x1 : 0000000000000010 x0 : 0000000000000000 [ 2.814838] Call trace: [ 2.817273] tcpci_irq+0x38/0x318 [ 2.820583] _tcpci_irq+0x14/0x20 [ 2.823885] irq_thread_fn+0x2c/0xa8 [ 2.827456] irq_thread+0x16c/0x2f4 [ 2.830940] kthread+0x110/0x114 [ 2.834164] ret_from_fork+0x10/0x20 [ 2.837738] Code: f9426420 f9001fe0 d2800000 52800201 (f9400a60) This may happen on shared irq case. Such as two Type-C ports share one irq. After the first port finished tcpci_register_port(), it may trigger interrupt. However, if the interrupt comes by chance the 2nd port finishes devm_request_threaded_irq(), the 2nd port interrupt handler will run at first. Then the above issue happens due to tcpci is still a NULL pointer in tcpci_irq() when dereference to regmap. devm_request_threaded_irq() <-- port1 irq comes disable_irq(client->irq); tcpci_register_port() This will restore the logic to the state before commit (77e85107a771 "usb: typec: tcpci: support edge irq"). However, moving tcpci_register_port() earlier creates a problem when use edge irq because tcpci_init() will be called before devm_request_threaded_irq(). The tcpci_init() writes the ALERT_MASK to the hardware to tell it to start generating interrupts but we're not ready to deal with them yet, then the ALERT events may be missed and ALERT line will not recover to high level forever. To avoid the issue, this will also set ALERT_MASK register after devm_request_threaded_irq() return. Fixes: 77e85107a771 ("usb: typec: tcpci: support edge irq") Cc: stable Tested-by: Emanuele Ghidoli Signed-off-by: Xu Yang Reviewed-by: Francesco Dolcini Reviewed-by: Heikki Krogerus Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20241218095328.2604607-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpci.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpci.c b/drivers/usb/typec/tcpm/tcpci.c index ed32583829be..24a6a4354df8 100644 --- a/drivers/usb/typec/tcpm/tcpci.c +++ b/drivers/usb/typec/tcpm/tcpci.c @@ -700,7 +700,7 @@ static int tcpci_init(struct tcpc_dev *tcpc) tcpci->alert_mask = reg; - return tcpci_write16(tcpci, TCPC_ALERT_MASK, reg); + return 0; } irqreturn_t tcpci_irq(struct tcpci *tcpci) @@ -923,22 +923,27 @@ static int tcpci_probe(struct i2c_client *client) chip->data.set_orientation = err; + chip->tcpci = tcpci_register_port(&client->dev, &chip->data); + if (IS_ERR(chip->tcpci)) + return PTR_ERR(chip->tcpci); + err = devm_request_threaded_irq(&client->dev, client->irq, NULL, _tcpci_irq, IRQF_SHARED | IRQF_ONESHOT, dev_name(&client->dev), chip); if (err < 0) - return err; + goto unregister_port; - /* - * Disable irq while registering port. If irq is configured as an edge - * irq this allow to keep track and process the irq as soon as it is enabled. - */ - disable_irq(client->irq); - chip->tcpci = tcpci_register_port(&client->dev, &chip->data); - enable_irq(client->irq); + /* Enable chip interrupts at last */ + err = tcpci_write16(chip->tcpci, TCPC_ALERT_MASK, chip->tcpci->alert_mask); + if (err < 0) + goto unregister_port; - return PTR_ERR_OR_ZERO(chip->tcpci); + return 0; + +unregister_port: + tcpci_unregister_port(chip->tcpci); + return err; } static void tcpci_remove(struct i2c_client *client) From 0df11fa8cee5a9cf8753d4e2672bb3667138c652 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Wed, 18 Dec 2024 15:13:46 +0800 Subject: [PATCH 470/807] usb: fix reference leak in usb_new_device() When device_add(&udev->dev) succeeds and a later call fails, usb_new_device() does not properly call device_del(). As comment of device_add() says, 'if device_add() succeeds, you should call device_del() when you want to get rid of it. If device_add() has not succeeded, use only put_device() to drop the reference count'. Found by code review. Cc: stable Fixes: 9f8b17e643fe ("USB: make usbdevices export their device nodes instead of using a separate class") Signed-off-by: Ma Ke Reviewed-by: Alan Stern Link: https://lore.kernel.org/r/20241218071346.2973980-1-make_ruc2021@163.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 4b93c0bd1d4b..21ac9b464696 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -2663,13 +2663,13 @@ int usb_new_device(struct usb_device *udev) err = sysfs_create_link(&udev->dev.kobj, &port_dev->dev.kobj, "port"); if (err) - goto fail; + goto out_del_dev; err = sysfs_create_link(&port_dev->dev.kobj, &udev->dev.kobj, "device"); if (err) { sysfs_remove_link(&udev->dev.kobj, "port"); - goto fail; + goto out_del_dev; } if (!test_and_set_bit(port1, hub->child_usage_bits)) @@ -2683,6 +2683,8 @@ int usb_new_device(struct usb_device *udev) pm_runtime_put_sync_autosuspend(&udev->dev); return err; +out_del_dev: + device_del(&udev->dev); fail: usb_set_device_state(udev, USB_STATE_NOTATTACHED); pm_runtime_disable(&udev->dev); From 59bfeaf5454b7e764288d84802577f4a99bf0819 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 6 Dec 2024 15:48:17 +0800 Subject: [PATCH 471/807] USB: core: Disable LPM only for non-suspended ports There's USB error when tegra board is shutting down: [ 180.919315] usb 2-3: Failed to set U1 timeout to 0x0,error code -113 [ 180.919995] usb 2-3: Failed to set U1 timeout to 0xa,error code -113 [ 180.920512] usb 2-3: Failed to set U2 timeout to 0x4,error code -113 [ 186.157172] tegra-xusb 3610000.usb: xHCI host controller not responding, assume dead [ 186.157858] tegra-xusb 3610000.usb: HC died; cleaning up [ 186.317280] tegra-xusb 3610000.usb: Timeout while waiting for evaluate context command The issue is caused by disabling LPM on already suspended ports. For USB2 LPM, the LPM is already disabled during port suspend. For USB3 LPM, port won't transit to U1/U2 when it's already suspended in U3, hence disabling LPM is only needed for ports that are not suspended. Cc: Wayne Chang Cc: stable Fixes: d920a2ed8620 ("usb: Disable USB3 LPM at shutdown") Signed-off-by: Kai-Heng Feng Acked-by: Alan Stern Tested-by: Jon Hunter Link: https://lore.kernel.org/r/20241206074817.89189-1-kaihengf@nvidia.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/port.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/port.c b/drivers/usb/core/port.c index 45d7af00f8d1..e857e532b35a 100644 --- a/drivers/usb/core/port.c +++ b/drivers/usb/core/port.c @@ -453,10 +453,11 @@ static int usb_port_runtime_suspend(struct device *dev) static void usb_port_shutdown(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); + struct usb_device *udev = port_dev->child; - if (port_dev->child) { - usb_disable_usb2_hardware_lpm(port_dev->child); - usb_unlocked_disable_lpm(port_dev->child); + if (udev && !udev->port_is_suspended) { + usb_disable_usb2_hardware_lpm(udev); + usb_unlocked_disable_lpm(udev); } } From dfc51e48bca475bbee984e90f33fdc537ce09699 Mon Sep 17 00:00:00 2001 From: Akash M Date: Thu, 19 Dec 2024 18:22:19 +0530 Subject: [PATCH 472/807] usb: gadget: f_fs: Remove WARN_ON in functionfs_bind This commit addresses an issue related to below kernel panic where panic_on_warn is enabled. It is caused by the unnecessary use of WARN_ON in functionsfs_bind, which easily leads to the following scenarios. 1.adb_write in adbd 2. UDC write via configfs ================= ===================== ->usb_ffs_open_thread() ->UDC write ->open_functionfs() ->configfs_write_iter() ->adb_open() ->gadget_dev_desc_UDC_store() ->adb_write() ->usb_gadget_register_driver_owner ->driver_register() ->StartMonitor() ->bus_add_driver() ->adb_read() ->gadget_bind_driver() ->configfs_composite_bind() ->usb_add_function() ->open_functionfs() ->ffs_func_bind() ->adb_open() ->functionfs_bind() state !=FFS_ACTIVE> The adb_open, adb_read, and adb_write operations are invoked from the daemon, but trying to bind the function is a process that is invoked by UDC write through configfs, which opens up the possibility of a race condition between the two paths. In this race scenario, the kernel panic occurs due to the WARN_ON from functionfs_bind when panic_on_warn is enabled. This commit fixes the kernel panic by removing the unnecessary WARN_ON. Kernel panic - not syncing: kernel: panic_on_warn set ... [ 14.542395] Call trace: [ 14.542464] ffs_func_bind+0x1c8/0x14a8 [ 14.542468] usb_add_function+0xcc/0x1f0 [ 14.542473] configfs_composite_bind+0x468/0x588 [ 14.542478] gadget_bind_driver+0x108/0x27c [ 14.542483] really_probe+0x190/0x374 [ 14.542488] __driver_probe_device+0xa0/0x12c [ 14.542492] driver_probe_device+0x3c/0x220 [ 14.542498] __driver_attach+0x11c/0x1fc [ 14.542502] bus_for_each_dev+0x104/0x160 [ 14.542506] driver_attach+0x24/0x34 [ 14.542510] bus_add_driver+0x154/0x270 [ 14.542514] driver_register+0x68/0x104 [ 14.542518] usb_gadget_register_driver_owner+0x48/0xf4 [ 14.542523] gadget_dev_desc_UDC_store+0xf8/0x144 [ 14.542526] configfs_write_iter+0xf0/0x138 Fixes: ddf8abd25994 ("USB: f_fs: the FunctionFS driver") Cc: stable Signed-off-by: Akash M Link: https://lore.kernel.org/r/20241219125221.1679-1-akash.m5@samsung.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index ad79eb0f729b..2dea9e42a0f8 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -2285,7 +2285,7 @@ static int functionfs_bind(struct ffs_data *ffs, struct usb_composite_dev *cdev) struct usb_gadget_strings **lang; int first_id; - if (WARN_ON(ffs->state != FFS_ACTIVE + if ((ffs->state != FFS_ACTIVE || test_and_set_bit(FFS_FL_BOUND, &ffs->flags))) return -EBADFD; From ed2761958ad77e54791802b07095786150eab844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 10 Dec 2024 19:01:20 +0200 Subject: [PATCH 473/807] tty: serial: 8250: Fix another runtime PM usage counter underflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit f9b11229b79c ("serial: 8250: Fix PM usage_count for console handover") fixed one runtime PM usage counter balance problem that occurs because .dev is not set during univ8250 setup preventing call to pm_runtime_get_sync(). Later, univ8250_console_exit() will trigger the runtime PM usage counter underflow as .dev is already set at that time. Call pm_runtime_get_sync() to balance the RPM usage counter also in serial8250_register_8250_port() before trying to add the port. Reported-by: Borislav Petkov (AMD) Fixes: bedb404e91bb ("serial: 8250_port: Don't use power management for kernel console") Cc: stable Tested-by: Borislav Petkov (AMD) Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20241210170120.2231-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 5f9f06911795..68baf75bdadc 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -812,6 +812,9 @@ int serial8250_register_8250_port(const struct uart_8250_port *up) uart->dl_write = up->dl_write; if (uart->port.type != PORT_8250_CIR) { + if (uart_console_registered(&uart->port)) + pm_runtime_get_sync(uart->port.dev); + if (serial8250_isa_config != NULL) serial8250_isa_config(0, &uart->port, &uart->capabilities); From fbd22c4fa737f9559be8b87a73bb1cdfcd39fd11 Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Wed, 11 Dec 2024 07:36:13 +0800 Subject: [PATCH 474/807] serial: imx: Use uart_port_lock_irq() instead of uart_port_lock() When executing 'echo mem > /sys/power/state', the following deadlock occurs. Since there is output during the serial port entering the suspend process, the suspend will be interrupted, resulting in the nesting of locks. Therefore, use uart_port_lock_irq() instead of uart_port_unlock(). WARNING: inconsistent lock state 6.12.0-rc2-00002-g3c199ed5bd64-dirty #23 Not tainted -------------------------------- inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. sh/494 [HC0[0]:SC0[0]:HE1:SE1] takes: c4db5850 (&port_lock_key){?.-.}-{3:3}, at: imx_uart_enable_wakeup+0x14/0x254 {IN-HARDIRQ-W} state was registered at: lock_acquire+0x104/0x348 _raw_spin_lock+0x48/0x84 imx_uart_int+0x14/0x4dc __handle_irq_event_percpu+0xac/0x2fc handle_irq_event_percpu+0xc/0x40 handle_irq_event+0x38/0x8c handle_fasteoi_irq+0xb4/0x1b8 handle_irq_desc+0x1c/0x2c gic_handle_irq+0x6c/0xa0 generic_handle_arch_irq+0x2c/0x64 call_with_stack+0x18/0x20 __irq_svc+0x9c/0xbc _raw_spin_unlock_irqrestore+0x2c/0x48 uart_write+0xd8/0x3a0 do_output_char+0x1a8/0x1e4 n_tty_write+0x224/0x440 file_tty_write.constprop.0+0x124/0x250 do_iter_readv_writev+0x100/0x1e0 vfs_writev+0xc4/0x448 do_writev+0x68/0xf8 ret_fast_syscall+0x0/0x1c irq event stamp: 31593 hardirqs last enabled at (31593): [] _raw_spin_unlock_irqrestore+0x44/0x48 hardirqs last disabled at (31592): [] clk_enable_lock+0x60/0x120 softirqs last enabled at (30334): [] handle_softirqs+0x2cc/0x478 softirqs last disabled at (30325): [] __irq_exit_rcu+0x120/0x15c other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&port_lock_key); lock(&port_lock_key); Fixes: 3c199ed5bd64 ("serial: imx: Grab port lock in imx_uart_enable_wakeup()") Signed-off-by: Xiaolei Wang Reviewed-by: Lucas Stach Reviewed-by: Fabio Estevam Link: https://lore.kernel.org/r/20241210233613.2881264-1-xiaolei.wang@windriver.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/imx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 17f70e4bee43..9c59ec128bb4 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -2692,7 +2692,7 @@ static void imx_uart_enable_wakeup(struct imx_port *sport, bool on) { u32 ucr3; - uart_port_lock(&sport->port); + uart_port_lock_irq(&sport->port); ucr3 = imx_uart_readl(sport, UCR3); if (on) { @@ -2714,7 +2714,7 @@ static void imx_uart_enable_wakeup(struct imx_port *sport, bool on) imx_uart_writel(sport, ucr1, UCR1); } - uart_port_unlock(&sport->port); + uart_port_unlock_irq(&sport->port); } static int imx_uart_suspend_noirq(struct device *dev) From 0cfc36ea51684b5932cd3951ded523777d807af2 Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Mon, 16 Dec 2024 09:53:23 -0500 Subject: [PATCH 475/807] serial: stm32: use port lock wrappers for break control Commit 30e945861f3b ("serial: stm32: add support for break control") added another usage of the port lock, but was merged on the same day as c5d06662551c ("serial: stm32: Use port lock wrappers"), therefore the latter did not update this usage to use the port lock wrappers. Fixes: c5d06662551c ("serial: stm32: Use port lock wrappers") Cc: stable Signed-off-by: Ben Wolsieffer Reviewed-by: John Ogness Link: https://lore.kernel.org/r/20241216145323.111612-1-ben.wolsieffer@hefring.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/stm32-usart.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index 7dc254546075..1ec5d8c3aef8 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -1051,14 +1051,14 @@ static void stm32_usart_break_ctl(struct uart_port *port, int break_state) const struct stm32_usart_offsets *ofs = &stm32_port->info->ofs; unsigned long flags; - spin_lock_irqsave(&port->lock, flags); + uart_port_lock_irqsave(port, &flags); if (break_state) stm32_usart_set_bits(port, ofs->rqr, USART_RQR_SBKRQ); else stm32_usart_clr_bits(port, ofs->rqr, USART_RQR_SBKRQ); - spin_unlock_irqrestore(&port->lock, flags); + uart_port_unlock_irqrestore(port, flags); } static int stm32_usart_startup(struct uart_port *port) From 4f4aa4aa28142d53f8b06585c478476cfe325cfc Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Thu, 19 Dec 2024 15:28:59 +0800 Subject: [PATCH 476/807] net: fix memory leak in tcp_conn_request() If inet_csk_reqsk_queue_hash_add() return false, tcp_conn_request() will return without free the dst memory, which allocated in af_ops->route_req. Here is the kmemleak stack: unreferenced object 0xffff8881198631c0 (size 240): comm "softirq", pid 0, jiffies 4299266571 (age 1802.392s) hex dump (first 32 bytes): 00 10 9b 03 81 88 ff ff 80 98 da bc ff ff ff ff ................ 81 55 18 bb ff ff ff ff 00 00 00 00 00 00 00 00 .U.............. backtrace: [] kmem_cache_alloc+0x60c/0xa80 [] dst_alloc+0x55/0x250 [] rt_dst_alloc+0x46/0x1d0 [] __mkroute_output+0x29a/0xa50 [] ip_route_output_key_hash+0x10b/0x240 [] ip_route_output_flow+0x1d/0x90 [] inet_csk_route_req+0x2c5/0x500 [] tcp_conn_request+0x691/0x12c0 [] tcp_rcv_state_process+0x3c8/0x11b0 [] tcp_v4_do_rcv+0x156/0x3b0 [] tcp_v4_rcv+0x1cf8/0x1d80 [] ip_protocol_deliver_rcu+0xf6/0x360 [] ip_local_deliver_finish+0xe6/0x1e0 [] ip_local_deliver+0xee/0x360 [] ip_rcv+0xad/0x2f0 [] __netif_receive_skb_one_core+0x123/0x140 Call dst_release() to free the dst memory when inet_csk_reqsk_queue_hash_add() return false in tcp_conn_request(). Fixes: ff46e3b44219 ("Fix race for duplicate reqsk on identical SYN") Signed-off-by: Wang Liang Link: https://patch.msgid.link/20241219072859.3783576-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bdf13ac26ef..4811727b8a02 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7328,6 +7328,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, req->timeout))) { reqsk_free(req); + dst_release(dst); return 0; } From b5a7b661a073727219fedc35f5619f62418ffe72 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Thu, 19 Dec 2024 21:03:36 +0800 Subject: [PATCH 477/807] net: Fix netns for ip_tunnel_init_flow() The device denoted by tunnel->parms.link resides in the underlay net namespace. Therefore pass tunnel->net to ip_tunnel_init_flow(). Fixes: db53cd3d88dc ("net: Handle l3mdev in ip_tunnel_init_flow") Signed-off-by: Xiao Liang Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241219130336.103839-1-shaw.leon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 3 +-- net/ipv4/ip_tunnel.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 4b5fd71c897d..32d2e61f2b82 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -423,8 +423,7 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev, parms = mlxsw_sp_ipip_netdev_parms4(to_dev); ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp, - 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0, - 0); + 0, 0, tun->net, parms.link, tun->fwmark, 0, 0); rt = ip_route_output_key(tun->net, &fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 25505f9b724c..09b73acf037a 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -294,7 +294,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - iph->tos & INET_DSCP_MASK, dev_net(dev), + iph->tos & INET_DSCP_MASK, tunnel->net, tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); @@ -611,7 +611,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), - tos & INET_DSCP_MASK, dev_net(dev), 0, skb->mark, + tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark, skb_get_hash(skb), key->flow_flags); if (!tunnel_hlen) @@ -774,7 +774,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, tos & INET_DSCP_MASK, - dev_net(dev), READ_ONCE(tunnel->parms.link), + tunnel->net, READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) From a4fd163aed2edd967a244499754dec991d8b4c7d Mon Sep 17 00:00:00 2001 From: Ilya Shchipletsov Date: Thu, 19 Dec 2024 08:23:07 +0000 Subject: [PATCH 478/807] netrom: check buffer length before accessing it Syzkaller reports an uninit value read from ax25cmp when sending raw message through ieee802154 implementation. ===================================================== BUG: KMSAN: uninit-value in ax25cmp+0x3a5/0x460 net/ax25/ax25_addr.c:119 ax25cmp+0x3a5/0x460 net/ax25/ax25_addr.c:119 nr_dev_get+0x20e/0x450 net/netrom/nr_route.c:601 nr_route_frame+0x1a2/0xfc0 net/netrom/nr_route.c:774 nr_xmit+0x5a/0x1c0 net/netrom/nr_dev.c:144 __netdev_start_xmit include/linux/netdevice.h:4940 [inline] netdev_start_xmit include/linux/netdevice.h:4954 [inline] xmit_one net/core/dev.c:3548 [inline] dev_hard_start_xmit+0x247/0xa10 net/core/dev.c:3564 __dev_queue_xmit+0x33b8/0x5130 net/core/dev.c:4349 dev_queue_xmit include/linux/netdevice.h:3134 [inline] raw_sendmsg+0x654/0xc10 net/ieee802154/socket.c:299 ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x318/0x740 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2780 sock_alloc_send_skb include/net/sock.h:1884 [inline] raw_sendmsg+0x36d/0xc10 net/ieee802154/socket.c:282 ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 0 PID: 5037 Comm: syz-executor166 Not tainted 6.7.0-rc7-syzkaller-00003-gfbafc3e621c3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 ===================================================== This issue occurs because the skb buffer is too small, and it's actual allocation is aligned. This hides an actual issue, which is that nr_route_frame does not validate the buffer size before using it. Fix this issue by checking skb->len before accessing any fields in skb->data. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Co-developed-by: Nikita Marushkin Signed-off-by: Nikita Marushkin Signed-off-by: Ilya Shchipletsov Link: https://patch.msgid.link/20241219082308.3942-1-rabbelkin@mail.ru Signed-off-by: Jakub Kicinski --- net/netrom/nr_route.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 2b5e246b8d9a..b94cb2ffbaf8 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -754,6 +754,12 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) int ret; struct sk_buff *skbn; + /* + * Reject malformed packets early. Check that it contains at least 2 + * addresses and 1 byte more for Time-To-Live + */ + if (skb->len < 2 * sizeof(ax25_address) + 1) + return 0; nr_src = (ax25_address *)(skb->data + 0); nr_dest = (ax25_address *)(skb->data + 7); From 4e86729d1ff329815a6e8a920cb554a1d4cb5b8d Mon Sep 17 00:00:00 2001 From: Nikolay Kuratov Date: Thu, 19 Dec 2024 19:21:14 +0300 Subject: [PATCH 479/807] net/sctp: Prevent autoclose integer overflow in sctp_association_init() While by default max_autoclose equals to INT_MAX / HZ, one may set net.sctp.max_autoclose to UINT_MAX. There is code in sctp_association_init() that can consequently trigger overflow. Cc: stable@vger.kernel.org Fixes: 9f70f46bd4c7 ("sctp: properly latch and use autoclose value from sock to association") Signed-off-by: Nikolay Kuratov Acked-by: Xin Long Link: https://patch.msgid.link/20241219162114.2863827-1-kniv@yandex-team.ru Signed-off-by: Jakub Kicinski --- net/sctp/associola.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index c45c192b7878..0b0794f164cf 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -137,7 +137,8 @@ static struct sctp_association *sctp_association_init( = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; - asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + (unsigned long)sp->autoclose * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) From 4a4d38ace1fb0586bffd2aab03caaa05d6011748 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Fri, 20 Dec 2024 13:26:14 +0530 Subject: [PATCH 480/807] net: ethernet: ti: am65-cpsw: default to round-robin for host port receive The Host Port (i.e. CPU facing port) of CPSW receives traffic from Linux via TX DMA Channels which are Hardware Queues consisting of traffic categorized according to their priority. The Host Port is configured to dequeue traffic from these Hardware Queues on the basis of priority i.e. as long as traffic exists on a Hardware Queue of a higher priority, the traffic on Hardware Queues of lower priority isn't dequeued. An alternate operation is also supported wherein traffic can be dequeued by the Host Port in a Round-Robin manner. Until commit under Fixes, the am65-cpsw driver enabled a single TX DMA Channel, due to which, unless modified by user via "ethtool", all traffic from Linux is transmitted on DMA Channel 0. Therefore, configuring the Host Port for priority based dequeuing or Round-Robin operation is identical since there is a single DMA Channel. Since commit under Fixes, all 8 TX DMA Channels are enabled by default. Additionally, the default "tc mapping" doesn't take into account the possibility of different traffic profiles which various users might have. This results in traffic starvation at the Host Port due to the priority based dequeuing which has been enabled by default since the inception of the driver. The traffic starvation triggers NETDEV WATCHDOG timeout for all TX DMA Channels that haven't been serviced due to the presence of traffic on the higher priority TX DMA Channels. Fix this by defaulting to Round-Robin dequeuing at the Host Port, which shall ensure that traffic is dequeued from all TX DMA Channels irrespective of the traffic profile. This will address the NETDEV WATCHDOG timeouts. At the same time, users can still switch from Round-Robin to Priority based dequeuing at the Host Port with the help of the "p0-rx-ptype-rrobin" private flag of "ethtool". Users are expected to setup an appropriate "tc mapping" that suits their traffic profile when switching to priority based dequeuing at the Host Port. Fixes: be397ea3473d ("net: ethernet: am65-cpsw: Set default TX channels to maximum") Cc: Signed-off-by: Siddharth Vadapalli Link: https://patch.msgid.link/20241220075618.228202-1-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 14e1df721f2e..5465bf872734 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -3551,7 +3551,7 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) init_completion(&common->tdown_complete); common->tx_ch_num = AM65_CPSW_DEFAULT_TX_CHNS; common->rx_ch_num_flows = AM65_CPSW_DEFAULT_RX_CHN_FLOWS; - common->pf_p0_rx_ptype_rrobin = false; + common->pf_p0_rx_ptype_rrobin = true; common->default_vlan = 1; common->ports = devm_kcalloc(dev, common->port_num, From 75221e96101fa93390d3db5c23e026f5e3565d9b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 20 Dec 2024 18:04:00 +0100 Subject: [PATCH 481/807] net: pse-pd: tps23881: Fix power on/off issue An issue was present in the initial driver implementation. The driver read the power status of all channels before toggling the bit of the desired one. Using the power status register as a base value introduced a problem, because only the bit corresponding to the concerned channel ID should be set in the write-only power enable register. This led to cases where disabling power for one channel also powered off other channels. This patch removes the power status read and ensures the value is limited to the bit matching the channel index of the PI. Fixes: 20e6d190ffe1 ("net: pse-pd: Add TI TPS23881 PSE controller driver") Signed-off-by: Kory Maincent Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20241220170400.291705-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/tps23881.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/net/pse-pd/tps23881.c b/drivers/net/pse-pd/tps23881.c index 5c4e88be46ee..8797ca1a8a21 100644 --- a/drivers/net/pse-pd/tps23881.c +++ b/drivers/net/pse-pd/tps23881.c @@ -64,15 +64,11 @@ static int tps23881_pi_enable(struct pse_controller_dev *pcdev, int id) if (id >= TPS23881_MAX_CHANS) return -ERANGE; - ret = i2c_smbus_read_word_data(client, TPS23881_REG_PW_STATUS); - if (ret < 0) - return ret; - chan = priv->port[id].chan[0]; if (chan < 4) - val = (u16)(ret | BIT(chan)); + val = BIT(chan); else - val = (u16)(ret | BIT(chan + 4)); + val = BIT(chan + 4); if (priv->port[id].is_4p) { chan = priv->port[id].chan[1]; @@ -100,15 +96,11 @@ static int tps23881_pi_disable(struct pse_controller_dev *pcdev, int id) if (id >= TPS23881_MAX_CHANS) return -ERANGE; - ret = i2c_smbus_read_word_data(client, TPS23881_REG_PW_STATUS); - if (ret < 0) - return ret; - chan = priv->port[id].chan[0]; if (chan < 4) - val = (u16)(ret | BIT(chan + 4)); + val = BIT(chan + 4); else - val = (u16)(ret | BIT(chan + 8)); + val = BIT(chan + 8); if (priv->port[id].is_4p) { chan = priv->port[id].chan[1]; From 050a4c011b0dfeb91664a5d7bd3647ff38db08ce Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Fri, 20 Dec 2024 10:15:02 +0200 Subject: [PATCH 482/807] net/mlx5: DR, select MSIX vector 0 for completion queue creation When creating a software steering completion queue (CQ), an arbitrary MSIX vector n is selected. This results in the CQ sharing the same Ethernet traffic channel n associated with the chosen vector. However, the value of n is often unpredictable, which can introduce complications for interrupt monitoring and verification tools. Moreover, SW steering uses polling rather than event-driven interrupts. Therefore, there is no need to select any MSIX vector other than the existing vector 0 for CQ creation. In light of these factors, and to enhance predictability, we modify the code to consistently select MSIX vector 0 for CQ creation. Fixes: 297cccebdc5a ("net/mlx5: DR, Expose an internal API to issue RDMA operations") Signed-off-by: Shahar Shitrit Reviewed-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index 6fa06ba2d346..f57c84e5128b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1067,7 +1067,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, int inlen, err, eqn; void *cqc, *in; __be64 *pas; - int vector; u32 i; cq = kzalloc(sizeof(*cq), GFP_KERNEL); @@ -1096,8 +1095,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, if (!in) goto err_cqwq; - vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev); - err = mlx5_comp_eqn_get(mdev, vector, &eqn); + err = mlx5_comp_eqn_get(mdev, 0, &eqn); if (err) { kvfree(in); goto err_cqwq; From 8c6254479b3d5bd788d2b5fefaa48fb194331ed0 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 20 Dec 2024 10:15:03 +0200 Subject: [PATCH 483/807] net/mlx5e: macsec: Maintain TX SA from encoding_sa In MACsec, it is possible to create multiple active TX SAs on a SC, but only one such SA can be used at a time for transmission. This SA is selected through the encoding_sa link parameter. When there are 2 or more active TX SAs configured (encoding_sa=0): ip macsec add macsec0 tx sa 0 pn 1 on key 00 ip macsec add macsec0 tx sa 1 pn 1 on key 00 ... the traffic should be still sent via TX SA 0 as the encoding_sa was not changed. However, the driver ignores the encoding_sa and overrides it to SA 1 by installing the flow steering id of the newly created TX SA into the SCI -> flow steering id hash map. The future packet tx descriptors will point to the incorrect flow steering rule (SA 1). This patch fixes the issue by avoiding the creation of the flow steering rule for an active TX SA that is not the encoding_sa. The driver side tx_sa object and the FW side macsec object are still created. When the encoding_sa link parameter is changed to another active TX SA, only the new flow steering rule will be created in the mlx5e_macsec_upd_txsa() handler. Fixes: 8ff0ac5be144 ("net/mlx5: Add MACsec offload Tx command support") Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Reviewed-by: Lior Nahmanson Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index cc9bcc420032..6ab02f3fc291 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -339,9 +339,13 @@ static int mlx5e_macsec_init_sa_fs(struct macsec_context *ctx, { struct mlx5e_priv *priv = macsec_netdev_priv(ctx->netdev); struct mlx5_macsec_fs *macsec_fs = priv->mdev->macsec_fs; + const struct macsec_tx_sc *tx_sc = &ctx->secy->tx_sc; struct mlx5_macsec_rule_attrs rule_attrs; union mlx5_macsec_rule *macsec_rule; + if (is_tx && tx_sc->encoding_sa != sa->assoc_num) + return 0; + rule_attrs.macsec_obj_id = sa->macsec_obj_id; rule_attrs.sci = sa->sci; rule_attrs.assoc_num = sa->assoc_num; From 5a03b368562a7ff5f5f1f63b5adf8309cbdbd5be Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:04 +0200 Subject: [PATCH 484/807] net/mlx5e: Skip restore TC rules for vport rep without loaded flag During driver unload, unregister_netdev is called after unloading vport rep. So, the mlx5e_rep_priv is already freed while trying to get rpriv->netdev, or walk rpriv->tc_ht, which results in use-after-free. So add the checking to make sure access the data of vport rep which is still loaded. Fixes: d1569537a837 ("net/mlx5e: Modify and restore TC rules for IPSec TX rules") Signed-off-by: Jianbo Liu Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 --- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c index 5a0047bdcb51..ed977ae75fab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c @@ -150,11 +150,11 @@ void mlx5_esw_ipsec_restore_dest_uplink(struct mlx5_core_dev *mdev) unsigned long i; int err; - xa_for_each(&esw->offloads.vport_reps, i, rep) { - rpriv = rep->rep_data[REP_ETH].priv; - if (!rpriv || !rpriv->netdev) + mlx5_esw_for_each_rep(esw, i, rep) { + if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED) continue; + rpriv = rep->rep_data[REP_ETH].priv; rhashtable_walk_enter(&rpriv->tc_ht, &iter); rhashtable_walk_start(&iter); while ((flow = rhashtable_walk_next(&iter)) != NULL) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index a83d41121db6..8573d36785f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -714,6 +714,9 @@ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw); MLX5_CAP_GEN_2((esw->dev), ec_vf_vport_base) +\ (last) - 1) +#define mlx5_esw_for_each_rep(esw, i, rep) \ + xa_for_each(&((esw)->offloads.vport_reps), i, rep) + struct mlx5_eswitch *__must_check mlx5_devlink_eswitch_get(struct devlink *devlink); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index d5b42b3a19fd..40359f320724 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -53,9 +53,6 @@ #include "lag/lag.h" #include "en/tc/post_meter.h" -#define mlx5_esw_for_each_rep(esw, i, rep) \ - xa_for_each(&((esw)->offloads.vport_reps), i, rep) - /* There are two match-all miss flows, one for unicast dst mac and * one for multicast. */ From 2a4f56fbcc473d8faeb29b73082df39efbe5893c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:05 +0200 Subject: [PATCH 485/807] net/mlx5e: Keep netdev when leave switchdev for devlink set legacy only In the cited commit, when changing from switchdev to legacy mode, uplink representor's netdev is kept, and its profile is replaced with nic profile, so netdev is detached from old profile, then attach to new profile. During profile change, the hardware resources allocated by the old profile will be cleaned up. However, the cleanup is relying on the related kernel modules. And they may need to flush themselves first, which is triggered by netdev events, for example, NETDEV_UNREGISTER. However, netdev is kept, or netdev_register is called after the cleanup, which may cause troubles because the resources are still referred by kernel modules. The same process applies to all the caes when uplink is leaving switchdev mode, including devlink eswitch mode set legacy, driver unload and devlink reload. For the first one, it can be blocked and returns failure to users, whenever possible. But it's hard for the others. Besides, the attachment to nic profile is unnecessary as the netdev will be unregistered anyway for such cases. So in this patch, the original behavior is kept only for devlink eswitch set mode legacy. For the others, moves netdev unregistration before the profile change. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 19 +++++++++++++++++-- .../net/ethernet/mellanox/mlx5/core/en_rep.c | 15 +++++++++++++++ .../mellanox/mlx5/core/eswitch_offloads.c | 2 ++ include/linux/mlx5/driver.h | 1 + 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dd16d73000c3..0ec17c276bdd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6542,8 +6542,23 @@ static void _mlx5e_remove(struct auxiliary_device *adev) mlx5_core_uplink_netdev_set(mdev, NULL); mlx5e_dcbnl_delete_app(priv); - unregister_netdev(priv->netdev); - _mlx5e_suspend(adev, false); + /* When unload driver, the netdev is in registered state + * if it's from legacy mode. If from switchdev mode, it + * is already unregistered before changing to NIC profile. + */ + if (priv->netdev->reg_state == NETREG_REGISTERED) { + unregister_netdev(priv->netdev); + _mlx5e_suspend(adev, false); + } else { + struct mlx5_core_dev *pos; + int i; + + if (test_bit(MLX5E_STATE_DESTROYING, &priv->state)) + mlx5_sd_for_each_dev(i, mdev, pos) + mlx5e_destroy_mdev_resources(pos); + else + _mlx5e_suspend(adev, true); + } /* Avoid cleanup if profile rollback failed. */ if (priv->profile) priv->profile->cleanup(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 554f9cb5b53f..fdff9fd8a89e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1509,6 +1509,21 @@ mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv) priv = netdev_priv(netdev); + /* This bit is set when using devlink to change eswitch mode from + * switchdev to legacy. As need to keep uplink netdev ifindex, we + * detach uplink representor profile and attach NIC profile only. + * The netdev will be unregistered later when unload NIC auxiliary + * driver for this case. + * We explicitly block devlink eswitch mode change if any IPSec rules + * offloaded, but can't block other cases, such as driver unload + * and devlink reload. We have to unregister netdev before profile + * change for those cases. This is to avoid resource leak because + * the offloaded rules don't have the chance to be unoffloaded before + * cleanup which is triggered by detach uplink representor profile. + */ + if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_SWITCH_LEGACY)) + unregister_netdev(netdev); + mlx5e_netdev_attach_nic_profile(priv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 40359f320724..06076dd9ec64 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3777,6 +3777,8 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, esw->eswitch_operation_in_progress = true; up_write(&esw->mode_lock); + if (mode == DEVLINK_ESWITCH_MODE_LEGACY) + esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY; mlx5_eswitch_disable_locked(esw); if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { if (mlx5_devlink_trap_get_num_active(esw->dev)) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..8f5991168ccd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -524,6 +524,7 @@ enum { * creation/deletion on drivers rescan. Unset during device attach. */ MLX5_PRIV_FLAGS_DETACH = 1 << 2, + MLX5_PRIV_FLAGS_SWITCH_LEGACY = 1 << 3, }; struct mlx5_adev { From d29662695ed7c015521e5fc9387df25aab192a2e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Nov 2024 18:16:49 +0100 Subject: [PATCH 486/807] btrfs: fix use-after-free waiting for encoded read endios Fix a use-after-free in the I/O completion path for encoded reads by using a completion instead of a wait_queue for synchronizing the destruction of 'struct btrfs_encoded_read_private'. Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 94c8809e8170..6baa0269a85b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9078,9 +9078,9 @@ out: } struct btrfs_encoded_read_private { - wait_queue_head_t wait; + struct completion done; void *uring_ctx; - atomic_t pending; + refcount_t pending_refs; blk_status_t status; }; @@ -9099,14 +9099,14 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (atomic_dec_and_test(&priv->pending)) { + if (refcount_dec_and_test(&priv->pending_refs)) { int err = blk_status_to_errno(READ_ONCE(priv->status)); if (priv->uring_ctx) { btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - wake_up(&priv->wait); + complete(&priv->done); } } bio_put(&bbio->bio); @@ -9126,8 +9126,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!priv) return -ENOMEM; - init_waitqueue_head(&priv->wait); - atomic_set(&priv->pending, 1); + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9140,7 +9140,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, @@ -9155,11 +9155,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); if (uring_ctx) { - if (atomic_dec_return(&priv->pending) == 0) { + if (refcount_dec_and_test(&priv->pending_refs)) { ret = blk_status_to_errno(READ_ONCE(priv->status)); btrfs_uring_read_extent_endio(uring_ctx, ret); kfree(priv); @@ -9168,8 +9168,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { - if (atomic_dec_return(&priv->pending) != 0) - io_wait_event(priv->wait, !atomic_read(&priv->pending)); + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); /* See btrfs_encoded_read_endio() for ordering. */ ret = blk_status_to_errno(READ_ONCE(priv->status)); kfree(priv); From 44f52bbe96dfdbe4aca3818a2534520082a07040 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Dec 2024 16:08:07 +0000 Subject: [PATCH 487/807] btrfs: fix use-after-free when COWing tree bock and tracing is enabled When a COWing a tree block, at btrfs_cow_block(), and we have the tracepoint trace_btrfs_cow_block() enabled and preemption is also enabled (CONFIG_PREEMPT=y), we can trigger a use-after-free in the COWed extent buffer while inside the tracepoint code. This is because in some paths that call btrfs_cow_block(), such as btrfs_search_slot(), we are holding the last reference on the extent buffer @buf so btrfs_force_cow_block() drops the last reference on the @buf extent buffer when it calls free_extent_buffer_stale(buf), which schedules the release of the extent buffer with RCU. This means that if we are on a kernel with preemption, the current task may be preempted before calling trace_btrfs_cow_block() and the extent buffer already released by the time trace_btrfs_cow_block() is called, resulting in a use-after-free. Fix this by moving the trace_btrfs_cow_block() from btrfs_cow_block() to btrfs_force_cow_block() before the COWed extent buffer is freed. This also has a side effect of invoking the tracepoint in the tree defrag code, at defrag.c:btrfs_realloc_node(), since btrfs_force_cow_block() is called there, but this is fine and it was actually missing there. Reported-by: syzbot+8517da8635307182c8a5@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6759a9b9.050a0220.1ac542.000d.GAE@google.com/ CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 693dc27ffb89..185985a337b3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -654,6 +654,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, goto error_unlock_cow; } } + + trace_btrfs_cow_block(root, buf, cow); if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); @@ -710,7 +712,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; - int ret; if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { btrfs_abort_transaction(trans, -EUCLEAN); @@ -751,12 +752,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, - cow_ret, search_start, 0, nest); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); } ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); From 3e74859ee35edc33a022c3f3971df066ea0ca6b9 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 13 Dec 2024 12:22:32 -0800 Subject: [PATCH 488/807] btrfs: check folio mapping after unlock in relocate_one_folio() When we call btrfs_read_folio() to bring a folio uptodate, we unlock the folio. The result of that is that a different thread can modify the mapping (like remove it with invalidate) before we call folio_lock(). This results in an invalid page and we need to try again. In particular, if we are relocating concurrently with aborting a transaction, this can result in a crash like the following: BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] SMP CPU: 76 PID: 1411631 Comm: kworker/u322:5 Workqueue: events_unbound btrfs_reclaim_bgs_work RIP: 0010:set_page_extent_mapped+0x20/0xb0 RSP: 0018:ffffc900516a7be8 EFLAGS: 00010246 RAX: ffffea009e851d08 RBX: ffffea009e0b1880 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffc900516a7b90 RDI: ffffea009e0b1880 RBP: 0000000003573000 R08: 0000000000000001 R09: ffff88c07fd2f3f0 R10: 0000000000000000 R11: 0000194754b575be R12: 0000000003572000 R13: 0000000003572fff R14: 0000000000100cca R15: 0000000005582fff FS: 0000000000000000(0000) GS:ffff88c07fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000407d00f002 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __die+0x78/0xc0 ? page_fault_oops+0x2a8/0x3a0 ? __switch_to+0x133/0x530 ? wq_worker_running+0xa/0x40 ? exc_page_fault+0x63/0x130 ? asm_exc_page_fault+0x22/0x30 ? set_page_extent_mapped+0x20/0xb0 relocate_file_extent_cluster+0x1a7/0x940 relocate_data_extent+0xaf/0x120 relocate_block_group+0x20f/0x480 btrfs_relocate_block_group+0x152/0x320 btrfs_relocate_chunk+0x3d/0x120 btrfs_reclaim_bgs_work+0x2ae/0x4e0 process_scheduled_works+0x184/0x370 worker_thread+0xc6/0x3e0 ? blk_add_timer+0xb0/0xb0 kthread+0xae/0xe0 ? flush_tlb_kernel_range+0x90/0x90 ret_from_fork+0x2f/0x40 ? flush_tlb_kernel_range+0x90/0x90 ret_from_fork_asm+0x11/0x20 This occurs because cleanup_one_transaction() calls destroy_delalloc_inodes() which calls invalidate_inode_pages2() which takes the folio_lock before setting mapping to NULL. We fail to check this, and subsequently call set_extent_mapping(), which assumes that mapping != NULL (in fact it asserts that in debug mode) Note that the "fixes" patch here is not the one that introduced the race (the very first iteration of this code from 2009) but a more recent change that made this particular crash happen in practice. Fixes: e7f1326cc24e ("btrfs: set page extent mapped after read_folio in relocate_one_page") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index bf267bdfa8f8..db8b42f674b7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2902,6 +2902,7 @@ static int relocate_one_folio(struct reloc_control *rc, const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); +again: folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { @@ -2937,6 +2938,11 @@ static int relocate_one_folio(struct reloc_control *rc, ret = -EIO; goto release_folio; } + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } /* From 0fba7be1ca6df2881e68386e5575fe096f33c4ca Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 13 Dec 2024 12:33:22 -0800 Subject: [PATCH 489/807] btrfs: check folio mapping after unlock in put_file_data() When we call btrfs_read_folio() we get an unlocked folio, so it is possible for a different thread to concurrently modify folio->mapping. We must check that this hasn't happened once we do have the lock. CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/send.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7254279c3cc9..498c84323253 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5280,6 +5280,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); +again: folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(mapping, @@ -5312,6 +5313,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) ret = -EIO; break; } + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, From 0525064bb82e50d59543b62b9d41a606198a4a44 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 29 Nov 2024 12:25:30 +0000 Subject: [PATCH 490/807] btrfs: fix race with memory mapped writes when activating swap file When activating the swap file we flush all delalloc and wait for ordered extent completion, so that we don't miss any delalloc and extents before we check that the file's extent layout is usable for a swap file and activate the swap file. We are called with the inode's VFS lock acquired, so we won't race with buffered and direct IO writes, however we can still race with memory mapped writes since they don't acquire the inode's VFS lock. The race window is between flushing all delalloc and locking the whole file's extent range, since memory mapped writes lock an extent range with the length of a page. Fix this by acquiring the inode's mmap lock before we flush delalloc. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6baa0269a85b..b2abc0aa5300 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9809,6 +9809,15 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, u64 isize; u64 start; + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); + /* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and @@ -9816,22 +9825,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* @@ -9846,7 +9858,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -9860,7 +9873,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -9881,7 +9895,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", btrfs_root_id(root)); - return -EPERM; + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); @@ -10036,6 +10051,8 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); if (ret) return ret; From 03018e5d8508254534511d40fb57bc150e6a87f2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Dec 2024 12:54:14 +0000 Subject: [PATCH 491/807] btrfs: fix swap file activation failure due to extents that used to be shared When activating a swap file, to determine if an extent is shared we use can_nocow_extent(), which ends up at btrfs_cross_ref_exist(). That helper is meant to be quick because it's used in the NOCOW write path, when flushing delalloc and when doing a direct IO write, however it does return some false positives, meaning it may indicate that an extent is shared even if it's no longer the case. For the write path this is fine, we just do a unnecessary COW operation instead of doing a more rigorous check which would be too heavy (calling btrfs_is_data_extent_shared()). However when activating a swap file, the false positives simply result in a failure, which is confusing for users/applications. One particular case where this happens is when a data extent only has 1 reference but that reference is not inlined in the extent item located in the extent tree - this happens when we create more than 33 references for an extent and then delete those 33 references plus every other non-inline reference except one. The function check_committed_ref() assumes that if the size of an extent item doesn't match the size of struct btrfs_extent_item plus the size of an inline reference (plus an owner reference in case simple quotas are enabled), then the extent is shared - that is not the case however, we can have a single reference but it's not inlined - the reason we do this is to be fast and avoid inspecting non-inline references which may be located in another leaf of the extent tree, slowing down write paths. The following test script reproduces the bug: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi NUM_CLONES=50 umount $DEV &> /dev/null run_test() { local sync_after_add_reflinks=$1 local sync_after_remove_reflinks=$2 mkfs.btrfs -f $DEV > /dev/null #mkfs.xfs -f $DEV > /dev/null mount $DEV $MNT touch $MNT/foo chmod 0600 $MNT/foo # On btrfs the file must be NOCOW. chattr +C $MNT/foo &> /dev/null xfs_io -s -c "pwrite -b 1M 0 1M" $MNT/foo mkswap $MNT/foo for ((i = 1; i <= $NUM_CLONES; i++)); do touch $MNT/foo_clone_$i chmod 0600 $MNT/foo_clone_$i # On btrfs the file must be NOCOW. chattr +C $MNT/foo_clone_$i &> /dev/null cp --reflink=always $MNT/foo $MNT/foo_clone_$i done if [ $sync_after_add_reflinks -ne 0 ]; then # Flush delayed refs and commit current transaction. sync -f $MNT fi # Remove the original file and all clones except the last. rm -f $MNT/foo for ((i = 1; i < $NUM_CLONES; i++)); do rm -f $MNT/foo_clone_$i done if [ $sync_after_remove_reflinks -ne 0 ]; then # Flush delayed refs and commit current transaction. sync -f $MNT fi # Now use the last clone as a swap file. It should work since # its extent are not shared anymore. swapon $MNT/foo_clone_${NUM_CLONES} swapoff $MNT/foo_clone_${NUM_CLONES} umount $MNT } echo -e "\nTest without sync after creating and removing clones" run_test 0 0 echo -e "\nTest with sync after creating clones" run_test 1 0 echo -e "\nTest with sync after removing clones" run_test 0 1 echo -e "\nTest with sync after creating and removing clones" run_test 1 1 Running the test: $ ./test.sh Test without sync after creating and removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0017 sec (556.793 MiB/sec and 556.7929 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=a6b9c29e-5ef4-4689-a8ac-bc199c750f02 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Test with sync after creating clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0036 sec (271.739 MiB/sec and 271.7391 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=5e9008d6-1f7a-4948-a1b4-3f30aba20a33 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Test with sync after removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0103 sec (96.665 MiB/sec and 96.6651 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=916c2740-fa9f-4385-9f06-29c3f89e4764 Test with sync after creating and removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0031 sec (314.268 MiB/sec and 314.2678 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=06aab1dd-4d90-49c0-bd9f-3a8db4e2f912 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Fix this by reworking btrfs_swap_activate() to instead of using extent maps and checking for shared extents with can_nocow_extent(), iterate over the inode's file extent items and use the accurate btrfs_is_data_extent_shared(). CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 96 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b2abc0aa5300..b87f19630b00 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9799,15 +9799,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; /* * Acquire the inode's mmap lock to prevent races with memory mapped @@ -9846,6 +9847,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, goto out_unlock_mmap; } + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; + } + /* * Balance or device remove/replace/resize can move stuff around from * under us. The exclop protection makes sure they aren't running/won't @@ -9904,24 +9912,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->disk_bytenr == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->disk_bytenr == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -9933,23 +9956,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = extent_map_block_start(em) + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } - ret = can_nocow_extent(inode, start, &len, NULL, false, true); + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); + + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -9984,7 +10029,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -10025,20 +10069,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - - start += len; } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); @@ -10053,6 +10093,8 @@ out: out_unlock_mmap: up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; From 9a45022a0efadd99bcc58f7f1cc2b6fb3b808c40 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Dec 2024 16:31:41 +0000 Subject: [PATCH 492/807] btrfs: allow swap activation to be interruptible During swap activation we iterate over the extents of a file, then do several checks for each extent, some of which may take some significant time such as checking if an extent is shared. Since a file can have many thousands of extents, this can be a very slow operation and it's currently not interruptible. I had a bug during development of a previous patch that resulted in an infinite loop when iterating the extents, so a core was busy looping and I couldn't cancel the operation, which is very annoying and requires a reboot. So make the loop interruptible by checking for fatal signals at the end of each iteration and stopping immediately if there is one. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b87f19630b00..c4675f4345fd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10073,6 +10073,11 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, bsi.block_start = physical_block_start; bsi.block_len = len; } + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } } if (bsi.block_len) From 2c8507c63f5498d4ee4af404a8e44ceae4345056 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Dec 2024 16:43:44 +0000 Subject: [PATCH 493/807] btrfs: avoid monopolizing a core when activating a swap file During swap activation we iterate over the extents of a file and we can have many thousands of them, so we can end up in a busy loop monopolizing a core. Avoid this by doing a voluntary reschedule after processing each extent. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4675f4345fd..623d9d7ab480 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10078,6 +10078,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINTR; goto out; } + + cond_resched(); } if (bsi.block_len) From f2363e6fcc7938c5f0f6ac066fad0dd247598b51 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Wed, 11 Dec 2024 19:13:15 +0800 Subject: [PATCH 494/807] btrfs: fix transaction atomicity bug when enabling simple quotas Set squota incompat bit before committing the transaction that enables the feature. With the config CONFIG_BTRFS_ASSERT enabled, an assertion failure occurs regarding the simple quota feature. [5.596534] assertion failed: btrfs_fs_incompat(fs_info, SIMPLE_QUOTA), in fs/btrfs/qgroup.c:365 [5.597098] ------------[ cut here ]------------ [5.597371] kernel BUG at fs/btrfs/qgroup.c:365! [5.597946] CPU: 1 UID: 0 PID: 268 Comm: mount Not tainted 6.13.0-rc2-00031-gf92f4749861b #146 [5.598450] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [5.599008] RIP: 0010:btrfs_read_qgroup_config+0x74d/0x7a0 [5.604303] [5.605230] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.605538] ? exc_invalid_op+0x56/0x70 [5.605775] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.606066] ? asm_exc_invalid_op+0x1f/0x30 [5.606441] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.606741] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.607038] ? try_to_wake_up+0x317/0x760 [5.607286] open_ctree+0xd9c/0x1710 [5.607509] btrfs_get_tree+0x58a/0x7e0 [5.608002] vfs_get_tree+0x2e/0x100 [5.608224] fc_mount+0x16/0x60 [5.608420] btrfs_get_tree+0x2f8/0x7e0 [5.608897] vfs_get_tree+0x2e/0x100 [5.609121] path_mount+0x4c8/0xbc0 [5.609538] __x64_sys_mount+0x10d/0x150 The issue can be easily reproduced using the following reproducer: root@q:linux# cat repro.sh set -e mkfs.btrfs -q -f /dev/sdb mount /dev/sdb /mnt/btrfs btrfs quota enable -s /mnt/btrfs umount /mnt/btrfs mount /dev/sdb /mnt/btrfs The issue is that when enabling quotas, at btrfs_quota_enable(), we set BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE at fs_info->qgroup_flags and persist it in the quota root in the item with the key BTRFS_QGROUP_STATUS_KEY, but we only set the incompat bit BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA after we commit the transaction used to enable simple quotas. This means that if after that transaction commit we unmount the filesystem without starting and committing any other transaction, or we have a power failure, the next time we mount the filesystem we will find the flag BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE set in the item with the key BTRFS_QGROUP_STATUS_KEY but we will not find the incompat bit BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA set in the superblock, triggering an assertion failure at: btrfs_read_qgroup_config() -> qgroup_read_enable_gen() To fix this issue, set the BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA flag immediately after setting the BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE. This ensures that both flags are flushed to disk within the same transaction. Fixes: 182940f4f4db ("btrfs: qgroup: add new quota mode for simple quotas") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Filipe Manana Signed-off-by: Julian Sun Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a6f92836c9b1..f9b214992212 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1121,6 +1121,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -1254,8 +1255,6 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (simple) - btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ From fca432e73db2bec0fdbfbf6d98d3ebcd5388a977 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 18 Dec 2024 17:00:56 +1030 Subject: [PATCH 495/807] btrfs: sysfs: fix direct super block member reads The following sysfs entries are reading super block member directly, which can have a different endian and cause wrong values: - sys/fs/btrfs//nodesize - sys/fs/btrfs//sectorsize - sys/fs/btrfs//clone_alignment Thankfully those values (nodesize and sectorsize) are always aligned inside the btrfs_super_block, so it won't trigger unaligned read errors, just endian problems. Fix them by using the native cached members instead. Fixes: df93589a1737 ("btrfs: export more from FS_INFO to sysfs") CC: stable@vger.kernel.org Reviewed-by: Naohiro Aota Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index fdcbf650ac31..7f09b6c9cc2d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1118,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -1128,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -1180,7 +1180,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); From 452f4b31e3f70a52b97890888eeb9eaa9a87139a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Mon, 25 Nov 2024 11:50:25 +0100 Subject: [PATCH 496/807] tracing: Constify string literal data member in struct trace_event_call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The name member of the struct trace_event_call is assigned with generated string literals; declare them pointer to read-only. Reported by clang: security/landlock/syscalls.c:179:1: warning: initializing 'char *' with an expression of type 'const char[34]' discards qualifiers [-Wincompatible-pointer-types-discards-qualifiers] 179 | SYSCALL_DEFINE3(landlock_create_ruleset, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 180 | const struct landlock_ruleset_attr __user *const, attr, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 181 | const size_t, size, const __u32, flags) | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:226:36: note: expanded from macro 'SYSCALL_DEFINE3' 226 | #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:234:2: note: expanded from macro 'SYSCALL_DEFINEx' 234 | SYSCALL_METADATA(sname, x, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:184:2: note: expanded from macro 'SYSCALL_METADATA' 184 | SYSCALL_TRACE_ENTER_EVENT(sname); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:151:30: note: expanded from macro 'SYSCALL_TRACE_ENTER_EVENT' 151 | .name = "sys_enter"#sname, \ | ^~~~~~~~~~~~~~~~~ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mickaël Salaün Cc: Günther Noack Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Bill Wendling Cc: Justin Stitt Link: https://lore.kernel.org/20241125105028.42807-1-cgoettsche@seltendoof.de Fixes: b77e38aa240c3 ("tracing: add event trace infrastructure") Signed-off-by: Christian Göttsche Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..58ad4ead33fc 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -364,7 +364,7 @@ struct trace_event_call { struct list_head list; struct trace_event_class *class; union { - char *name; + const char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; From 98feccbf32cfdde8c722bc4587aaa60ee5ac33f0 Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Mon, 16 Dec 2024 15:32:38 +0800 Subject: [PATCH 497/807] tracing: Prevent bad count for tracing_cpumask_write If a large count is provided, it will trigger a warning in bitmap_parse_user. Also check zero for it. Cc: stable@vger.kernel.org Fixes: 9e01c1b74c953 ("cpumask: convert kernel trace functions") Link: https://lore.kernel.org/20241216073238.2573704-1-lizhi.xu@windriver.com Reported-by: syzbot+0aecfd34fb878546f3fd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=0aecfd34fb878546f3fd Tested-by: syzbot+0aecfd34fb878546f3fd@syzkaller.appspotmail.com Signed-off-by: Lizhi Xu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 957f941a08e7..f8aebcb01e62 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5087,6 +5087,9 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, cpumask_var_t tracing_cpumask_new; int err; + if (count == 0 || count > KMALLOC_MAX_SIZE) + return -EINVAL; + if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; From ccfa3131d4a0347988e73638edea5c8281b6d2c7 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 21 Dec 2024 16:57:12 +0900 Subject: [PATCH 498/807] dmaengine: fsl-edma: implement the cleanup path of fsl_edma3_attach_pd() Current implementation of fsl_edma3_attach_pd() does not provide a cleanup path, resulting in a memory leak. For example, dev_pm_domain_detach() is not called after dev_pm_domain_attach_by_id(), and the device link created with the DL_FLAG_STATELESS is not released explicitly. Therefore, provide a cleanup function fsl_edma3_detach_pd() and call it upon failure. Also add a devm_add_action_or_reset() call with this function after a successful fsl_edma3_attach_pd(). Fixes: 72f5801a4e2b ("dmaengine: fsl-edma: integrate v3 support") Signed-off-by: Joe Hattori Link: https://lore.kernel.org/r/20241221075712.3297200-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Vinod Koul --- drivers/dma/fsl-edma-common.h | 1 + drivers/dma/fsl-edma-main.c | 41 ++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/drivers/dma/fsl-edma-common.h b/drivers/dma/fsl-edma-common.h index ce37e1ee9c46..fe8f103d4a63 100644 --- a/drivers/dma/fsl-edma-common.h +++ b/drivers/dma/fsl-edma-common.h @@ -166,6 +166,7 @@ struct fsl_edma_chan { struct work_struct issue_worker; struct platform_device *pdev; struct device *pd_dev; + struct device_link *pd_dev_link; u32 srcid; struct clk *clk; int priority; diff --git a/drivers/dma/fsl-edma-main.c b/drivers/dma/fsl-edma-main.c index 60de1003193a..1a613236b3e4 100644 --- a/drivers/dma/fsl-edma-main.c +++ b/drivers/dma/fsl-edma-main.c @@ -417,10 +417,33 @@ static const struct of_device_id fsl_edma_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, fsl_edma_dt_ids); +static void fsl_edma3_detach_pd(struct fsl_edma_engine *fsl_edma) +{ + struct fsl_edma_chan *fsl_chan; + int i; + + for (i = 0; i < fsl_edma->n_chans; i++) { + if (fsl_edma->chan_masked & BIT(i)) + continue; + fsl_chan = &fsl_edma->chans[i]; + if (fsl_chan->pd_dev_link) + device_link_del(fsl_chan->pd_dev_link); + if (fsl_chan->pd_dev) { + dev_pm_domain_detach(fsl_chan->pd_dev, false); + pm_runtime_dont_use_autosuspend(fsl_chan->pd_dev); + pm_runtime_set_suspended(fsl_chan->pd_dev); + } + } +} + +static void devm_fsl_edma3_detach_pd(void *data) +{ + fsl_edma3_detach_pd(data); +} + static int fsl_edma3_attach_pd(struct platform_device *pdev, struct fsl_edma_engine *fsl_edma) { struct fsl_edma_chan *fsl_chan; - struct device_link *link; struct device *pd_chan; struct device *dev; int i; @@ -436,15 +459,16 @@ static int fsl_edma3_attach_pd(struct platform_device *pdev, struct fsl_edma_eng pd_chan = dev_pm_domain_attach_by_id(dev, i); if (IS_ERR_OR_NULL(pd_chan)) { dev_err(dev, "Failed attach pd %d\n", i); - return -EINVAL; + goto detach; } - link = device_link_add(dev, pd_chan, DL_FLAG_STATELESS | + fsl_chan->pd_dev_link = device_link_add(dev, pd_chan, DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE); - if (!link) { + if (!fsl_chan->pd_dev_link) { dev_err(dev, "Failed to add device_link to %d\n", i); - return -EINVAL; + dev_pm_domain_detach(pd_chan, false); + goto detach; } fsl_chan->pd_dev = pd_chan; @@ -455,6 +479,10 @@ static int fsl_edma3_attach_pd(struct platform_device *pdev, struct fsl_edma_eng } return 0; + +detach: + fsl_edma3_detach_pd(fsl_edma); + return -EINVAL; } static int fsl_edma_probe(struct platform_device *pdev) @@ -544,6 +572,9 @@ static int fsl_edma_probe(struct platform_device *pdev) ret = fsl_edma3_attach_pd(pdev, fsl_edma); if (ret) return ret; + ret = devm_add_action_or_reset(&pdev->dev, devm_fsl_edma3_detach_pd, fsl_edma); + if (ret) + return ret; } if (drvdata->flags & FSL_EDMA_DRV_TCD64) From 2ac5415022d16d63d912a39a06f32f1f51140261 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 20 Dec 2024 23:23:25 +0100 Subject: [PATCH 499/807] RDMA/rxe: Remove the direct link to net_device The similar patch in siw is in the link: https://git.kernel.org/rdma/rdma/c/16b87037b48889 This problem also occurred in RXE. The following analyze this problem. In the following Call Traces: " BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0 net/core/dev.c:8782 Read of size 4 at addr ffff8880554640b0 by task kworker/1:4/5295 CPU: 1 UID: 0 PID: 5295 Comm: kworker/1:4 Not tainted 6.12.0-rc3-syzkaller-00399-g9197b73fd7bb #0 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: infiniband ib_cache_event_task Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 dev_get_flags+0x188/0x1d0 net/core/dev.c:8782 rxe_query_port+0x12d/0x260 drivers/infiniband/sw/rxe/rxe_verbs.c:60 __ib_query_port drivers/infiniband/core/device.c:2111 [inline] ib_query_port+0x168/0x7d0 drivers/infiniband/core/device.c:2143 ib_cache_update+0x1a9/0xb80 drivers/infiniband/core/cache.c:1494 ib_cache_event_task+0xf3/0x1e0 drivers/infiniband/core/cache.c:1568 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa65/0x1850 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f2/0x390 kernel/kthread.c:389 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 " 1). In the link [1], " infiniband syz2: set down " This means that on 839.350575, the event ib_cache_event_task was sent andi queued in ib_wq. 2). In the link [1], " team0 (unregistering): Port device team_slave_0 removed " It indicates that before 843.251853, the net device should be freed. 3). In the link [1], " BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0 " This means that on 850.559070, this slab-use-after-free problem occurred. In all, on 839.350575, the event ib_cache_event_task was sent and queued in ib_wq, before 843.251853, the net device veth was freed. on 850.559070, this event was executed, and the mentioned freed net device was called. Thus, the above call trace occurred. [1] https://syzkaller.appspot.com/x/log.txt?x=12e7025f980000 Reported-by: syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=4b87489410b4efd181bf Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20241220222325.2487767-1-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe.c | 23 +++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe.h | 3 ++- drivers/infiniband/sw/rxe/rxe_mcast.c | 22 ++++++++++++++++++++-- drivers/infiniband/sw/rxe/rxe_net.c | 24 ++++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe_verbs.c | 26 +++++++++++++++++++++----- drivers/infiniband/sw/rxe/rxe_verbs.h | 11 ++++++++--- 6 files changed, 90 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 255677bc12b2..1ba4a0c8726a 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -40,6 +40,8 @@ void rxe_dealloc(struct ib_device *ib_dev) /* initialize rxe device parameters */ static void rxe_init_device_param(struct rxe_dev *rxe) { + struct net_device *ndev; + rxe->max_inline_data = RXE_MAX_INLINE_DATA; rxe->attr.vendor_id = RXE_VENDOR_ID; @@ -71,8 +73,15 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; rxe->attr.max_pkeys = RXE_MAX_PKEYS; rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; + addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, - rxe->ndev->dev_addr); + ndev->dev_addr); + + dev_put(ndev); rxe->max_ucontext = RXE_MAX_UCONTEXT; } @@ -109,10 +118,15 @@ static void rxe_init_port_param(struct rxe_port *port) static void rxe_init_ports(struct rxe_dev *rxe) { struct rxe_port *port = &rxe->port; + struct net_device *ndev; rxe_init_port_param(port); + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; addrconf_addr_eui48((unsigned char *)&port->port_guid, - rxe->ndev->dev_addr); + ndev->dev_addr); + dev_put(ndev); spin_lock_init(&port->port_lock); } @@ -167,12 +181,13 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. */ -int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev) { rxe_init(rxe); rxe_set_mtu(rxe, mtu); - return rxe_register_device(rxe, ibdev_name); + return rxe_register_device(rxe, ibdev_name, ndev); } static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index d8fb2c7af30a..fe7f97066732 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -139,7 +139,8 @@ enum resp_states { void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); -int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev); void rxe_rcv(struct sk_buff *skb); diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 86cc2e18a7fd..07ff47bae31d 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -31,10 +31,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; + struct net_device *ndev; + int ret; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return -ENODEV; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - return dev_mc_add(rxe->ndev, ll_addr); + ret = dev_mc_add(ndev, ll_addr); + dev_put(ndev); + + return ret; } /** @@ -47,10 +56,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; + struct net_device *ndev; + int ret; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return -ENODEV; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - return dev_mc_del(rxe->ndev, ll_addr); + ret = dev_mc_del(ndev, ll_addr); + dev_put(ndev); + + return ret; } /** diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 75d1407db52d..8cc64ceeb356 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -524,7 +524,16 @@ out: */ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) { - return rxe->ndev->name; + struct net_device *ndev; + char *ndev_name; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return NULL; + ndev_name = ndev->name; + dev_put(ndev); + + return ndev_name; } int rxe_net_add(const char *ibdev_name, struct net_device *ndev) @@ -536,10 +545,9 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) if (!rxe) return -ENOMEM; - rxe->ndev = ndev; ib_mark_name_assigned_by_user(&rxe->ib_dev); - err = rxe_add(rxe, ndev->mtu, ibdev_name); + err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev); if (err) { ib_dealloc_device(&rxe->ib_dev); return err; @@ -587,10 +595,18 @@ void rxe_port_down(struct rxe_dev *rxe) void rxe_set_port_state(struct rxe_dev *rxe) { - if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) + struct net_device *ndev; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; + + if (netif_running(ndev) && netif_carrier_ok(ndev)) rxe_port_up(rxe); else rxe_port_down(rxe); + + dev_put(ndev); } static int rxe_notify(struct notifier_block *not_blk, diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 5c18f7e342f2..8a5fc20fd186 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -41,6 +41,7 @@ static int rxe_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(ibdev); + struct net_device *ndev; int err, ret; if (port_num != 1) { @@ -49,6 +50,12 @@ static int rxe_query_port(struct ib_device *ibdev, goto err_out; } + ndev = rxe_ib_device_get_netdev(ibdev); + if (!ndev) { + err = -ENODEV; + goto err_out; + } + memcpy(attr, &rxe->port.attr, sizeof(*attr)); mutex_lock(&rxe->usdev_lock); @@ -57,13 +64,14 @@ static int rxe_query_port(struct ib_device *ibdev, if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; - else if (dev_get_flags(rxe->ndev) & IFF_UP) + else if (dev_get_flags(ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; mutex_unlock(&rxe->usdev_lock); + dev_put(ndev); return ret; err_out: @@ -1425,9 +1433,16 @@ static const struct attribute_group rxe_attr_group = { static int rxe_enable_driver(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); + struct net_device *ndev; + + ndev = rxe_ib_device_get_netdev(ib_dev); + if (!ndev) + return -ENODEV; rxe_set_port_state(rxe); - dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev)); + dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev)); + + dev_put(ndev); return 0; } @@ -1495,7 +1510,8 @@ static const struct ib_device_ops rxe_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), }; -int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, + struct net_device *ndev) { int err; struct ib_device *dev = &rxe->ib_dev; @@ -1507,13 +1523,13 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) dev->num_comp_vectors = num_possible_cpus(); dev->local_dma_lkey = 0; addrconf_addr_eui48((unsigned char *)&dev->node_guid, - rxe->ndev->dev_addr); + ndev->dev_addr); dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); ib_set_device_ops(dev, &rxe_dev_ops); - err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1); + err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 3c1354f82283..6573ceec0ef5 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -370,6 +370,7 @@ struct rxe_port { u32 qp_gsi_index; }; +#define RXE_PORT 1 struct rxe_dev { struct ib_device ib_dev; struct ib_device_attr attr; @@ -377,8 +378,6 @@ struct rxe_dev { int max_inline_data; struct mutex usdev_lock; - struct net_device *ndev; - struct rxe_pool uc_pool; struct rxe_pool pd_pool; struct rxe_pool ah_pool; @@ -406,6 +405,11 @@ struct rxe_dev { struct crypto_shash *tfm; }; +static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev) +{ + return ib_device_get_netdev(dev, RXE_PORT); +} + static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index) { atomic64_inc(&rxe->stats_counters[index]); @@ -471,6 +475,7 @@ static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw) return to_rpd(mw->ibmw.pd); } -int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name); +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, + struct net_device *ndev); #endif /* RXE_VERBS_H */ From 385a95cc72941c7f88630a7bc4176048cc03b395 Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Mon, 16 Dec 2024 23:45:54 +0530 Subject: [PATCH 500/807] drm/i915/cx0_phy: Fix C10 pll programming sequence According to spec VDR_CUSTOM_WIDTH register gets programmed after pll specific VDR registers and TX Lane programming registers are done. Moreover we only program into C10_VDR_CONTROL1 to update config and setup master lane once all VDR registers are written into. Bspec: 67636 Fixes: 51390cc0e00a ("drm/i915/mtl: Add Support for C10 PHY message bus and pll programming") Signed-off-by: Suraj Kandpal Reviewed-by: Ankit Nautiyal Link: https://patchwork.freedesktop.org/patch/msgid/20241216181554.2861381-1-suraj.kandpal@intel.com (cherry picked from commit f9d418552ba1e3a0e92487ff82eb515dab7516c0) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_cx0_phy.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cx0_phy.c b/drivers/gpu/drm/i915/display/intel_cx0_phy.c index 71dc659228ab..0c7aee13495a 100644 --- a/drivers/gpu/drm/i915/display/intel_cx0_phy.c +++ b/drivers/gpu/drm/i915/display/intel_cx0_phy.c @@ -2115,14 +2115,6 @@ static void intel_c10_pll_program(struct intel_display *display, 0, C10_VDR_CTRL_MSGBUS_ACCESS, MB_WRITE_COMMITTED); - /* Custom width needs to be programmed to 0 for both the phy lanes */ - intel_cx0_rmw(encoder, INTEL_CX0_BOTH_LANES, PHY_C10_VDR_CUSTOM_WIDTH, - C10_VDR_CUSTOM_WIDTH_MASK, C10_VDR_CUSTOM_WIDTH_8_10, - MB_WRITE_COMMITTED); - intel_cx0_rmw(encoder, INTEL_CX0_BOTH_LANES, PHY_C10_VDR_CONTROL(1), - 0, C10_VDR_CTRL_UPDATE_CFG, - MB_WRITE_COMMITTED); - /* Program the pll values only for the master lane */ for (i = 0; i < ARRAY_SIZE(pll_state->pll); i++) intel_cx0_write(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_PLL(i), @@ -2132,6 +2124,10 @@ static void intel_c10_pll_program(struct intel_display *display, intel_cx0_write(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_CMN(0), pll_state->cmn, MB_WRITE_COMMITTED); intel_cx0_write(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_TX(0), pll_state->tx, MB_WRITE_COMMITTED); + /* Custom width needs to be programmed to 0 for both the phy lanes */ + intel_cx0_rmw(encoder, INTEL_CX0_BOTH_LANES, PHY_C10_VDR_CUSTOM_WIDTH, + C10_VDR_CUSTOM_WIDTH_MASK, C10_VDR_CUSTOM_WIDTH_8_10, + MB_WRITE_COMMITTED); intel_cx0_rmw(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_CONTROL(1), 0, C10_VDR_CTRL_MASTER_LANE | C10_VDR_CTRL_UPDATE_CFG, MB_WRITE_COMMITTED); From 20e7c5313ffbf11c34a46395345677adbe890bee Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Thu, 19 Dec 2024 16:00:19 -0500 Subject: [PATCH 501/807] drm/i915/dg1: Fix power gate sequence. sub-pipe PG is not present on DG1. Setting these bits can disable other power gates and cause GPU hangs on video playbacks. VLK: 16314, 4304 Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13381 Fixes: 85a12d7eb8fe ("drm/i915/tgl: Fix Media power gate sequence.") Cc: Vinay Belgaumkar Cc: Himal Prasad Ghimiray Reviewed-by: Vinay Belgaumkar Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20241219210019.70532-1-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit de7061947b4ed4be857d452c60d5fb795831d79e) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_rc6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index c864d101faf9..9378d5901c49 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -133,7 +133,7 @@ static void gen11_rc6_enable(struct intel_rc6 *rc6) GEN9_MEDIA_PG_ENABLE | GEN11_MEDIA_SAMPLER_PG_ENABLE; - if (GRAPHICS_VER(gt->i915) >= 12) { + if (GRAPHICS_VER(gt->i915) >= 12 && !IS_DG1(gt->i915)) { for (i = 0; i < I915_MAX_VCS; i++) if (HAS_ENGINE(gt, _VCS(i))) pg_enable |= (VDN_HCP_POWERGATE_ENABLE(i) | From 362f1bf98a3ecb5a2a4fcbdaa9718c8403beceb2 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Fri, 11 Oct 2024 22:57:59 +0200 Subject: [PATCH 502/807] dmaengine: mv_xor: fix child node refcount handling in early exit The for_each_child_of_node() loop requires explicit calls to of_node_put() to decrement the child's refcount upon early exits (break, goto, return). Add the missing calls in the two early exits before the goto instructions. Cc: stable@vger.kernel.org Fixes: f7d12ef53ddf ("dma: mv_xor: add Device Tree binding") Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20241011-dma_mv_xor_of_node_put-v1-1-3c2de819f463@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/mv_xor.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index 43efce77bb57..40b76b40bc30 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -1388,6 +1388,7 @@ static int mv_xor_probe(struct platform_device *pdev) irq = irq_of_parse_and_map(np, 0); if (!irq) { ret = -ENODEV; + of_node_put(np); goto err_channel_add; } @@ -1396,6 +1397,7 @@ static int mv_xor_probe(struct platform_device *pdev) if (IS_ERR(chan)) { ret = PTR_ERR(chan); irq_dispose_mapping(irq); + of_node_put(np); goto err_channel_add; } From ebc008699fd95701c9af5ebaeb0793eef81a71d5 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Thu, 12 Dec 2024 18:14:12 +0530 Subject: [PATCH 503/807] dmaengine: tegra: Return correct DMA status when paused Currently, the driver does not return the correct DMA status when a DMA pause is issued by the client drivers. This causes GPCDMA users to assume that DMA is still running, while in reality, the DMA is paused. Return DMA_PAUSED for tx_status() if the channel is paused in the middle of a transfer. Fixes: ee17028009d4 ("dmaengine: tegra: Add tegra gpcdma driver") Cc: stable@vger.kernel.org Signed-off-by: Akhil R Signed-off-by: Kartik Rajput Link: https://lore.kernel.org/r/20241212124412.5650-1-kkartik@nvidia.com Signed-off-by: Vinod Koul --- drivers/dma/tegra186-gpc-dma.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/dma/tegra186-gpc-dma.c b/drivers/dma/tegra186-gpc-dma.c index cacf3757adc2..4d6fe0efa76e 100644 --- a/drivers/dma/tegra186-gpc-dma.c +++ b/drivers/dma/tegra186-gpc-dma.c @@ -231,6 +231,7 @@ struct tegra_dma_channel { bool config_init; char name[30]; enum dma_transfer_direction sid_dir; + enum dma_status status; int id; int irq; int slave_id; @@ -393,6 +394,8 @@ static int tegra_dma_pause(struct tegra_dma_channel *tdc) tegra_dma_dump_chan_regs(tdc); } + tdc->status = DMA_PAUSED; + return ret; } @@ -419,6 +422,8 @@ static void tegra_dma_resume(struct tegra_dma_channel *tdc) val = tdc_read(tdc, TEGRA_GPCDMA_CHAN_CSRE); val &= ~TEGRA_GPCDMA_CHAN_CSRE_PAUSE; tdc_write(tdc, TEGRA_GPCDMA_CHAN_CSRE, val); + + tdc->status = DMA_IN_PROGRESS; } static int tegra_dma_device_resume(struct dma_chan *dc) @@ -544,6 +549,7 @@ static void tegra_dma_xfer_complete(struct tegra_dma_channel *tdc) tegra_dma_sid_free(tdc); tdc->dma_desc = NULL; + tdc->status = DMA_COMPLETE; } static void tegra_dma_chan_decode_error(struct tegra_dma_channel *tdc, @@ -716,6 +722,7 @@ static int tegra_dma_terminate_all(struct dma_chan *dc) tdc->dma_desc = NULL; } + tdc->status = DMA_COMPLETE; tegra_dma_sid_free(tdc); vchan_get_all_descriptors(&tdc->vc, &head); spin_unlock_irqrestore(&tdc->vc.lock, flags); @@ -769,6 +776,9 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc, if (ret == DMA_COMPLETE) return ret; + if (tdc->status == DMA_PAUSED) + ret = DMA_PAUSED; + spin_lock_irqsave(&tdc->vc.lock, flags); vd = vchan_find_desc(&tdc->vc, cookie); if (vd) { From fe4bfa9b6d7bd752bfe4700c937f235aa8ce997b Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:41 +0800 Subject: [PATCH 504/807] phy: core: Fix that API devm_phy_put() fails to release the phy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For devm_phy_put(), its comment says it needs to invoke phy_put() to release the phy, but it will not actually invoke the function since devres_destroy() does not call devm_phy_release(), and the missing phy_put() call will cause: - The phy fails to be released. - devm_phy_put() can not fully undo what API devm_phy_get() does. - Leak refcount of both the module and device for below typical usage: devm_phy_get(); // or its variant ... err = do_something(); if (err) goto err_out; ... err_out: devm_phy_put(); // leak refcount here The file(s) affected by this issue are shown below since they have such typical usage. drivers/pci/controller/cadence/pcie-cadence.c drivers/net/ethernet/ti/am65-cpsw-nuss.c Fix by using devres_release() instead of devres_destroy() within the API. Fixes: ff764963479a ("drivers: phy: add generic PHY framework") Cc: stable@vger.kernel.org Cc: Lorenzo Pieralisi Cc: Krzysztof Wilczyński Cc: Bjorn Helgaas Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-1-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index f053b525ccff..f190d7126613 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -737,7 +737,7 @@ void devm_phy_put(struct device *dev, struct phy *phy) if (!phy) return; - r = devres_destroy(dev, devm_phy_release, devm_phy_match, phy); + r = devres_release(dev, devm_phy_release, devm_phy_match, phy); dev_WARN_ONCE(dev, r, "couldn't find PHY resource\n"); } EXPORT_SYMBOL_GPL(devm_phy_put); From c0b82ab95b4f1fbc3e3aeab9d829d012669524b6 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:42 +0800 Subject: [PATCH 505/807] phy: core: Fix that API devm_of_phy_provider_unregister() fails to unregister the phy provider For devm_of_phy_provider_unregister(), its comment says it needs to invoke of_phy_provider_unregister() to unregister the phy provider, but it will not actually invoke the function since devres_destroy() does not call devm_phy_provider_release(), and the missing of_phy_provider_unregister() call will cause: - The phy provider fails to be unregistered. - Leak both memory and the OF node refcount. Fortunately, the faulty API has not been used by current kernel tree. Fix by using devres_release() instead of devres_destroy() within the API. Fixes: ff764963479a ("drivers: phy: add generic PHY framework") Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/stable/20241213-phy_core_fix-v6-2-40ae28f5015a%40quicinc.com Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-2-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index f190d7126613..de07e1616b34 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -1259,12 +1259,12 @@ EXPORT_SYMBOL_GPL(of_phy_provider_unregister); * of_phy_provider_unregister to unregister the phy provider. */ void devm_of_phy_provider_unregister(struct device *dev, - struct phy_provider *phy_provider) + struct phy_provider *phy_provider) { int r; - r = devres_destroy(dev, devm_phy_provider_release, devm_phy_match, - phy_provider); + r = devres_release(dev, devm_phy_provider_release, devm_phy_match, + phy_provider); dev_WARN_ONCE(dev, r, "couldn't find PHY provider device resource\n"); } EXPORT_SYMBOL_GPL(devm_of_phy_provider_unregister); From 4dc48c88fcf82b89fdebd83a906aaa64f40fb8a9 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:43 +0800 Subject: [PATCH 506/807] phy: core: Fix that API devm_phy_destroy() fails to destroy the phy For devm_phy_destroy(), its comment says it needs to invoke phy_destroy() to destroy the phy, but it will not actually invoke the function since devres_destroy() does not call devm_phy_consume(), and the missing phy_destroy() call will cause that the phy fails to be destroyed. Fortunately, the faulty API has not been used by current kernel tree. Fix by using devres_release() instead of devres_destroy() within the API. Fixes: ff764963479a ("drivers: phy: add generic PHY framework") Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-3-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index de07e1616b34..52ca590a58b9 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -1121,7 +1121,7 @@ void devm_phy_destroy(struct device *dev, struct phy *phy) { int r; - r = devres_destroy(dev, devm_phy_consume, devm_phy_match, phy); + r = devres_release(dev, devm_phy_consume, devm_phy_match, phy); dev_WARN_ONCE(dev, r, "couldn't find PHY resource\n"); } EXPORT_SYMBOL_GPL(devm_phy_destroy); From 5ebdc6be16c2000e37fcb8b4072d442d268ad492 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:44 +0800 Subject: [PATCH 507/807] phy: core: Fix an OF node refcount leakage in _of_phy_get() _of_phy_get() will directly return when suffers of_device_is_compatible() error, but it forgets to decrease refcount of OF node @args.np before error return, the refcount was increased by previous of_parse_phandle_with_args() so causes the OF node's refcount leakage. Fix by decreasing the refcount via of_node_put() before the error return. Fixes: b7563e2796f8 ("phy: work around 'phys' references to usb-nop-xceiv devices") Cc: stable@vger.kernel.org Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-4-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index 52ca590a58b9..b88fbda6c046 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -629,8 +629,10 @@ static struct phy *_of_phy_get(struct device_node *np, int index) return ERR_PTR(-ENODEV); /* This phy type handled by the usb-phy subsystem for now */ - if (of_device_is_compatible(args.np, "usb-nop-xceiv")) - return ERR_PTR(-ENODEV); + if (of_device_is_compatible(args.np, "usb-nop-xceiv")) { + phy = ERR_PTR(-ENODEV); + goto out_put_node; + } mutex_lock(&phy_provider_mutex); phy_provider = of_phy_provider_lookup(args.np); @@ -652,6 +654,7 @@ out_put_module: out_unlock: mutex_unlock(&phy_provider_mutex); +out_put_node: of_node_put(args.np); return phy; From a2d633cb1421e679b56f1a9fe1f42f089706f1ed Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:45 +0800 Subject: [PATCH 508/807] phy: core: Fix an OF node refcount leakage in of_phy_provider_lookup() For macro for_each_child_of_node(parent, child), refcount of @child has been increased before entering its loop body, so normally needs to call of_node_put(@child) before returning from the loop body to avoid refcount leakage. of_phy_provider_lookup() has such usage but does not call of_node_put() before returning, so cause leakage of the OF node refcount. Fix by simply calling of_node_put() before returning from the loop body. The APIs affected by this issue are shown below since they indirectly invoke problematic of_phy_provider_lookup(). phy_get() of_phy_get() devm_phy_get() devm_of_phy_get() devm_of_phy_get_by_index() Fixes: 2a4c37016ca9 ("phy: core: Fix of_phy_provider_lookup to return PHY provider for sub node") Cc: stable@vger.kernel.org Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-5-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index b88fbda6c046..413f76e2d174 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -145,8 +145,10 @@ static struct phy_provider *of_phy_provider_lookup(struct device_node *node) return phy_provider; for_each_child_of_node(phy_provider->children, child) - if (child == node) + if (child == node) { + of_node_put(child); return phy_provider; + } } return ERR_PTR(-EPROBE_DEFER); From 739214dd1c209e34323814fb815fb17cccb9f95b Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Sun, 15 Dec 2024 16:05:55 -0600 Subject: [PATCH 509/807] phy: freescale: fsl-samsung-hdmi: Fix 64-by-32 division cocci warnings The Kernel test robot returns the following warning: do_div() does a 64-by-32 division, please consider using div64_ul instead. To prevent the 64-by-32 divsion, consolidate both the multiplication and the do_div into one line which explicitly uses u64 sizes. Fixes: 1951dbb41d1d ("phy: freescale: fsl-samsung-hdmi: Support dynamic integer") Signed-off-by: Adam Ford Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412091243.fSObwwPi-lkp@intel.com/ Link: https://lore.kernel.org/r/20241215220555.99113-1-aford173@gmail.com Signed-off-by: Vinod Koul --- drivers/phy/freescale/phy-fsl-samsung-hdmi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/phy/freescale/phy-fsl-samsung-hdmi.c b/drivers/phy/freescale/phy-fsl-samsung-hdmi.c index 2c8038864357..d3ccf547ba1c 100644 --- a/drivers/phy/freescale/phy-fsl-samsung-hdmi.c +++ b/drivers/phy/freescale/phy-fsl-samsung-hdmi.c @@ -424,8 +424,7 @@ static unsigned long fsl_samsung_hdmi_phy_find_pms(unsigned long fout, u8 *p, u1 * Fvco = (M * f_ref) / P, * where f_ref is 24MHz. */ - tmp = (u64)_m * 24 * MHZ; - do_div(tmp, _p); + tmp = div64_ul((u64)_m * 24 * MHZ, _p); if (tmp < 750 * MHZ || tmp > 3000 * MHZ) continue; From 17194c2998d39ab366a2ecbc4d1f3281e00d6a05 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Dec 2024 09:30:51 +0100 Subject: [PATCH 510/807] phy: mediatek: phy-mtk-hdmi: add regulator dependency The driver no longer builds when regulator support is unavailable: arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi.o: in function `mtk_hdmi_phy_register_regulators': phy-mtk-hdmi.c:(.text.unlikely+0x3e): undefined reference to `devm_regulator_register' arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi-mt8195.o: in function `mtk_hdmi_phy_pwr5v_is_enabled': phy-mtk-hdmi-mt8195.c:(.text+0x326): undefined reference to `rdev_get_drvdata' arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi-mt8195.o: in function `mtk_hdmi_phy_pwr5v_disable': phy-mtk-hdmi-mt8195.c:(.text+0x346): undefined reference to `rdev_get_drvdata' arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi-mt8195.o: in function `mtk_hdmi_phy_pwr5v_enable': Fixes: 49393b2da1cd ("phy: mediatek: phy-mtk-hdmi: Register PHY provided regulator") Signed-off-by: Arnd Bergmann Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20241213083056.2596499-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/mediatek/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/phy/mediatek/Kconfig b/drivers/phy/mediatek/Kconfig index 60e00057e8bc..ba6461350951 100644 --- a/drivers/phy/mediatek/Kconfig +++ b/drivers/phy/mediatek/Kconfig @@ -65,6 +65,7 @@ config PHY_MTK_HDMI depends on ARCH_MEDIATEK || COMPILE_TEST depends on COMMON_CLK depends on OF + depends on REGULATOR select GENERIC_PHY help Support HDMI PHY for Mediatek SoCs. From d57212f281fda9056412cd6cca983d9d2eb89f53 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Tue, 24 Dec 2024 12:43:58 +0800 Subject: [PATCH 511/807] workqueue: add printf attribute to __alloc_workqueue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a compiler warning with W=1: kernel/workqueue.c: error: function ‘__alloc_workqueue’ might be a candidate for ‘gnu_printf’ format attribute[-Werror=suggest-attribute=format] 5657 | name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); | ^~~~~~~~ Fixes: 9b59a85a84dc ("workqueue: Don't call va_start / va_end twice") Signed-off-by: Su Hui Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8336218ec4b8..f7d8fc204579 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5645,6 +5645,7 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) } while (activated); } +__printf(1, 0) static struct workqueue_struct *__alloc_workqueue(const char *fmt, unsigned int flags, int max_active, va_list args) From 35bf430e08a18fdab6eb94492a06d9ad14c6179b Mon Sep 17 00:00:00 2001 From: Henry Huang Date: Sun, 22 Dec 2024 23:43:16 +0800 Subject: [PATCH 512/807] sched_ext: initialize kit->cursor.flags struct bpf_iter_scx_dsq *it maybe not initialized. If we didn't call scx_bpf_dsq_move_set_vtime and scx_bpf_dsq_move_set_slice before scx_bpf_dsq_move, it would cause unexpected behaviors: 1. Assign a huge slice into p->scx.slice 2. Assign a invalid vtime into p->scx.dsq_vtime Signed-off-by: Henry Huang Fixes: 6462dd53a260 ("sched_ext: Compact struct bpf_iter_scx_dsq_kern") Cc: stable@vger.kernel.org # v6.12 Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 98519e6d0dcd..19d2699cf638 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7013,7 +7013,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, return -ENOENT; INIT_LIST_HEAD(&kit->cursor.node); - kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags; + kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; kit->cursor.priv = READ_ONCE(kit->dsq->seq); return 0; From 542ed8145e6f9392e3d0a86a0e9027d2ffd183e4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 21 Dec 2024 00:29:20 +0100 Subject: [PATCH 513/807] netfilter: nft_set_hash: unaligned atomic read on struct nft_set_ext Access to genmask field in struct nft_set_ext results in unaligned atomic read: [ 72.130109] Unable to handle kernel paging request at virtual address ffff0000c2bb708c [ 72.131036] Mem abort info: [ 72.131213] ESR = 0x0000000096000021 [ 72.131446] EC = 0x25: DABT (current EL), IL = 32 bits [ 72.132209] SET = 0, FnV = 0 [ 72.133216] EA = 0, S1PTW = 0 [ 72.134080] FSC = 0x21: alignment fault [ 72.135593] Data abort info: [ 72.137194] ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000 [ 72.142351] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 72.145989] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 72.150115] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000237d27000 [ 72.154893] [ffff0000c2bb708c] pgd=0000000000000000, p4d=180000023ffff403, pud=180000023f84b403, pmd=180000023f835403, +pte=0068000102bb7707 [ 72.163021] Internal error: Oops: 0000000096000021 [#1] SMP [...] [ 72.170041] CPU: 7 UID: 0 PID: 54 Comm: kworker/7:0 Tainted: G E 6.13.0-rc3+ #2 [ 72.170509] Tainted: [E]=UNSIGNED_MODULE [ 72.170720] Hardware name: QEMU QEMU Virtual Machine, BIOS edk2-stable202302-for-qemu 03/01/2023 [ 72.171192] Workqueue: events_power_efficient nft_rhash_gc [nf_tables] [ 72.171552] pstate: 21400005 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 72.171915] pc : nft_rhash_gc+0x200/0x2d8 [nf_tables] [ 72.172166] lr : nft_rhash_gc+0x128/0x2d8 [nf_tables] [ 72.172546] sp : ffff800081f2bce0 [ 72.172724] x29: ffff800081f2bd40 x28: ffff0000c2bb708c x27: 0000000000000038 [ 72.173078] x26: ffff0000c6780ef0 x25: ffff0000c643df00 x24: ffff0000c6778f78 [ 72.173431] x23: 000000000000001a x22: ffff0000c4b1f000 x21: ffff0000c6780f78 [ 72.173782] x20: ffff0000c2bb70dc x19: ffff0000c2bb7080 x18: 0000000000000000 [ 72.174135] x17: ffff0000c0a4e1c0 x16: 0000000000003000 x15: 0000ac26d173b978 [ 72.174485] x14: ffffffffffffffff x13: 0000000000000030 x12: ffff0000c6780ef0 [ 72.174841] x11: 0000000000000000 x10: ffff800081f2bcf8 x9 : ffff0000c3000000 [ 72.175193] x8 : 00000000000004be x7 : 0000000000000000 x6 : 0000000000000000 [ 72.175544] x5 : 0000000000000040 x4 : ffff0000c3000010 x3 : 0000000000000000 [ 72.175871] x2 : 0000000000003a98 x1 : ffff0000c2bb708c x0 : 0000000000000004 [ 72.176207] Call trace: [ 72.176316] nft_rhash_gc+0x200/0x2d8 [nf_tables] (P) [ 72.176653] process_one_work+0x178/0x3d0 [ 72.176831] worker_thread+0x200/0x3f0 [ 72.176995] kthread+0xe8/0xf8 [ 72.177130] ret_from_fork+0x10/0x20 [ 72.177289] Code: 54fff984 d503201f d2800080 91003261 (f820303f) [ 72.177557] ---[ end trace 0000000000000000 ]--- Align struct nft_set_ext to word size to address this and documentation it. pahole reports that this increases the size of elements for rhash and pipapo in 8 bytes on x86_64. Fixes: 7ffc7481153b ("netfilter: nft_set_hash: skip duplicated elements pending gc run") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4afa64c81304..0027beca5cd5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -733,15 +733,18 @@ struct nft_set_ext_tmpl { /** * struct nft_set_ext - set extensions * - * @genmask: generation mask + * @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT) * @offset: offsets of individual extension types * @data: beginning of extension data + * + * This structure must be aligned to word size, otherwise atomic bitops + * on genmask field can cause alignment failure on some archs. */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; -}; +} __aligned(BITS_PER_LONG / 8); static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { From ce2b93fc1dfa1c82f2576aa571731c4e5dcc8dd7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Dec 2024 14:09:15 -1000 Subject: [PATCH 514/807] sched_ext: Fix dsq_local_on selftest The dsp_local_on selftest expects the scheduler to fail by trying to schedule an e.g. CPU-affine task to the wrong CPU. However, this isn't guaranteed to happen in the 1 second window that the test is running. Besides, it's odd to have this particular exception path tested when there are no other tests that verify that the interface is working at all - e.g. the test would pass if dsp_local_on interface is completely broken and fails on any attempt. Flip the test so that it verifies that the feature works. While at it, fix a typo in the info message. Signed-off-by: Tejun Heo Reported-by: Ihor Solodrai Link: http://lkml.kernel.org/r/Z1n9v7Z6iNJ-wKmq@slm.duckdns.org Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 5 ++++- tools/testing/selftests/sched_ext/dsp_local_on.c | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index 6325bf76f47e..fbda6bf54671 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -43,7 +43,10 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) if (!p) return; - target = bpf_get_prandom_u32() % nr_cpus; + if (p->nr_cpus_allowed == nr_cpus) + target = bpf_get_prandom_u32() % nr_cpus; + else + target = scx_bpf_task_cpu(p); scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); bpf_task_release(p); diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c index 472851b56854..0ff27e57fe43 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.c @@ -34,9 +34,10 @@ static enum scx_test_status run(void *ctx) /* Just sleeping is fine, plenty of scheduling events happening */ sleep(1); - SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); bpf_link__destroy(link); + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG)); + return SCX_TEST_PASS; } @@ -50,7 +51,7 @@ static void cleanup(void *ctx) struct scx_test dsp_local_on = { .name = "dsp_local_on", .description = "Verify we can directly dispatch tasks to a local DSQs " - "from osp.dispatch()", + "from ops.dispatch()", .setup = setup, .run = run, .cleanup = cleanup, From dcd59d0d7d51b2a4b768fc132b0d74a97dfd6d6a Mon Sep 17 00:00:00 2001 From: "Dustin L. Howett" Date: Tue, 24 Dec 2024 12:55:58 -0600 Subject: [PATCH 515/807] platform/chrome: cros_ec_lpc: fix product identity for early Framework Laptops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The product names for the Framework Laptop (12th and 13th Generation Intel Core) are incorrect as of 62be134abf42. Fixes: 62be134abf42 ("platform/chrome: cros_ec_lpc: switch primary DMI data for Framework Laptop") Cc: stable@vger.kernel.org # 6.12.x Signed-off-by: Dustin L. Howett Reviewed-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241224-platform-chrome-cros_ec_lpc-fix-product-identity-for-early-framework-laptops-v1-1-0d31d6e1d22c@howett.net Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_lpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c index 924bf4d3cc77..8470b7f2b135 100644 --- a/drivers/platform/chrome/cros_ec_lpc.c +++ b/drivers/platform/chrome/cros_ec_lpc.c @@ -707,7 +707,7 @@ static const struct dmi_system_id cros_ec_lpc_dmi_table[] __initconst = { /* Framework Laptop (12th Gen Intel Core) */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Framework"), - DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "12th Gen Intel Core"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Laptop (12th Gen Intel Core)"), }, .driver_data = (void *)&framework_laptop_mec_lpc_driver_data, }, @@ -715,7 +715,7 @@ static const struct dmi_system_id cros_ec_lpc_dmi_table[] __initconst = { /* Framework Laptop (13th Gen Intel Core) */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Framework"), - DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "13th Gen Intel Core"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Laptop (13th Gen Intel Core)"), }, .driver_data = (void *)&framework_laptop_mec_lpc_driver_data, }, From 36684e9d88a2e2401ae26715a2e217cb4295cea7 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2024 10:32:27 +0800 Subject: [PATCH 516/807] drm/mediatek: Set private->all_drm_private[i]->drm to NULL if mtk_drm_bind returns err The pointer need to be set to NULL, otherwise KASAN complains about use-after-free. Because in mtk_drm_bind, all private's drm are set as follows. private->all_drm_private[i]->drm = drm; And drm will be released by drm_dev_put in case mtk_drm_kms_init returns failure. However, the shutdown path still accesses the previous allocated memory in drm_atomic_helper_shutdown. [ 84.874820] watchdog: watchdog0: watchdog did not stop! [ 86.512054] ================================================================== [ 86.513162] BUG: KASAN: use-after-free in drm_atomic_helper_shutdown+0x33c/0x378 [ 86.514258] Read of size 8 at addr ffff0000d46fc068 by task shutdown/1 [ 86.515213] [ 86.515455] CPU: 1 UID: 0 PID: 1 Comm: shutdown Not tainted 6.13.0-rc1-mtk+gfa1a78e5d24b-dirty #55 [ 86.516752] Hardware name: Unknown Product/Unknown Product, BIOS 2022.10 10/01/2022 [ 86.517960] Call trace: [ 86.518333] show_stack+0x20/0x38 (C) [ 86.518891] dump_stack_lvl+0x90/0xd0 [ 86.519443] print_report+0xf8/0x5b0 [ 86.519985] kasan_report+0xb4/0x100 [ 86.520526] __asan_report_load8_noabort+0x20/0x30 [ 86.521240] drm_atomic_helper_shutdown+0x33c/0x378 [ 86.521966] mtk_drm_shutdown+0x54/0x80 [ 86.522546] platform_shutdown+0x64/0x90 [ 86.523137] device_shutdown+0x260/0x5b8 [ 86.523728] kernel_restart+0x78/0xf0 [ 86.524282] __do_sys_reboot+0x258/0x2f0 [ 86.524871] __arm64_sys_reboot+0x90/0xd8 [ 86.525473] invoke_syscall+0x74/0x268 [ 86.526041] el0_svc_common.constprop.0+0xb0/0x240 [ 86.526751] do_el0_svc+0x4c/0x70 [ 86.527251] el0_svc+0x4c/0xc0 [ 86.527719] el0t_64_sync_handler+0x144/0x168 [ 86.528367] el0t_64_sync+0x198/0x1a0 [ 86.528920] [ 86.529157] The buggy address belongs to the physical page: [ 86.529972] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0xffff0000d46fd4d0 pfn:0x1146fc [ 86.531319] flags: 0xbfffc0000000000(node=0|zone=2|lastcpupid=0xffff) [ 86.532267] raw: 0bfffc0000000000 0000000000000000 dead000000000122 0000000000000000 [ 86.533390] raw: ffff0000d46fd4d0 0000000000000000 00000000ffffffff 0000000000000000 [ 86.534511] page dumped because: kasan: bad access detected [ 86.535323] [ 86.535559] Memory state around the buggy address: [ 86.536265] ffff0000d46fbf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 86.537314] ffff0000d46fbf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 86.538363] >ffff0000d46fc000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 86.544733] ^ [ 86.551057] ffff0000d46fc080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 86.557510] ffff0000d46fc100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 86.563928] ================================================================== [ 86.571093] Disabling lock debugging due to kernel taint [ 86.577642] Unable to handle kernel paging request at virtual address e0e9c0920000000b [ 86.581834] KASAN: maybe wild-memory-access in range [0x0752049000000058-0x075204900000005f] ... Fixes: 1ef7ed48356c ("drm/mediatek: Modify mediatek-drm for mt8195 multi mmsys support") Signed-off-by: Guoqing Jiang Reviewed-by: AngeloGioacchino Del Regno Link: https://patchwork.kernel.org/project/dri-devel/patch/20241223023227.1258112-1-guoqing.jiang@canonical.com/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index 9a8ef8558da9..0062374f75d5 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -673,6 +673,8 @@ err_deinit: err_free: private->drm = NULL; drm_dev_put(drm); + for (i = 0; i < private->data->mmsys_dev_num; i++) + private->all_drm_private[i]->drm = NULL; return ret; } From da03801ad08f2488c01e684509cd89e1aa5d17ec Mon Sep 17 00:00:00 2001 From: "Jason-JH.Lin" Date: Wed, 11 Dec 2024 11:47:16 +0800 Subject: [PATCH 517/807] drm/mediatek: Move mtk_crtc_finish_page_flip() to ddp_cmdq_cb() mtk_crtc_finish_page_flip() is used to notify userspace that a page flip has been completed, allowing userspace to free the frame buffer of the last frame and commit the next frame. In MediaTek's hardware design for configuring display hardware by using GCE, `DRM_EVENT_FLIP_COMPLETE` should be notified to userspace after GCE has finished configuring all display hardware settings for each atomic_commit(). Currently, mtk_crtc_finish_page_flip() cannot guarantee that GCE has configured all the display hardware settings of the last frame. Therefore, to increase the accuracy of the timing for notifying `DRM_EVENT_FLIP_COMPLETE` to userspace, mtk_crtc_finish_page_flip() should be moved to ddp_cmdq_cb(). Fixes: 7f82d9c43879 ("drm/mediatek: Clear pending flag when cmdq packet is done") Signed-off-by: Jason-JH.Lin Reviewed-by: CK Hu Link: https://patchwork.kernel.org/project/dri-devel/patch/20241211034716.29241-1-jason-jh.lin@mediatek.com/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_crtc.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_crtc.c b/drivers/gpu/drm/mediatek/mtk_crtc.c index eb0e1233ad04..5674f5707cca 100644 --- a/drivers/gpu/drm/mediatek/mtk_crtc.c +++ b/drivers/gpu/drm/mediatek/mtk_crtc.c @@ -112,6 +112,11 @@ static void mtk_drm_finish_page_flip(struct mtk_crtc *mtk_crtc) drm_crtc_handle_vblank(&mtk_crtc->base); +#if IS_REACHABLE(CONFIG_MTK_CMDQ) + if (mtk_crtc->cmdq_client.chan) + return; +#endif + spin_lock_irqsave(&mtk_crtc->config_lock, flags); if (!mtk_crtc->config_updating && mtk_crtc->pending_needs_vblank) { mtk_crtc_finish_page_flip(mtk_crtc); @@ -284,10 +289,8 @@ static void ddp_cmdq_cb(struct mbox_client *cl, void *mssg) state = to_mtk_crtc_state(mtk_crtc->base.state); spin_lock_irqsave(&mtk_crtc->config_lock, flags); - if (mtk_crtc->config_updating) { - spin_unlock_irqrestore(&mtk_crtc->config_lock, flags); + if (mtk_crtc->config_updating) goto ddp_cmdq_cb_out; - } state->pending_config = false; @@ -315,10 +318,15 @@ static void ddp_cmdq_cb(struct mbox_client *cl, void *mssg) mtk_crtc->pending_async_planes = false; } - spin_unlock_irqrestore(&mtk_crtc->config_lock, flags); - ddp_cmdq_cb_out: + if (mtk_crtc->pending_needs_vblank) { + mtk_crtc_finish_page_flip(mtk_crtc); + mtk_crtc->pending_needs_vblank = false; + } + + spin_unlock_irqrestore(&mtk_crtc->config_lock, flags); + mtk_crtc->cmdq_vblank_cnt = 0; wake_up(&mtk_crtc->cb_blocking_queue); } @@ -606,13 +614,18 @@ static void mtk_crtc_update_config(struct mtk_crtc *mtk_crtc, bool needs_vblank) */ mtk_crtc->cmdq_vblank_cnt = 3; + spin_lock_irqsave(&mtk_crtc->config_lock, flags); + mtk_crtc->config_updating = false; + spin_unlock_irqrestore(&mtk_crtc->config_lock, flags); + mbox_send_message(mtk_crtc->cmdq_client.chan, cmdq_handle); mbox_client_txdone(mtk_crtc->cmdq_client.chan, 0); } -#endif +#else spin_lock_irqsave(&mtk_crtc->config_lock, flags); mtk_crtc->config_updating = false; spin_unlock_irqrestore(&mtk_crtc->config_lock, flags); +#endif mutex_unlock(&mtk_crtc->hw_lock); } From f8d9b91739e1fb436447c437a346a36deb676a36 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 17 Dec 2024 01:18:01 +0000 Subject: [PATCH 518/807] drm/mediatek: Only touch DISP_REG_OVL_PITCH_MSB if AFBC is supported Touching DISP_REG_OVL_PITCH_MSB leads to video overlay on MT2701, MT7623N and probably other older SoCs being broken. Move setting up AFBC layer configuration into a separate function only being called on hardware which actually supports AFBC which restores the behavior as it was before commit c410fa9b07c3 ("drm/mediatek: Add AFBC support to Mediatek DRM driver") on non-AFBC hardware. Fixes: c410fa9b07c3 ("drm/mediatek: Add AFBC support to Mediatek DRM driver") Cc: stable@vger.kernel.org Signed-off-by: Daniel Golle Reviewed-by: CK Hu Link: https://patchwork.kernel.org/project/dri-devel/patch/c7fbd3c3e633c0b7dd6d1cd78ccbdded31e1ca0f.1734397800.git.daniel@makrotopia.org/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_disp_ovl.c | 57 +++++++++++++------------ 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_disp_ovl.c b/drivers/gpu/drm/mediatek/mtk_disp_ovl.c index e0c0bb01f65a..0e4da239cbeb 100644 --- a/drivers/gpu/drm/mediatek/mtk_disp_ovl.c +++ b/drivers/gpu/drm/mediatek/mtk_disp_ovl.c @@ -460,6 +460,29 @@ static unsigned int mtk_ovl_fmt_convert(struct mtk_disp_ovl *ovl, } } +static void mtk_ovl_afbc_layer_config(struct mtk_disp_ovl *ovl, + unsigned int idx, + struct mtk_plane_pending_state *pending, + struct cmdq_pkt *cmdq_pkt) +{ + unsigned int pitch_msb = pending->pitch >> 16; + unsigned int hdr_pitch = pending->hdr_pitch; + unsigned int hdr_addr = pending->hdr_addr; + + if (pending->modifier != DRM_FORMAT_MOD_LINEAR) { + mtk_ddp_write_relaxed(cmdq_pkt, hdr_addr, &ovl->cmdq_reg, ovl->regs, + DISP_REG_OVL_HDR_ADDR(ovl, idx)); + mtk_ddp_write_relaxed(cmdq_pkt, + OVL_PITCH_MSB_2ND_SUBBUF | pitch_msb, + &ovl->cmdq_reg, ovl->regs, DISP_REG_OVL_PITCH_MSB(idx)); + mtk_ddp_write_relaxed(cmdq_pkt, hdr_pitch, &ovl->cmdq_reg, ovl->regs, + DISP_REG_OVL_HDR_PITCH(ovl, idx)); + } else { + mtk_ddp_write_relaxed(cmdq_pkt, pitch_msb, + &ovl->cmdq_reg, ovl->regs, DISP_REG_OVL_PITCH_MSB(idx)); + } +} + void mtk_ovl_layer_config(struct device *dev, unsigned int idx, struct mtk_plane_state *state, struct cmdq_pkt *cmdq_pkt) @@ -467,25 +490,13 @@ void mtk_ovl_layer_config(struct device *dev, unsigned int idx, struct mtk_disp_ovl *ovl = dev_get_drvdata(dev); struct mtk_plane_pending_state *pending = &state->pending; unsigned int addr = pending->addr; - unsigned int hdr_addr = pending->hdr_addr; - unsigned int pitch = pending->pitch; - unsigned int hdr_pitch = pending->hdr_pitch; + unsigned int pitch_lsb = pending->pitch & GENMASK(15, 0); unsigned int fmt = pending->format; unsigned int offset = (pending->y << 16) | pending->x; unsigned int src_size = (pending->height << 16) | pending->width; unsigned int blend_mode = state->base.pixel_blend_mode; unsigned int ignore_pixel_alpha = 0; unsigned int con; - bool is_afbc = pending->modifier != DRM_FORMAT_MOD_LINEAR; - union overlay_pitch { - struct split_pitch { - u16 lsb; - u16 msb; - } split_pitch; - u32 pitch; - } overlay_pitch; - - overlay_pitch.pitch = pitch; if (!pending->enable) { mtk_ovl_layer_off(dev, idx, cmdq_pkt); @@ -524,11 +535,12 @@ void mtk_ovl_layer_config(struct device *dev, unsigned int idx, } if (ovl->data->supports_afbc) - mtk_ovl_set_afbc(ovl, cmdq_pkt, idx, is_afbc); + mtk_ovl_set_afbc(ovl, cmdq_pkt, idx, + pending->modifier != DRM_FORMAT_MOD_LINEAR); mtk_ddp_write_relaxed(cmdq_pkt, con, &ovl->cmdq_reg, ovl->regs, DISP_REG_OVL_CON(idx)); - mtk_ddp_write_relaxed(cmdq_pkt, overlay_pitch.split_pitch.lsb | ignore_pixel_alpha, + mtk_ddp_write_relaxed(cmdq_pkt, pitch_lsb | ignore_pixel_alpha, &ovl->cmdq_reg, ovl->regs, DISP_REG_OVL_PITCH(idx)); mtk_ddp_write_relaxed(cmdq_pkt, src_size, &ovl->cmdq_reg, ovl->regs, DISP_REG_OVL_SRC_SIZE(idx)); @@ -537,19 +549,8 @@ void mtk_ovl_layer_config(struct device *dev, unsigned int idx, mtk_ddp_write_relaxed(cmdq_pkt, addr, &ovl->cmdq_reg, ovl->regs, DISP_REG_OVL_ADDR(ovl, idx)); - if (is_afbc) { - mtk_ddp_write_relaxed(cmdq_pkt, hdr_addr, &ovl->cmdq_reg, ovl->regs, - DISP_REG_OVL_HDR_ADDR(ovl, idx)); - mtk_ddp_write_relaxed(cmdq_pkt, - OVL_PITCH_MSB_2ND_SUBBUF | overlay_pitch.split_pitch.msb, - &ovl->cmdq_reg, ovl->regs, DISP_REG_OVL_PITCH_MSB(idx)); - mtk_ddp_write_relaxed(cmdq_pkt, hdr_pitch, &ovl->cmdq_reg, ovl->regs, - DISP_REG_OVL_HDR_PITCH(ovl, idx)); - } else { - mtk_ddp_write_relaxed(cmdq_pkt, - overlay_pitch.split_pitch.msb, - &ovl->cmdq_reg, ovl->regs, DISP_REG_OVL_PITCH_MSB(idx)); - } + if (ovl->data->supports_afbc) + mtk_ovl_afbc_layer_config(ovl, idx, pending, cmdq_pkt); mtk_ovl_set_bit_depth(dev, idx, fmt, cmdq_pkt); mtk_ovl_layer_on(dev, idx, cmdq_pkt); From 5c9d7e79ba154e8e1f0bfdeb7b495f454c1a3eba Mon Sep 17 00:00:00 2001 From: "Jason-JH.Lin" Date: Mon, 18 Nov 2024 10:51:26 +0800 Subject: [PATCH 519/807] drm/mediatek: Add support for 180-degree rotation in the display driver mediatek-drm driver reported the capability of 180-degree rotation by adding `DRM_MODE_ROTATE_180` to the plane property, as flip-x combined with flip-y equals a 180-degree rotation. However, we did not handle the rotation property in the driver and lead to rotation issues. Fixes: 74608d8feefd ("drm/mediatek: Add DRM_MODE_ROTATE_0 to rotation property") Signed-off-by: Jason-JH.Lin Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: CK Hu Link: https://patchwork.kernel.org/project/dri-devel/patch/20241118025126.30808-1-jason-jh.lin@mediatek.com/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_disp_ovl.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_disp_ovl.c b/drivers/gpu/drm/mediatek/mtk_disp_ovl.c index 0e4da239cbeb..19b0d5083981 100644 --- a/drivers/gpu/drm/mediatek/mtk_disp_ovl.c +++ b/drivers/gpu/drm/mediatek/mtk_disp_ovl.c @@ -492,6 +492,7 @@ void mtk_ovl_layer_config(struct device *dev, unsigned int idx, unsigned int addr = pending->addr; unsigned int pitch_lsb = pending->pitch & GENMASK(15, 0); unsigned int fmt = pending->format; + unsigned int rotation = pending->rotation; unsigned int offset = (pending->y << 16) | pending->x; unsigned int src_size = (pending->height << 16) | pending->width; unsigned int blend_mode = state->base.pixel_blend_mode; @@ -524,12 +525,19 @@ void mtk_ovl_layer_config(struct device *dev, unsigned int idx, ignore_pixel_alpha = OVL_CONST_BLEND; } - if (pending->rotation & DRM_MODE_REFLECT_Y) { + /* + * Treat rotate 180 as flip x + flip y, and XOR the original rotation value + * to flip x + flip y to support both in the same time. + */ + if (rotation & DRM_MODE_ROTATE_180) + rotation ^= DRM_MODE_REFLECT_X | DRM_MODE_REFLECT_Y; + + if (rotation & DRM_MODE_REFLECT_Y) { con |= OVL_CON_VIRT_FLIP; addr += (pending->height - 1) * pending->pitch; } - if (pending->rotation & DRM_MODE_REFLECT_X) { + if (rotation & DRM_MODE_REFLECT_X) { con |= OVL_CON_HORZ_FLIP; addr += pending->pitch - 1; } From 924d66011f2401a4145e2e814842c5c4572e439f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Dec 2024 09:58:31 +0100 Subject: [PATCH 520/807] drm/mediatek: stop selecting foreign drivers The PHY portion of the mediatek hdmi driver was originally part of the driver it self and later split out into drivers/phy, which a 'select' to keep the prior behavior. However, this leads to build failures when the PHY driver cannot be built: WARNING: unmet direct dependencies detected for PHY_MTK_HDMI Depends on [n]: (ARCH_MEDIATEK || COMPILE_TEST [=y]) && COMMON_CLK [=y] && OF [=y] && REGULATOR [=n] Selected by [m]: - DRM_MEDIATEK_HDMI [=m] && HAS_IOMEM [=y] && DRM [=m] && DRM_MEDIATEK [=m] ERROR: modpost: "devm_regulator_register" [drivers/phy/mediatek/phy-mtk-hdmi-drv.ko] undefined! ERROR: modpost: "rdev_get_drvdata" [drivers/phy/mediatek/phy-mtk-hdmi-drv.ko] undefined! The best option here is to just not select the phy driver and leave that up to the defconfig. Do the same for the other PHY and memory drivers selected here as well for consistency. Fixes: a481bf2f0ca4 ("drm/mediatek: Separate mtk_hdmi_phy to an independent module") Signed-off-by: Arnd Bergmann Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: CK Hu Link: https://patchwork.kernel.org/project/dri-devel/patch/20241218085837.2670434-1-arnd@kernel.org/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/Kconfig | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/mediatek/Kconfig b/drivers/gpu/drm/mediatek/Kconfig index f496e6cfdfe0..e47debd60619 100644 --- a/drivers/gpu/drm/mediatek/Kconfig +++ b/drivers/gpu/drm/mediatek/Kconfig @@ -14,9 +14,6 @@ config DRM_MEDIATEK select DRM_BRIDGE_CONNECTOR select DRM_MIPI_DSI select DRM_PANEL - select MEMORY - select MTK_SMI - select PHY_MTK_MIPI_DSI select VIDEOMODE_HELPERS help Choose this option if you have a Mediatek SoCs. @@ -27,7 +24,6 @@ config DRM_MEDIATEK config DRM_MEDIATEK_DP tristate "DRM DPTX Support for MediaTek SoCs" depends on DRM_MEDIATEK - select PHY_MTK_DP select DRM_DISPLAY_HELPER select DRM_DISPLAY_DP_HELPER select DRM_DISPLAY_DP_AUX_BUS @@ -38,6 +34,5 @@ config DRM_MEDIATEK_HDMI tristate "DRM HDMI Support for Mediatek SoCs" depends on DRM_MEDIATEK select SND_SOC_HDMI_CODEC if SND_SOC - select PHY_MTK_HDMI help DRM/KMS HDMI driver for Mediatek SoCs From 768776dd4efc681cdca33a79e29bb508d6de9bc0 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Mon, 16 Dec 2024 16:16:40 +0100 Subject: [PATCH 521/807] i2c: imx: fix missing stop condition in single-master mode A regression was introduced with the implementation of single-master mode, preventing proper stop conditions from being generated. Devices that require a valid stop condition, such as EEPROMs, fail to function correctly as a result. The issue only affects devices with the single-master property enabled. This commit resolves the issue by re-enabling I2C bus busy bit (IBB) polling for single-master mode when generating a stop condition. The fix further ensures that the i2c_imx->stopped flag is cleared at the start of each transfer, allowing the stop condition to be correctly generated in i2c_imx_stop(). According to the reference manual (IMX8MMRM, Rev. 2, 09/2019, page 5270), polling the IBB bit to determine if the bus is free is only necessary in multi-master mode. Consequently, the IBB bit is not polled for the start condition in single-master mode. Fixes: 6692694aca86 ("i2c: imx: do not poll for bus busy in single master mode") Signed-off-by: Stefan Eichenberger Reviewed-by: Frank Li Reviewed-by: Francesco Dolcini Link: https://lore.kernel.org/r/20241216151829.74056-1-eichest@gmail.com Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-imx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index f751d231ded8..488ee3511314 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -532,22 +532,20 @@ static void i2c_imx_dma_free(struct imx_i2c_struct *i2c_imx) static int i2c_imx_bus_busy(struct imx_i2c_struct *i2c_imx, int for_busy, bool atomic) { + bool multi_master = i2c_imx->multi_master; unsigned long orig_jiffies = jiffies; unsigned int temp; - if (!i2c_imx->multi_master) - return 0; - while (1) { temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR); /* check for arbitration lost */ - if (temp & I2SR_IAL) { + if (multi_master && (temp & I2SR_IAL)) { i2c_imx_clear_irq(i2c_imx, I2SR_IAL); return -EAGAIN; } - if (for_busy && (temp & I2SR_IBB)) { + if (for_busy && (!multi_master || (temp & I2SR_IBB))) { i2c_imx->stopped = 0; break; } From e0cec363197e41af870613e8e17b30bf0e3d41b5 Mon Sep 17 00:00:00 2001 From: Carlos Song Date: Wed, 18 Dec 2024 12:42:38 +0800 Subject: [PATCH 522/807] i2c: imx: add imx7d compatible string for applying erratum ERR007805 Compatible string "fsl,imx7d-i2c" is not exited at i2c-imx driver compatible string table, at the result, "fsl,imx21-i2c" will be matched, but it will cause erratum ERR007805 not be applied in fact. So Add "fsl,imx7d-i2c" compatible string in i2c-imx driver to apply the erratum ERR007805(https://www.nxp.com/docs/en/errata/IMX7DS_3N09P.pdf). " ERR007805 I2C: When the I2C clock speed is configured for 400 kHz, the SCL low period violates the I2C spec of 1.3 uS min Description: When the I2C module is programmed to operate at the maximum clock speed of 400 kHz (as defined by the I2C spec), the SCL clock low period violates the I2C spec of 1.3 uS min. The user must reduce the clock speed to obtain the SCL low time to meet the 1.3us I2C minimum required. This behavior means the SoC is not compliant to the I2C spec at 400kHz. Workaround: To meet the clock low period requirement in fast speed mode, SCL must be configured to 384KHz or less. " "fsl,imx7d-i2c" already is documented in binding doc. This erratum fix has been included in imx6_i2c_hwdata and it is the same in all I.MX6/7/8, so just reuse it. Fixes: 39c025721d70 ("i2c: imx: Implement errata ERR007805 or e7805 bus frequency limit") Cc: stable@vger.kernel.org # v5.18+ Signed-off-by: Carlos Song Signed-off-by: Haibo Chen Reviewed-by: Frank Li Fixes: 39c025721d70 ("i2c: imx: Implement errata ERR007805 or e7805 bus frequency limit") Acked-by: Oleksij Rempel Link: https://lore.kernel.org/r/20241218044238.143414-1-carlos.song@nxp.com Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-imx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index 488ee3511314..5c9a8dfbc4a0 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -335,6 +335,7 @@ static const struct of_device_id i2c_imx_dt_ids[] = { { .compatible = "fsl,imx6sll-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx6sx-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx6ul-i2c", .data = &imx6_i2c_hwdata, }, + { .compatible = "fsl,imx7d-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx7s-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx8mm-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx8mn-i2c", .data = &imx6_i2c_hwdata, }, From 9a8f9320d67b27ddd7f1ee88d91820197a0e908f Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 18 Dec 2024 12:07:40 +0000 Subject: [PATCH 523/807] i2c: microchip-core: actually use repeated sends At present, where repeated sends are intended to be used, the i2c-microchip-core driver sends a stop followed by a start. Lots of i2c devices must not malfunction in the face of this behaviour, because the driver has operated like this for years! Try to keep track of whether or not a repeated send is required, and suppress sending a stop in these cases. CC: stable@vger.kernel.org Fixes: 64a6f1c4987e ("i2c: add support for microchip fpga i2c controllers") Signed-off-by: Conor Dooley Reviewed-by: Andi Shyti Link: https://lore.kernel.org/r/20241218-football-composure-e56df2461461@spud Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-microchip-corei2c.c | 124 ++++++++++++++++----- 1 file changed, 96 insertions(+), 28 deletions(-) diff --git a/drivers/i2c/busses/i2c-microchip-corei2c.c b/drivers/i2c/busses/i2c-microchip-corei2c.c index d1543e7d8380..6a124e903c66 100644 --- a/drivers/i2c/busses/i2c-microchip-corei2c.c +++ b/drivers/i2c/busses/i2c-microchip-corei2c.c @@ -93,27 +93,35 @@ * @base: pointer to register struct * @dev: device reference * @i2c_clk: clock reference for i2c input clock + * @msg_queue: pointer to the messages requiring sending * @buf: pointer to msg buffer for easier use * @msg_complete: xfer completion object * @adapter: core i2c abstraction * @msg_err: error code for completed message * @bus_clk_rate: current i2c bus clock rate * @isr_status: cached copy of local ISR status + * @total_num: total number of messages to be sent/received + * @current_num: index of the current message being sent/received * @msg_len: number of bytes transferred in msg * @addr: address of the current slave + * @restart_needed: whether or not a repeated start is required after current message */ struct mchp_corei2c_dev { void __iomem *base; struct device *dev; struct clk *i2c_clk; + struct i2c_msg *msg_queue; u8 *buf; struct completion msg_complete; struct i2c_adapter adapter; int msg_err; + int total_num; + int current_num; u32 bus_clk_rate; u32 isr_status; u16 msg_len; u8 addr; + bool restart_needed; }; static void mchp_corei2c_core_disable(struct mchp_corei2c_dev *idev) @@ -222,6 +230,47 @@ static int mchp_corei2c_fill_tx(struct mchp_corei2c_dev *idev) return 0; } +static void mchp_corei2c_next_msg(struct mchp_corei2c_dev *idev) +{ + struct i2c_msg *this_msg; + u8 ctrl; + + if (idev->current_num >= idev->total_num) { + complete(&idev->msg_complete); + return; + } + + /* + * If there's been an error, the isr needs to return control + * to the "main" part of the driver, so as not to keep sending + * messages once it completes and clears the SI bit. + */ + if (idev->msg_err) { + complete(&idev->msg_complete); + return; + } + + this_msg = idev->msg_queue++; + + if (idev->current_num < (idev->total_num - 1)) { + struct i2c_msg *next_msg = idev->msg_queue; + + idev->restart_needed = next_msg->flags & I2C_M_RD; + } else { + idev->restart_needed = false; + } + + idev->addr = i2c_8bit_addr_from_msg(this_msg); + idev->msg_len = this_msg->len; + idev->buf = this_msg->buf; + + ctrl = readb(idev->base + CORE_I2C_CTRL); + ctrl |= CTRL_STA; + writeb(ctrl, idev->base + CORE_I2C_CTRL); + + idev->current_num++; +} + static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) { u32 status = idev->isr_status; @@ -247,10 +296,14 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) break; case STATUS_M_SLAW_ACK: case STATUS_M_TX_DATA_ACK: - if (idev->msg_len > 0) + if (idev->msg_len > 0) { mchp_corei2c_fill_tx(idev); - else - last_byte = true; + } else { + if (idev->restart_needed) + finished = true; + else + last_byte = true; + } break; case STATUS_M_TX_DATA_NACK: case STATUS_M_SLAR_NACK: @@ -287,7 +340,7 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) mchp_corei2c_stop(idev); if (last_byte || finished) - complete(&idev->msg_complete); + mchp_corei2c_next_msg(idev); return IRQ_HANDLED; } @@ -311,21 +364,48 @@ static irqreturn_t mchp_corei2c_isr(int irq, void *_dev) return ret; } -static int mchp_corei2c_xfer_msg(struct mchp_corei2c_dev *idev, - struct i2c_msg *msg) +static int mchp_corei2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, + int num) { - u8 ctrl; + struct mchp_corei2c_dev *idev = i2c_get_adapdata(adap); + struct i2c_msg *this_msg = msgs; unsigned long time_left; - - idev->addr = i2c_8bit_addr_from_msg(msg); - idev->msg_len = msg->len; - idev->buf = msg->buf; - idev->msg_err = 0; - - reinit_completion(&idev->msg_complete); + u8 ctrl; mchp_corei2c_core_enable(idev); + /* + * The isr controls the flow of a transfer, this info needs to be saved + * to a location that it can access the queue information from. + */ + idev->restart_needed = false; + idev->msg_queue = msgs; + idev->total_num = num; + idev->current_num = 0; + + /* + * But the first entry to the isr is triggered by the start in this + * function, so the first message needs to be "dequeued". + */ + idev->addr = i2c_8bit_addr_from_msg(this_msg); + idev->msg_len = this_msg->len; + idev->buf = this_msg->buf; + idev->msg_err = 0; + + if (idev->total_num > 1) { + struct i2c_msg *next_msg = msgs + 1; + + idev->restart_needed = next_msg->flags & I2C_M_RD; + } + + idev->current_num++; + idev->msg_queue++; + + reinit_completion(&idev->msg_complete); + + /* + * Send the first start to pass control to the isr + */ ctrl = readb(idev->base + CORE_I2C_CTRL); ctrl |= CTRL_STA; writeb(ctrl, idev->base + CORE_I2C_CTRL); @@ -335,20 +415,8 @@ static int mchp_corei2c_xfer_msg(struct mchp_corei2c_dev *idev, if (!time_left) return -ETIMEDOUT; - return idev->msg_err; -} - -static int mchp_corei2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, - int num) -{ - struct mchp_corei2c_dev *idev = i2c_get_adapdata(adap); - int i, ret; - - for (i = 0; i < num; i++) { - ret = mchp_corei2c_xfer_msg(idev, msgs++); - if (ret) - return ret; - } + if (idev->msg_err) + return idev->msg_err; return num; } From 49e1f0fd0d4cb03a16b8526c4e683e1958f71490 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 18 Dec 2024 12:07:42 +0000 Subject: [PATCH 524/807] i2c: microchip-core: fix "ghost" detections Running i2c-detect currently produces an output akin to: 0 1 2 3 4 5 6 7 8 9 a b c d e f 00: 08 -- 0a -- 0c -- 0e -- 10: 10 -- 12 -- 14 -- 16 -- UU 19 -- 1b -- 1d -- 1f 20: -- 21 -- 23 -- 25 -- 27 -- 29 -- 2b -- 2d -- 2f 30: -- -- -- -- -- -- -- -- 38 -- 3a -- 3c -- 3e -- 40: 40 -- 42 -- 44 -- 46 -- 48 -- 4a -- 4c -- 4e -- 50: -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 60: 60 -- 62 -- 64 -- 66 -- 68 -- 6a -- 6c -- 6e -- 70: 70 -- 72 -- 74 -- 76 -- This happens because for an i2c_msg with a len of 0 the driver will mark the transmission of the message as a success once the START has been sent, without waiting for the devices on the bus to respond with an ACK/NAK. Since i2cdetect seems to run in a tight loop over all addresses the NAK is treated as part of the next test for the next address. Delete the fast path that marks a message as complete when idev->msg_len is zero after sending a START/RESTART since this isn't a valid scenario. CC: stable@vger.kernel.org Fixes: 64a6f1c4987e ("i2c: add support for microchip fpga i2c controllers") Signed-off-by: Conor Dooley Reviewed-by: Andi Shyti Link: https://lore.kernel.org/r/20241218-outbid-encounter-b2e78b1cc707@spud Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-microchip-corei2c.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-microchip-corei2c.c b/drivers/i2c/busses/i2c-microchip-corei2c.c index 6a124e903c66..5db73429125c 100644 --- a/drivers/i2c/busses/i2c-microchip-corei2c.c +++ b/drivers/i2c/busses/i2c-microchip-corei2c.c @@ -287,8 +287,6 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) ctrl &= ~CTRL_STA; writeb(idev->addr, idev->base + CORE_I2C_DATA); writeb(ctrl, idev->base + CORE_I2C_CTRL); - if (idev->msg_len == 0) - finished = true; break; case STATUS_M_ARB_LOST: idev->msg_err = -EAGAIN; From 8a6442ec3437083348f32a6159b9a67bf66417bc Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Thu, 19 Dec 2024 10:52:16 +0800 Subject: [PATCH 525/807] arm64: dts: qcom: sa8775p: fix the secure device bootup issue The secure device(fused) cannot bootup with TPDM_DCC device. So disable it in DT. Fixes: 6596118ccdcd ("arm64: dts: qcom: Add coresight nodes for SA8775p") Signed-off-by: Jie Gan Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20241219025216.3463527-1-quic_jiegan@quicinc.com Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/sa8775p.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/qcom/sa8775p.dtsi b/arch/arm64/boot/dts/qcom/sa8775p.dtsi index 368bcf7c9802..9da62d7c4d27 100644 --- a/arch/arm64/boot/dts/qcom/sa8775p.dtsi +++ b/arch/arm64/boot/dts/qcom/sa8775p.dtsi @@ -2440,6 +2440,7 @@ qcom,cmb-element-bits = <32>; qcom,cmb-msrs-num = <32>; + status = "disabled"; out-ports { port { From 75cd4005da5492129917a4a4ee45e81660556104 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 25 Dec 2024 19:06:40 +0800 Subject: [PATCH 526/807] ublk: detach gendisk from ublk device if add_disk() fails Inside ublk_abort_requests(), gendisk is grabbed for aborting all inflight requests. And ublk_abort_requests() is called when exiting the uring context or handling timeout. If add_disk() fails, the gendisk may have been freed when calling ublk_abort_requests(), so use-after-free can be caused when getting disk's reference in ublk_abort_requests(). Fixes the bug by detaching gendisk from ublk device if add_disk() fails. Fixes: bd23f6c2c2d0 ("ublk: quiesce request queue when aborting queue") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241225110640.351531-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d4aed12dd436..934ab9332c80 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1618,6 +1618,21 @@ static void ublk_unquiesce_dev(struct ublk_device *ub) blk_mq_kick_requeue_list(ub->ub_disk->queue); } +static struct gendisk *ublk_detach_disk(struct ublk_device *ub) +{ + struct gendisk *disk; + + /* Sync with ublk_abort_queue() by holding the lock */ + spin_lock(&ub->lock); + disk = ub->ub_disk; + ub->dev_info.state = UBLK_S_DEV_DEAD; + ub->dev_info.ublksrv_pid = -1; + ub->ub_disk = NULL; + spin_unlock(&ub->lock); + + return disk; +} + static void ublk_stop_dev(struct ublk_device *ub) { struct gendisk *disk; @@ -1631,14 +1646,7 @@ static void ublk_stop_dev(struct ublk_device *ub) ublk_unquiesce_dev(ub); } del_gendisk(ub->ub_disk); - - /* Sync with ublk_abort_queue() by holding the lock */ - spin_lock(&ub->lock); - disk = ub->ub_disk; - ub->dev_info.state = UBLK_S_DEV_DEAD; - ub->dev_info.ublksrv_pid = -1; - ub->ub_disk = NULL; - spin_unlock(&ub->lock); + disk = ublk_detach_disk(ub); put_disk(disk); unlock: mutex_unlock(&ub->mutex); @@ -2336,7 +2344,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) out_put_cdev: if (ret) { - ub->dev_info.state = UBLK_S_DEV_DEAD; + ublk_detach_disk(ub); ublk_put_device(ub); } if (ret) From e33ac68e5e21ec1292490dfe061e75c0dbdd3bd4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Dec 2024 16:49:23 +0000 Subject: [PATCH 527/807] io_uring/sqpoll: fix sqpoll error handling races BUG: KASAN: slab-use-after-free in __lock_acquire+0x370b/0x4a10 kernel/locking/lockdep.c:5089 Call Trace: ... _raw_spin_lock_irqsave+0x3d/0x60 kernel/locking/spinlock.c:162 class_raw_spinlock_irqsave_constructor include/linux/spinlock.h:551 [inline] try_to_wake_up+0xb5/0x23c0 kernel/sched/core.c:4205 io_sq_thread_park+0xac/0xe0 io_uring/sqpoll.c:55 io_sq_thread_finish+0x6b/0x310 io_uring/sqpoll.c:96 io_sq_offload_create+0x162/0x11d0 io_uring/sqpoll.c:497 io_uring_create io_uring/io_uring.c:3724 [inline] io_uring_setup+0x1728/0x3230 io_uring/io_uring.c:3806 ... Kun Hu reports that the SQPOLL creating error path has UAF, which happens if io_uring_alloc_task_context() fails and then io_sq_thread() manages to run and complete before the rest of error handling code, which means io_sq_thread_finish() is looking at already killed task. Note that this is mostly theoretical, requiring fault injection on the allocation side to trigger in practice. Cc: stable@vger.kernel.org Reported-by: Kun Hu Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0f2f1aa5729332612bd01fe0f2f385fd1f06ce7c.1735231717.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 6df5e649c413..9e5bd79fd2b5 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -405,6 +405,7 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct task_struct *task_to_put = NULL; int ret; /* Retain compatibility with failing for an invalid attach attempt */ @@ -480,6 +481,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, } sqd->thread = tsk; + task_to_put = get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) @@ -490,11 +492,15 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } + if (task_to_put) + put_task_struct(task_to_put); return 0; err_sqpoll: complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); + if (task_to_put) + put_task_struct(task_to_put); return ret; } From 6cc45f8c1f898570916044f606be9890d295e129 Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Wed, 27 Nov 2024 14:41:30 +0100 Subject: [PATCH 528/807] rtla/timerlat: Fix histogram ALL for zero samples rtla timerlat hist currently computers the minimum, maximum and average latency even in cases when there are zero samples. This leads to nonsensical values being calculated for maximum and minimum, and to divide by zero for average. A similar bug is fixed by 01b05fc0e5f3 ("rtla/timerlat: Fix histogram report when a cpu count is 0") but the bug still remains for printing the sum over all CPUs in timerlat_print_stats_all. The issue can be reproduced with this command: $ rtla timerlat hist -U -d 1s Index over: count: min: avg: max: Floating point exception (core dumped) (There are always no samples with -U unless the user workload is created.) Fix the bug by omitting max/min/avg when sample count is zero, displaying a dash instead, just like we already do for the individual CPUs. The logic is moved into a new function called format_summary_value, which is used for both the individual CPUs and for the overall summary. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20241127134130.51171-1-tglozar@redhat.com Fixes: 1462501c7a8 ("rtla/timerlat: Add a summary for hist mode") Signed-off-by: Tomas Glozar Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/timerlat_hist.c | 177 ++++++++++++++----------- 1 file changed, 96 insertions(+), 81 deletions(-) diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 8b66387e5f35..4403cc4eba30 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -281,6 +281,21 @@ static void timerlat_hist_header(struct osnoise_tool *tool) trace_seq_reset(s); } +/* + * format_summary_value - format a line of summary value (min, max or avg) + * of hist data + */ +static void format_summary_value(struct trace_seq *seq, + int count, + unsigned long long val, + bool avg) +{ + if (count) + trace_seq_printf(seq, "%9llu ", avg ? val / count : val); + else + trace_seq_printf(seq, "%9c ", '-'); +} + /* * timerlat_print_summary - print the summary of the hist data to the output */ @@ -328,29 +343,23 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) { - if (data->hist[cpu].irq_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_irq); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_irq) + format_summary_value(trace->seq, + data->hist[cpu].irq_count, + data->hist[cpu].min_irq, + false); - if (!params->no_thread) { - if (data->hist[cpu].thread_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_thread); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_thread) + format_summary_value(trace->seq, + data->hist[cpu].thread_count, + data->hist[cpu].min_thread, + false); - if (params->user_hist) { - if (data->hist[cpu].user_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_user); - else - trace_seq_printf(trace->seq, " - "); - } + if (params->user_hist) + format_summary_value(trace->seq, + data->hist[cpu].user_count, + data->hist[cpu].min_user, + false); } trace_seq_printf(trace->seq, "\n"); @@ -364,29 +373,23 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) { - if (data->hist[cpu].irq_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].sum_irq / data->hist[cpu].irq_count); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_irq) + format_summary_value(trace->seq, + data->hist[cpu].irq_count, + data->hist[cpu].sum_irq, + true); - if (!params->no_thread) { - if (data->hist[cpu].thread_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].sum_thread / data->hist[cpu].thread_count); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_thread) + format_summary_value(trace->seq, + data->hist[cpu].thread_count, + data->hist[cpu].sum_thread, + true); - if (params->user_hist) { - if (data->hist[cpu].user_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].sum_user / data->hist[cpu].user_count); - else - trace_seq_printf(trace->seq, " - "); - } + if (params->user_hist) + format_summary_value(trace->seq, + data->hist[cpu].user_count, + data->hist[cpu].sum_user, + true); } trace_seq_printf(trace->seq, "\n"); @@ -400,29 +403,23 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) { - if (data->hist[cpu].irq_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_irq); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_irq) + format_summary_value(trace->seq, + data->hist[cpu].irq_count, + data->hist[cpu].max_irq, + false); - if (!params->no_thread) { - if (data->hist[cpu].thread_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_thread); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_thread) + format_summary_value(trace->seq, + data->hist[cpu].thread_count, + data->hist[cpu].max_thread, + false); - if (params->user_hist) { - if (data->hist[cpu].user_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_user); - else - trace_seq_printf(trace->seq, " - "); - } + if (params->user_hist) + format_summary_value(trace->seq, + data->hist[cpu].user_count, + data->hist[cpu].max_user, + false); } trace_seq_printf(trace->seq, "\n"); trace_seq_do_printf(trace->seq); @@ -506,16 +503,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params, trace_seq_printf(trace->seq, "min: "); if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - sum.min_irq); + format_summary_value(trace->seq, + sum.irq_count, + sum.min_irq, + false); if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - sum.min_thread); + format_summary_value(trace->seq, + sum.thread_count, + sum.min_thread, + false); if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - sum.min_user); + format_summary_value(trace->seq, + sum.user_count, + sum.min_user, + false); trace_seq_printf(trace->seq, "\n"); @@ -523,16 +526,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params, trace_seq_printf(trace->seq, "avg: "); if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - sum.sum_irq / sum.irq_count); + format_summary_value(trace->seq, + sum.irq_count, + sum.sum_irq, + true); if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - sum.sum_thread / sum.thread_count); + format_summary_value(trace->seq, + sum.thread_count, + sum.sum_thread, + true); if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - sum.sum_user / sum.user_count); + format_summary_value(trace->seq, + sum.user_count, + sum.sum_user, + true); trace_seq_printf(trace->seq, "\n"); @@ -540,16 +549,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params, trace_seq_printf(trace->seq, "max: "); if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - sum.max_irq); + format_summary_value(trace->seq, + sum.irq_count, + sum.max_irq, + false); if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - sum.max_thread); + format_summary_value(trace->seq, + sum.thread_count, + sum.max_thread, + false); if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - sum.max_user); + format_summary_value(trace->seq, + sum.user_count, + sum.max_user, + false); trace_seq_printf(trace->seq, "\n"); trace_seq_do_printf(trace->seq); From 6b830c6a023ff6e8fe05dbe47a9e5cd276df09ee Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:14 +0100 Subject: [PATCH 529/807] netlink: specs: mptcp: add missing 'server-side' attr This attribute is added with the 'created' and 'established' events, but the documentation didn't mention it. The documentation in the UAPI header has been auto-generated by: ./tools/net/ynl/ynl-regen.sh Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-1-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 6 ++++-- include/uapi/linux/mptcp_pm.h | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index dc190bf838fe..fc0603f51665 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -23,7 +23,8 @@ definitions: - name: created doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport + token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, + server-side A new MPTCP connection has been created. It is the good time to allocate memory and send ADD_ADDR if needed. Depending on the traffic-patterns it can take a long time until the @@ -31,7 +32,8 @@ definitions: - name: established doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport + token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, + server-side A MPTCP connection is established (can start new subflows). - name: closed diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 50589e5dd6a3..b34fd95b6f84 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -13,12 +13,13 @@ * enum mptcp_event_type * @MPTCP_EVENT_UNSPEC: unused event * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport A new MPTCP connection has been created. It is the good time - * to allocate memory and send ADD_ADDR if needed. Depending on the - * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED - * is sent. + * sport, dport, server-side A new MPTCP connection has been created. It is + * the good time to allocate memory and send ADD_ADDR if needed. Depending on + * the traffic-patterns it can take a long time until the + * MPTCP_EVENT_ESTABLISHED is sent. * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport A MPTCP connection is established (can start new subflows). + * sport, dport, server-side A MPTCP connection is established (can start new + * subflows). * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A * new address has been announced by the peer. From bea87657b5ee8e6f18af2833ee4b88212ef52d28 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:15 +0100 Subject: [PATCH 530/807] netlink: specs: mptcp: clearly mention attributes The rendered version of the MPTCP events [1] looked strange, because the whole content of the 'doc' was displayed in the same block. It was then not clear that the first words, not even ended by a period, were the attributes that are defined when such events are emitted. These attributes have now been moved to the end, prefixed by 'Attributes:' and ended with a period. Note that '>-' has been added after 'doc:' to allow ':' in the text below. The documentation in the UAPI header has been auto-generated by: ./tools/net/ynl/ynl-regen.sh Link: https://docs.kernel.org/networking/netlink_spec/mptcp_pm.html#event-type [1] Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-2-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 50 ++++++++++----------- include/uapi/linux/mptcp_pm.h | 53 ++++++++++++----------- 2 files changed, 52 insertions(+), 51 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index fc0603f51665..59087a230565 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -22,67 +22,67 @@ definitions: doc: unused event - name: created - doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, - server-side + doc: >- A new MPTCP connection has been created. It is the good time to allocate memory and send ADD_ADDR if needed. Depending on the traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. + Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, + dport, server-side. - name: established - doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, - server-side + doc: >- A MPTCP connection is established (can start new subflows). + Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, + dport, server-side. - name: closed - doc: - token + doc: >- A MPTCP connection has stopped. + Attribute: token. - name: announced value: 6 - doc: - token, rem_id, family, daddr4 | daddr6 [, dport] + doc: >- A new address has been announced by the peer. + Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. - name: removed - doc: - token, rem_id + doc: >- An address has been lost by the peer. + Attributes: token, rem_id. - name: sub-established value: 10 - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- A new subflow has been established. 'error' should not be set. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: sub-closed - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- A subflow has been closed. An error (copy of sk_err) could be set if an error has been detected for this subflow. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: sub-priority value: 13 - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- The priority of a subflow has changed. 'error' should not be set. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: listener-created value: 15 - doc: - family, sport, saddr4 | saddr6 + doc: >- A new PM listener is created. + Attributes: family, sport, saddr4 | saddr6. - name: listener-closed - doc: - family, sport, saddr4 | saddr6 + doc: >- A PM listener is closed. + Attributes: family, sport, saddr4 | saddr6. attribute-sets: - diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index b34fd95b6f84..84fa8a21dfd0 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -12,32 +12,33 @@ /** * enum mptcp_event_type * @MPTCP_EVENT_UNSPEC: unused event - * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side A new MPTCP connection has been created. It is - * the good time to allocate memory and send ADD_ADDR if needed. Depending on - * the traffic-patterns it can take a long time until the - * MPTCP_EVENT_ESTABLISHED is sent. - * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side A MPTCP connection is established (can start new - * subflows). - * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. - * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A - * new address has been announced by the peer. - * @MPTCP_EVENT_REMOVED: token, rem_id An address has been lost by the peer. - * @MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error] A new - * subflow has been established. 'error' should not be set. - * @MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx [, error] A subflow has been - * closed. An error (copy of sk_err) could be set if an error has been - * detected for this subflow. - * @MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx [, error] The priority of a - * subflow has changed. 'error' should not be set. - * @MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 A new PM - * listener is created. - * @MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 A PM listener - * is closed. + * @MPTCP_EVENT_CREATED: A new MPTCP connection has been created. It is the + * good time to allocate memory and send ADD_ADDR if needed. Depending on the + * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED + * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, server-side. + * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new + * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, server-side. + * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. + * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. + * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. + * @MPTCP_EVENT_REMOVED: An address has been lost by the peer. Attributes: + * token, rem_id. + * @MPTCP_EVENT_SUB_ESTABLISHED: A new subflow has been established. 'error' + * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_SUB_CLOSED: A subflow has been closed. An error (copy of + * sk_err) could be set if an error has been detected for this subflow. + * Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + * daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_SUB_PRIORITY: The priority of a subflow has changed. 'error' + * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_LISTENER_CREATED: A new PM listener is created. Attributes: + * family, sport, saddr4 | saddr6. + * @MPTCP_EVENT_LISTENER_CLOSED: A PM listener is closed. Attributes: family, + * sport, saddr4 | saddr6. */ enum mptcp_event_type { MPTCP_EVENT_UNSPEC, From 4f363fe9f6b28ed9b714cd7fe5ce880171927dab Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:16 +0100 Subject: [PATCH 531/807] netlink: specs: mptcp: fix missing doc Two operations didn't have a small description. It looks like something that has been missed in the original commit introducing this file. Replace the two "todo" by a small and simple description: Create/Destroy subflow. While at it, also uniform the capital letters, avoid double spaces, and fix the "announce" event description: a new "address" has been announced, not a new "subflow". Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-3-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index 59087a230565..dfd017780d2f 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -308,8 +308,8 @@ operations: attributes: - addr - - name: flush-addrs - doc: flush addresses + name: flush-addrs + doc: Flush addresses attribute-set: endpoint dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -353,7 +353,7 @@ operations: - addr-remote - name: announce - doc: announce new sf + doc: Announce new address attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -364,7 +364,7 @@ operations: - token - name: remove - doc: announce removal + doc: Announce removal attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -375,7 +375,7 @@ operations: - loc-id - name: subflow-create - doc: todo + doc: Create subflow attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -387,7 +387,7 @@ operations: - addr-remote - name: subflow-destroy - doc: todo + doc: Destroy subflow attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] From a024e377efed31ecfb39210bed562932321345b3 Mon Sep 17 00:00:00 2001 From: Antonio Pastor Date: Tue, 24 Dec 2024 20:07:20 -0500 Subject: [PATCH 532/807] net: llc: reset skb->transport_header 802.2+LLC+SNAP frames received by napi_complete_done with GRO and DSA have skb->transport_header set two bytes short, or pointing 2 bytes before network_header & skb->data. As snap_rcv expects transport_header to point to SNAP header (OID:PID) after LLC processing advances offset over LLC header (llc_rcv & llc_fixup_skb), code doesn't find a match and packet is dropped. Between napi_complete_done and snap_rcv, transport_header is not used until __netif_receive_skb_core, where originally it was being reset. Commit fda55eca5a33 ("net: introduce skb_transport_header_was_set()") only does so if not set, on the assumption the value was set correctly by GRO (and also on assumption that "network stacks usually reset the transport header anyway"). Afterwards it is moved forward by llc_fixup_skb. Locally generated traffic shows up at __netif_receive_skb_core with no transport_header set and is processed without issue. On a setup with GRO but no DSA, transport_header and network_header are both set to point to skb->data which is also correct. As issue is LLC specific, to avoid impacting non-LLC traffic, and to follow up on original assumption made on previous code change, llc_fixup_skb to reset the offset after skb pull. llc_fixup_skb assumes the LLC header is at skb->data, and by definition SNAP header immediately follows. Fixes: fda55eca5a33 ("net: introduce skb_transport_header_was_set()") Signed-off-by: Antonio Pastor Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241225010723.2830290-1-antonio.pastor@gmail.com Signed-off-by: Jakub Kicinski --- net/llc/llc_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 51bccfb00a9c..61b0159b2fbe 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -124,8 +124,8 @@ static inline int llc_fixup_skb(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; - skb->transport_header += llc_len; skb_pull(skb, llc_len); + skb_reset_transport_header(skb); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; From 4db3d750ac7e894278ef1cb1c53cc7d883060496 Mon Sep 17 00:00:00 2001 From: Leo Stone Date: Wed, 18 Dec 2024 10:49:57 -0800 Subject: [PATCH 533/807] nvmet: Don't overflow subsysnqn nvmet_root_discovery_nqn_store treats the subsysnqn string like a fixed size buffer, even though it is dynamically allocated to the size of the string. Create a new string with kstrndup instead of using the old buffer. Reported-by: syzbot+ff4aab278fa7e27e0f9e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ff4aab278fa7e27e0f9e Fixes: 95409e277d83 ("nvmet: implement unique discovery NQN") Signed-off-by: Leo Stone Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index eeee9e9b854c..9c109b93ffbf 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -2254,12 +2254,17 @@ static ssize_t nvmet_root_discovery_nqn_store(struct config_item *item, const char *page, size_t count) { struct list_head *entry; + char *old_nqn, *new_nqn; size_t len; len = strcspn(page, "\n"); if (!len || len > NVMF_NQN_FIELD_LEN - 1) return -EINVAL; + new_nqn = kstrndup(page, len, GFP_KERNEL); + if (!new_nqn) + return -ENOMEM; + down_write(&nvmet_config_sem); list_for_each(entry, &nvmet_subsystems_group.cg_children) { struct config_item *item = @@ -2268,13 +2273,15 @@ static ssize_t nvmet_root_discovery_nqn_store(struct config_item *item, if (!strncmp(config_item_name(item), page, len)) { pr_err("duplicate NQN %s\n", config_item_name(item)); up_write(&nvmet_config_sem); + kfree(new_nqn); return -EINVAL; } } - memset(nvmet_disc_subsys->subsysnqn, 0, NVMF_NQN_FIELD_LEN); - memcpy(nvmet_disc_subsys->subsysnqn, page, len); + old_nqn = nvmet_disc_subsys->subsysnqn; + nvmet_disc_subsys->subsysnqn = new_nqn; up_write(&nvmet_config_sem); + kfree(old_nqn); return len; } From b579d6fdc3a9149bb4d2b3133cc0767130ed13e6 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Tue, 17 Dec 2024 18:33:25 -0800 Subject: [PATCH 534/807] nvmet: propagate npwg topology Ensure we propagate npwg to the target as well instead of assuming its the same logical blocks per physical block. This ensures devices with large IUs information properly propagated on the target. Signed-off-by: Luis Chamberlain Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/io-cmd-bdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 0bda83d0fc3e..eaf31c823cbe 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -36,7 +36,7 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) */ id->nsfeat |= 1 << 4; /* NPWG = Namespace Preferred Write Granularity. 0's based */ - id->npwg = lpp0b; + id->npwg = to0based(bdev_io_min(bdev) / bdev_logical_block_size(bdev)); /* NPWA = Namespace Preferred Write Alignment. 0's based */ id->npwa = id->npwg; /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ From 74d16965d7ac378d28ebd833ae6d6a097186a4ec Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 11 Dec 2024 14:28:06 +0530 Subject: [PATCH 535/807] nvmet-loop: avoid using mutex in IO hotpath Using mutex lock in IO hot path causes the kernel BUG sleeping while atomic. Shinichiro[1], first encountered this issue while running blktest nvme/052 shown below: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 996, name: (udev-worker) preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 2 locks held by (udev-worker)/996: #0: ffff8881004570c8 (mapping.invalidate_lock){.+.+}-{3:3}, at: page_cache_ra_unbounded+0x155/0x5c0 #1: ffffffff8607eaa0 (rcu_read_lock){....}-{1:2}, at: blk_mq_flush_plug_list+0xa75/0x1950 CPU: 2 UID: 0 PID: 996 Comm: (udev-worker) Not tainted 6.12.0-rc3+ #339 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 Call Trace: dump_stack_lvl+0x6a/0x90 __might_resched.cold+0x1f7/0x23d ? __pfx___might_resched+0x10/0x10 ? vsnprintf+0xdeb/0x18f0 __mutex_lock+0xf4/0x1220 ? nvmet_subsys_nsid_exists+0xb9/0x150 [nvmet] ? __pfx_vsnprintf+0x10/0x10 ? __pfx___mutex_lock+0x10/0x10 ? snprintf+0xa5/0xe0 ? xas_load+0x1ce/0x3f0 ? nvmet_subsys_nsid_exists+0xb9/0x150 [nvmet] nvmet_subsys_nsid_exists+0xb9/0x150 [nvmet] ? __pfx_nvmet_subsys_nsid_exists+0x10/0x10 [nvmet] nvmet_req_find_ns+0x24e/0x300 [nvmet] nvmet_req_init+0x694/0xd40 [nvmet] ? blk_mq_start_request+0x11c/0x750 ? nvme_setup_cmd+0x369/0x990 [nvme_core] nvme_loop_queue_rq+0x2a7/0x7a0 [nvme_loop] ? __pfx___lock_acquire+0x10/0x10 ? __pfx_nvme_loop_queue_rq+0x10/0x10 [nvme_loop] __blk_mq_issue_directly+0xe2/0x1d0 ? __pfx___blk_mq_issue_directly+0x10/0x10 ? blk_mq_request_issue_directly+0xc2/0x140 blk_mq_plug_issue_direct+0x13f/0x630 ? lock_acquire+0x2d/0xc0 ? blk_mq_flush_plug_list+0xa75/0x1950 blk_mq_flush_plug_list+0xa9d/0x1950 ? __pfx_blk_mq_flush_plug_list+0x10/0x10 ? __pfx_mpage_readahead+0x10/0x10 __blk_flush_plug+0x278/0x4d0 ? __pfx___blk_flush_plug+0x10/0x10 ? lock_release+0x460/0x7a0 blk_finish_plug+0x4e/0x90 read_pages+0x51b/0xbc0 ? __pfx_read_pages+0x10/0x10 ? lock_release+0x460/0x7a0 page_cache_ra_unbounded+0x326/0x5c0 force_page_cache_ra+0x1ea/0x2f0 filemap_get_pages+0x59e/0x17b0 ? __pfx_filemap_get_pages+0x10/0x10 ? lock_is_held_type+0xd5/0x130 ? __pfx___might_resched+0x10/0x10 ? find_held_lock+0x2d/0x110 filemap_read+0x317/0xb70 ? up_write+0x1ba/0x510 ? __pfx_filemap_read+0x10/0x10 ? inode_security+0x54/0xf0 ? selinux_file_permission+0x36d/0x420 blkdev_read_iter+0x143/0x3b0 vfs_read+0x6ac/0xa20 ? __pfx_vfs_read+0x10/0x10 ? __pfx_vm_mmap_pgoff+0x10/0x10 ? __pfx___seccomp_filter+0x10/0x10 ksys_read+0xf7/0x1d0 ? __pfx_ksys_read+0x10/0x10 do_syscall_64+0x93/0x180 ? lockdep_hardirqs_on_prepare+0x16d/0x400 ? do_syscall_64+0x9f/0x180 ? lockdep_hardirqs_on+0x78/0x100 ? do_syscall_64+0x9f/0x180 ? lockdep_hardirqs_on_prepare+0x16d/0x400 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f565bd1ce11 Code: 00 48 8b 15 09 90 0d 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd e8 d0 ad 01 00 f3 0f 1e fa 80 3d 35 12 0e 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec RSP: 002b:00007ffd6e7a20c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000001000 RCX: 00007f565bd1ce11 RDX: 0000000000001000 RSI: 00007f565babb000 RDI: 0000000000000014 RBP: 00007ffd6e7a2130 R08: 00000000ffffffff R09: 0000000000000000 R10: 0000556000bfa610 R11: 0000000000000246 R12: 000000003ffff000 R13: 0000556000bfa5b0 R14: 0000000000000e00 R15: 0000556000c07328 Apparently, the above issue is caused due to using mutex lock while we're in IO hot path. It's a regression caused with commit 505363957fad ("nvmet: fix nvme status code when namespace is disabled"). The mutex ->su_mutex is used to find whether a disabled nsid exists in the config group or not. This is to differentiate between a nsid that is disabled vs non-existent. To mitigate the above issue, we've worked upon a fix[2] where we now insert nsid in subsys Xarray as soon as it's created under config group and later when that nsid is enabled, we add an Xarray mark on it and set ns->enabled to true. The Xarray mark is useful while we need to loop through all enabled namepsaces under a subsystem using xa_for_each_marked() API. If later a nsid is disabled then we clear Xarray mark from it and also set ns->enabled to false. It's only when nsid is deleted from the config group we delete it from the Xarray. So with this change, now we could easily differentiate a nsid is disabled (i.e. Xarray entry for ns exists but ns->enabled is set to false) vs non- existent (i.e.Xarray entry for ns doesn't exist). Link: https://lore.kernel.org/linux-nvme/20241022070252.GA11389@lst.de/ [2] Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/linux-nvme/tqcy3sveity7p56v7ywp7ssyviwcb3w4623cnxj3knoobfcanq@yxgt2mjkbkam/ [1] Fixes: 505363957fad ("nvmet: fix nvme status code when namespace is disabled") Fix-suggested-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 9 +-- drivers/nvme/target/configfs.c | 12 ---- drivers/nvme/target/core.c | 108 +++++++++++++++++++------------- drivers/nvme/target/nvmet.h | 7 +++ drivers/nvme/target/pr.c | 8 +-- 5 files changed, 79 insertions(+), 65 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 2962794ce881..fa89b0549c36 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -139,7 +139,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, unsigned long idx; ctrl = req->sq->ctrl; - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { /* we don't have the right data for file backed ns */ if (!ns->bdev) continue; @@ -331,9 +331,10 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, u32 count = 0; if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) { - xa_for_each(&ctrl->subsys->namespaces, idx, ns) + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->anagrpid == grpid) desc->nsids[count++] = cpu_to_le32(ns->nsid); + } } desc->grpid = cpu_to_le32(grpid); @@ -772,7 +773,7 @@ static void nvmet_execute_identify_endgrp_list(struct nvmet_req *req) goto out; } - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->nsid <= min_endgid) continue; @@ -815,7 +816,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css) goto out; } - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->nsid <= min_nsid) continue; if (match_css && req->ns->csi != req->cmd->identify.csi) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 9c109b93ffbf..2b030f0efc38 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -810,18 +810,6 @@ static struct configfs_attribute *nvmet_ns_attrs[] = { NULL, }; -bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid) -{ - struct config_item *ns_item; - char name[12]; - - snprintf(name, sizeof(name), "%u", nsid); - mutex_lock(&subsys->namespaces_group.cg_subsys->su_mutex); - ns_item = config_group_find_item(&subsys->namespaces_group, name); - mutex_unlock(&subsys->namespaces_group.cg_subsys->su_mutex); - return ns_item != NULL; -} - static void nvmet_ns_release(struct config_item *item) { struct nvmet_ns *ns = to_nvmet_ns(item); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 1f4e9989663b..fde6c555af61 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -127,7 +127,7 @@ static u32 nvmet_max_nsid(struct nvmet_subsys *subsys) unsigned long idx; u32 nsid = 0; - xa_for_each(&subsys->namespaces, idx, cur) + nvmet_for_each_enabled_ns(&subsys->namespaces, idx, cur) nsid = cur->nsid; return nsid; @@ -441,11 +441,14 @@ u16 nvmet_req_find_ns(struct nvmet_req *req) struct nvmet_subsys *subsys = nvmet_req_subsys(req); req->ns = xa_load(&subsys->namespaces, nsid); - if (unlikely(!req->ns)) { + if (unlikely(!req->ns || !req->ns->enabled)) { req->error_loc = offsetof(struct nvme_common_command, nsid); - if (nvmet_subsys_nsid_exists(subsys, nsid)) - return NVME_SC_INTERNAL_PATH_ERROR; - return NVME_SC_INVALID_NS | NVME_STATUS_DNR; + if (!req->ns) /* ns doesn't exist! */ + return NVME_SC_INVALID_NS | NVME_STATUS_DNR; + + /* ns exists but it's disabled */ + req->ns = NULL; + return NVME_SC_INTERNAL_PATH_ERROR; } percpu_ref_get(&req->ns->ref); @@ -583,8 +586,6 @@ int nvmet_ns_enable(struct nvmet_ns *ns) goto out_unlock; ret = -EMFILE; - if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) - goto out_unlock; ret = nvmet_bdev_ns_enable(ns); if (ret == -ENOTBLK) @@ -599,38 +600,19 @@ int nvmet_ns_enable(struct nvmet_ns *ns) list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) nvmet_p2pmem_ns_add_p2p(ctrl, ns); - ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, - 0, GFP_KERNEL); - if (ret) - goto out_dev_put; - - if (ns->nsid > subsys->max_nsid) - subsys->max_nsid = ns->nsid; - - ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL); - if (ret) - goto out_restore_subsys_maxnsid; - if (ns->pr.enable) { ret = nvmet_pr_init_ns(ns); if (ret) - goto out_remove_from_subsys; + goto out_dev_put; } - subsys->nr_namespaces++; - nvmet_ns_changed(subsys, ns->nsid); ns->enabled = true; + xa_set_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED); ret = 0; out_unlock: mutex_unlock(&subsys->lock); return ret; - -out_remove_from_subsys: - xa_erase(&subsys->namespaces, ns->nsid); -out_restore_subsys_maxnsid: - subsys->max_nsid = nvmet_max_nsid(subsys); - percpu_ref_exit(&ns->ref); out_dev_put: list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); @@ -649,15 +631,37 @@ void nvmet_ns_disable(struct nvmet_ns *ns) goto out_unlock; ns->enabled = false; - xa_erase(&ns->subsys->namespaces, ns->nsid); - if (ns->nsid == subsys->max_nsid) - subsys->max_nsid = nvmet_max_nsid(subsys); + xa_clear_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED); list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); mutex_unlock(&subsys->lock); + if (ns->pr.enable) + nvmet_pr_exit_ns(ns); + + mutex_lock(&subsys->lock); + nvmet_ns_changed(subsys, ns->nsid); + nvmet_ns_dev_disable(ns); +out_unlock: + mutex_unlock(&subsys->lock); +} + +void nvmet_ns_free(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + + nvmet_ns_disable(ns); + + mutex_lock(&subsys->lock); + + xa_erase(&subsys->namespaces, ns->nsid); + if (ns->nsid == subsys->max_nsid) + subsys->max_nsid = nvmet_max_nsid(subsys); + + mutex_unlock(&subsys->lock); + /* * Now that we removed the namespaces from the lookup list, we * can kill the per_cpu ref and wait for any remaining references @@ -671,21 +675,9 @@ void nvmet_ns_disable(struct nvmet_ns *ns) wait_for_completion(&ns->disable_done); percpu_ref_exit(&ns->ref); - if (ns->pr.enable) - nvmet_pr_exit_ns(ns); - mutex_lock(&subsys->lock); - subsys->nr_namespaces--; - nvmet_ns_changed(subsys, ns->nsid); - nvmet_ns_dev_disable(ns); -out_unlock: mutex_unlock(&subsys->lock); -} - -void nvmet_ns_free(struct nvmet_ns *ns) -{ - nvmet_ns_disable(ns); down_write(&nvmet_ana_sem); nvmet_ana_group_enabled[ns->anagrpid]--; @@ -699,15 +691,33 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) { struct nvmet_ns *ns; + mutex_lock(&subsys->lock); + + if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) + goto out_unlock; + ns = kzalloc(sizeof(*ns), GFP_KERNEL); if (!ns) - return NULL; + goto out_unlock; init_completion(&ns->disable_done); ns->nsid = nsid; ns->subsys = subsys; + if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL)) + goto out_free; + + if (ns->nsid > subsys->max_nsid) + subsys->max_nsid = nsid; + + if (xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL)) + goto out_exit; + + subsys->nr_namespaces++; + + mutex_unlock(&subsys->lock); + down_write(&nvmet_ana_sem); ns->anagrpid = NVMET_DEFAULT_ANA_GRPID; nvmet_ana_group_enabled[ns->anagrpid]++; @@ -718,6 +728,14 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->csi = NVME_CSI_NVM; return ns; +out_exit: + subsys->max_nsid = nvmet_max_nsid(subsys); + percpu_ref_exit(&ns->ref); +out_free: + kfree(ns); +out_unlock: + mutex_unlock(&subsys->lock); + return NULL; } static void nvmet_update_sq_head(struct nvmet_req *req) @@ -1394,7 +1412,7 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, ctrl->p2p_client = get_device(req->p2p_client); - xa_for_each(&ctrl->subsys->namespaces, idx, ns) + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) nvmet_p2pmem_ns_add_p2p(ctrl, ns); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 58328b35dc96..7233549f7c8a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -24,6 +24,7 @@ #define NVMET_DEFAULT_VS NVME_VS(2, 1, 0) +#define NVMET_NS_ENABLED XA_MARK_1 #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) @@ -33,6 +34,12 @@ #define NVMET_FR_MAX_SIZE 8 #define NVMET_PR_LOG_QUEUE_SIZE 64 +#define nvmet_for_each_ns(xa, index, entry) \ + xa_for_each(xa, index, entry) + +#define nvmet_for_each_enabled_ns(xa, index, entry) \ + xa_for_each_marked(xa, index, entry, NVMET_NS_ENABLED) + /* * Supported optional AENs: */ diff --git a/drivers/nvme/target/pr.c b/drivers/nvme/target/pr.c index 90e9f5bbe581..cd22d8333314 100644 --- a/drivers/nvme/target/pr.c +++ b/drivers/nvme/target/pr.c @@ -60,7 +60,7 @@ u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask) goto success; } - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->pr.enable) WRITE_ONCE(ns->pr.notify_mask, mask); } @@ -1056,7 +1056,7 @@ int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl) * nvmet_pr_init_ns(), see more details in nvmet_ns_enable(). * So just check ns->pr.enable. */ - xa_for_each(&subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&subsys->namespaces, idx, ns) { if (ns->pr.enable) { ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid, &ctrl->hostid); @@ -1067,7 +1067,7 @@ int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl) return 0; free_per_ctrl_refs: - xa_for_each(&subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&subsys->namespaces, idx, ns) { if (ns->pr.enable) { pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid); if (pc_ref) @@ -1087,7 +1087,7 @@ void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl) kfifo_free(&ctrl->pr_log_mgr.log_queue); mutex_destroy(&ctrl->pr_log_mgr.lock); - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->pr.enable) { pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid); if (pc_ref) From 36e3b1f9abe359b2bc25e81bc47b64354e42c9b1 Mon Sep 17 00:00:00 2001 From: "Chunguang.xu" Date: Tue, 3 Dec 2024 11:39:55 +0800 Subject: [PATCH 536/807] nvme-tcp: remove nvme_tcp_destroy_io_queues() Now when destroying the IO queue we call nvme_tcp_stop_io_queues() twice, nvme_tcp_destroy_io_queues() has an unnecessary call. Here we try to remove nvme_tcp_destroy_io_queues() and merge it into nvme_tcp_teardown_io_queues(), simplify the code and align with nvme-rdma, make it easy to maintaince. Signed-off-by: Chunguang.xu Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 28c76a3e1bd2..b127d41dbbfe 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2024,14 +2024,6 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) return __nvme_tcp_alloc_io_queues(ctrl); } -static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) -{ - nvme_tcp_stop_io_queues(ctrl); - if (remove) - nvme_remove_io_tag_set(ctrl); - nvme_tcp_free_io_queues(ctrl); -} - static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) { int ret, nr_queues; @@ -2176,9 +2168,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); - if (remove) + if (remove) { nvme_unquiesce_io_queues(ctrl); - nvme_tcp_destroy_io_queues(ctrl, remove); + nvme_remove_io_tag_set(ctrl); + } + nvme_tcp_free_io_queues(ctrl); } static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl, @@ -2267,7 +2261,9 @@ destroy_io: nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); - nvme_tcp_destroy_io_queues(ctrl, new); + if (new) + nvme_remove_io_tag_set(ctrl); + nvme_tcp_free_io_queues(ctrl); } destroy_admin: nvme_stop_keep_alive(ctrl); From 7a6c355b55c051eb37cb15d191241da3aa3d6cba Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Mon, 23 Dec 2024 12:44:53 +0000 Subject: [PATCH 537/807] scripts/mksysmap: Fix escape chars '$' Commit b18b047002b7 ("kbuild: change scripts/mksysmap into sed script") changed the invocation of the script, to call sed directly without shell. That means, the current extra escape that was added in: commit ec336aa83162 ("scripts/mksysmap: Fix badly escaped '$'") for the shell is not correct any more, at the moment the stack traces for nvhe are corrupted: [ 22.840904] kvm [190]: [] __kvm_nvhe_$x.220+0x58/0x9c [ 22.842913] kvm [190]: [] __kvm_nvhe_$x.9+0x44/0x50 [ 22.844112] kvm [190]: [] __kvm_nvhe___skip_pauth_save+0x4/0x4 With this patch: [ 25.793513] kvm [192]: nVHE call trace: [ 25.794141] kvm [192]: [] __kvm_nvhe_hyp_panic+0xb0/0xf4 [ 25.796590] kvm [192]: [] __kvm_nvhe_handle_trap+0xe4/0x188 [ 25.797553] kvm [192]: [] __kvm_nvhe___skip_pauth_save+0x4/0x4 Fixes: b18b047002b7 ("kbuild: change scripts/mksysmap into sed script") Signed-off-by: Mostafa Saleh Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- scripts/mksysmap | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/mksysmap b/scripts/mksysmap index c12723a04655..3accbdb269ac 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -26,7 +26,7 @@ # (do not forget a space before each pattern) # local symbols for ARM, MIPS, etc. -/ \\$/d +/ \$/d # local labels, .LBB, .Ltmpxxx, .L__unnamed_xx, .LASANPC, etc. / \.L/d @@ -39,7 +39,7 @@ / __pi_\.L/d # arm64 local symbols in non-VHE KVM namespace -/ __kvm_nvhe_\\$/d +/ __kvm_nvhe_\$/d / __kvm_nvhe_\.L/d # lld arm/aarch64/mips thunks From bf36b4bf1b9a7a0015610e2f038ee84ddb085de2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 26 Dec 2024 00:33:35 +0900 Subject: [PATCH 538/807] modpost: fix the missed iteration for the max bit in do_input() This loop should iterate over the range from 'min' to 'max' inclusively. The last interation is missed. Fixes: 1d8f430c15b3 ("[PATCH] Input: add modalias support") Signed-off-by: Masahiro Yamada Tested-by: John Paul Adrian Glaubitz --- scripts/mod/file2alias.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 5b5745f00eb3..ff263c285977 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -656,7 +656,7 @@ static void do_input(char *alias, for (i = min / BITS_PER_LONG; i < max / BITS_PER_LONG + 1; i++) arr[i] = TO_NATIVE(arr[i]); - for (i = min; i < max; i++) + for (i = min; i <= max; i++) if (arr[i / BITS_PER_LONG] & (1ULL << (i%BITS_PER_LONG))) sprintf(alias + strlen(alias), "%X,*", i); } From e1352d7ead2b8803689823cd4059c1ec72609ed4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 26 Dec 2024 00:33:36 +0900 Subject: [PATCH 539/807] modpost: refactor do_vmbus_entry() Optimize the size of guid_name[], as it only requires 1 additional byte for '\0' instead of 2. Simplify the loop by incrementing the iterator by 1 instead of 2. Remove the unnecessary TO_NATIVE() call, as the guid is represented as a byte stream. Signed-off-by: Masahiro Yamada Tested-by: John Paul Adrian Glaubitz --- scripts/mod/file2alias.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index ff263c285977..2c7b76d4e8ec 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -812,15 +812,13 @@ static void do_virtio_entry(struct module *mod, void *symval) * Each byte of the guid will be represented by two hex characters * in the name. */ - static void do_vmbus_entry(struct module *mod, void *symval) { - int i; DEF_FIELD_ADDR(symval, hv_vmbus_device_id, guid); - char guid_name[(sizeof(*guid) + 1) * 2]; + char guid_name[sizeof(*guid) * 2 + 1]; - for (i = 0; i < (sizeof(*guid) * 2); i += 2) - sprintf(&guid_name[i], "%02x", TO_NATIVE((guid->b)[i/2])); + for (int i = 0; i < sizeof(*guid); i++) + sprintf(&guid_name[i * 2], "%02x", guid->b[i]); module_alias_printf(mod, false, "vmbus:%s", guid_name); } From 8fe1a63d3d99d86f1bdc034505aad6fc70424737 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 26 Dec 2024 00:33:37 +0900 Subject: [PATCH 540/807] modpost: work around unaligned data access error With the latest binutils, modpost fails with a bus error on some architectures such as ARM and sparc64. Since binutils commit 1f1b5e506bf0 ("bfd/ELF: restrict file alignment for object files"), the byte offset to each section (sh_offset) in relocatable ELF is no longer guaranteed to be aligned. modpost parses MODULE_DEVICE_TABLE() data structures, which are usually located in the .rodata section. If it is not properly aligned, unaligned access errors may occur. To address the issue, this commit imports the get_unaligned() helper from include/linux/unaligned.h. The get_unaligned_native() helper caters to the endianness in addition to handling the unaligned access. I slightly refactored do_pcmcia_entry() and do_input() to avoid writing back to an unaligned address. (We would need the put_unaligned() helper to do that.) The addend_*_rel() functions need similar adjustments because the .text sections are not aligned either. It seems that the .symtab, .rel.* and .rela.* sections are still aligned. Keep normal pointer access for these sections to avoid unnecessary performance costs. Reported-by: Paulo Pisati Reported-by: Matthias Klose Closes: https://sourceware.org/bugzilla/show_bug.cgi?id=32435 Reported-by: John Paul Adrian Glaubitz Closes: https://sourceware.org/bugzilla/show_bug.cgi?id=32493 Signed-off-by: Masahiro Yamada Tested-by: John Paul Adrian Glaubitz --- scripts/mod/file2alias.c | 26 +++++++++++++------------- scripts/mod/modpost.c | 24 ++++++++++++------------ scripts/mod/modpost.h | 14 ++++++++++++++ 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 2c7b76d4e8ec..19ec72a69e90 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -132,7 +132,8 @@ struct devtable { * based at address m. */ #define DEF_FIELD(m, devid, f) \ - typeof(((struct devid *)0)->f) f = TO_NATIVE(*(typeof(f) *)((m) + OFF_##devid##_##f)) + typeof(((struct devid *)0)->f) f = \ + get_unaligned_native((typeof(f) *)((m) + OFF_##devid##_##f)) /* Define a variable f that holds the address of field f of struct devid * based at address m. Due to the way typeof works, for a field of type @@ -600,7 +601,7 @@ static void do_pnp_card_entry(struct module *mod, void *symval) static void do_pcmcia_entry(struct module *mod, void *symval) { char alias[256] = {}; - unsigned int i; + DEF_FIELD(symval, pcmcia_device_id, match_flags); DEF_FIELD(symval, pcmcia_device_id, manf_id); DEF_FIELD(symval, pcmcia_device_id, card_id); @@ -609,10 +610,6 @@ static void do_pcmcia_entry(struct module *mod, void *symval) DEF_FIELD(symval, pcmcia_device_id, device_no); DEF_FIELD_ADDR(symval, pcmcia_device_id, prod_id_hash); - for (i=0; i<4; i++) { - (*prod_id_hash)[i] = TO_NATIVE((*prod_id_hash)[i]); - } - ADD(alias, "m", match_flags & PCMCIA_DEV_ID_MATCH_MANF_ID, manf_id); ADD(alias, "c", match_flags & PCMCIA_DEV_ID_MATCH_CARD_ID, @@ -623,10 +620,14 @@ static void do_pcmcia_entry(struct module *mod, void *symval) function); ADD(alias, "pfn", match_flags & PCMCIA_DEV_ID_MATCH_DEVICE_NO, device_no); - ADD(alias, "pa", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID1, (*prod_id_hash)[0]); - ADD(alias, "pb", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID2, (*prod_id_hash)[1]); - ADD(alias, "pc", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3, (*prod_id_hash)[2]); - ADD(alias, "pd", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4, (*prod_id_hash)[3]); + ADD(alias, "pa", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID1, + get_unaligned_native(*prod_id_hash + 0)); + ADD(alias, "pb", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID2, + get_unaligned_native(*prod_id_hash + 1)); + ADD(alias, "pc", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3, + get_unaligned_native(*prod_id_hash + 2)); + ADD(alias, "pd", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4, + get_unaligned_native(*prod_id_hash + 3)); module_alias_printf(mod, true, "pcmcia:%s", alias); } @@ -654,10 +655,9 @@ static void do_input(char *alias, { unsigned int i; - for (i = min / BITS_PER_LONG; i < max / BITS_PER_LONG + 1; i++) - arr[i] = TO_NATIVE(arr[i]); for (i = min; i <= max; i++) - if (arr[i / BITS_PER_LONG] & (1ULL << (i%BITS_PER_LONG))) + if (get_unaligned_native(arr + i / BITS_PER_LONG) & + (1ULL << (i % BITS_PER_LONG))) sprintf(alias + strlen(alias), "%X,*", i); } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 94ee49207a45..7ea59dc4926b 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1138,9 +1138,9 @@ static Elf_Addr addend_386_rel(uint32_t *location, unsigned int r_type) { switch (r_type) { case R_386_32: - return TO_NATIVE(*location); + return get_unaligned_native(location); case R_386_PC32: - return TO_NATIVE(*location) + 4; + return get_unaligned_native(location) + 4; } return (Elf_Addr)(-1); @@ -1161,24 +1161,24 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type) switch (r_type) { case R_ARM_ABS32: case R_ARM_REL32: - inst = TO_NATIVE(*(uint32_t *)loc); + inst = get_unaligned_native((uint32_t *)loc); return inst + sym->st_value; case R_ARM_MOVW_ABS_NC: case R_ARM_MOVT_ABS: - inst = TO_NATIVE(*(uint32_t *)loc); + inst = get_unaligned_native((uint32_t *)loc); offset = sign_extend32(((inst & 0xf0000) >> 4) | (inst & 0xfff), 15); return offset + sym->st_value; case R_ARM_PC24: case R_ARM_CALL: case R_ARM_JUMP24: - inst = TO_NATIVE(*(uint32_t *)loc); + inst = get_unaligned_native((uint32_t *)loc); offset = sign_extend32((inst & 0x00ffffff) << 2, 25); return offset + sym->st_value + 8; case R_ARM_THM_MOVW_ABS_NC: case R_ARM_THM_MOVT_ABS: - upper = TO_NATIVE(*(uint16_t *)loc); - lower = TO_NATIVE(*((uint16_t *)loc + 1)); + upper = get_unaligned_native((uint16_t *)loc); + lower = get_unaligned_native((uint16_t *)loc + 1); offset = sign_extend32(((upper & 0x000f) << 12) | ((upper & 0x0400) << 1) | ((lower & 0x7000) >> 4) | @@ -1195,8 +1195,8 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type) * imm11 = lower[10:0] * imm32 = SignExtend(S:J2:J1:imm6:imm11:'0') */ - upper = TO_NATIVE(*(uint16_t *)loc); - lower = TO_NATIVE(*((uint16_t *)loc + 1)); + upper = get_unaligned_native((uint16_t *)loc); + lower = get_unaligned_native((uint16_t *)loc + 1); sign = (upper >> 10) & 1; j1 = (lower >> 13) & 1; @@ -1219,8 +1219,8 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type) * I2 = NOT(J2 XOR S) * imm32 = SignExtend(S:I1:I2:imm10:imm11:'0') */ - upper = TO_NATIVE(*(uint16_t *)loc); - lower = TO_NATIVE(*((uint16_t *)loc + 1)); + upper = get_unaligned_native((uint16_t *)loc); + lower = get_unaligned_native((uint16_t *)loc + 1); sign = (upper >> 10) & 1; j1 = (lower >> 13) & 1; @@ -1241,7 +1241,7 @@ static Elf_Addr addend_mips_rel(uint32_t *location, unsigned int r_type) { uint32_t inst; - inst = TO_NATIVE(*location); + inst = get_unaligned_native(location); switch (r_type) { case R_MIPS_LO16: return inst & 0xffff; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 8b72c227ebf4..ffd0a52a606e 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -65,6 +65,20 @@ #define TO_NATIVE(x) \ (target_is_big_endian == host_is_big_endian ? x : bswap(x)) +#define __get_unaligned_t(type, ptr) ({ \ + const struct { type x; } __attribute__((__packed__)) *__pptr = \ + (typeof(__pptr))(ptr); \ + __pptr->x; \ +}) + +#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) + +#define get_unaligned_native(ptr) \ +({ \ + typeof(*(ptr)) _val = get_unaligned(ptr); \ + TO_NATIVE(_val); \ +}) + #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) From 38fc96a58ce40257aec79b32e9b310c86907c63c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 28 Dec 2024 17:44:52 +0000 Subject: [PATCH 541/807] io_uring/rw: fix downgraded mshot read The io-wq path can downgrade a multishot request to oneshot mode, however io_read_mshot() doesn't handle that and would still post multiple CQEs. That's not allowed, because io_req_post_cqe() requires stricter context requirements. The described can only happen with pollable files that don't support FMODE_NOWAIT, which is an odd combination, so if even allowed it should be fairly rare. Cc: stable@vger.kernel.org Reported-by: chase xd Fixes: bee1d5becdf5b ("io_uring: disable io-wq execution of multishot NOWAIT requests") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c5c8c4a50a882fd581257b81bf52eee260ac29fd.1735407848.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/rw.c b/io_uring/rw.c index 0bcb83e4ce3c..29bb3010f9c0 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -983,6 +983,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) io_kbuf_recycle(req, issue_flags); if (ret < 0) req_set_fail(req); + } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + cflags = io_put_kbuf(req, ret, issue_flags); } else { /* * Any successful return value will keep the multishot read From b06a6187ef983f501e93faa56209169752d3bde3 Mon Sep 17 00:00:00 2001 From: Tanya Agarwal Date: Sun, 29 Dec 2024 11:32:42 +0530 Subject: [PATCH 542/807] ALSA: usb-audio: US16x08: Initialize array before use Initialize meter_urb array before use in mixer_us16x08.c. CID 1410197: (#1 of 1): Uninitialized scalar variable (UNINIT) uninit_use_in_call: Using uninitialized value *meter_urb when calling get_meter_levels_from_urb. Coverity Link: https://scan7.scan.coverity.com/#/project-view/52849/11354?selectedIssue=1410197 Fixes: d2bb390a2081 ("ALSA: usb-audio: Tascam US-16x08 DSP mixer quirk") Signed-off-by: Tanya Agarwal Link: https://patch.msgid.link/20241229060240.1642-1-tanyaagarwal25699@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/mixer_us16x08.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/mixer_us16x08.c b/sound/usb/mixer_us16x08.c index 6eb7d93b358d..20ac32635f1f 100644 --- a/sound/usb/mixer_us16x08.c +++ b/sound/usb/mixer_us16x08.c @@ -687,7 +687,7 @@ static int snd_us16x08_meter_get(struct snd_kcontrol *kcontrol, struct usb_mixer_elem_info *elem = kcontrol->private_data; struct snd_usb_audio *chip = elem->head.mixer->chip; struct snd_us16x08_meter_store *store = elem->private_data; - u8 meter_urb[64]; + u8 meter_urb[64] = {0}; switch (kcontrol->private_value) { case 0: { From 31ad36a271290648e7c2288a03d7b933d20254d6 Mon Sep 17 00:00:00 2001 From: chenchangcheng Date: Fri, 20 Dec 2024 15:48:47 +0800 Subject: [PATCH 543/807] objtool: Add bch2_trans_unlocked_error() to bcachefs noreturns Fix the following objtool warning during build time: fs/bcachefs/btree_trans_commit.o: warning: objtool: bch2_trans_commit_write_locked.isra.0() falls through to next function do_bch2_trans_commit.isra.0() fs/bcachefs/btree_trans_commit.o: warning: objtool: .text: unexpected end of section ...... fs/bcachefs/btree_update.o: warning: objtool: bch2_trans_update_get_key_cache() falls through to next function flush_new_cached_update() fs/bcachefs/btree_update.o: warning: objtool: flush_new_cached_update() falls through to next function bch2_trans_update_by_path() bch2_trans_unlocked_error() is an Obviously Correct (tm) panic() wrapper, add it to the list of known noreturns. [ mingo: Improved the changelog ] Fixes: fd104e2967b7 ("bcachefs: bch2_trans_verify_not_unlocked()") Signed-off-by: chenchangcheng Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20241220074847.3418134-1-ccc194101@163.com --- tools/objtool/noreturns.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index f37614cc2c1b..b2174894f9f7 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -19,6 +19,7 @@ NORETURN(__x64_sys_exit_group) NORETURN(arch_cpu_idle_dead) NORETURN(bch2_trans_in_restart_error) NORETURN(bch2_trans_restart_error) +NORETURN(bch2_trans_unlocked_error) NORETURN(cpu_bringup_and_idle) NORETURN(cpu_startup_entry) NORETURN(do_exit) From f718faf3940e95d5d34af9041f279f598396ab7d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 17 Dec 2024 00:48:18 +0000 Subject: [PATCH 544/807] freezer, sched: Report frozen tasks as 'D' instead of 'R' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") the frozen task stat was reported as 'D' in cgroup v1. However, after rewriting the core freezer logic, the frozen task stat is reported as 'R'. This is confusing, especially when a task with stat of 'S' is frozen. This bug can be reproduced with these steps: $ cd /sys/fs/cgroup/freezer/ $ mkdir test $ sleep 1000 & [1] 739 // task whose stat is 'S' $ echo 739 > test/cgroup.procs $ echo FROZEN > test/freezer.state $ ps -aux | grep 739 root 739 0.1 0.0 8376 1812 pts/0 R 10:56 0:00 sleep 1000 As shown above, a task whose stat is 'S' was changed to 'R' when it was frozen. To solve this regression, simply maintain the same reported state as before the rewrite. [ mingo: Enhanced the changelog and comments ] Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Signed-off-by: Chen Ridong Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Tejun Heo Acked-by: Michal Koutný Link: https://lore.kernel.org/r/20241217004818.3200515-1-chenridong@huaweicloud.com --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 66b311fbd5d6..64934e0830af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1637,8 +1637,9 @@ static inline unsigned int __task_state_index(unsigned int tsk_state, * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. + * Report frozen tasks as uninterruptible. */ - if (tsk_state & TASK_RTLOCK_WAIT) + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); From dc81e556f2a017d681251ace21bf06c126d5a192 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Wed, 13 Nov 2024 09:59:34 -0800 Subject: [PATCH 545/807] x86/fred: Clear WFE in missing-ENDBRANCH #CPs An indirect branch instruction sets the CPU indirect branch tracker (IBT) into WAIT_FOR_ENDBRANCH (WFE) state and WFE stays asserted across the instruction boundary. When the decoder finds an inappropriate instruction while WFE is set ENDBR, the CPU raises a #CP fault. For the "kernel IBT no ENDBR" selftest where #CPs are deliberately triggered, the WFE state of the interrupted context needs to be cleared to let execution continue. Otherwise when the CPU resumes from the instruction that just caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU enters a dead loop. This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't set WFE. But FRED provides space on the entry stack (in an expanded CS area) to save and restore the WFE state, thus the WFE state is no longer clobbered, so software must clear it. Clear WFE to avoid dead looping in ibt_clear_fred_wfe() and the !ibt_fatal code path when execution is allowed to continue. Clobbering WFE in any other circumstance is a security-relevant bug. [ dhansen: changelog rewording ] Fixes: a5f6c2ace997 ("x86/shstk: Add user control-protection fault handler") Signed-off-by: Xin Li (Intel) Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20241113175934.3897541-1-xin%40zytor.com --- arch/x86/kernel/cet.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c index d2c732a34e5d..303bf74d175b 100644 --- a/arch/x86/kernel/cet.c +++ b/arch/x86/kernel/cet.c @@ -81,6 +81,34 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) static __ro_after_init bool ibt_fatal = true; +/* + * By definition, all missing-ENDBRANCH #CPs are a result of WFE && !ENDBR. + * + * For the kernel IBT no ENDBR selftest where #CPs are deliberately triggered, + * the WFE state of the interrupted context needs to be cleared to let execution + * continue. Otherwise when the CPU resumes from the instruction that just + * caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU + * enters a dead loop. + * + * This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't + * set WFE. But FRED provides space on the entry stack (in an expanded CS area) + * to save and restore the WFE state, thus the WFE state is no longer clobbered, + * so software must clear it. + */ +static void ibt_clear_fred_wfe(struct pt_regs *regs) +{ + /* + * No need to do any FRED checks. + * + * For IDT event delivery, the high-order 48 bits of CS are pushed + * as 0s into the stack, and later IRET ignores these bits. + * + * For FRED, a test to check if fred_cs.wfe is set would be dropped + * by compilers. + */ + regs->fred_cs.wfe = 0; +} + static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) { if ((error_code & CP_EC) != CP_ENDBR) { @@ -90,6 +118,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { regs->ax = 0; + ibt_clear_fred_wfe(regs); return; } @@ -97,6 +126,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (!ibt_fatal) { printk(KERN_DEFAULT CUT_HERE); __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + ibt_clear_fred_wfe(regs); return; } BUG(); From 27834971f616c5e154423c578fa95e0444444ce1 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 19 Jun 2024 19:18:01 +0800 Subject: [PATCH 546/807] virt: tdx-guest: Just leak decrypted memory on unrecoverable errors In CoCo VMs it is possible for the untrusted host to cause set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. Leak the decrypted memory when set_memory_decrypted() fails, and don't need to print an error since set_memory_decrypted() will call WARN_ONCE(). Fixes: f4738f56d1dc ("virt: tdx-guest: Add Quote generation support using TSM_REPORTS") Signed-off-by: Li RongQing Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Reviewed-by: Rick Edgecombe Reviewed-by: Kirill A. Shutemov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240619111801.25630-1-lirongqing%40baidu.com --- drivers/virt/coco/tdx-guest/tdx-guest.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/virt/coco/tdx-guest/tdx-guest.c b/drivers/virt/coco/tdx-guest/tdx-guest.c index d7db6c824e13..224e7dde9cde 100644 --- a/drivers/virt/coco/tdx-guest/tdx-guest.c +++ b/drivers/virt/coco/tdx-guest/tdx-guest.c @@ -124,10 +124,8 @@ static void *alloc_quote_buf(void) if (!addr) return NULL; - if (set_memory_decrypted((unsigned long)addr, count)) { - free_pages_exact(addr, len); + if (set_memory_decrypted((unsigned long)addr, count)) return NULL; - } return addr; } From 032fe9b0516702599c2dd990a4703f783d5716b8 Mon Sep 17 00:00:00 2001 From: Mingcong Bai Date: Thu, 26 Dec 2024 14:22:05 +0800 Subject: [PATCH 547/807] platform/x86: hp-wmi: mark 8A15 board for timed OMEN thermal profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HP OMEN 8 (2022), corresponding to a board ID of 8A15, supports OMEN thermal profile and requires the timed profile quirk. Upon adding this ID to both the omen_thermal_profile_boards and omen_timed_thermal_profile_boards, significant bump in performance can be observed. For instance, SilverBench (https://silver.urih.com/) results improved from ~56,000 to ~69,000, as a result of higher power draws (and thus core frequencies) whilst under load: Package Power: - Before the patch: ~65W (dropping to about 55W under sustained load). - After the patch: ~115W (dropping to about 105W under sustained load). Core Power: - Before: ~60W (ditto above). - After: ~108W (ditto above). Add 8A15 to omen_thermal_profile_boards and omen_timed_thermal_profile_boards to improve performance. Signed-off-by: Xi Xiao <1577912515@qq.com> Signed-off-by: Mingcong Bai Link: https://lore.kernel.org/r/20241226062207.3352629-1-jeffbai@aosc.io Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-wmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 81ccc96ffe40..20c55bab3b8c 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -64,7 +64,7 @@ static const char * const omen_thermal_profile_boards[] = { "874A", "8603", "8604", "8748", "886B", "886C", "878A", "878B", "878C", "88C8", "88CB", "8786", "8787", "8788", "88D1", "88D2", "88F4", "88FD", "88F5", "88F6", "88F7", "88FE", "88FF", "8900", "8901", "8902", "8912", - "8917", "8918", "8949", "894A", "89EB", "8BAD", "8A42" + "8917", "8918", "8949", "894A", "89EB", "8BAD", "8A42", "8A15" }; /* DMI Board names of Omen laptops that are specifically set to be thermal @@ -80,7 +80,7 @@ static const char * const omen_thermal_profile_force_v0_boards[] = { * "balanced" when reaching zero. */ static const char * const omen_timed_thermal_profile_boards[] = { - "8BAD", "8A42" + "8BAD", "8A42", "8A15" }; /* DMI Board names of Victus laptops */ From 7e16ae558a87ac9099b6a93a43f19b42d809fd78 Mon Sep 17 00:00:00 2001 From: Vishnu Sankar Date: Sat, 28 Dec 2024 08:18:40 +0900 Subject: [PATCH 548/807] platform/x86: thinkpad-acpi: Add support for hotkey 0x1401 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit F8 mode key on Lenovo 2025 platforms use a different key code. Adding support for the new keycode 0x1401. Tested on X1 Carbon Gen 13 and X1 2-in-1 Gen 10. Signed-off-by: Vishnu Sankar Reviewed-by: Mark Pearson Link: https://lore.kernel.org/r/20241227231840.21334-1-vishnuocv@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- Documentation/admin-guide/laptops/thinkpad-acpi.rst | 10 +++++++--- drivers/platform/x86/thinkpad_acpi.c | 4 +++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/laptops/thinkpad-acpi.rst b/Documentation/admin-guide/laptops/thinkpad-acpi.rst index 7f674a6cfa8a..4ab0fef7d440 100644 --- a/Documentation/admin-guide/laptops/thinkpad-acpi.rst +++ b/Documentation/admin-guide/laptops/thinkpad-acpi.rst @@ -445,8 +445,10 @@ event code Key Notes 0x1008 0x07 FN+F8 IBM: toggle screen expand Lenovo: configure UltraNav, or toggle screen expand. - On newer platforms (2024+) - replaced by 0x131f (see below) + On 2024 platforms replaced by + 0x131f (see below) and on newer + platforms (2025 +) keycode is + replaced by 0x1401 (see below). 0x1009 0x08 FN+F9 - @@ -506,9 +508,11 @@ event code Key Notes 0x1019 0x18 unknown -0x131f ... FN+F8 Platform Mode change. +0x131f ... FN+F8 Platform Mode change (2024 systems). Implemented in driver. +0x1401 ... FN+F8 Platform Mode change (2025 + systems). + Implemented in driver. ... ... ... 0x1020 0x1F unknown diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 6371a9f765c1..2cfb2ac3f465 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -184,7 +184,8 @@ enum tpacpi_hkey_event_t { */ TP_HKEY_EV_AMT_TOGGLE = 0x131a, /* Toggle AMT on/off */ TP_HKEY_EV_DOUBLETAP_TOGGLE = 0x131c, /* Toggle trackpoint doubletap on/off */ - TP_HKEY_EV_PROFILE_TOGGLE = 0x131f, /* Toggle platform profile */ + TP_HKEY_EV_PROFILE_TOGGLE = 0x131f, /* Toggle platform profile in 2024 systems */ + TP_HKEY_EV_PROFILE_TOGGLE2 = 0x1401, /* Toggle platform profile in 2025 + systems */ /* Reasons for waking up from S3/S4 */ TP_HKEY_EV_WKUP_S3_UNDOCK = 0x2304, /* undock requested, S3 */ @@ -11200,6 +11201,7 @@ static bool tpacpi_driver_event(const unsigned int hkey_event) tp_features.trackpoint_doubletap = !tp_features.trackpoint_doubletap; return true; case TP_HKEY_EV_PROFILE_TOGGLE: + case TP_HKEY_EV_PROFILE_TOGGLE2: platform_profile_cycle(); return true; } From fc033cf25e612e840e545f8d5ad2edd6ba613ed5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Dec 2024 13:15:45 -0800 Subject: [PATCH 549/807] Linux 6.13-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5c9b1d2d59b4..48e89108aa58 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Baby Opossum Posse # *DOCUMENTATION* From 6a451e2c5c03e27aa3ec36be424fccaa286c3ccd Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Mon, 30 Dec 2024 14:49:10 +0800 Subject: [PATCH 550/807] ALSA: hda/tas2781: Ignore SUBSYS_ID not found for tas2563 projects Driver will return error if no SUBSYS_ID found in BIOS(acpi). It will cause error in tas2563 projects, which have no SUBSYS_ID. Fixes: 4e7035a75da9 ("ALSA: hda/tas2781: Add speaker id check for ASUS projects") Signed-off-by: Baojun Xu Link: https://lore.kernel.org/20241223225442.1358491-1-stuart.a.hayhurst@gmail.com Link: https://patch.msgid.link/20241230064910.1583-1-baojun.xu@ti.com Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_i2c.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 0af015806aba..0e42b87dadb8 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -142,6 +142,9 @@ static int tas2781_read_acpi(struct tasdevice_priv *p, const char *hid) } sub = acpi_get_subsystem_id(ACPI_HANDLE(physdev)); if (IS_ERR(sub)) { + /* No subsys id in older tas2563 projects. */ + if (!strncmp(hid, "INT8866", sizeof("INT8866"))) + goto end_2563; dev_err(p->dev, "Failed to get SUBSYS ID.\n"); ret = PTR_ERR(sub); goto err; @@ -164,6 +167,7 @@ static int tas2781_read_acpi(struct tasdevice_priv *p, const char *hid) p->speaker_id = NULL; } +end_2563: acpi_dev_free_resource_list(&resources); strscpy(p->dev_name, hid, sizeof(p->dev_name)); put_device(physdev); From ac9fae799eda81e24bbf2e0d5cb9e5c33fc9bdcb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 29 Dec 2024 09:39:13 +0100 Subject: [PATCH 551/807] ALSA: compress_offload: Drop unneeded no_free_ptr() The error path for memdup_user() no longer needs the tricky wrap with no_free_ptr() and we can safely return the error pointer directly. Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412290846.cncnpGaw-lkp@intel.com/ Link: https://patch.msgid.link/20241229083917.14912-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/compress_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index edf5aadf38e5..4ed6cec5fd5c 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1077,7 +1077,7 @@ static int snd_compr_task_create(struct snd_compr_stream *stream, unsigned long return -EPERM; task = memdup_user((void __user *)arg, sizeof(*task)); if (IS_ERR(task)) - return PTR_ERR(no_free_ptr(task)); + return PTR_ERR(task); retval = snd_compr_task_new(stream, task); if (retval >= 0) if (copy_to_user((void __user *)arg, task, sizeof(*task))) @@ -1138,7 +1138,7 @@ static int snd_compr_task_start_ioctl(struct snd_compr_stream *stream, unsigned return -EPERM; task = memdup_user((void __user *)arg, sizeof(*task)); if (IS_ERR(task)) - return PTR_ERR(no_free_ptr(task)); + return PTR_ERR(task); retval = snd_compr_task_start(stream, task); if (retval >= 0) if (copy_to_user((void __user *)arg, task, sizeof(*task))) @@ -1229,7 +1229,7 @@ static int snd_compr_task_status_ioctl(struct snd_compr_stream *stream, unsigned return -EPERM; status = memdup_user((void __user *)arg, sizeof(*status)); if (IS_ERR(status)) - return PTR_ERR(no_free_ptr(status)); + return PTR_ERR(status); retval = snd_compr_task_status(stream, status); if (retval >= 0) if (copy_to_user((void __user *)arg, status, sizeof(*status))) From 7439b395211874e20c24b2fe0e4903864357a3f5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Dec 2024 18:52:32 +0000 Subject: [PATCH 552/807] ALSA: compress_offload: fix remaining descriptor races in sound/core/compress_offload.c 3d3f43fab4cf ("ALSA: compress_offload: improve file descriptors installation for dma-buf") fixed some of descriptor races in snd_compr_task_new(), but there's a couple more left. We need to grab the references to dmabuf before moving them into descriptor table - trying to do that by descriptor afterwards might end up getting a different object, with a dangling reference left in task->{input,output} Fixes: 3d3f43fab4cf ("ALSA: compress_offload: improve file descriptors installation for dma-buf") Signed-off-by: Al Viro Link: https://patch.msgid.link/20241229185232.GA1977892@ZenIV Signed-off-by: Takashi Iwai --- sound/core/compress_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 4ed6cec5fd5c..840bb9cfe789 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1053,13 +1053,13 @@ static int snd_compr_task_new(struct snd_compr_stream *stream, struct snd_compr_ put_unused_fd(fd_i); goto cleanup; } + /* keep dmabuf reference until freed with task free ioctl */ + get_dma_buf(task->input); + get_dma_buf(task->output); fd_install(fd_i, task->input->file); fd_install(fd_o, task->output->file); utask->input_fd = fd_i; utask->output_fd = fd_o; - /* keep dmabuf reference until freed with task free ioctl */ - dma_buf_get(utask->input_fd); - dma_buf_get(utask->output_fd); list_add_tail(&task->list, &stream->runtime->tasks); stream->runtime->total_tasks++; return 0; From 0179488ca992d79908b8e26b9213f1554fc5bacc Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 30 Dec 2024 12:05:35 +0100 Subject: [PATCH 553/807] ALSA: seq: oss: Fix races at processing SysEx messages OSS sequencer handles the SysEx messages split in 6 bytes packets, and ALSA sequencer OSS layer tries to combine those. It stores the data in the internal buffer and this access is racy as of now, which may lead to the out-of-bounds access. As a temporary band-aid fix, introduce a mutex for serializing the process of the SysEx message packets. Reported-by: Kun Hu Closes: https://lore.kernel.org/2B7E93E4-B13A-4AE4-8E87-306A8EE9BBB7@m.fudan.edu.cn Cc: Link: https://patch.msgid.link/20241230110543.32454-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/oss/seq_oss_synth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/core/seq/oss/seq_oss_synth.c b/sound/core/seq/oss/seq_oss_synth.c index e3394919daa0..51ee4c00a843 100644 --- a/sound/core/seq/oss/seq_oss_synth.c +++ b/sound/core/seq/oss/seq_oss_synth.c @@ -66,6 +66,7 @@ static struct seq_oss_synth midi_synth_dev = { }; static DEFINE_SPINLOCK(register_lock); +static DEFINE_MUTEX(sysex_mutex); /* * prototypes @@ -497,6 +498,7 @@ snd_seq_oss_synth_sysex(struct seq_oss_devinfo *dp, int dev, unsigned char *buf, if (!info) return -ENXIO; + guard(mutex)(&sysex_mutex); sysex = info->sysex; if (sysex == NULL) { sysex = kzalloc(sizeof(*sysex), GFP_KERNEL); From abbff41b6932cde359589fd51f4024b7c85f366b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 30 Dec 2024 12:40:22 +0100 Subject: [PATCH 554/807] Revert "ALSA: ump: Don't enumeration invalid groups for legacy rawmidi" This reverts commit c2d188e137e77294323132a760a4608321a36a70. Although it's fine to filter the invalid UMP groups at the first probe time, this will become a problem when UMP groups are updated and (re-)activated. Then there is no way to re-add the substreams properly for the legacy rawmidi, and the new active groups will be still invisible. So let's revert the change. This will move back to showing the full 16 groups, but it's better than forever lost. Link: https://patch.msgid.link/20241230114023.3787-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/ump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/ump.c b/sound/core/ump.c index fe4d39ae1159..9198bff4768c 100644 --- a/sound/core/ump.c +++ b/sound/core/ump.c @@ -1244,7 +1244,7 @@ static int fill_legacy_mapping(struct snd_ump_endpoint *ump) num = 0; for (i = 0; i < SNDRV_UMP_MAX_GROUPS; i++) - if ((group_maps & (1U << i)) && ump->groups[i].valid) + if (group_maps & (1U << i)) ump->legacy_mapping[num++] = i; return num; From a10f26062a9973c38c0a11ea91757f9228e200f2 Mon Sep 17 00:00:00 2001 From: Chun-Kuang Hu Date: Mon, 23 Dec 2024 15:12:18 +0000 Subject: [PATCH 555/807] Revert "drm/mediatek: Switch to for_each_child_of_node_scoped()" This reverts commit fd620fc25d88a1e490eaa9f72bc31962be1b4741. Boot failures reported by KernelCI: [ 4.395400] mediatek-drm mediatek-drm.5.auto: bound 1c014000.merge (ops 0xffffd35fd12975f8) [ 4.396155] mediatek-drm mediatek-drm.5.auto: bound 1c000000.ovl (ops 0xffffd35fd12977b8) [ 4.411951] mediatek-drm mediatek-drm.5.auto: bound 1c002000.rdma (ops 0xffffd35fd12989c0) [ 4.536837] mediatek-drm mediatek-drm.5.auto: bound 1c004000.ccorr (ops 0xffffd35fd1296cf0) [ 4.545181] mediatek-drm mediatek-drm.5.auto: bound 1c005000.aal (ops 0xffffd35fd1296a80) [ 4.553344] mediatek-drm mediatek-drm.5.auto: bound 1c006000.gamma (ops 0xffffd35fd12972b0) [ 4.561680] mediatek-drm mediatek-drm.5.auto: bound 1c014000.merge (ops 0xffffd35fd12975f8) [ 4.570025] ------------[ cut here ]------------ [ 4.574630] refcount_t: underflow; use-after-free. [ 4.579416] WARNING: CPU: 6 PID: 81 at lib/refcount.c:28 refcount_warn_saturate+0xf4/0x148 [ 4.587670] Modules linked in: [ 4.590714] CPU: 6 UID: 0 PID: 81 Comm: kworker/u32:3 Tainted: G W 6.12.0 #1 cab58e2e59020ebd4be8ada89a65f465a316c742 [ 4.602695] Tainted: [W]=WARN [ 4.605649] Hardware name: Acer Tomato (rev2) board (DT) [ 4.610947] Workqueue: events_unbound deferred_probe_work_func [ 4.616768] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 4.623715] pc : refcount_warn_saturate+0xf4/0x148 [ 4.628493] lr : refcount_warn_saturate+0xf4/0x148 [ 4.633270] sp : ffff8000807639c0 [ 4.636571] x29: ffff8000807639c0 x28: ffff34ff4116c640 x27: ffff34ff4368e080 [ 4.643693] x26: ffffd35fd1299ac8 x25: ffff34ff46c8c410 x24: 0000000000000000 [ 4.650814] x23: ffff34ff4368e080 x22: 00000000fffffdfb x21: 0000000000000002 [ 4.657934] x20: ffff34ff470c6000 x19: ffff34ff410c7c10 x18: 0000000000000006 [ 4.665055] x17: 666678302073706f x16: 2820656772656d2e x15: ffff800080763440 [ 4.672176] x14: 0000000000000000 x13: 2e656572662d7265 x12: ffffd35fd2ed14f0 [ 4.679297] x11: 0000000000000001 x10: 0000000000000001 x9 : ffffd35fd0342150 [ 4.686418] x8 : c0000000ffffdfff x7 : ffffd35fd2e21450 x6 : 00000000000affa8 [ 4.693539] x5 : ffffd35fd2ed1498 x4 : 0000000000000000 x3 : 0000000000000000 [ 4.700660] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff34ff40932580 [ 4.707781] Call trace: [ 4.710216] refcount_warn_saturate+0xf4/0x148 (P) [ 4.714993] refcount_warn_saturate+0xf4/0x148 (L) [ 4.719772] kobject_put+0x110/0x118 [ 4.723335] put_device+0x1c/0x38 [ 4.726638] mtk_drm_bind+0x294/0x5c0 [ 4.730289] try_to_bring_up_aggregate_device+0x16c/0x1e0 [ 4.735673] __component_add+0xbc/0x1c0 [ 4.739495] component_add+0x1c/0x30 [ 4.743058] mtk_disp_rdma_probe+0x140/0x210 [ 4.747314] platform_probe+0x70/0xd0 [ 4.750964] really_probe+0xc4/0x2a8 [ 4.754527] __driver_probe_device+0x80/0x140 [ 4.758870] driver_probe_device+0x44/0x120 [ 4.763040] __device_attach_driver+0xc0/0x108 [ 4.767470] bus_for_each_drv+0x8c/0xf0 [ 4.771294] __device_attach+0xa4/0x198 [ 4.775117] device_initial_probe+0x1c/0x30 [ 4.779286] bus_probe_device+0xb4/0xc0 [ 4.783109] deferred_probe_work_func+0xb0/0x100 [ 4.787714] process_one_work+0x18c/0x420 [ 4.791712] worker_thread+0x30c/0x418 [ 4.795449] kthread+0x128/0x138 [ 4.798665] ret_from_fork+0x10/0x20 [ 4.802229] ---[ end trace 0000000000000000 ]--- Fixes: fd620fc25d88 ("drm/mediatek: Switch to for_each_child_of_node_scoped()") Cc: stable@vger.kernel.org Cc: Javier Carrasco Reported-by: Sasha Levin Closes: https://lore.kernel.org/lkml/Z0lNHdwQ3rODHQ2c@sashalap/T/#mfaa6343cfd4d59aae5912b095c0693c0553e746c Link: https://patchwork.kernel.org/project/dri-devel/patch/20241223151218.7958-1-chunkuang.hu@kernel.org/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index 0062374f75d5..11935cf2b39e 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -373,11 +373,12 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) struct mtk_drm_private *temp_drm_priv; struct device_node *phandle = dev->parent->of_node; const struct of_device_id *of_id; + struct device_node *node; struct device *drm_dev; unsigned int cnt = 0; int i, j; - for_each_child_of_node_scoped(phandle->parent, node) { + for_each_child_of_node(phandle->parent, node) { struct platform_device *pdev; of_id = of_match_node(mtk_drm_of_ids, node); @@ -406,8 +407,10 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) if (temp_drm_priv->mtk_drm_bound) cnt++; - if (cnt == MAX_CRTC) + if (cnt == MAX_CRTC) { + of_node_put(node); break; + } } if (drm_priv->data->mmsys_dev_num == cnt) { From ef24fbd8f12015ff827973fffefed3902ffd61cc Mon Sep 17 00:00:00 2001 From: Liankun Yang Date: Fri, 25 Oct 2024 16:28:27 +0800 Subject: [PATCH 556/807] drm/mediatek: Fix YCbCr422 color format issue for DP Setting up misc0 for Pixel Encoding Format. According to the definition of YCbCr in spec 1.2a Table 2-96, 0x1 << 1 should be written to the register. Use switch case to distinguish RGB, YCbCr422, and unsupported color formats. Fixes: f70ac097a2cf ("drm/mediatek: Add MT8195 Embedded DisplayPort driver") Signed-off-by: Liankun Yang Link: https://patchwork.kernel.org/project/dri-devel/patch/20241025083036.8829-2-liankun.yang@mediatek.com/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_dp.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_dp.c b/drivers/gpu/drm/mediatek/mtk_dp.c index 1cc916b16471..baa799f10e3b 100644 --- a/drivers/gpu/drm/mediatek/mtk_dp.c +++ b/drivers/gpu/drm/mediatek/mtk_dp.c @@ -543,18 +543,16 @@ static int mtk_dp_set_color_format(struct mtk_dp *mtk_dp, enum dp_pixelformat color_format) { u32 val; - - /* update MISC0 */ - mtk_dp_update_bits(mtk_dp, MTK_DP_ENC0_P0_3034, - color_format << DP_TEST_COLOR_FORMAT_SHIFT, - DP_TEST_COLOR_FORMAT_MASK); + u32 misc0_color; switch (color_format) { case DP_PIXELFORMAT_YUV422: val = PIXEL_ENCODE_FORMAT_DP_ENC0_P0_YCBCR422; + misc0_color = DP_COLOR_FORMAT_YCbCr422; break; case DP_PIXELFORMAT_RGB: val = PIXEL_ENCODE_FORMAT_DP_ENC0_P0_RGB; + misc0_color = DP_COLOR_FORMAT_RGB; break; default: drm_warn(mtk_dp->drm_dev, "Unsupported color format: %d\n", @@ -562,6 +560,11 @@ static int mtk_dp_set_color_format(struct mtk_dp *mtk_dp, return -EINVAL; } + /* update MISC0 */ + mtk_dp_update_bits(mtk_dp, MTK_DP_ENC0_P0_3034, + misc0_color, + DP_TEST_COLOR_FORMAT_MASK); + mtk_dp_update_bits(mtk_dp, MTK_DP_ENC0_P0_303C, val, PIXEL_ENCODE_FORMAT_DP_ENC0_P0_MASK); return 0; From 0d68b55887cedc7487036ed34cb4c2097c4228f1 Mon Sep 17 00:00:00 2001 From: Liankun Yang Date: Fri, 25 Oct 2024 16:28:28 +0800 Subject: [PATCH 557/807] drm/mediatek: Fix mode valid issue for dp Fix dp mode valid issue to avoid abnormal display of limit state. After DP passes link training, it can express the lane count of the current link status is good. Calculate the maximum bandwidth supported by DP using the current lane count. The color format will select the best one based on the bandwidth requirements of the current timing mode. If the current timing mode uses RGB and meets the DP link bandwidth requirements, RGB will be used. If the timing mode uses RGB but does not meet the DP link bandwidthi requirements, it will continue to check whether YUV422 meets the DP link bandwidth. FEC overhead is approximately 2.4% from DP 1.4a spec 2.2.1.4.2. The down-spread amplitude shall either be disabled (0.0%) or up to 0.5% from 1.4a 3.5.2.6. Add up to approximately 3% total overhead. Because rate is already divided by 10, mode->clock does not need to be multiplied by 10. Fixes: f70ac097a2cf ("drm/mediatek: Add MT8195 Embedded DisplayPort driver") Signed-off-by: Liankun Yang Link: https://patchwork.kernel.org/project/dri-devel/patch/20241025083036.8829-3-liankun.yang@mediatek.com/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_dp.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_dp.c b/drivers/gpu/drm/mediatek/mtk_dp.c index baa799f10e3b..e945c1204837 100644 --- a/drivers/gpu/drm/mediatek/mtk_dp.c +++ b/drivers/gpu/drm/mediatek/mtk_dp.c @@ -2411,12 +2411,19 @@ mtk_dp_bridge_mode_valid(struct drm_bridge *bridge, { struct mtk_dp *mtk_dp = mtk_dp_from_bridge(bridge); u32 bpp = info->color_formats & DRM_COLOR_FORMAT_YCBCR422 ? 16 : 24; - u32 rate = min_t(u32, drm_dp_max_link_rate(mtk_dp->rx_cap) * - drm_dp_max_lane_count(mtk_dp->rx_cap), - drm_dp_bw_code_to_link_rate(mtk_dp->max_linkrate) * - mtk_dp->max_lanes); + u32 lane_count_min = mtk_dp->train_info.lane_count; + u32 rate = drm_dp_bw_code_to_link_rate(mtk_dp->train_info.link_rate) * + lane_count_min; - if (rate < mode->clock * bpp / 8) + /* + *FEC overhead is approximately 2.4% from DP 1.4a spec 2.2.1.4.2. + *The down-spread amplitude shall either be disabled (0.0%) or up + *to 0.5% from 1.4a 3.5.2.6. Add up to approximately 3% total overhead. + * + *Because rate is already divided by 10, + *mode->clock does not need to be multiplied by 10 + */ + if ((rate * 97 / 100) < (mode->clock * bpp / 8)) return MODE_CLOCK_HIGH; return MODE_OK; @@ -2457,10 +2464,9 @@ static u32 *mtk_dp_bridge_atomic_get_input_bus_fmts(struct drm_bridge *bridge, struct drm_display_mode *mode = &crtc_state->adjusted_mode; struct drm_display_info *display_info = &conn_state->connector->display_info; - u32 rate = min_t(u32, drm_dp_max_link_rate(mtk_dp->rx_cap) * - drm_dp_max_lane_count(mtk_dp->rx_cap), - drm_dp_bw_code_to_link_rate(mtk_dp->max_linkrate) * - mtk_dp->max_lanes); + u32 lane_count_min = mtk_dp->train_info.lane_count; + u32 rate = drm_dp_bw_code_to_link_rate(mtk_dp->train_info.link_rate) * + lane_count_min; *num_input_fmts = 0; @@ -2469,8 +2475,8 @@ static u32 *mtk_dp_bridge_atomic_get_input_bus_fmts(struct drm_bridge *bridge, * datarate of YUV422 and sink device supports YUV422, we output YUV422 * format. Use this condition, we can support more resolution. */ - if ((rate < (mode->clock * 24 / 8)) && - (rate > (mode->clock * 16 / 8)) && + if (((rate * 97 / 100) < (mode->clock * 24 / 8)) && + ((rate * 97 / 100) > (mode->clock * 16 / 8)) && (display_info->color_formats & DRM_COLOR_FORMAT_YCBCR422)) { input_fmts = kcalloc(1, sizeof(*input_fmts), GFP_KERNEL); if (!input_fmts) From cc0dc9e871a91aadf5b26a2d7760fb762e0d9203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Le=20Goffic?= Date: Wed, 18 Dec 2024 10:22:27 +0100 Subject: [PATCH 558/807] watchdog: stm32_iwdg: fix error message during driver probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 3ab1663af6c1 ("watchdog: stm32_iwdg: Add pretimeout support") introduces the support for the pre-timeout interrupt. The support for this interrupt is optional but the driver uses the platform_get_irq() which produces an error message during the driver probe if we don't have any `interrupts` property in the DT. Use the platform_get_irq_optional() API to get rid of the error message as this property is optional. Fixes: 3ab1663af6c1 ("watchdog: stm32_iwdg: Add pretimeout support") Signed-off-by: Clément Le Goffic Reviewed-by: Marek Vasut Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20241218092227.771133-1-clement.legoffic@foss.st.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/stm32_iwdg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/stm32_iwdg.c b/drivers/watchdog/stm32_iwdg.c index d700e0d49bb9..8ad06b54c5ad 100644 --- a/drivers/watchdog/stm32_iwdg.c +++ b/drivers/watchdog/stm32_iwdg.c @@ -286,7 +286,7 @@ static int stm32_iwdg_irq_init(struct platform_device *pdev, if (!wdt->data->has_early_wakeup) return 0; - irq = platform_get_irq(pdev, 0); + irq = platform_get_irq_optional(pdev, 0); if (irq <= 0) return 0; From 8fe3ee95da1bf42830e9b02c70f111b53ab65229 Mon Sep 17 00:00:00 2001 From: Fei Shao Date: Tue, 5 Nov 2024 17:00:28 +0800 Subject: [PATCH 559/807] dt-bindings: display: mediatek: dp: Reference common DAI properties The MediaTek DP hardware supports audio and exposes a DAI, so the '#sound-dai-cells' property is needed for describing the DAI links. Reference the dai-common.yaml schema to allow '#sound-dai-cells' to be used, and filter out non-DP compatibles as MediaTek eDP in the same binding doesn't support audio. This fixes dtbs_check error: '#sound-dai-cells' does not match any of the regexes: 'pinctrl-[0-9]+' Signed-off-by: Fei Shao Reviewed-by: Rob Herring (Arm) Link: https://patchwork.kernel.org/project/dri-devel/patch/20241105090207.3892242-1-fshao@chromium.org/ Signed-off-by: Chun-Kuang Hu --- .../display/mediatek/mediatek,dp.yaml | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dp.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,dp.yaml index 2aef1eb32e11..75ce92f4a5fd 100644 --- a/Documentation/devicetree/bindings/display/mediatek/mediatek,dp.yaml +++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,dp.yaml @@ -42,6 +42,9 @@ properties: interrupts: maxItems: 1 + '#sound-dai-cells': + const: 0 + ports: $ref: /schemas/graph.yaml#/properties/ports properties: @@ -85,7 +88,21 @@ required: - ports - max-linkrate-mhz -additionalProperties: false +allOf: + - $ref: /schemas/sound/dai-common.yaml# + - if: + not: + properties: + compatible: + contains: + enum: + - mediatek,mt8188-dp-tx + - mediatek,mt8195-dp-tx + then: + properties: + '#sound-dai-cells': false + +unevaluatedProperties: false examples: - | From 76aed5e00ff2625e0ec4b40c75f3514bdb27fae4 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Thu, 19 Dec 2024 12:27:33 +0100 Subject: [PATCH 560/807] drm/mediatek: mtk_dsi: Add registers to pdata to fix MT8186/MT8188 Registers DSI_VM_CMD and DSI_SHADOW_DEBUG start at different addresses in both MT8186 and MT8188 compared to the older IPs. Add two members in struct mtk_dsi_driver_data to specify the offsets for these two registers on a per-SoC basis, then do specify those in all of the currently present SoC driver data. This fixes writes to the Video Mode Command Packet Control register, fixing enablement of command packet transmission (VM_CMD_EN) and allowance of this transmission during the VFP period (TS_VFP_EN) on both MT8186 and MT8188. Fixes: 03d7adc41027 ("drm/mediatek: Add mt8186 dsi compatible to mtk_dsi.c") Fixes: 814d5341f314 ("drm/mediatek: Add mt8188 dsi compatible to mtk_dsi.c") Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: CK Hu Link: https://patchwork.kernel.org/project/dri-devel/patch/20241219112733.47907-1-angelogioacchino.delregno@collabora.com/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_dsi.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_dsi.c b/drivers/gpu/drm/mediatek/mtk_dsi.c index 3907863579b9..39f0623a6f0a 100644 --- a/drivers/gpu/drm/mediatek/mtk_dsi.c +++ b/drivers/gpu/drm/mediatek/mtk_dsi.c @@ -139,11 +139,11 @@ #define CLK_HS_POST GENMASK(15, 8) #define CLK_HS_EXIT GENMASK(23, 16) -#define DSI_VM_CMD_CON 0x130 +/* DSI_VM_CMD_CON */ #define VM_CMD_EN BIT(0) #define TS_VFP_EN BIT(5) -#define DSI_SHADOW_DEBUG 0x190U +/* DSI_SHADOW_DEBUG */ #define FORCE_COMMIT BIT(0) #define BYPASS_SHADOW BIT(1) @@ -187,6 +187,8 @@ struct phy; struct mtk_dsi_driver_data { const u32 reg_cmdq_off; + const u32 reg_vm_cmd_off; + const u32 reg_shadow_dbg_off; bool has_shadow_ctl; bool has_size_ctl; bool cmdq_long_packet_ctl; @@ -366,8 +368,8 @@ static void mtk_dsi_set_mode(struct mtk_dsi *dsi) static void mtk_dsi_set_vm_cmd(struct mtk_dsi *dsi) { - mtk_dsi_mask(dsi, DSI_VM_CMD_CON, VM_CMD_EN, VM_CMD_EN); - mtk_dsi_mask(dsi, DSI_VM_CMD_CON, TS_VFP_EN, TS_VFP_EN); + mtk_dsi_mask(dsi, dsi->driver_data->reg_vm_cmd_off, VM_CMD_EN, VM_CMD_EN); + mtk_dsi_mask(dsi, dsi->driver_data->reg_vm_cmd_off, TS_VFP_EN, TS_VFP_EN); } static void mtk_dsi_rxtx_control(struct mtk_dsi *dsi) @@ -713,7 +715,7 @@ static int mtk_dsi_poweron(struct mtk_dsi *dsi) if (dsi->driver_data->has_shadow_ctl) writel(FORCE_COMMIT | BYPASS_SHADOW, - dsi->regs + DSI_SHADOW_DEBUG); + dsi->regs + dsi->driver_data->reg_shadow_dbg_off); mtk_dsi_reset_engine(dsi); mtk_dsi_phy_timconfig(dsi); @@ -1262,26 +1264,36 @@ static void mtk_dsi_remove(struct platform_device *pdev) static const struct mtk_dsi_driver_data mt8173_dsi_driver_data = { .reg_cmdq_off = 0x200, + .reg_vm_cmd_off = 0x130, + .reg_shadow_dbg_off = 0x190 }; static const struct mtk_dsi_driver_data mt2701_dsi_driver_data = { .reg_cmdq_off = 0x180, + .reg_vm_cmd_off = 0x130, + .reg_shadow_dbg_off = 0x190 }; static const struct mtk_dsi_driver_data mt8183_dsi_driver_data = { .reg_cmdq_off = 0x200, + .reg_vm_cmd_off = 0x130, + .reg_shadow_dbg_off = 0x190, .has_shadow_ctl = true, .has_size_ctl = true, }; static const struct mtk_dsi_driver_data mt8186_dsi_driver_data = { .reg_cmdq_off = 0xd00, + .reg_vm_cmd_off = 0x200, + .reg_shadow_dbg_off = 0xc00, .has_shadow_ctl = true, .has_size_ctl = true, }; static const struct mtk_dsi_driver_data mt8188_dsi_driver_data = { .reg_cmdq_off = 0xd00, + .reg_vm_cmd_off = 0x200, + .reg_shadow_dbg_off = 0xc00, .has_shadow_ctl = true, .has_size_ctl = true, .cmdq_long_packet_ctl = true, From de30d74f58cbecb3894c7738985bd0086d04bec1 Mon Sep 17 00:00:00 2001 From: Steven Davis Date: Mon, 30 Dec 2024 19:34:31 +0000 Subject: [PATCH 561/807] cdrom: Fix typo, 'devicen' to 'device' Fix typo in cd_dbg line to add trailing newline character. Signed-off-by: Steven Davis Link: https://lore.kernel.org/lkml/20241229165744.21725-1-goldside000@outlook.com Reviewed-by: Phillip Potter Link: https://lore.kernel.org/lkml/Z3GV2W_MUOw5BrtR@equinox Signed-off-by: Phillip Potter Link: https://lore.kernel.org/r/20241230193431.441120-2-phil@philpotter.co.uk Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 6a99a459b80b..51745ed1bbab 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -1106,7 +1106,7 @@ int open_for_data(struct cdrom_device_info *cdi) } } - cd_dbg(CD_OPEN, "all seems well, opening the devicen"); + cd_dbg(CD_OPEN, "all seems well, opening the device\n"); /* all seems well, we can open the device */ ret = cdo->open(cdi, 0); /* open for data */ From a9c83a0ab66a5b02e914daed502fb8d3a8d3d619 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Dec 2024 14:15:17 -0700 Subject: [PATCH 562/807] io_uring/timeout: flush timeouts outside of the timeout lock syzbot reports that a recent fix causes nesting issues between the (now) raw timeoutlock and the eventfd locking: ============================= [ BUG: Invalid wait context ] 6.13.0-rc4-00080-g9828a4c0901f #29 Not tainted ----------------------------- kworker/u32:0/68094 is trying to lock: ffff000014d7a520 (&ctx->wqh#2){..-.}-{3:3}, at: eventfd_signal_mask+0x64/0x180 other info that might help us debug this: context-{5:5} 6 locks held by kworker/u32:0/68094: #0: ffff0000c1d98148 ((wq_completion)iou_exit){+.+.}-{0:0}, at: process_one_work+0x4e8/0xfc0 #1: ffff80008d927c78 ((work_completion)(&ctx->exit_work)){+.+.}-{0:0}, at: process_one_work+0x53c/0xfc0 #2: ffff0000c59bc3d8 (&ctx->completion_lock){+.+.}-{3:3}, at: io_kill_timeouts+0x40/0x180 #3: ffff0000c59bc358 (&ctx->timeout_lock){-.-.}-{2:2}, at: io_kill_timeouts+0x48/0x180 #4: ffff800085127aa0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire+0x8/0x38 #5: ffff800085127aa0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire+0x8/0x38 stack backtrace: CPU: 7 UID: 0 PID: 68094 Comm: kworker/u32:0 Not tainted 6.13.0-rc4-00080-g9828a4c0901f #29 Hardware name: linux,dummy-virt (DT) Workqueue: iou_exit io_ring_exit_work Call trace: show_stack+0x1c/0x30 (C) __dump_stack+0x24/0x30 dump_stack_lvl+0x60/0x80 dump_stack+0x14/0x20 __lock_acquire+0x19f8/0x60c8 lock_acquire+0x1a4/0x540 _raw_spin_lock_irqsave+0x90/0xd0 eventfd_signal_mask+0x64/0x180 io_eventfd_signal+0x64/0x108 io_req_local_work_add+0x294/0x430 __io_req_task_work_add+0x1c0/0x270 io_kill_timeout+0x1f0/0x288 io_kill_timeouts+0xd4/0x180 io_uring_try_cancel_requests+0x2e8/0x388 io_ring_exit_work+0x150/0x550 process_one_work+0x5e8/0xfc0 worker_thread+0x7ec/0xc80 kthread+0x24c/0x300 ret_from_fork+0x10/0x20 because after the preempt-rt fix for the timeout lock nesting inside the io-wq lock, we now have the eventfd spinlock nesting inside the raw timeout spinlock. Rather than play whack-a-mole with other nesting on the timeout lock, split the deletion and killing of timeouts so queueing the task_work for the timeout cancelations can get done outside of the timeout lock. Reported-by: syzbot+b1fc199a40b65d601b65@syzkaller.appspotmail.com Fixes: 020b40f35624 ("io_uring: make ctx->timeout_lock a raw spinlock") Signed-off-by: Jens Axboe --- io_uring/timeout.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index bbe58638eca7..362689b17ccc 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -85,7 +85,27 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_task_complete(req, ts); } -static bool io_kill_timeout(struct io_kiocb *req, int status) +static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) +{ + if (list_empty(list)) + return false; + + while (!list_empty(list)) { + struct io_timeout *timeout; + struct io_kiocb *req; + + timeout = list_first_entry(list, struct io_timeout, list); + list_del_init(&timeout->list); + req = cmd_to_io_kiocb(timeout); + if (err) + req_set_fail(req); + io_req_queue_tw_complete(req, err); + } + + return true; +} + +static void io_kill_timeout(struct io_kiocb *req, struct list_head *list) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; @@ -93,21 +113,17 @@ static bool io_kill_timeout(struct io_kiocb *req, int status) if (hrtimer_try_to_cancel(&io->timer) != -1) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); - if (status) - req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&timeout->list); - io_req_queue_tw_complete(req, status); - return true; + list_move_tail(&timeout->list, list); } - return false; } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) { - u32 seq; struct io_timeout *timeout, *tmp; + LIST_HEAD(list); + u32 seq; raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); @@ -131,10 +147,11 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - io_kill_timeout(req, 0); + io_kill_timeout(req, &list); } ctx->cq_last_tm_flush = seq; raw_spin_unlock_irq(&ctx->timeout_lock); + io_flush_killed_timeouts(&list, 0); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) @@ -661,7 +678,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx bool cancel_all) { struct io_timeout *timeout, *tmp; - int canceled = 0; + LIST_HEAD(list); /* * completion_lock is needed for io_match_task(). Take it before @@ -672,11 +689,11 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); - if (io_match_task(req, tctx, cancel_all) && - io_kill_timeout(req, -ECANCELED)) - canceled++; + if (io_match_task(req, tctx, cancel_all)) + io_kill_timeout(req, &list); } raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); - return canceled != 0; + + return io_flush_killed_timeouts(&list, -ECANCELED); } From b255ef45fcc2141c1bf98456796abb956d843a27 Mon Sep 17 00:00:00 2001 From: Vitalii Mordan Date: Fri, 27 Dec 2024 15:30:07 +0300 Subject: [PATCH 563/807] eth: bcmsysport: fix call balance of priv->clk handling routines Check the return value of clk_prepare_enable to ensure that priv->clk has been successfully enabled. If priv->clk was not enabled during bcm_sysport_probe, bcm_sysport_resume, or bcm_sysport_open, it must not be disabled in any subsequent execution paths. Fixes: 31bc72d97656 ("net: systemport: fetch and use clock resources") Signed-off-by: Vitalii Mordan Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20241227123007.2333397-1-mordan@ispras.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcmsysport.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 42672c63f108..bc4e1f3b3752 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1933,7 +1933,11 @@ static int bcm_sysport_open(struct net_device *dev) unsigned int i; int ret; - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + netdev_err(dev, "could not enable priv clock\n"); + return ret; + } /* Reset UniMAC */ umac_reset(priv); @@ -2591,7 +2595,11 @@ static int bcm_sysport_probe(struct platform_device *pdev) goto err_deregister_notifier; } - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + dev_err(&pdev->dev, "could not enable priv clock\n"); + goto err_deregister_netdev; + } priv->rev = topctrl_readl(priv, REV_CNTL) & REV_MASK; dev_info(&pdev->dev, @@ -2605,6 +2613,8 @@ static int bcm_sysport_probe(struct platform_device *pdev) return 0; +err_deregister_netdev: + unregister_netdev(dev); err_deregister_notifier: unregister_netdevice_notifier(&priv->netdev_notifier); err_deregister_fixed_link: @@ -2774,7 +2784,12 @@ static int __maybe_unused bcm_sysport_resume(struct device *d) if (!netif_running(dev)) return 0; - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + netdev_err(dev, "could not enable priv clock\n"); + return ret; + } + if (priv->wolopts) clk_disable_unprepare(priv->wol_clk); From fb3a9a1165cea104b5ab3753e88218e4497b01c1 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Fri, 20 Dec 2024 19:28:06 -0800 Subject: [PATCH 564/807] gve: trigger RX NAPI instead of TX NAPI in gve_xsk_wakeup Commit ba0925c34e0f ("gve: process XSK TX descriptors as part of RX NAPI") moved XSK TX processing to be part of the RX NAPI. However, that commit did not include triggering the RX NAPI in gve_xsk_wakeup. This is necessary because the TX NAPI only processes TX completions, meaning that a TX wakeup would not actually trigger XSK descriptor processing. Also, the branch on XDP_WAKEUP_TX was supposed to have been removed, as the NAPI should be scheduled whether the wakeup is for RX or TX. Fixes: ba0925c34e0f ("gve: process XSK TX descriptors as part of RX NAPI") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Link: https://patch.msgid.link/20241221032807.302244-1-pkaligineedi@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 09fb7f16f73e..8a8f6ab12a98 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1714,7 +1714,7 @@ done: static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) { struct gve_priv *priv = netdev_priv(dev); - int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id); + struct napi_struct *napi; if (!gve_get_napi_enabled(priv)) return -ENETDOWN; @@ -1722,19 +1722,12 @@ static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog) return -EINVAL; - if (flags & XDP_WAKEUP_TX) { - struct gve_tx_ring *tx = &priv->tx[tx_queue_id]; - struct napi_struct *napi = - &priv->ntfy_blocks[tx->ntfy_id].napi; - - if (!napi_if_scheduled_mark_missed(napi)) { - /* Call local_bh_enable to trigger SoftIRQ processing */ - local_bh_disable(); - napi_schedule(napi); - local_bh_enable(); - } - - tx->xdp_xsk_wakeup++; + napi = &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_id)].napi; + if (!napi_if_scheduled_mark_missed(napi)) { + /* Call local_bh_enable to trigger SoftIRQ processing */ + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); } return 0; From ad5c318086e2e23b577eca33559c5ebf89bc7eb9 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 21 Dec 2024 17:14:48 +0900 Subject: [PATCH 565/807] net: mv643xx_eth: fix an OF node reference leak Current implementation of mv643xx_eth_shared_of_add_port() calls of_parse_phandle(), but does not release the refcount on error. Call of_node_put() in the error path and in mv643xx_eth_shared_of_remove(). This bug was found by an experimental verification tool that I am developing. Fixes: 76723bca2802 ("net: mv643xx_eth: add DT parsing support") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241221081448.3313163-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mv643xx_eth.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index a06048719e84..67a6ff07c83d 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2704,9 +2704,15 @@ static struct platform_device *port_platdev[3]; static void mv643xx_eth_shared_of_remove(void) { + struct mv643xx_eth_platform_data *pd; int n; for (n = 0; n < 3; n++) { + if (!port_platdev[n]) + continue; + pd = dev_get_platdata(&port_platdev[n]->dev); + if (pd) + of_node_put(pd->phy_node); platform_device_del(port_platdev[n]); port_platdev[n] = NULL; } @@ -2769,8 +2775,10 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, } ppdev = platform_device_alloc(MV643XX_ETH_NAME, dev_num); - if (!ppdev) - return -ENOMEM; + if (!ppdev) { + ret = -ENOMEM; + goto put_err; + } ppdev->dev.coherent_dma_mask = DMA_BIT_MASK(32); ppdev->dev.of_node = pnp; @@ -2792,6 +2800,8 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, port_err: platform_device_put(ppdev); +put_err: + of_node_put(ppd.phy_node); return ret; } From cbb26f7d8451fe56ccac802c6db48d16240feebd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 21 Dec 2024 09:51:46 +0100 Subject: [PATCH 566/807] mptcp: fix TCP options overflow. Syzbot reported the following splat: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 1 UID: 0 PID: 5836 Comm: sshd Not tainted 6.13.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 RIP: 0010:_compound_head include/linux/page-flags.h:242 [inline] RIP: 0010:put_page+0x23/0x260 include/linux/mm.h:1552 Code: 90 90 90 90 90 90 90 55 41 57 41 56 53 49 89 fe 48 bd 00 00 00 00 00 fc ff df e8 f8 5e 12 f8 49 8d 5e 08 48 89 d8 48 c1 e8 03 <80> 3c 28 00 74 08 48 89 df e8 8f c7 78 f8 48 8b 1b 48 89 de 48 83 RSP: 0000:ffffc90003916c90 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000008 RCX: ffff888030458000 RDX: 0000000000000100 RSI: 0000000000000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff898ca81d R09: 1ffff110054414ac R10: dffffc0000000000 R11: ffffed10054414ad R12: 0000000000000007 R13: ffff88802a20a542 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f34f496e800(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f9d6ec9ec28 CR3: 000000004d260000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_page_unref include/linux/skbuff_ref.h:43 [inline] __skb_frag_unref include/linux/skbuff_ref.h:56 [inline] skb_release_data+0x483/0x8a0 net/core/skbuff.c:1119 skb_release_all net/core/skbuff.c:1190 [inline] __kfree_skb+0x55/0x70 net/core/skbuff.c:1204 tcp_clean_rtx_queue net/ipv4/tcp_input.c:3436 [inline] tcp_ack+0x2442/0x6bc0 net/ipv4/tcp_input.c:4032 tcp_rcv_state_process+0x8eb/0x44e0 net/ipv4/tcp_input.c:6805 tcp_v4_do_rcv+0x77d/0xc70 net/ipv4/tcp_ipv4.c:1939 tcp_v4_rcv+0x2dc0/0x37f0 net/ipv4/tcp_ipv4.c:2351 ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 __netif_receive_skb_one_core net/core/dev.c:5672 [inline] __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5785 process_backlog+0x662/0x15b0 net/core/dev.c:6117 __napi_poll+0xcb/0x490 net/core/dev.c:6883 napi_poll net/core/dev.c:6952 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:7074 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0xf7/0x220 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0x57/0xc0 arch/x86/kernel/apic/apic.c:1049 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702 RIP: 0033:0x7f34f4519ad5 Code: 85 d2 74 0d 0f 10 02 48 8d 54 24 20 0f 11 44 24 20 64 8b 04 25 18 00 00 00 85 c0 75 27 41 b8 08 00 00 00 b8 0f 01 00 00 0f 05 <48> 3d 00 f0 ff ff 76 75 48 8b 15 24 73 0d 00 f7 d8 64 89 02 48 83 RSP: 002b:00007ffec5b32ce0 EFLAGS: 00000246 RAX: 0000000000000001 RBX: 00000000000668a0 RCX: 00007f34f4519ad5 RDX: 00007ffec5b32d00 RSI: 0000000000000004 RDI: 0000564f4bc6cae0 RBP: 0000564f4bc6b5a0 R08: 0000000000000008 R09: 0000000000000000 R10: 00007ffec5b32de8 R11: 0000000000000246 R12: 0000564f48ea8aa4 R13: 0000000000000001 R14: 0000564f48ea93e8 R15: 00007ffec5b32d68 Eric noted a probable shinfo->nr_frags corruption, which indeed occurs. The root cause is a buggy MPTCP option len computation in some circumstances: the ADD_ADDR option should be mutually exclusive with DSS since the blamed commit. Still, mptcp_established_options_add_addr() tries to set the relevant info in mptcp_out_options, if the remaining space is large enough even when DSS is present. Since the ADD_ADDR infos and the DSS share the same union fields, adding first corrupts the latter. In the worst-case scenario, such corruption increases the DSS binary layout, exceeding the computed length and possibly overwriting the skb shared info. Address the issue by enforcing mutual exclusion in mptcp_established_options_add_addr(), too. Cc: stable@vger.kernel.org Reported-by: syzbot+38a095a81f30d82884c1@syzkaller.appspotmail.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/538 Fixes: 1bff1e43a30e ("mptcp: optimize out option generation") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/025d9df8cde3c9a557befc47e9bc08fbbe3476e5.1734771049.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1603b3702e22..a62bc874bf1e 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -667,8 +667,15 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * &echo, &drop_other_suboptions)) return false; + /* + * Later on, mptcp_write_options() will enforce mutually exclusion with + * DSS, bail out if such option is set and we can't drop it. + */ if (drop_other_suboptions) remaining += opt_size; + else if (opts->suboptions & OPTION_MPTCP_DSS) + return false; + len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); if (remaining < len) return false; From 03c8d0af2e409e15c16130b185e12b5efba0a6b9 Mon Sep 17 00:00:00 2001 From: Pascal Hambourg Date: Mon, 23 Dec 2024 17:44:01 +0100 Subject: [PATCH 567/807] sky2: Add device ID 11ab:4373 for Marvell 88E8075 A Marvell 88E8075 ethernet controller has this device ID instead of 11ab:4370 and works fine with the sky2 driver. Signed-off-by: Pascal Hambourg Cc: stable@vger.kernel.org Link: https://patch.msgid.link/10165a62-99fb-4be6-8c64-84afd6234085@plouf.fr.eu.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/sky2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 3914cd9210d4..988fa28cfb5f 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -130,6 +130,7 @@ static const struct pci_device_id sky2_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x436C) }, /* 88E8072 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x436D) }, /* 88E8055 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4370) }, /* 88E8075 */ + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4373) }, /* 88E8075 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4380) }, /* 88E8057 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4381) }, /* 88E8059 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4382) }, /* 88E8079 */ From 8ec396d05d1b737c87311fb7311f753b02c2a6b1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 28 Nov 2024 15:06:17 +0000 Subject: [PATCH 568/807] mm: reinstate ability to map write-sealed memfd mappings read-only Patch series "mm: reinstate ability to map write-sealed memfd mappings read-only". In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. This series reworks how we both permit write-sealed mappings being mapped read-only and disallow mprotect() from undoing the write-seal, fixing this regression. We also add a regression test to ensure that we do not accidentally regress this in future. Thanks to Julian Orth for reporting this regression. This patch (of 2): In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. This was previously unnecessarily disallowed, despite the man page documentation indicating that it would be, thereby limiting the usefulness of F_SEAL_WRITE logic. We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE seal (one which disallows future writes to the memfd) to also be used for F_SEAL_WRITE. For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a read-only mapping to disallow mprotect() from overriding the seal - an operation performed by seal_check_write(), invoked from shmem_mmap(), the f_op->mmap() hook used by shmem mappings. By extending this to F_SEAL_WRITE and critically - checking mapping_map_writable() to determine if we may map the memfd AFTER we invoke shmem_mmap() - the desired logic becomes possible. This is because mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will have cleared. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. We reinstate this functionality by moving the check out of shmem_mmap() and instead performing it in do_mmap() at the point at which VMA flags are being determined, which seems in any case to be a more appropriate place in which to make this determination. In order to achieve this we rework memfd seal logic to allow us access to this information using existing logic and eliminate the clearing of VM_MAYWRITE from seal_check_write() which we are performing in do_mmap() instead. Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") Signed-off-by: Lorenzo Stoakes Reported-by: Julian Orth Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/ Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Shuah Khan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/memfd.h | 14 +++++++++++ include/linux/mm.h | 56 ++++++++++++++++++++++++++++++------------- mm/memfd.c | 2 +- mm/mmap.c | 4 ++++ 4 files changed, 58 insertions(+), 18 deletions(-) diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 3f2cf339ceaf..d437e3070850 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,6 +7,7 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); +unsigned int *memfd_file_seals_ptr(struct file *file); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -16,6 +17,19 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } + +static inline unsigned int *memfd_file_seals_ptr(struct file *file) +{ + return NULL; +} #endif +/* Retrieve memfd seals associated with the file, if any. */ +static inline unsigned int memfd_file_seals(struct file *file) +{ + unsigned int *sealsp = memfd_file_seals_ptr(file); + + return sealsp ? *sealsp : 0; +} + #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 338a76ce9083..fb397918c43d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4101,6 +4101,37 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif +static inline bool is_write_sealed(int seals) +{ + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +/** + * is_readonly_sealed - Checks whether write-sealed but mapped read-only, + * in which case writes should be disallowing moving + * forwards. + * @seals: the seals to check + * @vm_flags: the VMA flags to check + * + * Returns whether readonly sealed, in which case writess should be disallowed + * going forward. + */ +static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) +{ + /* + * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (is_write_sealed(seals) && + ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) + return true; + + return false; +} + /** * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and * handle them. @@ -4112,24 +4143,15 @@ static inline void mem_dump_obj(void *object) {} */ static inline int seal_check_write(int seals, struct vm_area_struct *vma) { - if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; + if (!is_write_sealed(seals)) + return 0; - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vm_flags_clear(vma, VM_MAYWRITE); - } + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; return 0; } diff --git a/mm/memfd.c b/mm/memfd.c index c17c3ea701a1..35a370d75c9a 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) return error; } -static unsigned int *memfd_file_seals_ptr(struct file *file) +unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; diff --git a/mm/mmap.c b/mm/mmap.c index d32b7e701058..16f8e8be01f8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -368,6 +369,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); + unsigned int seals = memfd_file_seals(file); unsigned long flags_mask; if (!file_mmap_ok(file, inode, pgoff, len)) @@ -408,6 +410,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + else if (is_readonly_sealed(seals, vm_flags)) + vm_flags &= ~VM_MAYWRITE; fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) From ea0916e01d0b0f2cce1369ac1494239a79827270 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 28 Nov 2024 15:06:18 +0000 Subject: [PATCH 569/807] selftests/memfd: add test for mapping write-sealed memfd read-only Now we have reinstated the ability to map F_SEAL_WRITE mappings read-only, assert that we are able to do this in a test to ensure that we do not regress this again. Link: https://lkml.kernel.org/r/a6377ec470b14c0539b4600cf8fa24bf2e4858ae.1732804776.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Jann Horn Cc: Julian Orth Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 0a0b55516028..c0c53451a16d 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -282,6 +282,24 @@ static void *mfd_assert_mmap_shared(int fd) return p; } +static void *mfd_assert_mmap_read_shared(int fd) +{ + void *p; + + p = mmap(NULL, + mfd_def_size, + PROT_READ, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + + return p; +} + static void *mfd_assert_mmap_private(int fd) { void *p; @@ -980,6 +998,30 @@ static void test_seal_future_write(void) close(fd); } +static void test_seal_write_map_read_shared(void) +{ + int fd; + void *p; + + printf("%s SEAL-WRITE-MAP-READ\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_write_map_read", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_WRITE); + + p = mfd_assert_mmap_read_shared(fd); + + mfd_assert_read(fd); + mfd_assert_read_shared(fd); + mfd_fail_write(fd); + + munmap(p, mfd_def_size); + close(fd); +} + /* * Test SEAL_SHRINK * Test whether SEAL_SHRINK actually prevents shrinking @@ -1593,6 +1635,7 @@ int main(int argc, char **argv) test_seal_write(); test_seal_future_write(); + test_seal_write_map_read_shared(); test_seal_shrink(); test_seal_grow(); test_seal_resize(); From 6aaced5abd32e2a57cd94fd64f824514d0361da8 Mon Sep 17 00:00:00 2001 From: Seiji Nishikawa Date: Sun, 1 Dec 2024 01:12:34 +0900 Subject: [PATCH 570/807] mm: vmscan: account for free pages to prevent infinite Loop in throttle_direct_reclaim() The task sometimes continues looping in throttle_direct_reclaim() because allow_direct_reclaim(pgdat) keeps returning false. #0 [ffff80002cb6f8d0] __switch_to at ffff8000080095ac #1 [ffff80002cb6f900] __schedule at ffff800008abbd1c #2 [ffff80002cb6f990] schedule at ffff800008abc50c #3 [ffff80002cb6f9b0] throttle_direct_reclaim at ffff800008273550 #4 [ffff80002cb6fa20] try_to_free_pages at ffff800008277b68 #5 [ffff80002cb6fae0] __alloc_pages_nodemask at ffff8000082c4660 #6 [ffff80002cb6fc50] alloc_pages_vma at ffff8000082e4a98 #7 [ffff80002cb6fca0] do_anonymous_page at ffff80000829f5a8 #8 [ffff80002cb6fce0] __handle_mm_fault at ffff8000082a5974 #9 [ffff80002cb6fd90] handle_mm_fault at ffff8000082a5bd4 At this point, the pgdat contains the following two zones: NODE: 4 ZONE: 0 ADDR: ffff00817fffe540 NAME: "DMA32" SIZE: 20480 MIN/LOW/HIGH: 11/28/45 VM_STAT: NR_FREE_PAGES: 359 NR_ZONE_INACTIVE_ANON: 18813 NR_ZONE_ACTIVE_ANON: 0 NR_ZONE_INACTIVE_FILE: 50 NR_ZONE_ACTIVE_FILE: 0 NR_ZONE_UNEVICTABLE: 0 NR_ZONE_WRITE_PENDING: 0 NR_MLOCK: 0 NR_BOUNCE: 0 NR_ZSPAGES: 0 NR_FREE_CMA_PAGES: 0 NODE: 4 ZONE: 1 ADDR: ffff00817fffec00 NAME: "Normal" SIZE: 8454144 PRESENT: 98304 MIN/LOW/HIGH: 68/166/264 VM_STAT: NR_FREE_PAGES: 146 NR_ZONE_INACTIVE_ANON: 94668 NR_ZONE_ACTIVE_ANON: 3 NR_ZONE_INACTIVE_FILE: 735 NR_ZONE_ACTIVE_FILE: 78 NR_ZONE_UNEVICTABLE: 0 NR_ZONE_WRITE_PENDING: 0 NR_MLOCK: 0 NR_BOUNCE: 0 NR_ZSPAGES: 0 NR_FREE_CMA_PAGES: 0 In allow_direct_reclaim(), while processing ZONE_DMA32, the sum of inactive/active file-backed pages calculated in zone_reclaimable_pages() based on the result of zone_page_state_snapshot() is zero. Additionally, since this system lacks swap, the calculation of inactive/ active anonymous pages is skipped. crash> p nr_swap_pages nr_swap_pages = $1937 = { counter = 0 } As a result, ZONE_DMA32 is deemed unreclaimable and skipped, moving on to the processing of the next zone, ZONE_NORMAL, despite ZONE_DMA32 having free pages significantly exceeding the high watermark. The problem is that the pgdat->kswapd_failures hasn't been incremented. crash> px ((struct pglist_data *) 0xffff00817fffe540)->kswapd_failures $1935 = 0x0 This is because the node deemed balanced. The node balancing logic in balance_pgdat() evaluates all zones collectively. If one or more zones (e.g., ZONE_DMA32) have enough free pages to meet their watermarks, the entire node is deemed balanced. This causes balance_pgdat() to exit early before incrementing the kswapd_failures, as it considers the overall memory state acceptable, even though some zones (like ZONE_NORMAL) remain under significant pressure. The patch ensures that zone_reclaimable_pages() includes free pages (NR_FREE_PAGES) in its calculation when no other reclaimable pages are available (e.g., file-backed or anonymous pages). This change prevents zones like ZONE_DMA32, which have sufficient free pages, from being mistakenly deemed unreclaimable. By doing so, the patch ensures proper node balancing, avoids masking pressure on other zones like ZONE_NORMAL, and prevents infinite loops in throttle_direct_reclaim() caused by allow_direct_reclaim(pgdat) repeatedly returning false. The kernel hangs due to a task stuck in throttle_direct_reclaim(), caused by a node being incorrectly deemed balanced despite pressure in certain zones, such as ZONE_NORMAL. This issue arises from zone_reclaimable_pages() returning 0 for zones without reclaimable file- backed or anonymous pages, causing zones like ZONE_DMA32 with sufficient free pages to be skipped. The lack of swap or reclaimable pages results in ZONE_DMA32 being ignored during reclaim, masking pressure in other zones. Consequently, pgdat->kswapd_failures remains 0 in balance_pgdat(), preventing fallback mechanisms in allow_direct_reclaim() from being triggered, leading to an infinite loop in throttle_direct_reclaim(). This patch modifies zone_reclaimable_pages() to account for free pages (NR_FREE_PAGES) when no other reclaimable pages exist. This ensures zones with sufficient free pages are not skipped, enabling proper balancing and reclaim behavior. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20241130164346.436469-1-snishika@redhat.com Link: https://lkml.kernel.org/r/20241130161236.433747-2-snishika@redhat.com Fixes: 5a1c84b404a7 ("mm: remove reclaim and compaction retry approximations") Signed-off-by: Seiji Nishikawa Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 76378bc257e3..9a859b7d18d7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -374,7 +374,14 @@ unsigned long zone_reclaimable_pages(struct zone *zone) if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); - + /* + * If there are no reclaimable file-backed or anonymous pages, + * ensure zones with sufficient free pages are not skipped. + * This prevents zones like DMA32 from being ignored in reclaim + * scenarios where they can still help alleviate memory pressure. + */ + if (nr == 0) + nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); return nr; } From 34d7cf637c437d5c2a8a6ef23ea45193bad8a91c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 6 Dec 2024 15:03:45 +0800 Subject: [PATCH 571/807] mm: don't try THP alignment for FS without get_unmapped_area Commit ed48e87c7df3 ("thp: add thp_get_unmapped_area_vmflags()") changes thp_get_unmapped_area() to thp_get_unmapped_area_vmflags() in __get_unmapped_area(), which doesn't initialize local get_area for anonymous mappings. This leads to us always trying THP alignment even for file_operations which have a NULL ->get_unmapped_area() callback. Since commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") we only want to enable THP alignment for anonymous mappings, so add a !file check to avoid attempting THP alignment for file mappings. Found issue by code inspection. THP alignment is used for easy or more pmd mappings, from vma side. This may cause unnecessary VMA fragmentation and potentially worse performance on filesystems that do not actually support THPs and thus cannot benefit from the alignment. Link: https://lkml.kernel.org/r/20241206070345.2526501-1-wangkefeng.wang@huawei.com Fixes: ed48e87c7df3 ("thp: add thp_get_unmapped_area_vmflags()") Signed-off-by: Kefeng Wang Reviewed-by: Vlastimil Babka Reviewed-by: Yang Shi Cc: Christophe Leroy Cc: David Hildenbrand Cc: Jann Horn Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Rick Edgecombe Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 16f8e8be01f8..aec208f90337 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -892,7 +892,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ From 158cdce87c8c172787063998ad5dd3e2f658b963 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 6 Dec 2024 16:30:25 +0800 Subject: [PATCH 572/807] mm/readahead: fix large folio support in async readahead When testing large folio support with XFS on our servers, we observed that only a few large folios are mapped when reading large files via mmap. After a thorough analysis, I identified it was caused by the `/sys/block/*/queue/read_ahead_kb` setting. On our test servers, this parameter is set to 128KB. After I tune it to 2MB, the large folio can work as expected. However, I believe the large folio behavior should not be dependent on the value of read_ahead_kb. It would be more robust if the kernel can automatically adopt to it. With /sys/block/*/queue/read_ahead_kb set to 128KB and performing a sequential read on a 1GB file using MADV_HUGEPAGE, the differences in /proc/meminfo are as follows: - before this patch FileHugePages: 18432 kB FilePmdMapped: 4096 kB - after this patch FileHugePages: 1067008 kB FilePmdMapped: 1048576 kB This shows that after applying the patch, the entire 1GB file is mapped to huge pages. The stable list is CCed, as without this patch, large folios don't function optimally in the readahead path. It's worth noting that if read_ahead_kb is set to a larger value that isn't aligned with huge page sizes (e.g., 4MB + 128KB), it may still fail to map to hugepages. Link: https://lkml.kernel.org/r/20241108141710.9721-1-laoar.shao@gmail.com Link: https://lkml.kernel.org/r/20241206083025.3478-1-laoar.shao@gmail.com Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings") Signed-off-by: Yafang Shao Tested-by: kernel test robot Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/readahead.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/readahead.c b/mm/readahead.c index ea650b8b02fb..e151f4b13ca4 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -646,7 +646,11 @@ void page_cache_async_ra(struct readahead_control *ractl, 1UL << order); if (index == expected) { ra->start += ra->size; - ra->size = get_next_ra_size(ra, max_pages); + /* + * In the case of MADV_HUGEPAGE, the actual size might exceed + * the readahead window. + */ + ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); ra->async_size = ra->size; goto readit; } From 1fd8bc7cd889bd73d07a83cb32d674ac68f99153 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Sat, 14 Dec 2024 17:30:05 +0800 Subject: [PATCH 573/807] maple_tree: reload mas before the second call for mas_empty_area Change the LONG_MAX in simple_offset_add to 1024, and do latter: [root@fedora ~]# mkdir /tmp/dir [root@fedora ~]# for i in {1..1024}; do touch /tmp/dir/$i; done touch: cannot touch '/tmp/dir/1024': Device or resource busy [root@fedora ~]# rm /tmp/dir/123 [root@fedora ~]# touch /tmp/dir/1024 [root@fedora ~]# rm /tmp/dir/100 [root@fedora ~]# touch /tmp/dir/1025 touch: cannot touch '/tmp/dir/1025': Device or resource busy After we delete file 100, actually this is a empty entry, but the latter create failed unexpected. mas_alloc_cyclic has two chance to find empty entry. First find the entry with range range_lo and range_hi, if no empty entry exist, and range_lo > min, retry find with range min and range_hi. However, the first call mas_empty_area may mark mas as EBUSY, and the second call for mas_empty_area will return false directly. Fix this by reload mas before second call for mas_empty_area. [Liam.Howlett@Oracle.com: fix mas_alloc_cyclic() second search] Link: https://lore.kernel.org/all/20241216060600.287B4C4CED0@smtp.kernel.org/ Link: https://lkml.kernel.org/r/20241216190113.1226145-2-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20241214093005.72284-1-yangerkun@huaweicloud.com Fixes: 9b6713cc7522 ("maple_tree: Add mtree_alloc_cyclic()") Signed-off-by: Yang Erkun Signed-off-by: Liam R. Howlett Cc: Christian Brauner Cc: Chuck Lever says: Cc: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d0ae808f3a14..047397136f15 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4354,6 +4354,7 @@ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, ret = 1; } if (ret < 0 && range_lo > min) { + mas_reset(mas); ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; From 59d9094df3d79443937add8700b2ef1a866b1081 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 16 Dec 2024 15:11:47 +0800 Subject: [PATCH 574/807] mm: hugetlb: independent PMD page table shared count The folio refcount may be increased unexpectly through try_get_folio() by caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to check whether a pmd page table is shared. The check is incorrect if the refcount is increased by the above caller, and this can cause the page table leaked: BUG: Bad page state in process sh pfn:109324 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) page_type: f2(table) raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 page dumped because: nonzero mapcount ... CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7 Tainted: [B]=BAD_PAGE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x80/0xf8 dump_stack+0x18/0x28 bad_page+0x8c/0x130 free_page_is_bad_report+0xa4/0xb0 free_unref_page+0x3cc/0x620 __folio_put+0xf4/0x158 split_huge_pages_all+0x1e0/0x3e8 split_huge_pages_write+0x25c/0x2d8 full_proxy_write+0x64/0xd8 vfs_write+0xcc/0x280 ksys_write+0x70/0x110 __arm64_sys_write+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 The issue may be triggered by damon, offline_page, page_idle, etc, which will increase the refcount of page table. 1. The page table itself will be discarded after reporting the "nonzero mapcount". 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared and a shared page table will not be unmapped. Fix it by introducing independent PMD page table shared count. As described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv pmds, so we can reuse the field as pt_share_count. Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Liu Shixin Cc: Kefeng Wang Cc: Ken Chen Cc: Muchun Song Cc: Nanyong Sun Cc: Jane Chu Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + include/linux/mm_types.h | 30 ++++++++++++++++++++++++++++++ mm/hugetlb.c | 16 +++++++--------- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index fb397918c43d..b1c3db9cf355 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3125,6 +3125,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) if (!pmd_ptlock_init(ptdesc)) return false; __folio_set_pgtable(folio); + ptdesc_pmd_pts_init(ptdesc); lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7361a8f3ab68..332cee285662 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. + * @pt_share_count: Used for HugeTLB PMD page table share count. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. @@ -471,6 +472,9 @@ struct ptdesc { pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + atomic_t pt_share_count; +#endif }; union { @@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); const struct page *: (const struct ptdesc *)(p), \ struct page *: (struct ptdesc *)(p))) +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ + atomic_set(&ptdesc->pt_share_count, 0); +} + +static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc) +{ + atomic_inc(&ptdesc->pt_share_count); +} + +static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) +{ + atomic_dec(&ptdesc->pt_share_count); +} + +static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +{ + return atomic_read(&ptdesc->pt_share_count); +} +#else +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ +} +#endif + /* * Used for sizing the vmemmap region on some architectures */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cec4b121193f..c498874a7170 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7211,7 +7211,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { - get_page(virt_to_page(spte)); + ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); break; } } @@ -7226,7 +7226,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { - put_page(virt_to_page(spte)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); } spin_unlock(&mm->page_table_lock); out: @@ -7238,10 +7238,6 @@ out: /* * unmap huge page backed by shared pte. * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page @@ -7250,18 +7246,20 @@ out: int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) + if (sz != PMD_SIZE) + return 0; + if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); - put_page(virt_to_page(ptep)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; } From cddc76b165161a02ff14c4d84d0f5266d9d32b9e Mon Sep 17 00:00:00 2001 From: Alessandro Carminati Date: Tue, 17 Dec 2024 14:20:33 +0000 Subject: [PATCH 575/807] mm/kmemleak: fix sleeping function called from invalid context at print message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address a bug in the kernel that triggers a "sleeping function called from invalid context" warning when /sys/kernel/debug/kmemleak is printed under specific conditions: - CONFIG_PREEMPT_RT=y - Set SELinux as the LSM for the system - Set kptr_restrict to 1 - kmemleak buffer contains at least one item BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 136, name: cat preempt_count: 1, expected: 0 RCU nest depth: 2, expected: 2 6 locks held by cat/136: #0: ffff32e64bcbf950 (&p->lock){+.+.}-{3:3}, at: seq_read_iter+0xb8/0xe30 #1: ffffafe6aaa9dea0 (scan_mutex){+.+.}-{3:3}, at: kmemleak_seq_start+0x34/0x128 #3: ffff32e6546b1cd0 (&object->lock){....}-{2:2}, at: kmemleak_seq_show+0x3c/0x1e0 #4: ffffafe6aa8d8560 (rcu_read_lock){....}-{1:2}, at: has_ns_capability_noaudit+0x8/0x1b0 #5: ffffafe6aabbc0f8 (notif_lock){+.+.}-{2:2}, at: avc_compute_av+0xc4/0x3d0 irq event stamp: 136660 hardirqs last enabled at (136659): [] _raw_spin_unlock_irqrestore+0xa8/0xd8 hardirqs last disabled at (136660): [] _raw_spin_lock_irqsave+0x8c/0xb0 softirqs last enabled at (0): [] copy_process+0x11d8/0x3df8 softirqs last disabled at (0): [<0000000000000000>] 0x0 Preemption disabled at: [] kmemleak_seq_show+0x3c/0x1e0 CPU: 1 UID: 0 PID: 136 Comm: cat Tainted: G E 6.11.0-rt7+ #34 Tainted: [E]=UNSIGNED_MODULE Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0xa0/0x128 show_stack+0x1c/0x30 dump_stack_lvl+0xe8/0x198 dump_stack+0x18/0x20 rt_spin_lock+0x8c/0x1a8 avc_perm_nonode+0xa0/0x150 cred_has_capability.isra.0+0x118/0x218 selinux_capable+0x50/0x80 security_capable+0x7c/0xd0 has_ns_capability_noaudit+0x94/0x1b0 has_capability_noaudit+0x20/0x30 restricted_pointer+0x21c/0x4b0 pointer+0x298/0x760 vsnprintf+0x330/0xf70 seq_printf+0x178/0x218 print_unreferenced+0x1a4/0x2d0 kmemleak_seq_show+0xd0/0x1e0 seq_read_iter+0x354/0xe30 seq_read+0x250/0x378 full_proxy_read+0xd8/0x148 vfs_read+0x190/0x918 ksys_read+0xf0/0x1e0 __arm64_sys_read+0x70/0xa8 invoke_syscall.constprop.0+0xd4/0x1d8 el0_svc+0x50/0x158 el0t_64_sync+0x17c/0x180 %pS and %pK, in the same back trace line, are redundant, and %pS can void %pK service in certain contexts. %pS alone already provides the necessary information, and if it cannot resolve the symbol, it falls back to printing the raw address voiding the original intent behind the %pK. Additionally, %pK requires a privilege check CAP_SYSLOG enforced through the LSM, which can trigger a "sleeping function called from invalid context" warning under RT_PREEMPT kernels when the check occurs in an atomic context. This issue may also affect other LSMs. This change avoids the unnecessary privilege check and resolves the sleeping function warning without any loss of information. Link: https://lkml.kernel.org/r/20241217142032.55793-1-acarmina@redhat.com Fixes: 3a6f33d86baa ("mm/kmemleak: use %pK to display kernel pointers in backtrace") Signed-off-by: Alessandro Carminati Acked-by: Sebastian Andrzej Siewior Acked-by: Catalin Marinas Cc: Clément Léger Cc: Alessandro Carminati Cc: Eric Chanudet Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Steven Rostedt Cc: Thomas Weißschuh Cc: Signed-off-by: Andrew Morton --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2a945c07ae99..737af23f4f4e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -373,7 +373,7 @@ static void print_unreferenced(struct seq_file *seq, for (i = 0; i < nr_entries; i++) { void *ptr = (void *)entries[i]; - warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr); + warn_or_seq_printf(seq, " %pS\n", ptr); } } From 4d9b90df2eb49ab9becdbfd1fd60071bb107406e Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Tue, 17 Dec 2024 11:09:21 +0100 Subject: [PATCH 576/807] mailmap: modify the entry for Mathieu Othacehe Set my gnu address as the main one. Link: https://lkml.kernel.org/r/20241217100924.7821-1-othacehe@gnu.org Signed-off-by: Mathieu Othacehe Cc: Alex Elder Cc: David S. Miller Cc: Geliang Tang Cc: Kees Cook Cc: Matthieu Baerts (NGI0) Cc: Neeraj Upadhyay Cc: Quentin Monnet Signed-off-by: Andrew Morton --- .mailmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 7efe43237ca8..f5f97f947020 100644 --- a/.mailmap +++ b/.mailmap @@ -435,7 +435,7 @@ Martin Kepplinger Martin Kepplinger Martin Kepplinger Martyna Szapar-Mudlaw -Mathieu Othacehe +Mathieu Othacehe Mat Martineau Mat Martineau Matthew Wilcox From 472098f23323c39cc6269d7b7bf76cba62830a4c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 17 Dec 2024 16:55:39 +0800 Subject: [PATCH 577/807] docs: mm: fix the incorrect 'FileHugeMapped' field The '/proc/PID/smaps' does not have the 'FileHugeMapped' field to count the file transparent huge pages, instead, the 'FilePmdMapped' field should be used. Fix it. Link: https://lkml.kernel.org/r/d520ce3aba2b03b088be30bece732426a939049a.1734425264.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 5034915f4e8e..8872203df088 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -436,7 +436,7 @@ AnonHugePmdMapped). The number of file transparent huge pages mapped to userspace is available by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. To identify what applications are mapping file transparent huge pages, it -is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields +is necessary to read ``/proc/PID/smaps`` and count the FilePmdMapped fields for each mapping. Note that reading the smaps file is expensive and reading it From cb0ca08b326aa03f87fe94bb91872ce8d2ef1ed8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Dec 2024 08:18:10 +0100 Subject: [PATCH 578/807] kcov: mark in_softirq_really() as __always_inline If gcc decides not to inline in_softirq_really(), objtool warns about a function call with UACCESS enabled: kernel/kcov.o: warning: objtool: __sanitizer_cov_trace_pc+0x1e: call to in_softirq_really() with UACCESS enabled kernel/kcov.o: warning: objtool: check_kcov_mode+0x11: call to in_softirq_really() with UACCESS enabled Mark this as __always_inline to avoid the problem. Link: https://lkml.kernel.org/r/20241217071814.2261620-1-arnd@kernel.org Fixes: 7d4df2dad312 ("kcov: properly check for softirq context") Signed-off-by: Arnd Bergmann Reviewed-by: Marco Elver Cc: Aleksandr Nogikh Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 28a6be6e64fd..187ba1b80bda 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -166,7 +166,7 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, * Unlike in_serving_softirq(), this function returns false when called during * a hardirq or an NMI that happened in the softirq context. */ -static inline bool in_softirq_really(void) +static __always_inline bool in_softirq_really(void) { return in_serving_softirq() && !in_hardirq() && !in_nmi(); } From 3754137d263f52f4b507cf9ae913f8f0497d1b0e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 17 Dec 2024 20:50:00 +0100 Subject: [PATCH 579/807] fs/proc/task_mmu: fix pagemap flags with PMD THP entries on 32bit Entries (including flags) are u64, even on 32bit. So right now we are cutting of the flags on 32bit. This way, for example the cow selftest complains about: # ./cow ... Bail Out! read and ioctl return unmatched results for populated: 0 1 Link: https://lkml.kernel.org/r/20241217195000.1734039-1-david@redhat.com Fixes: 2c1f057e5be6 ("fs/proc/task_mmu: properly detect PM_MMAP_EXCLUSIVE per page of PMD-mapped THPs") Signed-off-by: David Hildenbrand Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 38a5a3e9cba2..f02cd362309a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1810,7 +1810,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } for (; addr != end; addr += PAGE_SIZE, idx++) { - unsigned long cur_flags = flags; + u64 cur_flags = flags; pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && From 5f3fd772d152229d94602bca243fbb658068a597 Mon Sep 17 00:00:00 2001 From: Dennis Lam Date: Tue, 17 Dec 2024 21:39:25 -0500 Subject: [PATCH 580/807] ocfs2: fix slab-use-after-free due to dangling pointer dqi_priv When mounting ocfs2 and then remounting it as read-only, a slab-use-after-free occurs after the user uses a syscall to quota_getnextquota. Specifically, sb_dqinfo(sb, type)->dqi_priv is the dangling pointer. During the remounting process, the pointer dqi_priv is freed but is never set as null leaving it to be accessed. Additionally, the read-only option for remounting sets the DQUOT_SUSPENDED flag instead of setting the DQUOT_USAGE_ENABLED flags. Moreover, later in the process of getting the next quota, the function ocfs2_get_next_id is called and only checks the quota usage flags and not the quota suspended flags. To fix this, I set dqi_priv to null when it is freed after remounting with read-only and put a check for DQUOT_SUSPENDED in ocfs2_get_next_id. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20241218023924.22821-2-dennis.lamerice@gmail.com Fixes: 8f9e8f5fcc05 ("ocfs2: Fix Q_GETNEXTQUOTA for filesystem without quotas") Signed-off-by: Dennis Lam Reported-by: syzbot+d173bf8a5a7faeede34c@syzkaller.appspotmail.com Tested-by: syzbot+d173bf8a5a7faeede34c@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6731d26f.050a0220.1fb99c.014b.GAE@google.com/T/ Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/quota_global.c | 2 +- fs/ocfs2/quota_local.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 2b0daced98eb..3404e7a30c33 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -893,7 +893,7 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) int status = 0; trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type); - if (!sb_has_quota_loaded(sb, type)) { + if (!sb_has_quota_active(sb, type)) { status = -ESRCH; goto out; } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 73d3367c533b..2956d888c131 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -867,6 +867,7 @@ out: brelse(oinfo->dqi_libh); brelse(oinfo->dqi_lqi_bh); kfree(oinfo); + info->dqi_priv = NULL; return status; } From eaebeb93922ca6ab0dd92027b73d0112701706ef Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 19 Dec 2024 21:24:37 +0000 Subject: [PATCH 581/807] mm: zswap: fix race between [de]compression and CPU hotunplug In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the current CPU at the beginning of the operation is retrieved and used throughout. However, since neither preemption nor migration are disabled, it is possible that the operation continues on a different CPU. If the original CPU is hotunplugged while the acomp_ctx is still in use, we run into a UAF bug as the resources attached to the acomp_ctx are freed during hotunplug in zswap_cpu_comp_dead(). The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") when the switch to the crypto_acomp API was made. Prior to that, the per-CPU crypto_comp was retrieved using get_cpu_ptr() which disables preemption and makes sure the CPU cannot go away from under us. Preemption cannot be disabled with the crypto_acomp API as a sleepable context is needed. Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx") increased the UAF surface area by making the per-CPU buffers dynamic, adding yet another resource that can be freed from under zswap compression/decompression by CPU hotunplug. There are a few ways to fix this: (a) Add a refcount for acomp_ctx. (b) Disable migration while using the per-CPU acomp_ctx. (c) Disable CPU hotunplug while using the per-CPU acomp_ctx by holding the CPUs read lock. Implement (c) since it's simpler than (a), and (b) involves using migrate_disable() which is apparently undesired (see huge comment in include/linux/preempt.h). Link: https://lkml.kernel.org/r/20241219212437.2714151-1-yosryahmed@google.com Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") Signed-off-by: Yosry Ahmed Reported-by: Johannes Weiner Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/ Reported-by: Sam Sun Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/ Reviewed-by: Chengming Zhou Acked-by: Barry Song Reviewed-by: Nhat Pham Cc: Vitaly Wool Cc: Signed-off-by: Andrew Morton --- mm/zswap.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index f6316b66fb23..5a27af8d86ea 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -880,6 +880,18 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } +/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */ +static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx) +{ + cpus_read_lock(); + return raw_cpu_ptr(acomp_ctx); +} + +static void acomp_ctx_put_cpu(void) +{ + cpus_read_unlock(); +} + static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -893,8 +905,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, gfp_t gfp; u8 *dst; - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); - + acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); dst = acomp_ctx->buffer; @@ -950,6 +961,7 @@ unlock: zswap_reject_alloc_fail++; mutex_unlock(&acomp_ctx->mutex); + acomp_ctx_put_cpu(); return comp_ret == 0 && alloc_ret == 0; } @@ -960,7 +972,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct crypto_acomp_ctx *acomp_ctx; u8 *src; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); @@ -990,6 +1002,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) if (src != acomp_ctx->buffer) zpool_unmap_handle(zpool, entry->handle); + acomp_ctx_put_cpu(); } /********************************* From 11673247700e2af3a6a95f7b3f1bb80b691c950e Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 19 Dec 2024 14:18:28 +0200 Subject: [PATCH 582/807] percpu: remove intermediate variable in PERCPU_PTR() The intermediate variable in the PERCPU_PTR() macro results in a kernel panic on boot [1] due to a compiler bug seen when compiling the kernel (+ KASAN) with gcc 11.3.1, but not when compiling with latest gcc (v14.2)/clang(v18.1). To solve it, remove the intermediate variable (which is not needed) and keep the casting that resolves the address space checks. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] CPU: 0 UID: 0 PID: 547 Comm: iptables Not tainted 6.13.0-rc1_external_tested-master #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:nf_ct_netns_do_get+0x139/0x540 Code: 03 00 00 48 81 c4 88 00 00 00 5b 5d 41 5c 41 5d 41 5e 41 5f c3 4d 8d 75 08 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 27 03 00 00 41 8b 45 08 83 c0 RSP: 0018:ffff888116df75e8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 1ffff11022dbeebe RCX: ffffffff839a2382 RDX: 0000000000000003 RSI: 0000000000000008 RDI: ffff88842ec46d10 RBP: 0000000000000002 R08: 0000000000000000 R09: fffffbfff0b0860c R10: ffff888116df75e8 R11: 0000000000000001 R12: ffffffff879d6a80 R13: 0000000000000016 R14: 000000000000001e R15: ffff888116df7908 FS: 00007fba01646740(0000) GS:ffff88842ec00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bd901800d8 CR3: 00000001205f0003 CR4: 0000000000172eb0 Call Trace: ? die_addr+0x3d/0xa0 ? exc_general_protection+0x144/0x220 ? asm_exc_general_protection+0x22/0x30 ? __mutex_lock+0x2c2/0x1d70 ? nf_ct_netns_do_get+0x139/0x540 ? nf_ct_netns_do_get+0xb5/0x540 ? net_generic+0x1f0/0x1f0 ? __create_object+0x5e/0x80 xt_check_target+0x1f0/0x930 ? textify_hooks.constprop.0+0x110/0x110 ? pcpu_alloc_noprof+0x7cd/0xcf0 ? xt_find_target+0x148/0x1e0 find_check_entry.constprop.0+0x6c0/0x920 ? get_info+0x380/0x380 ? __virt_addr_valid+0x1df/0x3b0 ? kasan_quarantine_put+0xe3/0x200 ? kfree+0x13e/0x3d0 ? translate_table+0xaf5/0x1750 translate_table+0xbd8/0x1750 ? ipt_unregister_table_exit+0x30/0x30 ? __might_fault+0xbb/0x170 do_ipt_set_ctl+0x408/0x1340 ? nf_sockopt_find.constprop.0+0x17b/0x1f0 ? lock_downgrade+0x680/0x680 ? lockdep_hardirqs_on_prepare+0x284/0x400 ? ipt_register_table+0x440/0x440 ? bit_wait_timeout+0x160/0x160 nf_setsockopt+0x6f/0xd0 raw_setsockopt+0x7e/0x200 ? raw_bind+0x590/0x590 ? do_user_addr_fault+0x812/0xd20 do_sock_setsockopt+0x1e2/0x3f0 ? move_addr_to_user+0x90/0x90 ? lock_downgrade+0x680/0x680 __sys_setsockopt+0x9e/0x100 __x64_sys_setsockopt+0xb9/0x150 ? do_syscall_64+0x33/0x140 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fba015134ce Code: 0f 1f 40 00 48 8b 15 59 69 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b1 0f 1f 00 f3 0f 1e fa 49 89 ca b8 36 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 21 RSP: 002b:00007ffd9de6f388 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 000055bd9017f490 RCX: 00007fba015134ce RDX: 0000000000000040 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 0000000000000500 R08: 0000000000000560 R09: 0000000000000052 R10: 000055bd901800e0 R11: 0000000000000246 R12: 000055bd90180140 R13: 000055bd901800e0 R14: 000055bd9017f498 R15: 000055bd9017ff10 Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc mlx4_ib mlx4_en mlx4_core rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi fuse ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_core ---[ end trace 0000000000000000 ]--- [akpm@linux-foundation.org: simplification, per Uros] Link: https://lkml.kernel.org/r/20241219121828.2120780-1-gal@nvidia.com Fixes: dabddd687c9e ("percpu: cast percpu pointer in PERCPU_PTR() via unsigned long") Signed-off-by: Gal Pressman Closes: https://lore.kernel.org/all/7590f546-4021-4602-9252-0d525de35b52@nvidia.com Cc: Uros Bizjak Cc: Bill Wendling Cc: Christoph Lameter Cc: Dennis Zhou Cc: Justin Stitt Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 35842d1e3879..5b520fe86b60 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,10 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ -({ \ - unsigned long __pcpu_ptr = (__force unsigned long)(__p); \ - (typeof(*(__p)) __force __kernel *)(__pcpu_ptr); \ -}) + (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) #ifdef CONFIG_SMP From d0e6983a6d1719738cf8d13982a68094f0a1872a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 19 Dec 2024 15:30:08 +0800 Subject: [PATCH 583/807] mm: shmem: fix incorrect index alignment for within_size policy With enabling the shmem per-size within_size policy, using an incorrect 'order' size to round_up() the index can lead to incorrect i_size checks, resulting in an inappropriate large orders being returned. Changing to use '1 << order' to round_up() the index to fix this issue. Additionally, adding an 'aligned_index' variable to avoid affecting the index checks. Link: https://lkml.kernel.org/r/77d8ef76a7d3d646e9225e9af88a76549a68aab1.1734593154.git.baolin.wang@linux.alibaba.com Fixes: e7a2ab7b3bb5 ("mm: shmem: add mTHP support for anonymous shmem") Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f6fb053ac50d..dec659e84562 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1689,6 +1689,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; + pgoff_t aligned_index; bool global_huge; loff_t i_size; int order; @@ -1723,9 +1724,9 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, /* Allow mTHP that will be fully within i_size. */ order = highest_order(within_size_orders); while (within_size_orders) { - index = round_up(index + 1, order); + aligned_index = round_up(index + 1, 1 << order); i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) { + if (i_size >> PAGE_SHIFT >= aligned_index) { mask |= within_size_orders; break; } From d77b90d2b2642655b5f60953c36ad887257e1802 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 19 Dec 2024 15:30:09 +0800 Subject: [PATCH 584/807] mm: shmem: fix the update of 'shmem_falloc->nr_unswapped' The 'shmem_falloc->nr_unswapped' is used to record how many writepage refused to swap out because fallocate() is allocating, but after shmem supports large folio swap out, the update of 'shmem_falloc->nr_unswapped' does not use the correct number of pages in the large folio, which may lead to fallocate() not exiting as soon as possible. Anyway, this is found through code inspection, and I am not sure whether it would actually cause serious issues. Link: https://lkml.kernel.org/r/f66a0119d0564c2c37c84f045835b870d1b2196f.1734593154.git.baolin.wang@linux.alibaba.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index dec659e84562..ac58d4fb2e6f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1535,7 +1535,7 @@ try_split: !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) - shmem_falloc->nr_unswapped++; + shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); From adcfb264c3ed51fbbf5068ddf10d309a63683868 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Sat, 21 Dec 2024 12:33:20 +0900 Subject: [PATCH 585/807] vmstat: disable vmstat_work on vmstat_cpu_down_prep() Even after mm/vmstat:online teardown, shepherd may still queue work for the dying cpu until the cpu is removed from online mask. While it's quite rare, this means that after unbind_workers() unbinds a per-cpu kworker, it potentially runs vmstat_update for the dying CPU on an irrelevant cpu before entering atomic AP states. When CONFIG_DEBUG_PREEMPT=y, it results in the following error with the backtrace. BUG: using smp_processor_id() in preemptible [00000000] code: \ kworker/7:3/1702 caller is refresh_cpu_vm_stats+0x235/0x5f0 CPU: 0 UID: 0 PID: 1702 Comm: kworker/7:3 Tainted: G Tainted: [N]=TEST Workqueue: mm_percpu_wq vmstat_update Call Trace: dump_stack_lvl+0x8d/0xb0 check_preemption_disabled+0xce/0xe0 refresh_cpu_vm_stats+0x235/0x5f0 vmstat_update+0x17/0xa0 process_one_work+0x869/0x1aa0 worker_thread+0x5e5/0x1100 kthread+0x29e/0x380 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 So, for mm/vmstat:online, disable vmstat_work reliably on teardown and symmetrically enable it on startup. Link: https://lkml.kernel.org/r/20241221033321.4154409-1-koichiro.den@canonical.com Signed-off-by: Koichiro Den Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Andrew Morton --- mm/vmstat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 4d016314a56c..0889b75cef14 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2148,13 +2148,14 @@ static int vmstat_cpu_online(unsigned int cpu) if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); } + enable_delayed_work(&per_cpu(vmstat_work, cpu)); return 0; } static int vmstat_cpu_down_prep(unsigned int cpu) { - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); return 0; } From 98a6abc6cec186bdc3d94c162227cc8e003de76c Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 23 Dec 2024 23:09:07 +0800 Subject: [PATCH 586/807] mm/list_lru: fix false warning of negative counter commit 2788cf0c401c ("memcg: reparent list_lrus and free kmemcg_id on css offline") removed sanity checks for the nr_items counter's value because it implemented list_lru re-parenting in a way that will redirect children's list_lru to the parent before re-parenting the items in list_lru. This will make item counter uncharging happen in the parent while the item is still being held by the child. As a result, the parent's counter value may become negative. This is acceptable because re-parenting will sum up the children's counter values, and the parent's counter will be fixed. Later commit fb56fdf8b9a2 ("mm/list_lru: split the lock to per-cgroup scope") reworked the re-parenting process, and removed the redirect. So it added the sanity check back, assuming that as long as items are still in the children's list_lru, parent's counter will not be uncharged. But that assumption is incorrect. The xas_store in memcg_reparent_list_lrus will set children's list_lru to NULL before re-parenting the items, it redirects list_lru helpers to use parent's list_lru just like before. But still, it's not a problem as re-parenting will fix the counter. Therefore, remove this sanity check, but add a new check to ensure that the counter won't go negative in a different way: the child's list_lru being re-parented should never have a negative counter, since re-parenting should occur in order and fixes counters. Link: https://lkml.kernel.org/r/20241223150907.1591-1-ryncsn@gmail.com Fixes: fb56fdf8b9a2 ("mm/list_lru: split the lock to per-cgroup scope") Signed-off-by: Kairui Song Closes: https://lore.kernel.org/lkml/Z2Bz9t92Be9l1xqj@lappy/ Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Sasha Levin Cc: Shakeel Butt Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/list_lru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index f93ada6a207b..7d69434c70e0 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -77,7 +77,6 @@ again: spin_lock(&l->lock); nr_items = READ_ONCE(l->nr_items); if (likely(nr_items != LONG_MIN)) { - WARN_ON(nr_items < 0); rcu_read_unlock(); return l; } @@ -450,6 +449,7 @@ static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, list_splice_init(&src->list, &dst->list); if (src->nr_items) { + WARN_ON(src->nr_items < 0); dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); } From 8debfc5b1aa569d3d2ac836af2553da037611c61 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Dec 2024 15:12:21 -0800 Subject: [PATCH 587/807] mm/damon/core: fix new damon_target objects leaks on damon_commit_targets() Patch series "mm/damon/core: fix memory leaks and ignored inputs from damon_commit_ctx()". Due to two bugs in damon_commit_targets() and damon_commit_schemes(), which are called from damon_commit_ctx(), some user inputs can be ignored, and some mmeory objects can be leaked. Fix those. Note that only DAMON sysfs interface users are affected. Other DAMON core API user modules that more focused more on simple and dedicated production usages, including DAMON_RECLAIM and DAMON_LRU_SORT are not using the buggy function in the way, so not affected. This patch (of 2): When new DAMON targets are added via damon_commit_targets(), the newly created targets are not deallocated when updating the internal data (damon_commit_target()) is failed. Worse yet, even if the setup is successfully done, the new target is not linked to the context. Hence, the new targets are always leaked regardless of the internal data setup failure. Fix the leaks. Link: https://lkml.kernel.org/r/20241222231222.85060-2-sj@kernel.org Fixes: 9cb3d0b9dfce ("mm/damon/core: implement DAMON context commit function") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 8b8e2933dcd4..dc52361f1863 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -961,8 +961,11 @@ static int damon_commit_targets( return -ENOMEM; err = damon_commit_target(new_target, false, src_target, damon_target_has_pid(src)); - if (err) + if (err) { + damon_destroy_target(new_target); return err; + } + damon_add_target(dst, new_target); } return 0; } From 7d390b53067ef745e2d9bee5a9683df4c96b80a0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Dec 2024 15:12:22 -0800 Subject: [PATCH 588/807] mm/damon/core: fix ignored quota goals and filters of newly committed schemes damon_commit_schemes() ignores quota goals and filters of the newly committed schemes. This makes users confused about the behaviors. Correctly handle those inputs. Link: https://lkml.kernel.org/r/20241222231222.85060-3-sj@kernel.org Fixes: 9cb3d0b9dfce ("mm/damon/core: implement DAMON context commit function") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index dc52361f1863..0776452a1abb 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -868,6 +868,11 @@ static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src) NUMA_NO_NODE); if (!new_scheme) return -ENOMEM; + err = damos_commit(new_scheme, src_scheme); + if (err) { + damon_destroy_scheme(new_scheme); + return err; + } damon_add_scheme(dst, new_scheme); } return 0; From 62e72d2cf702a5e2fb53d9c46ed900d9384e4a06 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 22 Dec 2024 20:29:36 +0800 Subject: [PATCH 589/807] mm, madvise: fix potential workingset node list_lru leaks Since commit 5abc1e37afa0 ("mm: list_lru: allocate list_lru_one only when needed"), all list_lru users need to allocate the items using the new infrastructure that provides list_lru info for slab allocation, ensuring that the corresponding memcg list_lru is allocated before use. For workingset shadow nodes (which are xa_node), users are converted to use the new infrastructure by commit 9bbdc0f32409 ("xarray: use kmem_cache_alloc_lru to allocate xa_node"). The xas->xa_lru will be set correctly for filemap users. However, there is a missing case: xa_node allocations caused by madvise(..., MADV_COLLAPSE). madvise(..., MADV_COLLAPSE) will also read in the absent parts of file map, and there will be xa_nodes allocated for the caller's memcg (assuming it's not rootcg). However, these allocations won't trigger memcg list_lru allocation because the proper xas info was not set. If nothing else has allocated other xa_nodes for that memcg to trigger list_lru creation, and memory pressure starts to evict file pages, workingset_update_node will try to add these xa_nodes to their corresponding memcg list_lru, and it does not exist (NULL). So they will be added to rootcg's list_lru instead. This shouldn't be a significant issue in practice, but it is indeed unexpected behavior, and these xa_nodes will not be reclaimed effectively. And may lead to incorrect counting of the list_lru->nr_items counter. This problem wasn't exposed until recent commit 28e98022b31ef ("mm/list_lru: simplify reparenting and initial allocation") added a sanity check: only dying memcg could have a NULL list_lru when list_lru_{add,del} is called. This problem triggered this WARNING. So make madvise(..., MADV_COLLAPSE) also call xas_set_lru() to pass the list_lru which we may want to insert xa_node into later. And move mapping_set_update to mm/internal.h, and turn into a macro to avoid including extra headers in mm/internal.h. Link: https://lkml.kernel.org/r/20241222122936.67501-1-ryncsn@gmail.com Fixes: 9bbdc0f32409 ("xarray: use kmem_cache_alloc_lru to allocate xa_node") Reported-by: syzbot+38a0cbd267eff2d286ff@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/675d01e9.050a0220.37aaf.00be.GAE@google.com/ Signed-off-by: Kairui Song Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Sasha Levin Cc: Shakeel Butt Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/filemap.c | 9 --------- mm/internal.h | 6 ++++++ mm/khugepaged.c | 3 +++ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index f61cf51c2238..33b60d448fca 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,15 +124,6 @@ * ->private_lock (zap_pte_range->block_dirty_folio) */ -static void mapping_set_update(struct xa_state *xas, - struct address_space *mapping) -{ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - xas_set_update(xas, workingset_update_node); - xas_set_lru(xas, &shadow_nodes); -} - static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { diff --git a/mm/internal.h b/mm/internal.h index 3bd08bafad04..9826f7dce607 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1504,6 +1504,12 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; +#define mapping_set_update(xas, mapping) do { \ + if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ + xas_set_update(xas, workingset_update_node); \ + xas_set_lru(xas, &shadow_nodes); \ + } \ +} while (0) /* mremap.c */ unsigned long move_page_tables(struct vm_area_struct *vma, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6f8d46d107b4..653dbb1ff05c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -1837,6 +1838,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (result != SCAN_SUCCEED) goto out; + mapping_set_update(&xas, mapping); + __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); From dd2a5b5514ab0e690f018595e34dd1fcb981d345 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 21 Dec 2024 16:47:29 +0900 Subject: [PATCH 590/807] mm/util: make memdup_user_nul() similar to memdup_user() Since the string data to copy from userspace is likely less than PAGE_SIZE bytes, replace GFP_KERNEL with GFP_USER like commit 6c2c97a24f09 ("memdup_user(): switch to GFP_USER") does and add __GFP_NOWARN like commit 6c8fcc096be9 ("mm: don't let userspace spam allocations warnings") does. Also, use dedicated slab buckets like commit d73778e4b867 ("mm/util: Use dedicated slab buckets for memdup_user()") does. Link: https://lkml.kernel.org/r/014cd694-cc27-4a07-a34a-2ae95d744515@I-love.SAKURA.ne.jp Reported-by: syzbot+7e12e97b36154c54414b@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7e12e97b36154c54414b Signed-off-by: Tetsuo Handa Signed-off-by: Andrew Morton --- mm/util.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/util.c b/mm/util.c index c1c3b06ab4f9..60aa40f612b8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -297,12 +297,7 @@ void *memdup_user_nul(const void __user *src, size_t len) { char *p; - /* - * Always use GFP_KERNEL, since copy_from_user() can sleep and - * cause pagefault, which makes it pointless to use GFP_NOFS - * or GFP_ATOMIC. - */ - p = kmalloc_track_caller(len + 1, GFP_KERNEL); + p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); From 0210d251162f4033350a94a43f95b1c39ec84a90 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Thu, 26 Dec 2024 22:03:32 +0800 Subject: [PATCH 591/807] scripts/sorttable: fix orc_sort_cmp() to maintain symmetry and transitivity The orc_sort_cmp() function, used with qsort(), previously violated the symmetry and transitivity rules required by the C standard. Specifically, when both entries are ORC_TYPE_UNDEFINED, it could result in both a < b and b < a, which breaks the required symmetry and transitivity. This can lead to undefined behavior and incorrect sorting results, potentially causing memory corruption in glibc implementations [1]. Symmetry: If x < y, then y > x. Transitivity: If x < y and y < z, then x < z. Fix the comparison logic to return 0 when both entries are ORC_TYPE_UNDEFINED, ensuring compliance with qsort() requirements. Link: https://www.qualys.com/2024/01/30/qsort.txt [1] Link: https://lkml.kernel.org/r/20241226140332.2670689-1-visitorckw@gmail.com Fixes: 57fa18994285 ("scripts/sorttable: Implement build-time ORC unwind table sorting") Fixes: fb799447ae29 ("x86,objtool: Split UNWIND_HINT_EMPTY in two") Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Shile Zhang Cc: Steven Rostedt Cc: Signed-off-by: Andrew Morton --- scripts/sorttable.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 7bd0184380d3..a7c5445baf00 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -110,7 +110,7 @@ static inline unsigned long orc_ip(const int *ip) static int orc_sort_cmp(const void *_a, const void *_b) { - struct orc_entry *orc_a; + struct orc_entry *orc_a, *orc_b; const int *a = g_orc_ip_table + *(int *)_a; const int *b = g_orc_ip_table + *(int *)_b; unsigned long a_val = orc_ip(a); @@ -128,6 +128,9 @@ static int orc_sort_cmp(const void *_a, const void *_b) * whitelisted .o files which didn't get objtool generation. */ orc_a = g_orc_table + (a - g_orc_ip_table); + orc_b = g_orc_table + (b - g_orc_ip_table); + if (orc_a->type == ORC_TYPE_UNDEFINED && orc_b->type == ORC_TYPE_UNDEFINED) + return 0; return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1; } From e7404921818d676da4d7143ce78659456b05e2af Mon Sep 17 00:00:00 2001 From: "Chester A. Unal" Date: Wed, 25 Dec 2024 15:50:41 +0300 Subject: [PATCH 592/807] =?UTF-8?q?MAINTAINERS:=20change=20Ar=C4=B1n=C3=A7?= =?UTF-8?q?=20=5FNAL's=20name=20and=20email=20address?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My legal name now includes Chester. Change the name and the email address sections to reflect that. Link: https://lkml.kernel.org/r/20241225-for-unknown-upstream-v1-1-3e35e4d5e161@arinc9.com Signed-off-by: Chester A. Unal Signed-off-by: Andrew Morton --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 910305c11e8a..22fa261cb60e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14756,7 +14756,7 @@ F: drivers/memory/mtk-smi.c F: include/soc/mediatek/smi.h MEDIATEK SWITCH DRIVER -M: Arınç ÜNAL +M: Chester A. Unal M: Daniel Golle M: DENG Qingfang M: Sean Wang @@ -18460,7 +18460,7 @@ F: Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml F: drivers/pinctrl/mediatek/ PIN CONTROLLER - MEDIATEK MIPS -M: Arınç ÜNAL +M: Chester A. Unal M: Sergio Paracuellos L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) L: linux-mips@vger.kernel.org @@ -19504,7 +19504,7 @@ S: Maintained F: arch/mips/ralink RALINK MT7621 MIPS ARCHITECTURE -M: Arınç ÜNAL +M: Chester A. Unal M: Sergio Paracuellos L: linux-mips@vger.kernel.org S: Maintained From 4f619d518db9cd1a933c3a095a5f95d0c1584ae8 Mon Sep 17 00:00:00 2001 From: Jinjian Song Date: Tue, 24 Dec 2024 12:15:52 +0800 Subject: [PATCH 593/807] net: wwan: t7xx: Fix FSM command timeout issue When driver processes the internal state change command, it use an asynchronous thread to process the command operation. If the main thread detects that the task has timed out, the asynchronous thread will panic when executing the completion notification because the main thread completion object has been released. BUG: unable to handle page fault for address: fffffffffffffff8 PGD 1f283a067 P4D 1f283a067 PUD 1f283c067 PMD 0 Oops: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:complete_all+0x3e/0xa0 [...] Call Trace: ? __die_body+0x68/0xb0 ? page_fault_oops+0x379/0x3e0 ? exc_page_fault+0x69/0xa0 ? asm_exc_page_fault+0x22/0x30 ? complete_all+0x3e/0xa0 fsm_main_thread+0xa3/0x9c0 [mtk_t7xx (HASH:1400 5)] ? __pfx_autoremove_wake_function+0x10/0x10 kthread+0xd8/0x110 ? __pfx_fsm_main_thread+0x10/0x10 [mtk_t7xx (HASH:1400 5)] ? __pfx_kthread+0x10/0x10 ret_from_fork+0x38/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 [...] CR2: fffffffffffffff8 ---[ end trace 0000000000000000 ]--- Use the reference counter to ensure safe release as Sergey suggests: https://lore.kernel.org/all/da90f64c-260a-4329-87bf-1f9ff20a5951@gmail.com/ Fixes: 13e920d93e37 ("net: wwan: t7xx: Add core components") Signed-off-by: Jinjian Song Acked-by: Sergey Ryazanov Link: https://patch.msgid.link/20241224041552.8711-1-jinjian.song@fibocom.com Signed-off-by: Jakub Kicinski --- drivers/net/wwan/t7xx/t7xx_state_monitor.c | 26 ++++++++++++++-------- drivers/net/wwan/t7xx/t7xx_state_monitor.h | 5 +++-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/net/wwan/t7xx/t7xx_state_monitor.c b/drivers/net/wwan/t7xx/t7xx_state_monitor.c index 3931c7a13f5a..cbdbb91e8381 100644 --- a/drivers/net/wwan/t7xx/t7xx_state_monitor.c +++ b/drivers/net/wwan/t7xx/t7xx_state_monitor.c @@ -104,14 +104,21 @@ void t7xx_fsm_broadcast_state(struct t7xx_fsm_ctl *ctl, enum md_state state) fsm_state_notify(ctl->md, state); } +static void fsm_release_command(struct kref *ref) +{ + struct t7xx_fsm_command *cmd = container_of(ref, typeof(*cmd), refcnt); + + kfree(cmd); +} + static void fsm_finish_command(struct t7xx_fsm_ctl *ctl, struct t7xx_fsm_command *cmd, int result) { if (cmd->flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { - *cmd->ret = result; - complete_all(cmd->done); + cmd->result = result; + complete_all(&cmd->done); } - kfree(cmd); + kref_put(&cmd->refcnt, fsm_release_command); } static void fsm_del_kf_event(struct t7xx_fsm_event *event) @@ -475,7 +482,6 @@ static int fsm_main_thread(void *data) int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id, unsigned int flag) { - DECLARE_COMPLETION_ONSTACK(done); struct t7xx_fsm_command *cmd; unsigned long flags; int ret; @@ -487,11 +493,13 @@ int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id INIT_LIST_HEAD(&cmd->entry); cmd->cmd_id = cmd_id; cmd->flag = flag; + kref_init(&cmd->refcnt); if (flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { - cmd->done = &done; - cmd->ret = &ret; + init_completion(&cmd->done); + kref_get(&cmd->refcnt); } + kref_get(&cmd->refcnt); spin_lock_irqsave(&ctl->command_lock, flags); list_add_tail(&cmd->entry, &ctl->command_queue); spin_unlock_irqrestore(&ctl->command_lock, flags); @@ -501,11 +509,11 @@ int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id if (flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { unsigned long wait_ret; - wait_ret = wait_for_completion_timeout(&done, + wait_ret = wait_for_completion_timeout(&cmd->done, msecs_to_jiffies(FSM_CMD_TIMEOUT_MS)); - if (!wait_ret) - return -ETIMEDOUT; + ret = wait_ret ? cmd->result : -ETIMEDOUT; + kref_put(&cmd->refcnt, fsm_release_command); return ret; } diff --git a/drivers/net/wwan/t7xx/t7xx_state_monitor.h b/drivers/net/wwan/t7xx/t7xx_state_monitor.h index 7b0a9baf488c..6e0601bb752e 100644 --- a/drivers/net/wwan/t7xx/t7xx_state_monitor.h +++ b/drivers/net/wwan/t7xx/t7xx_state_monitor.h @@ -110,8 +110,9 @@ struct t7xx_fsm_command { struct list_head entry; enum t7xx_fsm_cmd_state cmd_id; unsigned int flag; - struct completion *done; - int *ret; + struct completion done; + int result; + struct kref refcnt; }; struct t7xx_fsm_notifier { From afc6717628f959941d7b33728570568b4af1c4b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 31 Dec 2024 00:06:46 -0500 Subject: [PATCH 594/807] tracing: Have process_string() also allow arrays In order to catch a common bug where a TRACE_EVENT() TP_fast_assign() assigns an address of an allocated string to the ring buffer and then references it in TP_printk(), which can be executed hours later when the string is free, the function test_event_printk() runs on all events as they are registered to make sure there's no unwanted dereferencing. It calls process_string() to handle cases in TP_printk() format that has "%s". It returns whether or not the string is safe. But it can have some false positives. For instance, xe_bo_move() has: TP_printk("move_lacks_source:%s, migrate object %p [size %zu] from %s to %s device_id:%s", __entry->move_lacks_source ? "yes" : "no", __entry->bo, __entry->size, xe_mem_type_to_name[__entry->old_placement], xe_mem_type_to_name[__entry->new_placement], __get_str(device_id)) Where the "%s" references into xe_mem_type_to_name[]. This is an array of pointers that should be safe for the event to access. Instead of flagging this as a bad reference, if a reference points to an array, where the record field is the index, consider it safe. Link: https://lore.kernel.org/all/9dee19b6185d325d0e6fa5f7cbba81d007d99166.camel@sapience.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241231000646.324fb5f7@gandalf.local.home Fixes: 65a25d9f7ac02 ("tracing: Add "%s" check in test_event_printk()") Reported-by: Genes Lists Tested-by: Gene C Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1545cc8b49d0..770e7ed91716 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -364,6 +364,18 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca s = r + 1; } while (s < e); + /* + * Check for arrays. If the argument has: foo[REC->val] + * then it is very likely that foo is an array of strings + * that are safe to use. + */ + r = strstr(s, "["); + if (r && r < e) { + r = strstr(r, "REC->"); + if (r && r < e) + return true; + } + /* * If there's any strings in the argument consider this arg OK as it * could be: REC->field ? "foo" : "bar" and we don't want to get into From fee873761bd978d077d8c55334b4966ac4cb7b59 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Fri, 13 Dec 2024 13:08:37 +0800 Subject: [PATCH 595/807] exfat: fix the infinite loop in exfat_readdir() If the file system is corrupted so that a cluster is linked to itself in the cluster chain, and there is an unused directory entry in the cluster, 'dentry' will not be incremented, causing condition 'dentry < max_dentries' unable to prevent an infinite loop. This infinite loop causes s_lock not to be released, and other tasks will hang, such as exfat_sync_fs(). This commit stops traversing the cluster chain when there is unused directory entry in the cluster to avoid this infinite loop. Reported-by: syzbot+205c2644abdff9d3f9fc@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=205c2644abdff9d3f9fc Tested-by: syzbot+205c2644abdff9d3f9fc@syzkaller.appspotmail.com Fixes: ca06197382bd ("exfat: add directory operations") Signed-off-by: Yuezhang Mo Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index fe0a9b8a0cd0..3103b932b674 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -122,7 +122,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent type = exfat_get_entry_type(ep); if (type == TYPE_UNUSED) { brelse(bh); - break; + goto out; } if (type != TYPE_FILE && type != TYPE_DIR) { @@ -170,6 +170,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent } } +out: dir_entry->namebuf.lfn[0] = '\0'; *cpos = EXFAT_DEN_TO_B(dentry); return 0; From 98e2fb26d1a9eafe79f46d15d54e68e014d81d8c Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Thu, 12 Dec 2024 16:29:23 +0800 Subject: [PATCH 596/807] exfat: fix the new buffer was not zeroed before writing Before writing, if a buffer_head marked as new, its data must be zeroed, otherwise uninitialized data in the page cache will be written. So this commit uses folio_zero_new_buffers() to zero the new buffers before ->write_end(). Fixes: 6630ea49103c ("exfat: move extend valid_size into ->page_mkwrite()") Reported-by: syzbot+91ae49e1c1a2634d20c0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=91ae49e1c1a2634d20c0 Tested-by: syzbot+91ae49e1c1a2634d20c0@syzkaller.appspotmail.com Signed-off-by: Yuezhang Mo Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/file.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/exfat/file.c b/fs/exfat/file.c index fb38769c3e39..05b51e721783 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -545,6 +545,7 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) while (pos < new_valid_size) { u32 len; struct folio *folio; + unsigned long off; len = PAGE_SIZE - (pos & (PAGE_SIZE - 1)); if (pos + len > new_valid_size) @@ -554,6 +555,9 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) if (err) goto out; + off = offset_in_folio(folio, pos); + folio_zero_new_buffers(folio, off, off + len); + err = ops->write_end(file, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; @@ -563,6 +567,8 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) cond_resched(); } + return 0; + out: return err; } From a5324b3a488d883aa2d42f72260054e87d0940a0 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Mon, 16 Dec 2024 13:39:42 +0800 Subject: [PATCH 597/807] exfat: fix the infinite loop in __exfat_free_cluster() In __exfat_free_cluster(), the cluster chain is traversed until the EOF cluster. If the cluster chain includes a loop due to file system corruption, the EOF cluster cannot be traversed, resulting in an infinite loop. This commit uses the total number of clusters to prevent this infinite loop. Reported-by: syzbot+1de5a37cb85a2d536330@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1de5a37cb85a2d536330 Tested-by: syzbot+1de5a37cb85a2d536330@syzkaller.appspotmail.com Fixes: 31023864e67a ("exfat: add fat entry operations") Signed-off-by: Yuezhang Mo Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/fatent.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 773c320d68f3..9e5492ac409b 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -216,6 +216,16 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain if (err) goto dec_used_clus; + + if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) { + /* + * The cluster chain includes a loop, scan the + * bitmap to get the number of used clusters. + */ + exfat_count_used_clusters(sb, &sbi->used_clusters); + + return 0; + } } while (clu != EXFAT_EOF_CLUSTER); } From 7b509910b3ad6d7aacead24c8744de10daf8715d Mon Sep 17 00:00:00 2001 From: Daniel Schaefer Date: Tue, 31 Dec 2024 12:59:58 +0800 Subject: [PATCH 598/807] ALSA hda/realtek: Add quirk for Framework F111:000C Similar to commit eb91c456f371 ("ALSA: hda/realtek: Add Framework Laptop 13 (Intel Core Ultra) to quirks") and previous quirks for Framework systems with Realtek codecs. 000C is a new platform that will also have an ALC285 codec and needs the same quirk. Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: linux@frame.work Cc: Dustin L. Howett Signed-off-by: Daniel Schaefer Cc: Link: https://patch.msgid.link/20241231045958.14545-1-dhs@frame.work Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 61ba5dc35b8b..b74b566f675e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -11009,6 +11009,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0xf111, 0x0001, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0006, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0009, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0xf111, 0x000c, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), #if 0 /* Below is a quirk table taken from the old code. From fb514b31395946022f13a08e06a435f53cf9e8b3 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 31 Dec 2024 09:34:16 +0800 Subject: [PATCH 599/807] RDMA/rtrs: Ensure 'ib_sge list' is accessible Move the declaration of the 'ib_sge list' variable outside the 'always_invalidate' block to ensure it remains accessible for use throughout the function. Previously, 'ib_sge list' was declared within the 'always_invalidate' block, limiting its accessibility, then caused a 'BUG: kernel NULL pointer dereference'[1]. ? __die_body.cold+0x19/0x27 ? page_fault_oops+0x15a/0x2d0 ? search_module_extables+0x19/0x60 ? search_bpf_extables+0x5f/0x80 ? exc_page_fault+0x7e/0x180 ? asm_exc_page_fault+0x26/0x30 ? memcpy_orig+0xd5/0x140 rxe_mr_copy+0x1c3/0x200 [rdma_rxe] ? rxe_pool_get_index+0x4b/0x80 [rdma_rxe] copy_data+0xa5/0x230 [rdma_rxe] rxe_requester+0xd9b/0xf70 [rdma_rxe] ? finish_task_switch.isra.0+0x99/0x2e0 rxe_sender+0x13/0x40 [rdma_rxe] do_task+0x68/0x1e0 [rdma_rxe] process_one_work+0x177/0x330 worker_thread+0x252/0x390 ? __pfx_worker_thread+0x10/0x10 This change ensures the variable is available for subsequent operations that require it. [1] https://lore.kernel.org/linux-rdma/6a1f3e8f-deb0-49f9-bc69-a9b03ecfcda7@fujitsu.com/ Fixes: 9cb837480424 ("RDMA/rtrs: server: main functionality") Signed-off-by: Li Zhijian Link: https://patch.msgid.link/20241231013416.1290920-1-lizhijian@fujitsu.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index e83d95647852..ef4abdea3c2d 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -349,6 +349,7 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, struct rtrs_srv_mr *srv_mr; bool need_inval = false; enum ib_send_flags flags; + struct ib_sge list; u32 imm; int err; @@ -401,7 +402,6 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, imm = rtrs_to_io_rsp_imm(id->msg_id, errno, need_inval); imm_wr.wr.next = NULL; if (always_invalidate) { - struct ib_sge list; struct rtrs_msg_rkey_rsp *msg; srv_mr = &srv_path->mrs[id->msg_id]; From e6178bf78d0378c2d397a6aafaf4882d0af643fa Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 31 Dec 2024 08:20:08 +0530 Subject: [PATCH 600/807] RDMA/bnxt_re: Fix error recovery sequence Fixed to return ENXIO from __send_message_basic_sanity() to indicate that device is in error state. In the case of ERR_DEVICE_DETACHED state, the driver should not post the commands to the firmware as it will time out eventually. Removed bnxt_re_modify_qp() call from bnxt_re_dev_stop() as it is a no-op. Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error is detected") Signed-off-by: Kalesh AP Signed-off-by: Kashyap Desai Link: https://patch.msgid.link/20241231025008.2267162-1-kalesh-anakkur.purayil@broadcom.com Reviewed-by: Selvin Xavier Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 8 +------- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 5 +++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b7af0d5ff3b6..c143f273b759 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1715,11 +1715,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev, static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) { - int mask = IB_QP_STATE; - struct ib_qp_attr qp_attr; struct bnxt_re_qp *qp; - qp_attr.qp_state = IB_QPS_ERR; mutex_lock(&rdev->qp_lock); list_for_each_entry(qp, &rdev->qp_list, list) { /* Modify the state of all QPs except QP1/Shadow QP */ @@ -1727,12 +1724,9 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) if (qp->qplib_qp.state != CMDQ_MODIFY_QP_NEW_STATE_RESET && qp->qplib_qp.state != - CMDQ_MODIFY_QP_NEW_STATE_ERR) { + CMDQ_MODIFY_QP_NEW_STATE_ERR) bnxt_re_dispatch_event(&rdev->ibdev, &qp->ib_qp, 1, IB_EVENT_QP_FATAL); - bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, mask, - NULL); - } } } mutex_unlock(&rdev->qp_lock); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 5e90ea232de8..17e62f22683b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -424,7 +424,8 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, /* Prevent posting if f/w is not in a state to process */ if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) - return bnxt_qplib_map_rc(opcode); + return -ENXIO; + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) return -ETIMEDOUT; @@ -493,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __send_message_basic_sanity(rcfw, msg, opcode); if (rc) - return rc; + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; rc = __send_message(rcfw, msg, opcode); if (rc) From 8765429279e7d3d68d39ace5f84af2815174bb1e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 31 Dec 2024 15:53:58 +0100 Subject: [PATCH 601/807] ALSA: seq: Check UMP support for midi_version change When the kernel is built without UMP support but a user-space app requires the midi_version > 0, the kernel should return an error. Otherwise user-space assumes as if it were possible to deal, eventually hitting serious errors later. Fixes: 46397622a3fa ("ALSA: seq: Add UMP support") Cc: Link: https://patch.msgid.link/20241231145358.21946-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 3930e2f9082f..77b6ac9b5c11 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1275,10 +1275,16 @@ static int snd_seq_ioctl_set_client_info(struct snd_seq_client *client, if (client->type != client_info->type) return -EINVAL; - /* check validity of midi_version field */ - if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3) && - client_info->midi_version > SNDRV_SEQ_CLIENT_UMP_MIDI_2_0) - return -EINVAL; + if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3)) { + /* check validity of midi_version field */ + if (client_info->midi_version > SNDRV_SEQ_CLIENT_UMP_MIDI_2_0) + return -EINVAL; + + /* check if UMP is supported in kernel */ + if (!IS_ENABLED(CONFIG_SND_SEQ_UMP) && + client_info->midi_version > 0) + return -EINVAL; + } /* fill the info fields */ if (client_info->name[0]) From c4bd13be1949020e3b1c9ed6889988e0b30c3d3b Mon Sep 17 00:00:00 2001 From: Chun-Kuang Hu Date: Mon, 30 Dec 2024 13:53:14 +0000 Subject: [PATCH 602/807] drm/mediatek: Remove unneeded semicolon cocci warnings: (new ones prefixed by >>) >> drivers/gpu/drm/mediatek/mtk_drm_drv.c:1092:2-3: Unneeded semicolon Fixes: 4c932840db1d ("drm/mediatek: Implement OF graphs support for display paths") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412022048.kY2ZhxZ4-lkp@intel.com/ Reviewed-by: AngeloGioacchino Del Regno Link: https://patchwork.kernel.org/project/dri-devel/patch/20241230135314.5419-1-chunkuang.hu@kernel.org/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index 11935cf2b39e..f0f3d545ff19 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -1094,7 +1094,7 @@ static int mtk_drm_probe(struct platform_device *pdev) /* No devicetree graphs support: go with hardcoded paths if present */ dev_dbg(dev, "Using hardcoded paths for MMSYS %u\n", mtk_drm_data->mmsys_id); private->data = mtk_drm_data; - }; + } private->all_drm_private = devm_kmalloc_array(dev, private->data->mmsys_dev_num, sizeof(*private->all_drm_private), From 5cc2db37124bb33914996d6fdbb2ddb3811f2945 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Sat, 21 Dec 2024 21:10:46 +0000 Subject: [PATCH 603/807] x86/static-call: Remove early_boot_irqs_disabled check to fix Xen PVH dom0 __static_call_update_early() has a check for early_boot_irqs_disabled, but is used before early_boot_irqs_disabled is set up in start_kernel(). Xen PV has always special cased early_boot_irqs_disabled, but Xen PVH does not and falls over the BUG when booting as dom0. It is very suspect that early_boot_irqs_disabled starts as 0, becomes 1 for a time, then becomes 0 again, but as this needs backporting to fix a breakage in a security fix, dropping the BUG_ON() is the far safer option. Fixes: 0ef8047b737d ("x86/static-call: provide a way to do very early static-call updates") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219620 Reported-by: Alex Zenla Suggested-by: Peter Zijlstra Signed-off-by: Andrew Cooper Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Juergen Gross Acked-by: Peter Zijlstra (Intel) Tested-by: Alex Zenla Link: https://lore.kernel.org/r/20241221211046.6475-1-andrew.cooper3@citrix.com --- arch/x86/kernel/static_call.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 9eed0c144dad..9e51242ed125 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -175,7 +175,6 @@ EXPORT_SYMBOL_GPL(arch_static_call_transform); noinstr void __static_call_update_early(void *tramp, void *func) { BUG_ON(system_state != SYSTEM_BOOTING); - BUG_ON(!early_boot_irqs_disabled); BUG_ON(static_call_initialized); __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE); sync_core(); From 7bac65687510038390a0a54cbe14fba08d037e46 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 19 Dec 2024 22:20:41 +0530 Subject: [PATCH 604/807] scsi: ufs: qcom: Power off the PHY if it was already powered on in ufs_qcom_power_up_sequence() PHY might already be powered on during ufs_qcom_power_up_sequence() in a couple of cases: 1. During UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH quirk 2. Resuming from spm_lvl = 5 suspend In those cases, it is necessary to call phy_power_off() and phy_exit() in ufs_qcom_power_up_sequence() function to power off the PHY before calling phy_init() and phy_power_on(). Case (1) is doing it via ufs_qcom_reinit_notify() callback, but case (2) is not handled. So to satisfy both cases, call phy_power_off() and phy_exit() if the phy_count is non-zero. And with this change, the reinit_notify() callback is no longer needed. This fixes the below UFS resume failure with spm_lvl = 5: ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufs_device_wlun 0:0:0:49488: ufshcd_wl_resume failed: -5 ufs_device_wlun 0:0:0:49488: PM: dpm_run_callback(): scsi_bus_resume returns -5 ufs_device_wlun 0:0:0:49488: PM: failed to resume async: error -5 Cc: stable@vger.kernel.org # 6.3 Fixes: baf5ddac90dc ("scsi: ufs: ufs-qcom: Add support for reinitializing the UFS device") Reported-by: Ram Kumar Dwivedi Tested-by: Amit Pundir # on SM8550-HDK Reviewed-by: Bart Van Assche Tested-by: Neil Armstrong # on SM8550-QRD Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20241219-ufs-qcom-suspend-fix-v3-1-63c4b95a70b9@linaro.org Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd-priv.h | 6 ------ drivers/ufs/core/ufshcd.c | 1 - drivers/ufs/host/ufs-qcom.c | 13 +++++-------- include/ufs/ufshcd.h | 2 -- 4 files changed, 5 insertions(+), 17 deletions(-) diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h index 9ffd94ddf8c7..786f20ef2238 100644 --- a/drivers/ufs/core/ufshcd-priv.h +++ b/drivers/ufs/core/ufshcd-priv.h @@ -237,12 +237,6 @@ static inline void ufshcd_vops_config_scaling_param(struct ufs_hba *hba, hba->vops->config_scaling_param(hba, p, data); } -static inline void ufshcd_vops_reinit_notify(struct ufs_hba *hba) -{ - if (hba->vops && hba->vops->reinit_notify) - hba->vops->reinit_notify(hba); -} - static inline int ufshcd_vops_mcq_config_resource(struct ufs_hba *hba) { if (hba->vops && hba->vops->mcq_config_resource) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 8a01e4393159..d1e19a2ccf49 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8858,7 +8858,6 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) ufshcd_device_reset(hba); ufs_put_device_desc(hba); ufshcd_hba_stop(hba); - ufshcd_vops_reinit_notify(hba); ret = ufshcd_hba_enable(hba); if (ret) { dev_err(hba->dev, "Host controller enable failed\n"); diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 68040b2ab5f8..e770e7b9d239 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -368,6 +368,11 @@ static int ufs_qcom_power_up_sequence(struct ufs_hba *hba) if (ret) return ret; + if (phy->power_count) { + phy_power_off(phy); + phy_exit(phy); + } + /* phy initialization - calibrate the phy */ ret = phy_init(phy); if (ret) { @@ -1579,13 +1584,6 @@ static void ufs_qcom_config_scaling_param(struct ufs_hba *hba, } #endif -static void ufs_qcom_reinit_notify(struct ufs_hba *hba) -{ - struct ufs_qcom_host *host = ufshcd_get_variant(hba); - - phy_power_off(host->generic_phy); -} - /* Resources */ static const struct ufshcd_res_info ufs_res_info[RES_MAX] = { {.name = "ufs_mem",}, @@ -1825,7 +1823,6 @@ static const struct ufs_hba_variant_ops ufs_hba_qcom_vops = { .device_reset = ufs_qcom_device_reset, .config_scaling_param = ufs_qcom_config_scaling_param, .program_key = ufs_qcom_ice_program_key, - .reinit_notify = ufs_qcom_reinit_notify, .mcq_config_resource = ufs_qcom_mcq_config_resource, .get_hba_mac = ufs_qcom_get_hba_mac, .op_runtime_config = ufs_qcom_op_runtime_config, diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index d650ae6b58d3..74e5b9960c54 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -329,7 +329,6 @@ struct ufs_pwr_mode_info { * @program_key: program or evict an inline encryption key * @fill_crypto_prdt: initialize crypto-related fields in the PRDT * @event_notify: called to notify important events - * @reinit_notify: called to notify reinit of UFSHCD during max gear switch * @mcq_config_resource: called to configure MCQ platform resources * @get_hba_mac: reports maximum number of outstanding commands supported by * the controller. Should be implemented for UFSHCI 4.0 or later @@ -381,7 +380,6 @@ struct ufs_hba_variant_ops { void *prdt, unsigned int num_segments); void (*event_notify)(struct ufs_hba *hba, enum ufs_event_type evt, void *data); - void (*reinit_notify)(struct ufs_hba *); int (*mcq_config_resource)(struct ufs_hba *hba); int (*get_hba_mac)(struct ufs_hba *hba); int (*op_runtime_config)(struct ufs_hba *hba); From bb9850704c043e48c86cc9df90ee102e8a338229 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 19 Dec 2024 22:20:42 +0530 Subject: [PATCH 605/807] scsi: ufs: core: Honor runtime/system PM levels if set by host controller drivers Otherwise, the default levels will override the levels set by the host controller drivers. Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20241219-ufs-qcom-suspend-fix-v3-2-63c4b95a70b9@linaro.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index d1e19a2ccf49..9c26e8767515 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10590,14 +10590,17 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) } /* - * Set the default power management level for runtime and system PM. + * Set the default power management level for runtime and system PM if + * not set by the host controller drivers. * Default power saving mode is to keep UFS link in Hibern8 state * and UFS device in sleep state. */ - hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( + if (!hba->rpm_lvl) + hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( UFS_SLEEP_PWR_MODE, UIC_LINK_HIBERN8_STATE); - hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( + if (!hba->spm_lvl) + hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( UFS_SLEEP_PWR_MODE, UIC_LINK_HIBERN8_STATE); From 4f78a56af4c472834681759d4365fb93921da77d Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 19 Dec 2024 22:20:43 +0530 Subject: [PATCH 606/807] scsi: ufs: qcom: Allow passing platform specific OF data In order to allow platform specific flags and configurations, introduce the platform specific OF data and move the existing quirk UFSHCD_QUIRK_BROKEN_LSDBS_CAP for SM8550 and SM8650 SoCs. Reviewed-by: Avri Altman Reviewed-by: Neil Armstrong Tested-by: Amit Pundir # on SM8550-HDK Reviewed-by: Bart Van Assche Tested-by: Neil Armstrong # on SM8550-QRD Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20241219-ufs-qcom-suspend-fix-v3-3-63c4b95a70b9@linaro.org Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 13 +++++++++---- drivers/ufs/host/ufs-qcom.h | 4 ++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index e770e7b9d239..7042322d55e9 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -871,6 +871,7 @@ static u32 ufs_qcom_get_ufs_hci_version(struct ufs_hba *hba) */ static void ufs_qcom_advertise_quirks(struct ufs_hba *hba) { + const struct ufs_qcom_drvdata *drvdata = of_device_get_match_data(hba->dev); struct ufs_qcom_host *host = ufshcd_get_variant(hba); if (host->hw_ver.major == 0x2) @@ -879,9 +880,8 @@ static void ufs_qcom_advertise_quirks(struct ufs_hba *hba) if (host->hw_ver.major > 0x3) hba->quirks |= UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH; - if (of_device_is_compatible(hba->dev->of_node, "qcom,sm8550-ufshc") || - of_device_is_compatible(hba->dev->of_node, "qcom,sm8650-ufshc")) - hba->quirks |= UFSHCD_QUIRK_BROKEN_LSDBS_CAP; + if (drvdata && drvdata->quirks) + hba->quirks |= drvdata->quirks; } static void ufs_qcom_set_phy_gear(struct ufs_qcom_host *host) @@ -1865,9 +1865,14 @@ static void ufs_qcom_remove(struct platform_device *pdev) platform_device_msi_free_irqs_all(hba->dev); } +static const struct ufs_qcom_drvdata ufs_qcom_sm8550_drvdata = { + .quirks = UFSHCD_QUIRK_BROKEN_LSDBS_CAP, +}; + static const struct of_device_id ufs_qcom_of_match[] __maybe_unused = { { .compatible = "qcom,ufshc" }, - { .compatible = "qcom,sm8550-ufshc" }, + { .compatible = "qcom,sm8550-ufshc", .data = &ufs_qcom_sm8550_drvdata }, + { .compatible = "qcom,sm8650-ufshc", .data = &ufs_qcom_sm8550_drvdata }, {}, }; MODULE_DEVICE_TABLE(of, ufs_qcom_of_match); diff --git a/drivers/ufs/host/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h index b9de170983c9..15f6dad8b27f 100644 --- a/drivers/ufs/host/ufs-qcom.h +++ b/drivers/ufs/host/ufs-qcom.h @@ -217,6 +217,10 @@ struct ufs_qcom_host { bool esi_enabled; }; +struct ufs_qcom_drvdata { + enum ufshcd_quirks quirks; +}; + static inline u32 ufs_qcom_get_debug_reg_offset(struct ufs_qcom_host *host, u32 reg) { From 3b2f56860b05bf0cea86af786fd9b7faa8fe3ef3 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 19 Dec 2024 22:20:44 +0530 Subject: [PATCH 607/807] scsi: ufs: qcom: Power down the controller/device during system suspend for SM8550/SM8650 SoCs SM8550 and SM8650 SoCs doesn't support UFS PHY retention. So once these SoCs reaches the low power state (CX power collapse) during system suspend, all the PHY hardware state gets lost. This leads to the UFS resume failure: ufshcd-qcom 1d84000.ufs: ufshcd_uic_hibern8_exit: hibern8 exit failed. ret = 5 ufshcd-qcom 1d84000.ufs: __ufshcd_wl_resume: hibern8 exit failed 5 ufs_device_wlun 0:0:0:49488: ufshcd_wl_resume failed: 5 ufs_device_wlun 0:0:0:49488: PM: dpm_run_callback(): scsi_bus_resume+0x0/0x84 returns 5 ufs_device_wlun 0:0:0:49488: PM: failed to resume async: error 5 With the default system suspend level of UFS_PM_LVL_3, the power domain for UFS PHY needs to be kept always ON to retain the state. But this would prevent these SoCs from reaching the CX power collapse state, leading to poor power saving during system suspend. So to fix this issue without affecting the power saving, set 'ufs_qcom_drvdata::no_phy_retention' to true which sets 'hba->spm_lvl' to UFS_PM_LVL_5 to allow both the controller and device (in turn the PHY) to be powered down during system suspend for these SoCs by default. Cc: stable@vger.kernel.org # 6.3 Fixes: 35cf1aaab169 ("arm64: dts: qcom: sm8550: Add UFS host controller and phy nodes") Fixes: 10e024671295 ("arm64: dts: qcom: sm8650: add interconnect dependent device nodes") Reported-by: Neil Armstrong Tested-by: Amit Pundir # on SM8550-HDK Reviewed-by: Bart Van Assche Tested-by: Neil Armstrong # on SM8550-QRD Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20241219-ufs-qcom-suspend-fix-v3-4-63c4b95a70b9@linaro.org Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 5 +++++ drivers/ufs/host/ufs-qcom.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 7042322d55e9..91e94fe990b4 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -1069,6 +1069,7 @@ static int ufs_qcom_init(struct ufs_hba *hba) struct device *dev = hba->dev; struct ufs_qcom_host *host; struct ufs_clk_info *clki; + const struct ufs_qcom_drvdata *drvdata = of_device_get_match_data(hba->dev); host = devm_kzalloc(dev, sizeof(*host), GFP_KERNEL); if (!host) @@ -1148,6 +1149,9 @@ static int ufs_qcom_init(struct ufs_hba *hba) dev_warn(dev, "%s: failed to configure the testbus %d\n", __func__, err); + if (drvdata && drvdata->no_phy_retention) + hba->spm_lvl = UFS_PM_LVL_5; + return 0; out_variant_clear: @@ -1867,6 +1871,7 @@ static void ufs_qcom_remove(struct platform_device *pdev) static const struct ufs_qcom_drvdata ufs_qcom_sm8550_drvdata = { .quirks = UFSHCD_QUIRK_BROKEN_LSDBS_CAP, + .no_phy_retention = true, }; static const struct of_device_id ufs_qcom_of_match[] __maybe_unused = { diff --git a/drivers/ufs/host/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h index 15f6dad8b27f..919f53682beb 100644 --- a/drivers/ufs/host/ufs-qcom.h +++ b/drivers/ufs/host/ufs-qcom.h @@ -219,6 +219,7 @@ struct ufs_qcom_host { struct ufs_qcom_drvdata { enum ufshcd_quirks quirks; + bool no_phy_retention; }; static inline u32 From f0ed39830e6064d62f9c5393505677a26569bb56 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 20 Dec 2024 09:19:18 -0800 Subject: [PATCH 608/807] xe/oa: Fix query mode of operation for OAR/OAC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a set of squashed commits to facilitate smooth applying to stable. Each commit message is retained for reference. 1) Allow a GGTT mapped batch to be submitted to user exec queue For a OA use case, one of the HW registers needs to be modified by submitting an MI_LOAD_REGISTER_IMM command to the users exec queue, so that the register is modified in the user's hardware context. In order to do this a batch that is mapped in GGTT, needs to be submitted to the user exec queue. Since all user submissions use q->vm and hence PPGTT, add some plumbing to enable submission of batches mapped in GGTT. v2: ggtt is zero-initialized, so no need to set it false (Matt Brost) 2) xe/oa: Use MI_LOAD_REGISTER_IMMEDIATE to enable OAR/OAC To enable OAR/OAC, a bit in RING_CONTEXT_CONTROL needs to be set. Setting this bit cause the context image size to change and if not done correct, can cause undesired hangs. Current code uses a separate exec_queue to modify this bit and is error-prone. As per HW recommendation, submit MI_LOAD_REGISTER_IMM to the target hardware context to modify the relevant bit. In v2 version, an attempt to submit everything to the user-queue was made, but it failed the unprivileged-single-ctx-counters test. It appears that the OACTXCONTROL must be modified from a remote context. In v3 version, all context specific register configurations were moved to use LOAD_REGISTER_IMMEDIATE and that seems to work well. This is a cleaner way, since we can now submit all configuration to user exec_queue and the fence handling is simplified. v2: (Matt) - set job->ggtt to true if create job is successful - unlock vm on job error (Ashutosh) - don't wait on job submission - use kernel exec queue where possible v3: (Ashutosh) - Fix checkpatch issues - Remove extra spaces/new-lines - Add Fixes: and Cc: tags - Reset context control bit when OA stream is closed - Submit all config via MI_LOAD_REGISTER_IMMEDIATE (Umesh) - Update commit message for v3 experiment - Squash patches for easier port to stable v4: (Ashutosh) - No need to pass q to xe_oa_submit_bb - Do not support exec queues with width > 1 - Fix disabling of CTX_CTRL_OAC_CONTEXT_ENABLE v5: (Ashutosh) - Drop reg_lri related comments - Use XE_OA_SUBMIT_NO_DEPS in xe_oa_load_with_lri Fixes: 8135f1c09dd2 ("drm/xe/oa: Don't reset OAC_CONTEXT_ENABLE on OA stream close") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost # commit 1 Reviewed-by: Ashutosh Dixit Cc: stable@vger.kernel.org Reviewed-by: Jonathan Cavitt Signed-off-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20241220171919.571528-2-umesh.nerlige.ramappa@intel.com (cherry picked from commit 55039832f98c7e05f1cf9e0d8c12b2490abd0f16) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_oa.c | 134 ++++++++---------------- drivers/gpu/drm/xe/xe_ring_ops.c | 5 +- drivers/gpu/drm/xe/xe_sched_job_types.h | 2 + 3 files changed, 51 insertions(+), 90 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index 8dd55798ab31..5cc0f6f9bc11 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -74,12 +74,6 @@ struct xe_oa_config { struct rcu_head rcu; }; -struct flex { - struct xe_reg reg; - u32 offset; - u32 value; -}; - struct xe_oa_open_param { struct xe_file *xef; u32 oa_unit_id; @@ -596,19 +590,38 @@ static __poll_t xe_oa_poll(struct file *file, poll_table *wait) return ret; } +static void xe_oa_lock_vma(struct xe_exec_queue *q) +{ + if (q->vm) { + down_read(&q->vm->lock); + xe_vm_lock(q->vm, false); + } +} + +static void xe_oa_unlock_vma(struct xe_exec_queue *q) +{ + if (q->vm) { + xe_vm_unlock(q->vm); + up_read(&q->vm->lock); + } +} + static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream, enum xe_oa_submit_deps deps, struct xe_bb *bb) { + struct xe_exec_queue *q = stream->exec_q ?: stream->k_exec_q; struct xe_sched_job *job; struct dma_fence *fence; int err = 0; - /* Kernel configuration is issued on stream->k_exec_q, not stream->exec_q */ - job = xe_bb_create_job(stream->k_exec_q, bb); + xe_oa_lock_vma(q); + + job = xe_bb_create_job(q, bb); if (IS_ERR(job)) { err = PTR_ERR(job); goto exit; } + job->ggtt = true; if (deps == XE_OA_SUBMIT_ADD_DEPS) { for (int i = 0; i < stream->num_syncs && !err; i++) @@ -623,10 +636,13 @@ static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream, enum xe_oa fence = dma_fence_get(&job->drm.s_fence->finished); xe_sched_job_push(job); + xe_oa_unlock_vma(q); + return fence; err_put_job: xe_sched_job_put(job); exit: + xe_oa_unlock_vma(q); return ERR_PTR(err); } @@ -675,63 +691,19 @@ static void xe_oa_free_configs(struct xe_oa_stream *stream) dma_fence_put(stream->last_fence); } -static void xe_oa_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc, - struct xe_bb *bb, const struct flex *flex, u32 count) -{ - u32 offset = xe_bo_ggtt_addr(lrc->bo); - - do { - bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); - bb->cs[bb->len++] = offset + flex->offset * sizeof(u32); - bb->cs[bb->len++] = 0; - bb->cs[bb->len++] = flex->value; - - } while (flex++, --count); -} - -static int xe_oa_modify_ctx_image(struct xe_oa_stream *stream, struct xe_lrc *lrc, - const struct flex *flex, u32 count) +static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri, u32 count) { struct dma_fence *fence; struct xe_bb *bb; int err; - bb = xe_bb_new(stream->gt, 4 * count, false); + bb = xe_bb_new(stream->gt, 2 * count + 1, false); if (IS_ERR(bb)) { err = PTR_ERR(bb); goto exit; } - xe_oa_store_flex(stream, lrc, bb, flex, count); - - fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_NO_DEPS, bb); - if (IS_ERR(fence)) { - err = PTR_ERR(fence); - goto free_bb; - } - xe_bb_free(bb, fence); - dma_fence_put(fence); - - return 0; -free_bb: - xe_bb_free(bb, NULL); -exit: - return err; -} - -static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri) -{ - struct dma_fence *fence; - struct xe_bb *bb; - int err; - - bb = xe_bb_new(stream->gt, 3, false); - if (IS_ERR(bb)) { - err = PTR_ERR(bb); - goto exit; - } - - write_cs_mi_lri(bb, reg_lri, 1); + write_cs_mi_lri(bb, reg_lri, count); fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_NO_DEPS, bb); if (IS_ERR(fence)) { @@ -751,71 +723,55 @@ exit: static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable) { const struct xe_oa_format *format = stream->oa_buffer.format; - struct xe_lrc *lrc = stream->exec_q->lrc[0]; - u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32); u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) | (enable ? OAR_OACONTROL_COUNTER_ENABLE : 0); - struct flex regs_context[] = { + struct xe_oa_reg reg_lri[] = { { OACTXCONTROL(stream->hwe->mmio_base), - stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1, enable ? OA_COUNTER_RESUME : 0, }, + { + OAR_OACONTROL, + oacontrol, + }, { RING_CONTEXT_CONTROL(stream->hwe->mmio_base), - regs_offset + CTX_CONTEXT_CONTROL, - _MASKED_BIT_ENABLE(CTX_CTRL_OAC_CONTEXT_ENABLE), + _MASKED_FIELD(CTX_CTRL_OAC_CONTEXT_ENABLE, + enable ? CTX_CTRL_OAC_CONTEXT_ENABLE : 0) }, }; - struct xe_oa_reg reg_lri = { OAR_OACONTROL, oacontrol }; - int err; - /* Modify stream hwe context image with regs_context */ - err = xe_oa_modify_ctx_image(stream, stream->exec_q->lrc[0], - regs_context, ARRAY_SIZE(regs_context)); - if (err) - return err; - - /* Apply reg_lri using LRI */ - return xe_oa_load_with_lri(stream, ®_lri); + return xe_oa_load_with_lri(stream, reg_lri, ARRAY_SIZE(reg_lri)); } static int xe_oa_configure_oac_context(struct xe_oa_stream *stream, bool enable) { const struct xe_oa_format *format = stream->oa_buffer.format; - struct xe_lrc *lrc = stream->exec_q->lrc[0]; - u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32); u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) | (enable ? OAR_OACONTROL_COUNTER_ENABLE : 0); - struct flex regs_context[] = { + struct xe_oa_reg reg_lri[] = { { OACTXCONTROL(stream->hwe->mmio_base), - stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1, enable ? OA_COUNTER_RESUME : 0, }, + { + OAC_OACONTROL, + oacontrol + }, { RING_CONTEXT_CONTROL(stream->hwe->mmio_base), - regs_offset + CTX_CONTEXT_CONTROL, - _MASKED_BIT_ENABLE(CTX_CTRL_OAC_CONTEXT_ENABLE) | + _MASKED_FIELD(CTX_CTRL_OAC_CONTEXT_ENABLE, + enable ? CTX_CTRL_OAC_CONTEXT_ENABLE : 0) | _MASKED_FIELD(CTX_CTRL_RUN_ALONE, enable ? CTX_CTRL_RUN_ALONE : 0), }, }; - struct xe_oa_reg reg_lri = { OAC_OACONTROL, oacontrol }; - int err; /* Set ccs select to enable programming of OAC_OACONTROL */ xe_mmio_write32(&stream->gt->mmio, __oa_regs(stream)->oa_ctrl, __oa_ccs_select(stream)); - /* Modify stream hwe context image with regs_context */ - err = xe_oa_modify_ctx_image(stream, stream->exec_q->lrc[0], - regs_context, ARRAY_SIZE(regs_context)); - if (err) - return err; - - /* Apply reg_lri using LRI */ - return xe_oa_load_with_lri(stream, ®_lri); + return xe_oa_load_with_lri(stream, reg_lri, ARRAY_SIZE(reg_lri)); } static int xe_oa_configure_oa_context(struct xe_oa_stream *stream, bool enable) @@ -2066,8 +2022,8 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f if (XE_IOCTL_DBG(oa->xe, !param.exec_q)) return -ENOENT; - if (param.exec_q->width > 1) - drm_dbg(&oa->xe->drm, "exec_q->width > 1, programming only exec_q->lrc[0]\n"); + if (XE_IOCTL_DBG(oa->xe, param.exec_q->width > 1)) + return -EOPNOTSUPP; } /* diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index 0be4f489d3e1..9f327f27c072 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -221,7 +221,10 @@ static int emit_pipe_imm_ggtt(u32 addr, u32 value, bool stall_only, u32 *dw, static u32 get_ppgtt_flag(struct xe_sched_job *job) { - return job->q->vm ? BIT(8) : 0; + if (job->q->vm && !job->ggtt) + return BIT(8); + + return 0; } static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i) diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h index f13f333f00be..d942b20a9f29 100644 --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -56,6 +56,8 @@ struct xe_sched_job { u32 migrate_flush_flags; /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */ bool ring_ops_flush_tlb; + /** @ggtt: mapped in ggtt. */ + bool ggtt; /** @ptrs: per instance pointers. */ struct xe_job_ptrs ptrs[]; }; From 0bc21e701a6ffacfdde7f04f87d664d82e8a13bf Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Thu, 2 Jan 2025 06:30:03 -0800 Subject: [PATCH 609/807] MAINTAINERS: Remove Olof from SoC maintainers I haven't been an active participant for a couple of years now, and after discussions at Linux Plumbers in 2024, Arnd is getting fresh help from a few more participants. It's time to remove myself, and spare myself from patches and pull requests in my inbox. Signed-off-by: Olof Johansson Cc: Arnd Bergmann Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 910305c11e8a..c575de4903db 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1797,7 +1797,6 @@ F: include/uapi/linux/if_arcnet.h ARM AND ARM64 SoC SUB-ARCHITECTURES (COMMON PARTS) M: Arnd Bergmann -M: Olof Johansson L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: soc@lists.linux.dev S: Maintained From d65474033740ded0a4fe9a097fce72328655b41d Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Tue, 31 Dec 2024 11:37:31 +0000 Subject: [PATCH 610/807] fgraph: Add READ_ONCE() when accessing fgraph_array[] In __ftrace_return_to_handler(), a loop iterates over the fgraph_array[] elements, which are fgraph_ops. The loop checks if an element is a fgraph_stub to prevent using a fgraph_stub afterward. However, if the compiler reloads fgraph_array[] after this check, it might race with an update to fgraph_array[] that introduces a fgraph_stub. This could result in the stub being processed, but the stub contains a null "func_hash" field, leading to a NULL pointer dereference. To ensure that the gops compared against the fgraph_stub matches the gops processed later, add a READ_ONCE(). A similar patch appears in commit 63a8dfb ("function_graph: Add READ_ONCE() when accessing fgraph_array[]"). Cc: stable@vger.kernel.org Fixes: 37238abe3cb47 ("ftrace/function_graph: Pass fgraph_ops to function graph callbacks") Link: https://lore.kernel.org/20241231113731.277668-1-zilin@seu.edu.cn Signed-off-by: Zilin Guan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index ddedcb50917f..30e3ddc8a8a8 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -833,7 +833,7 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs #endif { for_each_set_bit(i, &bitmap, sizeof(bitmap) * BITS_PER_BYTE) { - struct fgraph_ops *gops = fgraph_array[i]; + struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]); if (gops == &fgraph_stub) continue; From 789a8cff8d2dbe4b5c617c3004b5eb63fa7a3b35 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Thu, 2 Jan 2025 04:08:20 +0900 Subject: [PATCH 611/807] ftrace: Fix function profiler's filtering functionality Commit c132be2c4fcc ("function_graph: Have the instances use their own ftrace_ops for filtering"), function profiler (enabled via function_profile_enabled) has been showing statistics for all functions, ignoring set_ftrace_filter settings. While tracers are instantiated, the function profiler is not. Therefore, it should use the global set_ftrace_filter for consistency. This patch modifies the function profiler to use the global filter, fixing the filtering functionality. Before (filtering not working): ``` root@localhost:~# echo 'vfs*' > /sys/kernel/tracing/set_ftrace_filter root@localhost:~# echo 1 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# sleep 1 root@localhost:~# echo 0 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# head /sys/kernel/tracing/trace_stat/* Function Hit Time Avg s^2 -------- --- ---- --- --- schedule 314 22290594 us 70989.15 us 40372231 us x64_sys_call 1527 8762510 us 5738.382 us 3414354 us schedule_hrtimeout_range 176 8665356 us 49234.98 us 405618876 us __x64_sys_ppoll 324 5656635 us 17458.75 us 19203976 us do_sys_poll 324 5653747 us 17449.83 us 19214945 us schedule_timeout 67 5531396 us 82558.15 us 2136740827 us __x64_sys_pselect6 12 3029540 us 252461.7 us 63296940171 us do_pselect.constprop.0 12 3029532 us 252461.0 us 63296952931 us ``` After (filtering working): ``` root@localhost:~# echo 'vfs*' > /sys/kernel/tracing/set_ftrace_filter root@localhost:~# echo 1 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# sleep 1 root@localhost:~# echo 0 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# head /sys/kernel/tracing/trace_stat/* Function Hit Time Avg s^2 -------- --- ---- --- --- vfs_write 462 68476.43 us 148.217 us 25874.48 us vfs_read 641 9611.356 us 14.994 us 28868.07 us vfs_fstat 890 878.094 us 0.986 us 1.667 us vfs_fstatat 227 757.176 us 3.335 us 18.928 us vfs_statx 226 610.610 us 2.701 us 17.749 us vfs_getattr_nosec 1187 460.919 us 0.388 us 0.326 us vfs_statx_path 297 343.287 us 1.155 us 11.116 us vfs_rename 6 291.575 us 48.595 us 9889.236 us ``` Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250101190820.72534-1-enjuk@amazon.com Fixes: c132be2c4fcc ("function_graph: Have the instances use their own ftrace_ops for filtering") Signed-off-by: Kohei Enju Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9b17efb1a87d..2e113f8b13a2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -902,16 +902,13 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, } static struct fgraph_ops fprofiler_ops = { - .ops = { - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(fprofiler_ops.ops) - }, .entryfunc = &profile_graph_entry, .retfunc = &profile_graph_return, }; static int register_ftrace_profiler(void) { + ftrace_ops_set_global_filter(&fprofiler_ops.ops); return register_ftrace_graph(&fprofiler_ops); } @@ -922,12 +919,11 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(ftrace_profile_ops) }; static int register_ftrace_profiler(void) { + ftrace_ops_set_global_filter(&ftrace_profile_ops); return register_ftrace_function(&ftrace_profile_ops); } From c6e60a0a68b7e6b3c7e33863a16e8e88ba9eee6f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 2 Jan 2025 16:32:51 -0700 Subject: [PATCH 612/807] io_uring/net: always initialize kmsg->msg.msg_inq upfront syzbot reports that ->msg_inq may get used uinitialized from the following path: BUG: KMSAN: uninit-value in io_recv_buf_select io_uring/net.c:1094 [inline] BUG: KMSAN: uninit-value in io_recv+0x930/0x1f90 io_uring/net.c:1158 io_recv_buf_select io_uring/net.c:1094 [inline] io_recv+0x930/0x1f90 io_uring/net.c:1158 io_issue_sqe+0x420/0x2130 io_uring/io_uring.c:1740 io_queue_sqe io_uring/io_uring.c:1950 [inline] io_req_task_submit+0xfa/0x1d0 io_uring/io_uring.c:1374 io_handle_tw_list+0x55f/0x5c0 io_uring/io_uring.c:1057 tctx_task_work_run+0x109/0x3e0 io_uring/io_uring.c:1121 tctx_task_work+0x6d/0xc0 io_uring/io_uring.c:1139 task_work_run+0x268/0x310 kernel/task_work.c:239 io_run_task_work+0x43a/0x4a0 io_uring/io_uring.h:343 io_cqring_wait io_uring/io_uring.c:2527 [inline] __do_sys_io_uring_enter io_uring/io_uring.c:3439 [inline] __se_sys_io_uring_enter+0x204f/0x4ce0 io_uring/io_uring.c:3330 __x64_sys_io_uring_enter+0x11f/0x1a0 io_uring/io_uring.c:3330 x64_sys_call+0xce5/0x3c30 arch/x86/include/generated/asm/syscalls_64.h:427 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f and it is correct, as it's never initialized upfront. Hence the first submission can end up using it uninitialized, if the recv wasn't successful and the networking stack didn't honor ->msg_get_inq being set and filling in the output value of ->msg_inq as requested. Set it to 0 upfront when it's allocated, just to silence this KMSAN warning. There's no side effect of using it uninitialized, it'll just potentially cause the next receive to use a recv value hint that's not accurate. Fixes: c6f32c7d9e09 ("io_uring/net: get rid of ->prep_async() for receive side") Reported-by: syzbot+068ff190354d2f74892f@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index df1f7dc6f1c8..c6cd38cc5dc4 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -754,6 +754,7 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) if (req->opcode == IORING_OP_RECV) { kmsg->msg.msg_name = NULL; kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_inq = 0; kmsg->msg.msg_control = NULL; kmsg->msg.msg_get_inq = 1; kmsg->msg.msg_controllen = 0; From a8620de72e5676993ec3a3b975f7c10908f5f60f Mon Sep 17 00:00:00 2001 From: Liang Jie Date: Mon, 30 Dec 2024 17:37:09 +0800 Subject: [PATCH 613/807] net: sfc: Correct key_len for efx_tc_ct_zone_ht_params In efx_tc_ct_zone_ht_params, the key_len was previously set to offsetof(struct efx_tc_ct_zone, linkage). This calculation is incorrect because it includes any padding between the zone field and the linkage field due to structure alignment, which can vary between systems. This patch updates key_len to use sizeof_field(struct efx_tc_ct_zone, zone) , ensuring that the hash table correctly uses the zone as the key. This fix prevents potential hash lookup errors and improves connection tracking reliability. Fixes: c3bb5c6acd4e ("sfc: functions to register for conntrack zone offload") Signed-off-by: Liang Jie Acked-by: Edward Cree Link: https://patch.msgid.link/20241230093709.3226854-1-buaajxlj@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/tc_conntrack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/tc_conntrack.c b/drivers/net/ethernet/sfc/tc_conntrack.c index d90206f27161..c0603f54cec3 100644 --- a/drivers/net/ethernet/sfc/tc_conntrack.c +++ b/drivers/net/ethernet/sfc/tc_conntrack.c @@ -16,7 +16,7 @@ static int efx_tc_flow_block(enum tc_setup_type type, void *type_data, void *cb_priv); static const struct rhashtable_params efx_tc_ct_zone_ht_params = { - .key_len = offsetof(struct efx_tc_ct_zone, linkage), + .key_len = sizeof_field(struct efx_tc_ct_zone, zone), .key_offset = 0, .head_offset = offsetof(struct efx_tc_ct_zone, linkage), }; From 68e068cabd2c6c533ef934c2e5151609cf6ecc6d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 1 Jan 2025 11:47:40 -0500 Subject: [PATCH 614/807] net: reenable NETIF_F_IPV6_CSUM offload for BIG TCP packets The blamed commit disabled hardware offoad of IPv6 packets with extension headers on devices that advertise NETIF_F_IPV6_CSUM, based on the definition of that feature in skbuff.h: * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). The change causes skb_warn_bad_offload to fire for BIG TCP packets. [ 496.310233] WARNING: CPU: 13 PID: 23472 at net/core/dev.c:3129 skb_warn_bad_offload+0xc4/0xe0 [ 496.310297] ? skb_warn_bad_offload+0xc4/0xe0 [ 496.310300] skb_checksum_help+0x129/0x1f0 [ 496.310303] skb_csum_hwoffload_help+0x150/0x1b0 [ 496.310306] validate_xmit_skb+0x159/0x270 [ 496.310309] validate_xmit_skb_list+0x41/0x70 [ 496.310312] sch_direct_xmit+0x5c/0x250 [ 496.310317] __qdisc_run+0x388/0x620 BIG TCP introduced an IPV6_TLV_JUMBO IPv6 extension header to communicate packet length, as this is an IPv6 jumbogram. But, the feature is only enabled on devices that support BIG TCP TSO. The header is only present for PF_PACKET taps like tcpdump, and not transmitted by physical devices. For this specific case of extension headers that are not transmitted, return to the situation before the blamed commit and support hardware offload. ipv6_has_hopopt_jumbo() tests not only whether this header is present, but also that it is the only extension header before a terminal (L4) header. Fixes: 04c20a9356f2 ("net: skip offload for NETIF_F_IPV6_CSUM if ipv6 header contains extension") Reported-by: syzbot Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89iK1hdC3Nt8KPhOtTF8vCPc1AHDCtse_BTNki1pWxAByTQ@mail.gmail.com/ Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250101164909.1331680-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 45a8c3dd4a64..faa23042df38 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3642,8 +3642,10 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) goto sw_checksum; + switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): From 5b0af621c3f6ef9261cf6067812f2fd9943acb4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Dec 2024 16:05:27 +0000 Subject: [PATCH 615/807] net: restrict SO_REUSEPORT to inet sockets After blamed commit, crypto sockets could accidentally be destroyed from RCU call back, as spotted by zyzbot [1]. Trying to acquire a mutex in RCU callback is not allowed. Restrict SO_REUSEPORT socket option to inet sockets. v1 of this patch supported TCP, UDP and SCTP sockets, but fcnal-test.sh test needed RAW and ICMP support. [1] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:562 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 24, name: ksoftirqd/1 preempt_count: 100, expected: 0 RCU nest depth: 0, expected: 0 1 lock held by ksoftirqd/1/24: #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_do_batch kernel/rcu/tree.c:2561 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_core+0xa37/0x17a0 kernel/rcu/tree.c:2823 Preemption disabled at: [] softirq_handle_begin kernel/softirq.c:402 [inline] [] handle_softirqs+0x128/0x9b0 kernel/softirq.c:537 CPU: 1 UID: 0 PID: 24 Comm: ksoftirqd/1 Not tainted 6.13.0-rc3-syzkaller-00174-ga024e377efed #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 __might_resched+0x5d4/0x780 kernel/sched/core.c:8758 __mutex_lock_common kernel/locking/mutex.c:562 [inline] __mutex_lock+0x131/0xee0 kernel/locking/mutex.c:735 crypto_put_default_null_skcipher+0x18/0x70 crypto/crypto_null.c:179 aead_release+0x3d/0x50 crypto/algif_aead.c:489 alg_do_release crypto/af_alg.c:118 [inline] alg_sock_destruct+0x86/0xc0 crypto/af_alg.c:502 __sk_destruct+0x58/0x5f0 net/core/sock.c:2260 rcu_do_batch kernel/rcu/tree.c:2567 [inline] rcu_core+0xaaa/0x17a0 kernel/rcu/tree.c:2823 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 run_ksoftirqd+0xca/0x130 kernel/softirq.c:950 smpboot_thread_fn+0x544/0xa30 kernel/smpboot.c:164 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: 8c7138b33e5c ("net: Unpublish sk from sk_reuseport_cb before call_rcu") Reported-by: syzbot+b3e02953598f447d4d2a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772f2f4.050a0220.2f3838.04cb.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241231160527.3994168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index 74729d20cd00..be84885f9290 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1295,7 +1295,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - sk->sk_reuseport = valbool; + if (valbool && !sk_is_inet(sk)) + ret = -EOPNOTSUPP; + else + sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); From a7af435df0e04cfb4a4004136d597c42639a2ae7 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Sun, 29 Dec 2024 17:46:58 +0100 Subject: [PATCH 616/807] net: wwan: iosm: Properly check for valid exec stage in ipc_mmio_init() ipc_mmio_init() used the post-decrement operator in its loop continuing condition of "retries" counter being "> 0", which meant that when this condition caused loop exit "retries" counter reached -1. But the later valid exec stage failure check only tests for "retries" counter being exactly zero, so it didn't trigger in this case (but would wrongly trigger if the code reaches a valid exec stage in the very last loop iteration). Fix this by using the pre-decrement operator instead, so the loop counter is exactly zero on valid exec stage failure. Fixes: dc0514f5d828 ("net: iosm: mmio scratchpad") Signed-off-by: Maciej S. Szmigiero Link: https://patch.msgid.link/8b19125a825f9dcdd81c667c1e5c48ba28d505a6.1735490770.git.mail@maciej.szmigiero.name Signed-off-by: Jakub Kicinski --- drivers/net/wwan/iosm/iosm_ipc_mmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_mmio.c b/drivers/net/wwan/iosm/iosm_ipc_mmio.c index 63eb08c43c05..6764c13530b9 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mmio.c +++ b/drivers/net/wwan/iosm/iosm_ipc_mmio.c @@ -104,7 +104,7 @@ struct iosm_mmio *ipc_mmio_init(void __iomem *mmio, struct device *dev) break; msleep(20); - } while (retries-- > 0); + } while (--retries > 0); if (!retries) { dev_err(ipc_mmio->dev, "invalid exec stage %X", stage); From 77ee7a6d16b6ec07b5c3ae2b6b60a24c1afbed09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:03 +0000 Subject: [PATCH 617/807] af_packet: fix vlan_get_tci() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_tci() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8da482 len:32 put:14 head:ffff88807a1d5800 data:ffff88807a1d5810 tail:0x14 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 5880 Comm: syz-executor172 Not tainted 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 9e 6c 26 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 3a 5a 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc90003baf5b8 EFLAGS: 00010286 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 8565c1eec37aa000 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802616fb50 R08: ffffffff817f0a4c R09: 1ffff92000775e50 R10: dffffc0000000000 R11: fffff52000775e51 R12: 0000000000000140 R13: ffff88807a1d5800 R14: ffff88807a1d5810 R15: 0000000000000014 FS: 00007fa03261f6c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffd65753000 CR3: 0000000031720000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_tci+0x272/0x550 net/packet/af_packet.c:565 packet_recvmsg+0x13c9/0x1ef0 net/packet/af_packet.c:3616 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1066 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2814 ___sys_recvmsg net/socket.c:2856 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2951 __sys_recvmmsg net/socket.c:3025 [inline] __do_sys_recvmmsg net/socket.c:3048 [inline] __se_sys_recvmmsg net/socket.c:3041 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3041 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+8400677f3fd43f37d3bc@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c6.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 886c0dd47b66..e2e34a49e98d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -538,10 +538,8 @@ static void *packet_current_frame(struct packet_sock *po, return packet_lookup_frame(po, rb, rb->head, status); } -static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) +static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; struct vlan_hdr vhdr, *vh; unsigned int header_len; @@ -562,12 +560,8 @@ static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) else return 0; - skb_push(skb, skb->data - skb_mac_header(skb)); - vh = skb_header_pointer(skb, header_len, sizeof(vhdr), &vhdr); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } + vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, + sizeof(vhdr), &vhdr); if (unlikely(!vh)) return 0; From f91a5b8089389eb408501af2762f168c3aaa7b79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:04 +0000 Subject: [PATCH 618/807] af_packet: fix vlan_get_protocol_dgram() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_protocol_dgram() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8ccd05 len:29 put:14 head:ffff88807fc8e400 data:ffff88807fc8e3f4 tail:0x11 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5892 Comm: syz-executor883 Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 86 d5 25 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 5a 69 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc900038d7638 EFLAGS: 00010282 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 609ffd18ea660600 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802483c8d0 R08: ffffffff817f0a8c R09: 1ffff9200071ae60 R10: dffffc0000000000 R11: fffff5200071ae61 R12: 0000000000000140 R13: ffff88807fc8e400 R14: ffff88807fc8e3f4 R15: 0000000000000011 FS: 00007fbac5e006c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbac5e00d58 CR3: 000000001238e000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_protocol_dgram+0x165/0x290 net/packet/af_packet.c:585 packet_recvmsg+0x948/0x1ef0 net/packet/af_packet.c:3552 sock_recvmsg_nosec net/socket.c:1033 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1055 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2803 ___sys_recvmsg net/socket.c:2845 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2940 __sys_recvmmsg net/socket.c:3014 [inline] __do_sys_recvmmsg net/socket.c:3037 [inline] __se_sys_recvmmsg net/socket.c:3030 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3030 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+74f70bb1cb968bf09e4f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c5.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 16 +++++++++++++--- net/packet/af_packet.c | 16 ++++------------ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed9..d65b5d71b93b 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -585,13 +585,16 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol + * @mac_offset: MAC offset * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, - int *depth) +static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset, + int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; @@ -610,7 +613,8 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, do { struct vlan_hdr vhdr, *vh; - vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; @@ -625,6 +629,12 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, return type; } +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, + int *depth) +{ + return __vlan_get_protocol_offset(skb, type, 0, depth); +} + /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e2e34a49e98d..2d73769d67f4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -568,21 +568,13 @@ static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) return ntohs(vh->h_vlan_TCI); } -static __be16 vlan_get_protocol_dgram(struct sk_buff *skb) +static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; - if (unlikely(eth_type_vlan(proto))) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; - - skb_push(skb, skb->data - skb_mac_header(skb)); - proto = __vlan_get_protocol(skb, proto, NULL); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } - } + if (unlikely(eth_type_vlan(proto))) + proto = __vlan_get_protocol_offset(skb, proto, + skb_mac_offset(skb), NULL); return proto; } From 260466b576bca0081a7d4acecc8e93687aa22d0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:28:49 +0000 Subject: [PATCH 619/807] ila: serialize calls to nf_register_net_hooks() syzbot found a race in ila_add_mapping() [1] commit 031ae72825ce ("ila: call nf_unregister_net_hooks() sooner") attempted to fix a similar issue. Looking at the syzbot repro, we have concurrent ILA_CMD_ADD commands. Add a mutex to make sure at most one thread is calling nf_register_net_hooks(). [1] BUG: KASAN: slab-use-after-free in rht_key_hashfn include/linux/rhashtable.h:159 [inline] BUG: KASAN: slab-use-after-free in __rhashtable_lookup.constprop.0+0x426/0x550 include/linux/rhashtable.h:604 Read of size 4 at addr ffff888028f40008 by task dhcpcd/5501 CPU: 1 UID: 0 PID: 5501 Comm: dhcpcd Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xc3/0x620 mm/kasan/report.c:489 kasan_report+0xd9/0x110 mm/kasan/report.c:602 rht_key_hashfn include/linux/rhashtable.h:159 [inline] __rhashtable_lookup.constprop.0+0x426/0x550 include/linux/rhashtable.h:604 rhashtable_lookup include/linux/rhashtable.h:646 [inline] rhashtable_lookup_fast include/linux/rhashtable.h:672 [inline] ila_lookup_wildcards net/ipv6/ila/ila_xlat.c:127 [inline] ila_xlat_addr net/ipv6/ila/ila_xlat.c:652 [inline] ila_nf_input+0x1ee/0x620 net/ipv6/ila/ila_xlat.c:185 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xbb/0x200 net/netfilter/core.c:626 nf_hook.constprop.0+0x42e/0x750 include/linux/netfilter.h:269 NF_HOOK include/linux/netfilter.h:312 [inline] ipv6_rcv+0xa4/0x680 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core+0x12e/0x1e0 net/core/dev.c:5672 __netif_receive_skb+0x1d/0x160 net/core/dev.c:5785 process_backlog+0x443/0x15f0 net/core/dev.c:6117 __napi_poll.constprop.0+0xb7/0x550 net/core/dev.c:6883 napi_poll net/core/dev.c:6952 [inline] net_rx_action+0xa94/0x1010 net/core/dev.c:7074 handle_softirqs+0x213/0x8f0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0x109/0x170 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0xa4/0xc0 arch/x86/kernel/apic/apic.c:1049 Fixes: 7f00feaf1076 ("ila: Add generic ILA translation facility") Reported-by: syzbot+47e761d22ecf745f72b9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c9ae.050a0220.2f3838.04c7.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Tom Herbert Link: https://patch.msgid.link/20241230162849.2795486-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ila/ila_xlat.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 7646e401c630..1d41b2ab4884 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -195,6 +195,8 @@ static const struct nf_hook_ops ila_nf_hook_ops[] = { }, }; +static DEFINE_MUTEX(ila_mutex); + static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -202,16 +204,20 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; - if (!ilan->xlat.hooks_registered) { + if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ - err = nf_register_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); + mutex_lock(&ila_mutex); + if (!ilan->xlat.hooks_registered) { + err = nf_register_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); + if (!err) + WRITE_ONCE(ilan->xlat.hooks_registered, true); + } + mutex_unlock(&ila_mutex); if (err) return err; - - ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); From 449e6912a2522af672e99992e1201a454910864e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:30 +0100 Subject: [PATCH 620/807] mptcp: fix recvbuffer adjust on sleeping rcvmsg If the recvmsg() blocks after receiving some data - i.e. due to SO_RCVLOWAT - the MPTCP code will attempt multiple times to adjust the receive buffer size, wrongly accounting every time the cumulative of received data - instead of accounting only for the delta. Address the issue moving mptcp_rcv_space_adjust just after the data reception and passing it only the just received bytes. This also removes an unneeded difference between the TCP and MPTCP RX code path implementation. Fixes: 581302298524 ("mptcp: error out earlier on disconnect") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-1-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 08a72242428c..27afdb7e2071 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1939,6 +1939,8 @@ do_error: goto out; } +static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); + static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, @@ -1992,6 +1994,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; } + mptcp_rcv_space_adjust(msk, copied); return copied; } @@ -2268,7 +2271,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); - mptcp_rcv_space_adjust(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2276,8 +2278,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } - mptcp_rcv_space_adjust(msk, copied); - out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) From 551844f26da2a9f76c0a698baaffa631d1178645 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:31 +0100 Subject: [PATCH 621/807] mptcp: don't always assume copied data in mptcp_cleanup_rbuf() Under some corner cases the MPTCP protocol can end-up invoking mptcp_cleanup_rbuf() when no data has been copied, but such helper assumes the opposite condition. Explicitly drop such assumption and performs the costly call only when strictly needed - before releasing the msk socket lock. Fixes: fd8976790a6c ("mptcp: be careful on MPTCP-level ack.") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-2-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 27afdb7e2071..5307fff9d995 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -528,13 +528,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } -static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied) { bool slow; slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) - tcp_cleanup_rbuf(ssk, 1); + tcp_cleanup_rbuf(ssk, copied); unlock_sock_fast(ssk, slow); } @@ -551,7 +551,7 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } -static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; @@ -559,14 +559,14 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) int space = __mptcp_space(sk); bool cleanup, rx_empty; - cleanup = (space > 0) && (space >= (old_space << 1)); - rx_empty = !__mptcp_rmem(sk); + cleanup = (space > 0) && (space >= (old_space << 1)) && copied; + rx_empty = !__mptcp_rmem(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) - mptcp_subflow_cleanup_rbuf(ssk); + mptcp_subflow_cleanup_rbuf(ssk, copied); } } @@ -2220,9 +2220,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - /* be sure to advertise window change */ - mptcp_cleanup_rbuf(msk); - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; @@ -2271,6 +2268,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); + mptcp_cleanup_rbuf(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2278,6 +2276,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } + mptcp_cleanup_rbuf(msk, copied); + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) From 56b824eb49d6258aa0bad09a406ceac3f643cdae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:32 +0100 Subject: [PATCH 622/807] mptcp: prevent excessive coalescing on receive Currently the skb size after coalescing is only limited by the skb layout (the skb must not carry frag_list). A single coalesced skb covering several MSS can potentially fill completely the receive buffer. In such a case, the snd win will zero until the receive buffer will be empty again, affecting tput badly. Fixes: 8268ed4c9d19 ("mptcp: introduce and use mptcp_try_coalesce()") Cc: stable@vger.kernel.org # please delay 2 weeks after 6.13-final release Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-3-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5307fff9d995..1b2e7cbb577f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -136,6 +136,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, int delta; if (MPTCP_SKB_CB(from)->offset || + ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; From 9facce84f4062f782ebde18daa7006a23d40b607 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Mon, 23 Dec 2024 20:45:49 +0530 Subject: [PATCH 623/807] net: ti: icssg-prueth: Fix firmware load sequence. Timesync related operations are ran in PRU0 cores for both ICSSG SLICE0 and SLICE1. Currently whenever any ICSSG interface comes up we load the respective firmwares to PRU cores and whenever interface goes down, we stop the resective cores. Due to this, when SLICE0 goes down while SLICE1 is still active, PRU0 firmwares are unloaded and PRU0 core is stopped. This results in clock jump for SLICE1 interface as the timesync related operations are no longer running. As there are interdependencies between SLICE0 and SLICE1 firmwares, fix this by running both PRU0 and PRU1 firmwares as long as at least 1 ICSSG interface is up. Add new flag in prueth struct to check if all firmwares are running and remove the old flag (fw_running). Use emacs_initialized as reference count to load the firmwares for the first and last interface up/down. Moving init_emac_mode and fw_offload_mode API outside of icssg_config to icssg_common_start API as they need to be called only once per firmware boot. Change prueth_emac_restart() to return error code and add error prints inside the caller of this functions in case of any failures. Move prueth_emac_stop() from common to sr1 driver. sr1 and sr2 drivers have different logic handling for stopping the firmwares. While sr1 driver is dependent on emac structure to stop the corresponding pru cores for that slice, for sr2 all the pru cores of both the slices are stopped and is not dependent on emac. So the prueth_emac_stop() function is no longer common and can be moved to sr1 driver. Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") Signed-off-by: MD Danish Anwar Signed-off-by: Meghana Malladi Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icssg_common.c | 25 -- drivers/net/ethernet/ti/icssg/icssg_config.c | 41 ++- drivers/net/ethernet/ti/icssg/icssg_config.h | 1 + drivers/net/ethernet/ti/icssg/icssg_prueth.c | 281 ++++++++++++------ drivers/net/ethernet/ti/icssg/icssg_prueth.h | 5 +- .../net/ethernet/ti/icssg/icssg_prueth_sr1.c | 24 +- 6 files changed, 246 insertions(+), 131 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_common.c b/drivers/net/ethernet/ti/icssg/icssg_common.c index fdebeb2f84e0..74f0f200a89d 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_common.c +++ b/drivers/net/ethernet/ti/icssg/icssg_common.c @@ -855,31 +855,6 @@ irqreturn_t prueth_rx_irq(int irq, void *dev_id) } EXPORT_SYMBOL_GPL(prueth_rx_irq); -void prueth_emac_stop(struct prueth_emac *emac) -{ - struct prueth *prueth = emac->prueth; - int slice; - - switch (emac->port_id) { - case PRUETH_PORT_MII0: - slice = ICSS_SLICE0; - break; - case PRUETH_PORT_MII1: - slice = ICSS_SLICE1; - break; - default: - netdev_err(emac->ndev, "invalid port\n"); - return; - } - - emac->fw_running = 0; - if (!emac->is_sr1) - rproc_shutdown(prueth->txpru[slice]); - rproc_shutdown(prueth->rtu[slice]); - rproc_shutdown(prueth->pru[slice]); -} -EXPORT_SYMBOL_GPL(prueth_emac_stop); - void prueth_cleanup_tx_ts(struct prueth_emac *emac) { int i; diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.c b/drivers/net/ethernet/ti/icssg/icssg_config.c index 5d2491c2943a..ddfd1c02a885 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.c +++ b/drivers/net/ethernet/ti/icssg/icssg_config.c @@ -397,7 +397,7 @@ static int prueth_emac_buffer_setup(struct prueth_emac *emac) return 0; } -static void icssg_init_emac_mode(struct prueth *prueth) +void icssg_init_emac_mode(struct prueth *prueth) { /* When the device is configured as a bridge and it is being brought * back to the emac mode, the host mac address has to be set as 0. @@ -406,9 +406,6 @@ static void icssg_init_emac_mode(struct prueth *prueth) int i; u8 mac[ETH_ALEN] = { 0 }; - if (prueth->emacs_initialized) - return; - /* Set VLAN TABLE address base */ regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, SMEM_VLAN_OFFSET_MASK, addr << SMEM_VLAN_OFFSET); @@ -423,15 +420,13 @@ static void icssg_init_emac_mode(struct prueth *prueth) /* Clear host MAC address */ icssg_class_set_host_mac_addr(prueth->miig_rt, mac); } +EXPORT_SYMBOL_GPL(icssg_init_emac_mode); -static void icssg_init_fw_offload_mode(struct prueth *prueth) +void icssg_init_fw_offload_mode(struct prueth *prueth) { u32 addr = prueth->shram.pa + EMAC_ICSSG_SWITCH_DEFAULT_VLAN_TABLE_OFFSET; int i; - if (prueth->emacs_initialized) - return; - /* Set VLAN TABLE address base */ regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, SMEM_VLAN_OFFSET_MASK, addr << SMEM_VLAN_OFFSET); @@ -448,6 +443,7 @@ static void icssg_init_fw_offload_mode(struct prueth *prueth) icssg_class_set_host_mac_addr(prueth->miig_rt, prueth->hw_bridge_dev->dev_addr); icssg_set_pvid(prueth, prueth->default_vlan, PRUETH_PORT_HOST); } +EXPORT_SYMBOL_GPL(icssg_init_fw_offload_mode); int icssg_config(struct prueth *prueth, struct prueth_emac *emac, int slice) { @@ -455,11 +451,6 @@ int icssg_config(struct prueth *prueth, struct prueth_emac *emac, int slice) struct icssg_flow_cfg __iomem *flow_cfg; int ret; - if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) - icssg_init_fw_offload_mode(prueth); - else - icssg_init_emac_mode(prueth); - memset_io(config, 0, TAS_GATE_MASK_LIST0); icssg_miig_queues_init(prueth, slice); @@ -786,3 +777,27 @@ void icssg_set_pvid(struct prueth *prueth, u8 vid, u8 port) writel(pvid, prueth->shram.va + EMAC_ICSSG_SWITCH_PORT0_DEFAULT_VLAN_OFFSET); } EXPORT_SYMBOL_GPL(icssg_set_pvid); + +int emac_fdb_flow_id_updated(struct prueth_emac *emac) +{ + struct mgmt_cmd_rsp fdb_cmd_rsp = { 0 }; + int slice = prueth_emac_slice(emac); + struct mgmt_cmd fdb_cmd = { 0 }; + int ret; + + fdb_cmd.header = ICSSG_FW_MGMT_CMD_HEADER; + fdb_cmd.type = ICSSG_FW_MGMT_FDB_CMD_TYPE_RX_FLOW; + fdb_cmd.seqnum = ++(emac->prueth->icssg_hwcmdseq); + fdb_cmd.param = 0; + + fdb_cmd.param |= (slice << 4); + fdb_cmd.cmd_args[0] = 0; + + ret = icssg_send_fdb_msg(emac, &fdb_cmd, &fdb_cmd_rsp); + if (ret) + return ret; + + WARN_ON(fdb_cmd.seqnum != fdb_cmd_rsp.seqnum); + return fdb_cmd_rsp.status == 1 ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(emac_fdb_flow_id_updated); diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.h b/drivers/net/ethernet/ti/icssg/icssg_config.h index 92c2deaa3068..c884e9fa099e 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.h +++ b/drivers/net/ethernet/ti/icssg/icssg_config.h @@ -55,6 +55,7 @@ struct icssg_rxq_ctx { #define ICSSG_FW_MGMT_FDB_CMD_TYPE 0x03 #define ICSSG_FW_MGMT_CMD_TYPE 0x04 #define ICSSG_FW_MGMT_PKT 0x80000000 +#define ICSSG_FW_MGMT_FDB_CMD_TYPE_RX_FLOW 0x05 struct icssg_r30_cmd { u32 cmd[4]; diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index c568c84a032b..d76fe6d05e10 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -164,11 +164,26 @@ static struct icssg_firmwares icssg_emac_firmwares[] = { } }; -static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) +static int prueth_start(struct rproc *rproc, const char *fw_name) +{ + int ret; + + ret = rproc_set_firmware(rproc, fw_name); + if (ret) + return ret; + return rproc_boot(rproc); +} + +static void prueth_shutdown(struct rproc *rproc) +{ + rproc_shutdown(rproc); +} + +static int prueth_emac_start(struct prueth *prueth) { struct icssg_firmwares *firmwares; struct device *dev = prueth->dev; - int slice, ret; + int ret, slice; if (prueth->is_switch_mode) firmwares = icssg_switch_firmwares; @@ -177,49 +192,126 @@ static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) else firmwares = icssg_emac_firmwares; - slice = prueth_emac_slice(emac); - if (slice < 0) { - netdev_err(emac->ndev, "invalid port\n"); - return -EINVAL; + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + ret = prueth_start(prueth->pru[slice], firmwares[slice].pru); + if (ret) { + dev_err(dev, "failed to boot PRU%d: %d\n", slice, ret); + goto unwind_slices; + } + + ret = prueth_start(prueth->rtu[slice], firmwares[slice].rtu); + if (ret) { + dev_err(dev, "failed to boot RTU%d: %d\n", slice, ret); + rproc_shutdown(prueth->pru[slice]); + goto unwind_slices; + } + + ret = prueth_start(prueth->txpru[slice], firmwares[slice].txpru); + if (ret) { + dev_err(dev, "failed to boot TX_PRU%d: %d\n", slice, ret); + rproc_shutdown(prueth->rtu[slice]); + rproc_shutdown(prueth->pru[slice]); + goto unwind_slices; + } } - ret = icssg_config(prueth, emac, slice); - if (ret) - return ret; - - ret = rproc_set_firmware(prueth->pru[slice], firmwares[slice].pru); - ret = rproc_boot(prueth->pru[slice]); - if (ret) { - dev_err(dev, "failed to boot PRU%d: %d\n", slice, ret); - return -EINVAL; - } - - ret = rproc_set_firmware(prueth->rtu[slice], firmwares[slice].rtu); - ret = rproc_boot(prueth->rtu[slice]); - if (ret) { - dev_err(dev, "failed to boot RTU%d: %d\n", slice, ret); - goto halt_pru; - } - - ret = rproc_set_firmware(prueth->txpru[slice], firmwares[slice].txpru); - ret = rproc_boot(prueth->txpru[slice]); - if (ret) { - dev_err(dev, "failed to boot TX_PRU%d: %d\n", slice, ret); - goto halt_rtu; - } - - emac->fw_running = 1; return 0; -halt_rtu: - rproc_shutdown(prueth->rtu[slice]); - -halt_pru: - rproc_shutdown(prueth->pru[slice]); +unwind_slices: + while (--slice >= 0) { + prueth_shutdown(prueth->txpru[slice]); + prueth_shutdown(prueth->rtu[slice]); + prueth_shutdown(prueth->pru[slice]); + } return ret; } +static void prueth_emac_stop(struct prueth *prueth) +{ + int slice; + + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + prueth_shutdown(prueth->txpru[slice]); + prueth_shutdown(prueth->rtu[slice]); + prueth_shutdown(prueth->pru[slice]); + } +} + +static int prueth_emac_common_start(struct prueth *prueth) +{ + struct prueth_emac *emac; + int ret = 0; + int slice; + + if (!prueth->emac[ICSS_SLICE0] && !prueth->emac[ICSS_SLICE1]) + return -EINVAL; + + /* clear SMEM and MSMC settings for all slices */ + memset_io(prueth->msmcram.va, 0, prueth->msmcram.size); + memset_io(prueth->shram.va, 0, ICSSG_CONFIG_OFFSET_SLICE1 * PRUETH_NUM_MACS); + + icssg_class_default(prueth->miig_rt, ICSS_SLICE0, 0, false); + icssg_class_default(prueth->miig_rt, ICSS_SLICE1, 0, false); + + if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) + icssg_init_fw_offload_mode(prueth); + else + icssg_init_emac_mode(prueth); + + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + emac = prueth->emac[slice]; + if (!emac) + continue; + ret = icssg_config(prueth, emac, slice); + if (ret) + goto disable_class; + } + + ret = prueth_emac_start(prueth); + if (ret) + goto disable_class; + + emac = prueth->emac[ICSS_SLICE0] ? prueth->emac[ICSS_SLICE0] : + prueth->emac[ICSS_SLICE1]; + ret = icss_iep_init(emac->iep, &prueth_iep_clockops, + emac, IEP_DEFAULT_CYCLE_TIME_NS); + if (ret) { + dev_err(prueth->dev, "Failed to initialize IEP module\n"); + goto stop_pruss; + } + + return 0; + +stop_pruss: + prueth_emac_stop(prueth); + +disable_class: + icssg_class_disable(prueth->miig_rt, ICSS_SLICE0); + icssg_class_disable(prueth->miig_rt, ICSS_SLICE1); + + return ret; +} + +static int prueth_emac_common_stop(struct prueth *prueth) +{ + struct prueth_emac *emac; + + if (!prueth->emac[ICSS_SLICE0] && !prueth->emac[ICSS_SLICE1]) + return -EINVAL; + + icssg_class_disable(prueth->miig_rt, ICSS_SLICE0); + icssg_class_disable(prueth->miig_rt, ICSS_SLICE1); + + prueth_emac_stop(prueth); + + emac = prueth->emac[ICSS_SLICE0] ? prueth->emac[ICSS_SLICE0] : + prueth->emac[ICSS_SLICE1]; + icss_iep_exit(emac->iep); + + return 0; +} + /* called back by PHY layer if there is change in link state of hw port*/ static void emac_adjust_link(struct net_device *ndev) { @@ -374,9 +466,6 @@ static void prueth_iep_settime(void *clockops_data, u64 ns) u32 cycletime; int timeout; - if (!emac->fw_running) - return; - sc_descp = emac->prueth->shram.va + TIMESYNC_FW_WC_SETCLOCK_DESC_OFFSET; cycletime = IEP_DEFAULT_CYCLE_TIME_NS; @@ -543,23 +632,17 @@ static int emac_ndo_open(struct net_device *ndev) { struct prueth_emac *emac = netdev_priv(ndev); int ret, i, num_data_chn = emac->tx_ch_num; + struct icssg_flow_cfg __iomem *flow_cfg; struct prueth *prueth = emac->prueth; int slice = prueth_emac_slice(emac); struct device *dev = prueth->dev; int max_rx_flows; int rx_flow; - /* clear SMEM and MSMC settings for all slices */ - if (!prueth->emacs_initialized) { - memset_io(prueth->msmcram.va, 0, prueth->msmcram.size); - memset_io(prueth->shram.va, 0, ICSSG_CONFIG_OFFSET_SLICE1 * PRUETH_NUM_MACS); - } - /* set h/w MAC as user might have re-configured */ ether_addr_copy(emac->mac_addr, ndev->dev_addr); icssg_class_set_mac_addr(prueth->miig_rt, slice, emac->mac_addr); - icssg_class_default(prueth->miig_rt, slice, 0, false); icssg_ft1_set_mac_addr(prueth->miig_rt, slice, emac->mac_addr); /* Notify the stack of the actual queue counts. */ @@ -597,18 +680,23 @@ static int emac_ndo_open(struct net_device *ndev) goto cleanup_napi; } - /* reset and start PRU firmware */ - ret = prueth_emac_start(prueth, emac); - if (ret) - goto free_rx_irq; + if (!prueth->emacs_initialized) { + ret = prueth_emac_common_start(prueth); + if (ret) + goto free_rx_irq; + } + + flow_cfg = emac->dram.va + ICSSG_CONFIG_OFFSET + PSI_L_REGULAR_FLOW_ID_BASE_OFFSET; + writew(emac->rx_flow_id_base, &flow_cfg->rx_base_flow); + ret = emac_fdb_flow_id_updated(emac); + + if (ret) { + netdev_err(ndev, "Failed to update Rx Flow ID %d", ret); + goto stop; + } icssg_mii_update_mtu(prueth->mii_rt, slice, ndev->max_mtu); - if (!prueth->emacs_initialized) { - ret = icss_iep_init(emac->iep, &prueth_iep_clockops, - emac, IEP_DEFAULT_CYCLE_TIME_NS); - } - ret = request_threaded_irq(emac->tx_ts_irq, NULL, prueth_tx_ts_irq, IRQF_ONESHOT, dev_name(dev), emac); if (ret) @@ -653,7 +741,8 @@ reset_rx_chn: free_tx_ts_irq: free_irq(emac->tx_ts_irq, emac); stop: - prueth_emac_stop(emac); + if (!prueth->emacs_initialized) + prueth_emac_common_stop(prueth); free_rx_irq: free_irq(emac->rx_chns.irq[rx_flow], emac); cleanup_napi: @@ -689,8 +778,6 @@ static int emac_ndo_stop(struct net_device *ndev) if (ndev->phydev) phy_stop(ndev->phydev); - icssg_class_disable(prueth->miig_rt, prueth_emac_slice(emac)); - if (emac->prueth->is_hsr_offload_mode) __dev_mc_unsync(ndev, icssg_prueth_hsr_del_mcast); else @@ -728,11 +815,9 @@ static int emac_ndo_stop(struct net_device *ndev) /* Destroying the queued work in ndo_stop() */ cancel_delayed_work_sync(&emac->stats_work); - if (prueth->emacs_initialized == 1) - icss_iep_exit(emac->iep); - /* stop PRUs */ - prueth_emac_stop(emac); + if (prueth->emacs_initialized == 1) + prueth_emac_common_stop(prueth); free_irq(emac->tx_ts_irq, emac); @@ -1053,10 +1138,11 @@ static void prueth_offload_fwd_mark_update(struct prueth *prueth) } } -static void prueth_emac_restart(struct prueth *prueth) +static int prueth_emac_restart(struct prueth *prueth) { struct prueth_emac *emac0 = prueth->emac[PRUETH_MAC0]; struct prueth_emac *emac1 = prueth->emac[PRUETH_MAC1]; + int ret; /* Detach the net_device for both PRUeth ports*/ if (netif_running(emac0->ndev)) @@ -1065,36 +1151,46 @@ static void prueth_emac_restart(struct prueth *prueth) netif_device_detach(emac1->ndev); /* Disable both PRUeth ports */ - icssg_set_port_state(emac0, ICSSG_EMAC_PORT_DISABLE); - icssg_set_port_state(emac1, ICSSG_EMAC_PORT_DISABLE); + ret = icssg_set_port_state(emac0, ICSSG_EMAC_PORT_DISABLE); + ret |= icssg_set_port_state(emac1, ICSSG_EMAC_PORT_DISABLE); + if (ret) + return ret; /* Stop both pru cores for both PRUeth ports*/ - prueth_emac_stop(emac0); - prueth->emacs_initialized--; - prueth_emac_stop(emac1); - prueth->emacs_initialized--; + ret = prueth_emac_common_stop(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to stop the firmwares"); + return ret; + } /* Start both pru cores for both PRUeth ports */ - prueth_emac_start(prueth, emac0); - prueth->emacs_initialized++; - prueth_emac_start(prueth, emac1); - prueth->emacs_initialized++; + ret = prueth_emac_common_start(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to start the firmwares"); + return ret; + } /* Enable forwarding for both PRUeth ports */ - icssg_set_port_state(emac0, ICSSG_EMAC_PORT_FORWARD); - icssg_set_port_state(emac1, ICSSG_EMAC_PORT_FORWARD); + ret = icssg_set_port_state(emac0, ICSSG_EMAC_PORT_FORWARD); + ret |= icssg_set_port_state(emac1, ICSSG_EMAC_PORT_FORWARD); /* Attache net_device for both PRUeth ports */ netif_device_attach(emac0->ndev); netif_device_attach(emac1->ndev); + + return ret; } static void icssg_change_mode(struct prueth *prueth) { struct prueth_emac *emac; - int mac; + int mac, ret; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) { emac = prueth->emac[mac]; @@ -1173,13 +1269,18 @@ static void prueth_netdevice_port_unlink(struct net_device *ndev) { struct prueth_emac *emac = netdev_priv(ndev); struct prueth *prueth = emac->prueth; + int ret; prueth->br_members &= ~BIT(emac->port_id); if (prueth->is_switch_mode) { prueth->is_switch_mode = false; emac->port_vlan = 0; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } } prueth_offload_fwd_mark_update(prueth); @@ -1228,6 +1329,7 @@ static void prueth_hsr_port_unlink(struct net_device *ndev) struct prueth *prueth = emac->prueth; struct prueth_emac *emac0; struct prueth_emac *emac1; + int ret; emac0 = prueth->emac[PRUETH_MAC0]; emac1 = prueth->emac[PRUETH_MAC1]; @@ -1238,7 +1340,11 @@ static void prueth_hsr_port_unlink(struct net_device *ndev) emac0->port_vlan = 0; emac1->port_vlan = 0; prueth->hsr_dev = NULL; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } netdev_dbg(ndev, "Disabling HSR Offload mode\n"); } } @@ -1413,13 +1519,10 @@ static int prueth_probe(struct platform_device *pdev) prueth->pa_stats = NULL; } - if (eth0_node) { + if (eth0_node || eth1_node) { ret = prueth_get_cores(prueth, ICSS_SLICE0, false); if (ret) goto put_cores; - } - - if (eth1_node) { ret = prueth_get_cores(prueth, ICSS_SLICE1, false); if (ret) goto put_cores; @@ -1618,14 +1721,12 @@ put_pruss: pruss_put(prueth->pruss); put_cores: - if (eth1_node) { - prueth_put_cores(prueth, ICSS_SLICE1); - of_node_put(eth1_node); - } - - if (eth0_node) { + if (eth0_node || eth1_node) { prueth_put_cores(prueth, ICSS_SLICE0); of_node_put(eth0_node); + + prueth_put_cores(prueth, ICSS_SLICE1); + of_node_put(eth1_node); } return ret; diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index f5c1d473e9f9..5473315ea204 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -140,7 +140,6 @@ struct prueth_rx_chn { /* data for each emac port */ struct prueth_emac { bool is_sr1; - bool fw_running; struct prueth *prueth; struct net_device *ndev; u8 mac_addr[6]; @@ -361,6 +360,8 @@ int icssg_set_port_state(struct prueth_emac *emac, enum icssg_port_state_cmd state); void icssg_config_set_speed(struct prueth_emac *emac); void icssg_config_half_duplex(struct prueth_emac *emac); +void icssg_init_emac_mode(struct prueth *prueth); +void icssg_init_fw_offload_mode(struct prueth *prueth); /* Buffer queue helpers */ int icssg_queue_pop(struct prueth *prueth, u8 queue); @@ -377,6 +378,7 @@ void icssg_vtbl_modify(struct prueth_emac *emac, u8 vid, u8 port_mask, u8 untag_mask, bool add); u16 icssg_get_pvid(struct prueth_emac *emac); void icssg_set_pvid(struct prueth *prueth, u8 vid, u8 port); +int emac_fdb_flow_id_updated(struct prueth_emac *emac); #define prueth_napi_to_tx_chn(pnapi) \ container_of(pnapi, struct prueth_tx_chn, napi_tx) @@ -407,7 +409,6 @@ void emac_rx_timestamp(struct prueth_emac *emac, struct sk_buff *skb, u32 *psdata); enum netdev_tx icssg_ndo_start_xmit(struct sk_buff *skb, struct net_device *ndev); irqreturn_t prueth_rx_irq(int irq, void *dev_id); -void prueth_emac_stop(struct prueth_emac *emac); void prueth_cleanup_tx_ts(struct prueth_emac *emac); int icssg_napi_rx_poll(struct napi_struct *napi_rx, int budget); int prueth_prepare_rx_chan(struct prueth_emac *emac, diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c b/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c index 5024f0647a0d..3dc86397c367 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c @@ -440,7 +440,6 @@ static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) goto halt_pru; } - emac->fw_running = 1; return 0; halt_pru: @@ -449,6 +448,29 @@ halt_pru: return ret; } +static void prueth_emac_stop(struct prueth_emac *emac) +{ + struct prueth *prueth = emac->prueth; + int slice; + + switch (emac->port_id) { + case PRUETH_PORT_MII0: + slice = ICSS_SLICE0; + break; + case PRUETH_PORT_MII1: + slice = ICSS_SLICE1; + break; + default: + netdev_err(emac->ndev, "invalid port\n"); + return; + } + + if (!emac->is_sr1) + rproc_shutdown(prueth->txpru[slice]); + rproc_shutdown(prueth->rtu[slice]); + rproc_shutdown(prueth->pru[slice]); +} + /** * emac_ndo_open - EMAC device open * @ndev: network adapter device From 9b115361248dc6cce182a2dc030c1c70b0a9639e Mon Sep 17 00:00:00 2001 From: Meghana Malladi Date: Mon, 23 Dec 2024 20:45:50 +0530 Subject: [PATCH 624/807] net: ti: icssg-prueth: Fix clearing of IEP_CMP_CFG registers during iep_init When ICSSG interfaces are brought down and brought up again, the pru cores are shut down and booted again, flushing out all the memories and start again in a clean state. Hence it is expected that the IEP_CMP_CFG register needs to be flushed during iep_init() to ensure that the existing residual configuration doesn't cause any unusual behavior. If the register is not cleared, existing IEP_CMP_CFG set for CMP1 will result in SYNC0_OUT signal based on the SYNC_OUT register values. After bringing the interface up, calling PPS enable doesn't work as the driver believes PPS is already enabled, (iep->pps_enabled is not cleared during interface bring down) and driver will just return true even though there is no signal. Fix this by disabling pps and perout. Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") Signed-off-by: Meghana Malladi Reviewed-by: Roger Quadros Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icss_iep.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c index 5d6d1cf78e93..768578c0d958 100644 --- a/drivers/net/ethernet/ti/icssg/icss_iep.c +++ b/drivers/net/ethernet/ti/icssg/icss_iep.c @@ -215,6 +215,9 @@ static void icss_iep_enable_shadow_mode(struct icss_iep *iep) for (cmp = IEP_MIN_CMP; cmp < IEP_MAX_CMP; cmp++) { regmap_update_bits(iep->map, ICSS_IEP_CMP_STAT_REG, IEP_CMP_STATUS(cmp), IEP_CMP_STATUS(cmp)); + + regmap_update_bits(iep->map, ICSS_IEP_CMP_CFG_REG, + IEP_CMP_CFG_CMP_EN(cmp), 0); } /* enable reset counter on CMP0 event */ @@ -780,6 +783,11 @@ int icss_iep_exit(struct icss_iep *iep) } icss_iep_disable(iep); + if (iep->pps_enabled) + icss_iep_pps_enable(iep, false); + else if (iep->perout_enabled) + icss_iep_perout_enable(iep, NULL, false); + return 0; } EXPORT_SYMBOL_GPL(icss_iep_exit); From 45d339fefaa3dcd237038769e0d34584fb867390 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Thu, 19 Dec 2024 14:23:36 +0200 Subject: [PATCH 625/807] RDMA/mlx5: Enable multiplane mode only when it is supported Driver queries vport_cxt.num_plane and enables multiplane when it is greater then 0, but some old FWs (versions from x.40.1000 till x.42.1000), report vport_cxt.num_plane = 1 unexpectedly. Fix it by querying num_plane only when HCA_CAP2.multiplane bit is set. Fixes: 2a5db20fa532 ("RDMA/mlx5: Add support to multi-plane device and port") Link: https://patch.msgid.link/r/1ef901acdf564716fcf550453cf5e94f343777ec.1734610916.git.leon@kernel.org Cc: stable@vger.kernel.org Reported-by: Francesco Poli Closes: https://lore.kernel.org/all/nvs4i2v7o6vn6zhmtq4sgazy2hu5kiulukxcntdelggmznnl7h@so3oul6uwgbl/ Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky Reviewed-by: Michal Swiatkowski Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c2314797afc9..f5b59d02f4d3 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2839,7 +2839,7 @@ static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) int err; *num_plane = 0; - if (!MLX5_CAP_GEN(mdev, ib_virt)) + if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane)) return 0; err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4fbbcf35498b..48d47181c7cd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2119,7 +2119,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 migration_in_chunks[0x1]; u8 reserved_at_d1[0x1]; u8 sf_eq_usage[0x1]; - u8 reserved_at_d3[0xd]; + u8 reserved_at_d3[0x5]; + u8 multiplane[0x1]; + u8 reserved_at_d9[0x7]; u8 cross_vhca_object_to_object_supported[0x20]; From 09dfc8a5f2ce897005a94bf66cca4f91e4e03700 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 2 Jan 2025 11:32:54 -0700 Subject: [PATCH 626/807] vfio/pci: Fallback huge faults for unaligned pfn The PFN must also be aligned to the fault order to insert a huge pfnmap. Test the alignment and fallback when unaligned. Fixes: f9e54c3a2f5b ("vfio/pci: implement huge_fault support") Link: https://bugzilla.kernel.org/show_bug.cgi?id=219619 Reported-by: Athul Krishna Reported-by: Precific Reviewed-by: Peter Xu Tested-by: Precific Link: https://lore.kernel.org/r/20250102183416.1841878-1-alex.williamson@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1ab58da9f38a..1a4ed5a357d3 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1661,14 +1661,15 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; vm_fault_t ret = VM_FAULT_SIGBUS; - if (order && (vmf->address & ((PAGE_SIZE << order) - 1) || + pfn = vma_to_pfn(vma) + pgoff; + + if (order && (pfn & ((1 << order) - 1) || + vmf->address & ((PAGE_SIZE << order) - 1) || vmf->address + (PAGE_SIZE << order) > vma->vm_end)) { ret = VM_FAULT_FALLBACK; goto out; } - pfn = vma_to_pfn(vma); - down_read(&vdev->memory_lock); if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) @@ -1676,18 +1677,18 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, switch (order) { case 0: - ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); + ret = vmf_insert_pfn(vma, vmf->address, pfn); break; #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP case PMD_ORDER: - ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff, - PFN_DEV), false); + ret = vmf_insert_pfn_pmd(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); break; #endif #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP case PUD_ORDER: - ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff, - PFN_DEV), false); + ret = vmf_insert_pfn_pud(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); break; #endif default: From 6df90c02bae468a3a6110bafbc659884d0c4966c Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 18 Dec 2024 13:56:58 +0100 Subject: [PATCH 627/807] dm-verity FEC: Fix RS FEC repair for roots unaligned to block size (take 2) This patch fixes an issue that was fixed in the commit df7b59ba9245 ("dm verity: fix FEC for RS roots unaligned to block size") but later broken again in the commit 8ca7cab82bda ("dm verity fec: fix misaligned RS roots IO") If the Reed-Solomon roots setting spans multiple blocks, the code does not use proper parity bytes and randomly fails to repair even trivial errors. This bug cannot happen if the sector size is multiple of RS roots setting (Android case with roots 2). The previous solution was to find a dm-bufio block size that is multiple of the device sector size and roots size. Unfortunately, the optimization in commit 8ca7cab82bda ("dm verity fec: fix misaligned RS roots IO") is incorrect and uses data block size for some roots (for example, it uses 4096 block size for roots = 20). This patch uses a different approach: - It always uses a configured data block size for dm-bufio to avoid possible misaligned IOs. - and it caches the processed parity bytes, so it can join it if it spans two blocks. As the RS calculation is called only if an error is detected and the process is computationally intensive, copying a few more bytes should not introduce performance issues. The issue was reported to cryptsetup with trivial reproducer https://gitlab.com/cryptsetup/cryptsetup/-/issues/923 Reproducer (with roots=20): # create verity device with RS FEC dd if=/dev/urandom of=data.img bs=4096 count=8 status=none veritysetup format data.img hash.img --fec-device=fec.img --fec-roots=20 | \ awk '/^Root hash/{ print $3 }' >roothash # create an erasure that should always be repairable with this roots setting dd if=/dev/zero of=data.img conv=notrunc bs=1 count=4 seek=4 status=none # try to read it through dm-verity veritysetup open data.img test hash.img --fec-device=fec.img --fec-roots=20 $(cat roothash) dd if=/dev/mapper/test of=/dev/null bs=4096 status=noxfer Even now the log says it cannot repair it: : verity-fec: 7:1: FEC 0: failed to correct: -74 : device-mapper: verity: 7:1: data block 0 is corrupted ... With this fix, errors are properly repaired. : verity-fec: 7:1: FEC 0: corrected 4 errors Signed-off-by: Milan Broz Fixes: 8ca7cab82bda ("dm verity fec: fix misaligned RS roots IO") Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 40 +++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 62b1a44b8dd2..6bd9848518d4 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -60,15 +60,19 @@ static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, * to the data block. Caller is responsible for releasing buf. */ static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, - unsigned int *offset, struct dm_buffer **buf, - unsigned short ioprio) + unsigned int *offset, unsigned int par_buf_offset, + struct dm_buffer **buf, unsigned short ioprio) { u64 position, block, rem; u8 *res; + /* We have already part of parity bytes read, skip to the next block */ + if (par_buf_offset) + index++; + position = (index + rsb) * v->fec->roots; block = div64_u64_rem(position, v->fec->io_size, &rem); - *offset = (unsigned int)rem; + *offset = par_buf_offset ? 0 : (unsigned int)rem; res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio); if (IS_ERR(res)) { @@ -128,11 +132,12 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, { int r, corrected = 0, res; struct dm_buffer *buf; - unsigned int n, i, offset; - u8 *par, *block; + unsigned int n, i, offset, par_buf_offset = 0; + u8 *par, *block, par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio)); + par = fec_read_parity(v, rsb, block_offset, &offset, + par_buf_offset, &buf, bio_prio(bio)); if (IS_ERR(par)) return PTR_ERR(par); @@ -142,7 +147,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, */ fec_for_each_buffer_rs_block(fio, n, i) { block = fec_buffer_rs_block(v, fio, n, i); - res = fec_decode_rs8(v, fio, block, &par[offset], neras); + memcpy(&par_buf[par_buf_offset], &par[offset], v->fec->roots - par_buf_offset); + res = fec_decode_rs8(v, fio, block, par_buf, neras); if (res < 0) { r = res; goto error; @@ -155,12 +161,21 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, if (block_offset >= 1 << v->data_dev_block_bits) goto done; - /* read the next block when we run out of parity bytes */ - offset += v->fec->roots; + /* Read the next block when we run out of parity bytes */ + offset += (v->fec->roots - par_buf_offset); + /* Check if parity bytes are split between blocks */ + if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) { + par_buf_offset = v->fec->io_size - offset; + memcpy(par_buf, &par[offset], par_buf_offset); + offset += par_buf_offset; + } else + par_buf_offset = 0; + if (offset >= v->fec->io_size) { dm_bufio_release(buf); - par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio)); + par = fec_read_parity(v, rsb, block_offset, &offset, + par_buf_offset, &buf, bio_prio(bio)); if (IS_ERR(par)) return PTR_ERR(par); } @@ -724,10 +739,7 @@ int verity_fec_ctr(struct dm_verity *v) return -E2BIG; } - if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1)) - f->io_size = 1 << v->data_dev_block_bits; - else - f->io_size = v->fec->roots << SECTOR_SHIFT; + f->io_size = 1 << v->data_dev_block_bits; f->bufio = dm_bufio_client_create(f->dev->bdev, f->io_size, From 548c6edbed92031baa4aa32cae55628c810c3ebb Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 18 Dec 2024 13:56:59 +0100 Subject: [PATCH 628/807] dm-verity FEC: Avoid copying RS parity bytes twice. Caching RS parity bytes is already done in fec_decode_bufs() now, no need to use yet another buffer for conversion to uint16_t. This patch removes that double copy of RS parity bytes. Signed-off-by: Milan Broz Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 6bd9848518d4..e61855da6461 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -39,22 +39,6 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset) return offset + mod * (v->fec->rounds << v->data_dev_block_bits); } -/* - * Decode an RS block using Reed-Solomon. - */ -static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, - u8 *data, u8 *fec, int neras) -{ - int i; - uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; - - for (i = 0; i < v->fec->roots; i++) - par[i] = fec[i]; - - return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras, - fio->erasures, 0, NULL); -} - /* * Read error-correcting codes for the requested RS block. Returns a pointer * to the data block. Caller is responsible for releasing buf. @@ -132,8 +116,9 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, { int r, corrected = 0, res; struct dm_buffer *buf; - unsigned int n, i, offset, par_buf_offset = 0; - u8 *par, *block, par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; + unsigned int n, i, j, offset, par_buf_offset = 0; + uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; + u8 *par, *block; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); par = fec_read_parity(v, rsb, block_offset, &offset, @@ -147,8 +132,11 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, */ fec_for_each_buffer_rs_block(fio, n, i) { block = fec_buffer_rs_block(v, fio, n, i); - memcpy(&par_buf[par_buf_offset], &par[offset], v->fec->roots - par_buf_offset); - res = fec_decode_rs8(v, fio, block, par_buf, neras); + for (j = 0; j < v->fec->roots - par_buf_offset; j++) + par_buf[par_buf_offset + j] = par[offset + j]; + /* Decode an RS block using Reed-Solomon */ + res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn, + NULL, neras, fio->erasures, 0, NULL); if (res < 0) { r = res; goto error; @@ -166,7 +154,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, /* Check if parity bytes are split between blocks */ if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) { par_buf_offset = v->fec->io_size - offset; - memcpy(par_buf, &par[offset], par_buf_offset); + for (j = 0; j < par_buf_offset; j++) + par_buf[j] = par[offset + j]; offset += par_buf_offset; } else par_buf_offset = 0; From a619cba8c69c434258ff4101d463322cd63e1bdc Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 3 Jan 2025 23:18:26 +0900 Subject: [PATCH 629/807] gpio: virtuser: fix missing lookup table cleanups When a virtuser device is created via configfs and the probe fails due to an incorrect lookup table, the table is not removed. This prevents subsequent probe attempts from succeeding, even if the issue is corrected, unless the device is released. Additionally, cleanup is also needed in the less likely case of platform_device_register_full() failure. Besides, a consistent memory leak in lookup_table->dev_id was spotted using kmemleak by toggling the live state between 0 and 1 with a correct lookup table. Introduce gpio_virtuser_remove_lookup_table() as the counterpart to the existing gpio_virtuser_make_lookup_table() and call it from all necessary points to ensure proper cleanup. Fixes: 91581c4b3f29 ("gpio: virtuser: new virtual testing driver for the GPIO API") Signed-off-by: Koichiro Den Link: https://lore.kernel.org/r/20250103141829.430662-2-koichiro.den@canonical.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-virtuser.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/gpio/gpio-virtuser.c b/drivers/gpio/gpio-virtuser.c index 91b6352c957c..e89b1239b635 100644 --- a/drivers/gpio/gpio-virtuser.c +++ b/drivers/gpio/gpio-virtuser.c @@ -1439,6 +1439,15 @@ gpio_virtuser_make_lookup_table(struct gpio_virtuser_device *dev) return 0; } +static void +gpio_virtuser_remove_lookup_table(struct gpio_virtuser_device *dev) +{ + gpiod_remove_lookup_table(dev->lookup_table); + kfree(dev->lookup_table->dev_id); + kfree(dev->lookup_table); + dev->lookup_table = NULL; +} + static struct fwnode_handle * gpio_virtuser_make_device_swnode(struct gpio_virtuser_device *dev) { @@ -1487,10 +1496,8 @@ gpio_virtuser_device_activate(struct gpio_virtuser_device *dev) pdevinfo.fwnode = swnode; ret = gpio_virtuser_make_lookup_table(dev); - if (ret) { - fwnode_remove_software_node(swnode); - return ret; - } + if (ret) + goto err_remove_swnode; reinit_completion(&dev->probe_completion); dev->driver_bound = false; @@ -1498,23 +1505,31 @@ gpio_virtuser_device_activate(struct gpio_virtuser_device *dev) pdev = platform_device_register_full(&pdevinfo); if (IS_ERR(pdev)) { + ret = PTR_ERR(pdev); bus_unregister_notifier(&platform_bus_type, &dev->bus_notifier); - fwnode_remove_software_node(swnode); - return PTR_ERR(pdev); + goto err_remove_lookup_table; } wait_for_completion(&dev->probe_completion); bus_unregister_notifier(&platform_bus_type, &dev->bus_notifier); if (!dev->driver_bound) { - platform_device_unregister(pdev); - fwnode_remove_software_node(swnode); - return -ENXIO; + ret = -ENXIO; + goto err_unregister_pdev; } dev->pdev = pdev; return 0; + +err_unregister_pdev: + platform_device_unregister(pdev); +err_remove_lookup_table: + gpio_virtuser_remove_lookup_table(dev); +err_remove_swnode: + fwnode_remove_software_node(swnode); + + return ret; } static void @@ -1526,10 +1541,9 @@ gpio_virtuser_device_deactivate(struct gpio_virtuser_device *dev) swnode = dev_fwnode(&dev->pdev->dev); platform_device_unregister(dev->pdev); + gpio_virtuser_remove_lookup_table(dev); fwnode_remove_software_node(swnode); dev->pdev = NULL; - gpiod_remove_lookup_table(dev->lookup_table); - kfree(dev->lookup_table); } static ssize_t From 656cc2e892f128b03ea9ef19bd11d70f71d5472b Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 3 Jan 2025 23:18:27 +0900 Subject: [PATCH 630/807] gpio: virtuser: fix handling of multiple conn_ids in lookup table Creating a virtuser device via configfs with multiple conn_ids fails due to incorrect indexing of lookup entries. Correct the indexing logic to ensure proper functionality when multiple gpio_virtuser_lookup are created. Fixes: 91581c4b3f29 ("gpio: virtuser: new virtual testing driver for the GPIO API") Signed-off-by: Koichiro Den Link: https://lore.kernel.org/r/20250103141829.430662-3-koichiro.den@canonical.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-virtuser.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-virtuser.c b/drivers/gpio/gpio-virtuser.c index e89b1239b635..d6244f0d3bc7 100644 --- a/drivers/gpio/gpio-virtuser.c +++ b/drivers/gpio/gpio-virtuser.c @@ -1410,7 +1410,7 @@ gpio_virtuser_make_lookup_table(struct gpio_virtuser_device *dev) size_t num_entries = gpio_virtuser_get_lookup_count(dev); struct gpio_virtuser_lookup_entry *entry; struct gpio_virtuser_lookup *lookup; - unsigned int i = 0; + unsigned int i = 0, idx; lockdep_assert_held(&dev->lock); @@ -1424,12 +1424,12 @@ gpio_virtuser_make_lookup_table(struct gpio_virtuser_device *dev) return -ENOMEM; list_for_each_entry(lookup, &dev->lookup_list, siblings) { + idx = 0; list_for_each_entry(entry, &lookup->entry_list, siblings) { - table->table[i] = + table->table[i++] = GPIO_LOOKUP_IDX(entry->key, entry->offset < 0 ? U16_MAX : entry->offset, - lookup->con_id, i, entry->flags); - i++; + lookup->con_id, idx++, entry->flags); } } From c7c434c1dba955005f5161dae73f09c0a922cfa7 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 3 Jan 2025 23:18:28 +0900 Subject: [PATCH 631/807] gpio: virtuser: lock up configfs that an instantiated device depends on Once a virtuser device is instantiated and actively used, allowing rmdir for its configfs serves no purpose and can be confusing. Userspace interacts with the virtual consumer at arbitrary times, meaning it depends on its existence. Make the subsystem itself depend on the configfs entry for a virtuser device while it is in active use. Signed-off-by: Koichiro Den Link: https://lore.kernel.org/r/20250103141829.430662-4-koichiro.den@canonical.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-virtuser.c | 49 ++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/drivers/gpio/gpio-virtuser.c b/drivers/gpio/gpio-virtuser.c index d6244f0d3bc7..e89f299f2140 100644 --- a/drivers/gpio/gpio-virtuser.c +++ b/drivers/gpio/gpio-virtuser.c @@ -1546,6 +1546,30 @@ gpio_virtuser_device_deactivate(struct gpio_virtuser_device *dev) dev->pdev = NULL; } +static void +gpio_virtuser_device_lockup_configfs(struct gpio_virtuser_device *dev, bool lock) +{ + struct configfs_subsystem *subsys = dev->group.cg_subsys; + struct gpio_virtuser_lookup_entry *entry; + struct gpio_virtuser_lookup *lookup; + + /* + * The device only needs to depend on leaf lookup entries. This is + * sufficient to lock up all the configfs entries that the + * instantiated, alive device depends on. + */ + list_for_each_entry(lookup, &dev->lookup_list, siblings) { + list_for_each_entry(entry, &lookup->entry_list, siblings) { + if (lock) + WARN_ON(configfs_depend_item_unlocked( + subsys, &entry->group.cg_item)); + else + configfs_undepend_item_unlocked( + &entry->group.cg_item); + } + } +} + static ssize_t gpio_virtuser_device_config_live_store(struct config_item *item, const char *page, size_t count) @@ -1558,15 +1582,24 @@ gpio_virtuser_device_config_live_store(struct config_item *item, if (ret) return ret; - guard(mutex)(&dev->lock); - - if (live == gpio_virtuser_device_is_live(dev)) - return -EPERM; - if (live) - ret = gpio_virtuser_device_activate(dev); - else - gpio_virtuser_device_deactivate(dev); + gpio_virtuser_device_lockup_configfs(dev, true); + + scoped_guard(mutex, &dev->lock) { + if (live == gpio_virtuser_device_is_live(dev)) + ret = -EPERM; + else if (live) + ret = gpio_virtuser_device_activate(dev); + else + gpio_virtuser_device_deactivate(dev); + } + + /* + * Undepend is required only if device disablement (live == 0) + * succeeds or if device enablement (live == 1) fails. + */ + if (live == !!ret) + gpio_virtuser_device_lockup_configfs(dev, false); return ret ?: count; } From 8bd76b3d3f3af7ac2898b6a27ad90c444fec418f Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 3 Jan 2025 23:18:29 +0900 Subject: [PATCH 632/807] gpio: sim: lock up configfs that an instantiated device depends on Once a sim device is instantiated and actively used, allowing rmdir for its configfs serves no purpose and can be confusing. Effectively, arbitrary users start depending on its existence. Make the subsystem itself depend on the configfs entry for a sim device while it is in active use. Signed-off-by: Koichiro Den Link: https://lore.kernel.org/r/20250103141829.430662-5-koichiro.den@canonical.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sim.c | 48 +++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index f387dad81f29..686ae3d11ba3 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -1027,6 +1027,30 @@ static void gpio_sim_device_deactivate(struct gpio_sim_device *dev) dev->pdev = NULL; } +static void +gpio_sim_device_lockup_configfs(struct gpio_sim_device *dev, bool lock) +{ + struct configfs_subsystem *subsys = dev->group.cg_subsys; + struct gpio_sim_bank *bank; + struct gpio_sim_line *line; + + /* + * The device only needs to depend on leaf line entries. This is + * sufficient to lock up all the configfs entries that the + * instantiated, alive device depends on. + */ + list_for_each_entry(bank, &dev->bank_list, siblings) { + list_for_each_entry(line, &bank->line_list, siblings) { + if (lock) + WARN_ON(configfs_depend_item_unlocked( + subsys, &line->group.cg_item)); + else + configfs_undepend_item_unlocked( + &line->group.cg_item); + } + } +} + static ssize_t gpio_sim_device_config_live_store(struct config_item *item, const char *page, size_t count) @@ -1039,14 +1063,24 @@ gpio_sim_device_config_live_store(struct config_item *item, if (ret) return ret; - guard(mutex)(&dev->lock); + if (live) + gpio_sim_device_lockup_configfs(dev, true); - if (live == gpio_sim_device_is_live(dev)) - ret = -EPERM; - else if (live) - ret = gpio_sim_device_activate(dev); - else - gpio_sim_device_deactivate(dev); + scoped_guard(mutex, &dev->lock) { + if (live == gpio_sim_device_is_live(dev)) + ret = -EPERM; + else if (live) + ret = gpio_sim_device_activate(dev); + else + gpio_sim_device_deactivate(dev); + } + + /* + * Undepend is required only if device disablement (live == 0) + * succeeds or if device enablement (live == 1) fails. + */ + if (live == !!ret) + gpio_sim_device_lockup_configfs(dev, false); return ret ?: count; } From ed123c948d06688d10f3b10a7bce1d6fbfd1ed07 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 09:29:09 -0700 Subject: [PATCH 633/807] io_uring/kbuf: use pre-committed buffer address for non-pollable file For non-pollable files, buffer ring consumption will commit upfront. This is fine, but io_ring_buffer_select() will return the address of the buffer after having committed it. For incrementally consumed buffers, this is incorrect as it will modify the buffer address. Store the pre-committed value and return that. If that isn't done, then the initial part of the buffer is not used and the application will correctly assume the content arrived at the start of the userspace buffer, but the kernel will have put it later in the buffer. Or it can cause a spurious -EFAULT returned in the CQE, depending on the buffer size. As bounds are suitably checked for doing the actual IO, no adverse side effects are possible - it's just a data misplacement within the existing buffer. Reported-by: Gwendal Fernet Cc: stable@vger.kernel.org Fixes: ae98dbf43d75 ("io_uring/kbuf: add support for incremental buffer consumption") Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index d407576ddfb7..eec5eb7de843 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -139,6 +139,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_uring_buf_ring *br = bl->buf_ring; __u16 tail, head = bl->head; struct io_uring_buf *buf; + void __user *ret; tail = smp_load_acquire(&br->tail); if (unlikely(tail == head)) @@ -153,6 +154,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; + ret = u64_to_user_ptr(buf->addr); if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { /* @@ -168,7 +170,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, io_kbuf_commit(req, bl, *len, 1); req->buf_list = NULL; } - return u64_to_user_ptr(buf->addr); + return ret; } void __user *io_buffer_select(struct io_kiocb *req, size_t *len, From 03f275adb8fbd7b4ebe96a1ad5044d8e602692dc Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 1 Jan 2025 14:00:37 +0100 Subject: [PATCH 634/807] fuse: respect FOPEN_KEEP_CACHE on opendir The re-factoring of fuse_dir_open() missed the need to invalidate directory inode page cache with open flag FOPEN_KEEP_CACHE. Fixes: 7de64d521bf92 ("fuse: break up fuse_open_common()") Reported-by: Prince Kumar Closes: https://lore.kernel.org/linux-fsdevel/CAEW=TRr7CYb4LtsvQPLj-zx5Y+EYBmGfM24SuzwyDoGVNoKm7w@mail.gmail.com/ Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250101130037.96680-1-amir73il@gmail.com Reviewed-by: Bernd Schubert Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 494ac372ace0..e540d05549ff 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1681,6 +1681,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file) */ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) nonseekable_open(inode, file); + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; From 522908140645865dc3e2fac70fd3b28834dfa7be Mon Sep 17 00:00:00 2001 From: Liankun Yang Date: Wed, 18 Dec 2024 19:34:07 +0800 Subject: [PATCH 635/807] drm/mediatek: Add return value check when reading DPCD Check the return value of drm_dp_dpcd_readb() to confirm that AUX communication is successful. To simplify the code, replace drm_dp_dpcd_readb() and DP_GET_SINK_COUNT() with drm_dp_read_sink_count(). Fixes: f70ac097a2cf ("drm/mediatek: Add MT8195 Embedded DisplayPort driver") Signed-off-by: Liankun Yang Reviewed-by: Guillaume Ranquet Link: https://patchwork.kernel.org/project/dri-devel/patch/20241218113448.2992-1-liankun.yang@mediatek.com/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_dp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_dp.c b/drivers/gpu/drm/mediatek/mtk_dp.c index e945c1204837..dcf4d23d11ba 100644 --- a/drivers/gpu/drm/mediatek/mtk_dp.c +++ b/drivers/gpu/drm/mediatek/mtk_dp.c @@ -2103,7 +2103,6 @@ static enum drm_connector_status mtk_dp_bdg_detect(struct drm_bridge *bridge) struct mtk_dp *mtk_dp = mtk_dp_from_bridge(bridge); enum drm_connector_status ret = connector_status_disconnected; bool enabled = mtk_dp->enabled; - u8 sink_count = 0; if (!mtk_dp->train_info.cable_plugged_in) return ret; @@ -2118,8 +2117,8 @@ static enum drm_connector_status mtk_dp_bdg_detect(struct drm_bridge *bridge) * function, we just need to check the HPD connection to check * whether we connect to a sink device. */ - drm_dp_dpcd_readb(&mtk_dp->aux, DP_SINK_COUNT, &sink_count); - if (DP_GET_SINK_COUNT(sink_count)) + + if (drm_dp_read_sink_count(&mtk_dp->aux) > 0) ret = connector_status_connected; if (!enabled) From f563dd9ca6cb6ed52c5fb6e4285d1ef26cfa7e8a Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 12 Nov 2024 11:50:30 +0100 Subject: [PATCH 636/807] drm/mediatek: Initialize pointer in mtk_drm_of_ddp_path_build_one() The struct device_node *next pointer is not initialized, and it is used in an error path in which it may have never been modified by function mtk_drm_of_get_ddp_ep_cid(). Since the error path is relying on that pointer being NULL for the OVL Adaptor and/or invalid component check and since said pointer is being used in prints for %pOF, in the case that it points to a bogus address, the print may cause a KP. To resolve that, initialize the *next pointer to NULL before usage. Fixes: 4c932840db1d ("drm/mediatek: Implement OF graphs support for display paths") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/dri-devel/633f3c6d-d09f-447c-95f1-dfb4114c50e6@stanley.mountain/ Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: CK Hu Reviewed-by: Alexandre Mergnat Link: https://patchwork.kernel.org/project/dri-devel/patch/20241112105030.93337-1-angelogioacchino.delregno@collabora.com/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index f0f3d545ff19..c86decee6ec9 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -905,7 +905,7 @@ static int mtk_drm_of_ddp_path_build_one(struct device *dev, enum mtk_crtc_path const unsigned int **out_path, unsigned int *out_path_len) { - struct device_node *next, *prev, *vdo = dev->parent->of_node; + struct device_node *next = NULL, *prev, *vdo = dev->parent->of_node; unsigned int temp_path[DDP_COMPONENT_DRM_ID_MAX] = { 0 }; unsigned int *final_ddp_path; unsigned short int idx = 0; From 1e9b0e1c550c42c13c111d1a31e822057232abc4 Mon Sep 17 00:00:00 2001 From: Antonio Pastor Date: Thu, 2 Jan 2025 20:23:00 -0500 Subject: [PATCH 637/807] net: 802: LLC+SNAP OID:PID lookup on start of skb data 802.2+LLC+SNAP frames received by napi_complete_done() with GRO and DSA have skb->transport_header set two bytes short, or pointing 2 bytes before network_header & skb->data. This was an issue as snap_rcv() expected offset to point to SNAP header (OID:PID), causing packet to be dropped. A fix at llc_fixup_skb() (a024e377efed) resets transport_header for any LLC consumers that may care about it, and stops SNAP packets from being dropped, but doesn't fix the problem which is that LLC and SNAP should not use transport_header offset. Ths patch eliminates the use of transport_header offset for SNAP lookup of OID:PID so that SNAP does not rely on the offset at all. The offset is reset after pull for any SNAP packet consumers that may (but shouldn't) use it. Fixes: fda55eca5a33 ("net: introduce skb_transport_header_was_set()") Signed-off-by: Antonio Pastor Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103012303.746521-1-antonio.pastor@gmail.com Signed-off-by: Jakub Kicinski --- net/802/psnap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/802/psnap.c b/net/802/psnap.c index fca9d454905f..389df460c8c4 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -55,11 +55,11 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; rcu_read_lock(); - proto = find_snap_client(skb_transport_header(skb)); + proto = find_snap_client(skb->data); if (proto) { /* Pass the frame on. */ - skb->transport_header += 5; skb_pull_rcsum(skb, 5); + skb_reset_transport_header(skb); rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } rcu_read_unlock(); From 3479c7549fb1dfa7a1db4efb7347c7b8ef50de4b Mon Sep 17 00:00:00 2001 From: Zhongqiu Duan Date: Thu, 2 Jan 2025 17:14:26 +0000 Subject: [PATCH 638/807] tcp/dccp: allow a connection when sk_max_ack_backlog is zero If the backlog of listen() is set to zero, sk_acceptq_is_full() allows one connection to be made, but inet_csk_reqsk_queue_is_full() does not. When the net.ipv4.tcp_syncookies is zero, inet_csk_reqsk_queue_is_full() will cause an immediate drop before the sk_acceptq_is_full() check in tcp_conn_request(), resulting in no connection can be made. This patch tries to keep consistent with 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes."). Link: https://lore.kernel.org/netdev/20250102080258.53858-1-kuniyu@amazon.com/ Fixes: ef547f2ac16b ("tcp: remove max_qlen_log") Signed-off-by: Zhongqiu Duan Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250102171426.915276-1-dzq.aishenghu0@gmail.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 3c82fad904d4..c7f42844c79a 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -282,7 +282,7 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return inet_csk_reqsk_queue_len(sk) >= READ_ONCE(sk->sk_max_ack_backlog); + return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog); } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); From a039e54397c6a75b713b9ce7894a62e06956aa92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jan 2025 10:45:46 +0000 Subject: [PATCH 639/807] net_sched: cls_flow: validate TCA_FLOW_RSHIFT attribute syzbot found that TCA_FLOW_RSHIFT attribute was not validated. Right shitfing a 32bit integer is undefined for large shift values. UBSAN: shift-out-of-bounds in net/sched/cls_flow.c:329:23 shift exponent 9445 is too large for 32-bit type 'u32' (aka 'unsigned int') CPU: 1 UID: 0 PID: 54 Comm: kworker/u8:3 Not tainted 6.13.0-rc3-syzkaller-00180-g4f619d518db9 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: ipv6_addrconf addrconf_dad_work Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:231 [inline] __ubsan_handle_shift_out_of_bounds+0x3c8/0x420 lib/ubsan.c:468 flow_classify+0x24d5/0x25b0 net/sched/cls_flow.c:329 tc_classify include/net/tc_wrapper.h:197 [inline] __tcf_classify net/sched/cls_api.c:1771 [inline] tcf_classify+0x420/0x1160 net/sched/cls_api.c:1867 sfb_classify net/sched/sch_sfb.c:260 [inline] sfb_enqueue+0x3ad/0x18b0 net/sched/sch_sfb.c:318 dev_qdisc_enqueue+0x4b/0x290 net/core/dev.c:3793 __dev_xmit_skb net/core/dev.c:3889 [inline] __dev_queue_xmit+0xf0e/0x3f50 net/core/dev.c:4400 dev_queue_xmit include/linux/netdevice.h:3168 [inline] neigh_hh_output include/net/neighbour.h:523 [inline] neigh_output include/net/neighbour.h:537 [inline] ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:236 iptunnel_xmit+0x55d/0x9b0 net/ipv4/ip_tunnel_core.c:82 udp_tunnel_xmit_skb+0x262/0x3b0 net/ipv4/udp_tunnel_core.c:173 geneve_xmit_skb drivers/net/geneve.c:916 [inline] geneve_xmit+0x21dc/0x2d00 drivers/net/geneve.c:1039 __netdev_start_xmit include/linux/netdevice.h:5002 [inline] netdev_start_xmit include/linux/netdevice.h:5011 [inline] xmit_one net/core/dev.c:3590 [inline] dev_hard_start_xmit+0x27a/0x7d0 net/core/dev.c:3606 __dev_queue_xmit+0x1b73/0x3f50 net/core/dev.c:4434 Fixes: e5dfb815181f ("[NET_SCHED]: Add flow classifier") Reported-by: syzbot+1dbb57d994e54aaa04d2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6777bf49.050a0220.178762.0040.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250103104546.3714168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/cls_flow.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5502998aace7..5c2580a07530 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -356,7 +356,8 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_KEYS] = { .type = NLA_U32 }, [TCA_FLOW_MODE] = { .type = NLA_U32 }, [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, - [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, + [TCA_FLOW_RSHIFT] = NLA_POLICY_MAX(NLA_U32, + 31 /* BITS_PER_U32 - 1 */), [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, [TCA_FLOW_MASK] = { .type = NLA_U32 }, [TCA_FLOW_XOR] = { .type = NLA_U32 }, From e95274dfe86490ec2a5633035c24b2de6722841f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Jan 2025 10:24:58 -0800 Subject: [PATCH 640/807] selftests: tc-testing: reduce rshift value After previous change rshift >= 32 is no longer allowed. Modify the test to use 31, the test doesn't seem to send any traffic so the exact value shouldn't matter. Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103182458.1213486-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/tc-tests/filters/flow.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json index 996448afe31b..91d120548bf5 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json @@ -78,10 +78,10 @@ "setup": [ "$TC qdisc add dev $DEV1 ingress" ], - "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0xff", + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0x1f", "expExitCode": "0", "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 protocol ip prio 1 flow", - "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 255 baseclass", + "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 31 baseclass", "matchCount": "1", "teardown": [ "$TC qdisc del dev $DEV1 ingress" From c83c846231db8b153bfcb44d552d373c34f78245 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 4 Jan 2025 18:29:02 +0000 Subject: [PATCH 641/807] io_uring/timeout: fix multishot updates After update only the first shot of a multishot timeout request adheres to the new timeout value while all subsequent retries continue to use the old value. Don't forget to update the timeout stored in struct io_timeout_data. Cc: stable@vger.kernel.org Fixes: ea97f6c8558e8 ("io_uring: add support for multishot timeouts") Reported-by: Christian Mazakas Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e6516c3304eb654ec234cfa65c88a9579861e597.1736015288.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 362689b17ccc..e9cec9e4dc2f 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -427,10 +427,12 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, timeout->off = 0; /* noseq */ data = req->async_data; + data->ts = *ts; + list_add_tail(&timeout->list, &ctx->timeout_list); hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); return 0; } From 8ce4f287524c74a118b0af1eebd4b24a8efca57a Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Fri, 3 Jan 2025 16:10:13 +0800 Subject: [PATCH 642/807] net: libwx: fix firmware mailbox abnormal return The existing SW-FW interaction flow on the driver is wrong. Follow this wrong flow, driver would never return error if there is a unknown command. Since firmware writes back 'firmware ready' and 'unknown command' in the mailbox message if there is an unknown command sent by driver. So reading 'firmware ready' does not timeout. Then driver would mistakenly believe that the interaction has completed successfully. It tends to happen with the use of custom firmware. Move the check for 'unknown command' out of the poll timeout for 'firmware ready'. And adjust the debug log so that mailbox messages are always printed when commands timeout. Fixes: 1efa9bfe58c5 ("net: libwx: Implement interaction with firmware") Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/20250103081013.1995939-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 24 ++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index 1bf9c38e4125..deaf670c160e 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -334,27 +334,25 @@ int wx_host_interface_command(struct wx *wx, u32 *buffer, status = read_poll_timeout(rd32, hicr, hicr & WX_MNG_MBOX_CTL_FWRDY, 1000, timeout * 1000, false, wx, WX_MNG_MBOX_CTL); + buf[0] = rd32(wx, WX_MNG_MBOX); + if ((buf[0] & 0xff0000) >> 16 == 0x80) { + wx_err(wx, "Unknown FW command: 0x%x\n", buffer[0] & 0xff); + status = -EINVAL; + goto rel_out; + } + /* Check command completion */ if (status) { - wx_dbg(wx, "Command has failed with no status valid.\n"); - - buf[0] = rd32(wx, WX_MNG_MBOX); - if ((buffer[0] & 0xff) != (~buf[0] >> 24)) { - status = -EINVAL; - goto rel_out; - } - if ((buf[0] & 0xff0000) >> 16 == 0x80) { - wx_dbg(wx, "It's unknown cmd.\n"); - status = -EINVAL; - goto rel_out; - } - + wx_err(wx, "Command has failed with no status valid.\n"); wx_dbg(wx, "write value:\n"); for (i = 0; i < dword_len; i++) wx_dbg(wx, "%x ", buffer[i]); wx_dbg(wx, "read value:\n"); for (i = 0; i < dword_len; i++) wx_dbg(wx, "%x ", buf[i]); + wx_dbg(wx, "\ncheck: %x %x\n", buffer[0] & 0xff, ~buf[0] >> 24); + + goto rel_out; } if (!return_data) From 5e7f0efd23238039bcd4fc72ff28d94f364ec26b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thi=C3=A9baud=20Weksteen?= Date: Fri, 20 Dec 2024 15:25:58 +1100 Subject: [PATCH 643/807] selinux: match extended permissions to their base permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit d1d991efaf34 ("selinux: Add netlink xperm support") a new extended permission was added ("nlmsg"). This was the second extended permission implemented in selinux ("ioctl" being the first one). Extended permissions are associated with a base permission. It was found that, in the access vector cache (avc), the extended permission did not keep track of its base permission. This is an issue for a domain that is using both extended permissions (i.e., a domain calling ioctl() on a netlink socket). In this case, the extended permissions were overlapping. Keep track of the base permission in the cache. A new field "base_perm" is added to struct extended_perms_decision to make sure that the extended permission refers to the correct policy permission. A new field "base_perms" is added to struct extended_perms to quickly decide if extended permissions apply. While it is in theory possible to retrieve the base permission from the access vector, the same base permission may not be mapped to the same bit for each class (e.g., "nlmsg" is mapped to a different bit for "netlink_route_socket" and "netlink_audit_socket"). Instead, use a constant (AVC_EXT_IOCTL or AVC_EXT_NLMSG) provided by the caller. Fixes: d1d991efaf34 ("selinux: Add netlink xperm support") Signed-off-by: Thiébaud Weksteen Signed-off-by: Paul Moore --- security/selinux/avc.c | 61 ++++++++++++++++------------- security/selinux/hooks.c | 6 +-- security/selinux/include/avc.h | 5 ++- security/selinux/include/security.h | 3 ++ security/selinux/ss/services.c | 28 +++++++++---- 5 files changed, 65 insertions(+), 38 deletions(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index cc0b0af20296..1f2680bcc43a 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -174,13 +174,15 @@ int avc_get_hash_stats(char *page) * using a linked list for extended_perms_decision lookup because the list is * always small. i.e. less than 5, typically 1 */ -static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver, - struct avc_xperms_node *xp_node) +static struct extended_perms_decision * +avc_xperms_decision_lookup(u8 driver, u8 base_perm, + struct avc_xperms_node *xp_node) { struct avc_xperms_decision_node *xpd_node; list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) { - if (xpd_node->xpd.driver == driver) + if (xpd_node->xpd.driver == driver && + xpd_node->xpd.base_perm == base_perm) return &xpd_node->xpd; } return NULL; @@ -205,11 +207,12 @@ avc_xperms_has_perm(struct extended_perms_decision *xpd, } static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node, - u8 driver, u8 perm) + u8 driver, u8 base_perm, u8 perm) { struct extended_perms_decision *xpd; security_xperm_set(xp_node->xp.drivers.p, driver); - xpd = avc_xperms_decision_lookup(driver, xp_node); + xp_node->xp.base_perms |= base_perm; + xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node); if (xpd && xpd->allowed) security_xperm_set(xpd->allowed->p, perm); } @@ -245,6 +248,7 @@ static void avc_xperms_free(struct avc_xperms_node *xp_node) static void avc_copy_xperms_decision(struct extended_perms_decision *dest, struct extended_perms_decision *src) { + dest->base_perm = src->base_perm; dest->driver = src->driver; dest->used = src->used; if (dest->used & XPERMS_ALLOWED) @@ -272,6 +276,7 @@ static inline void avc_quick_copy_xperms_decision(u8 perm, */ u8 i = perm >> 5; + dest->base_perm = src->base_perm; dest->used = src->used; if (dest->used & XPERMS_ALLOWED) dest->allowed->p[i] = src->allowed->p[i]; @@ -357,6 +362,7 @@ static int avc_xperms_populate(struct avc_node *node, memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p)); dest->xp.len = src->xp.len; + dest->xp.base_perms = src->xp.base_perms; /* for each source xpd allocate a destination xpd and copy */ list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) { @@ -807,6 +813,7 @@ out: * @event : Updating event * @perms : Permission mask bits * @driver: xperm driver information + * @base_perm: the base permission associated with the extended permission * @xperm: xperm permissions * @ssid: AVC entry source sid * @tsid: AVC entry target sid @@ -820,10 +827,9 @@ out: * otherwise, this function updates the AVC entry. The original AVC-entry object * will release later by RCU. */ -static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, - u32 tsid, u16 tclass, u32 seqno, - struct extended_perms_decision *xpd, - u32 flags) +static int avc_update_node(u32 event, u32 perms, u8 driver, u8 base_perm, + u8 xperm, u32 ssid, u32 tsid, u16 tclass, u32 seqno, + struct extended_perms_decision *xpd, u32 flags) { u32 hvalue; int rc = 0; @@ -880,7 +886,7 @@ static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, case AVC_CALLBACK_GRANT: node->ae.avd.allowed |= perms; if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS)) - avc_xperms_allow_perm(node->ae.xp_node, driver, xperm); + avc_xperms_allow_perm(node->ae.xp_node, driver, base_perm, xperm); break; case AVC_CALLBACK_TRY_REVOKE: case AVC_CALLBACK_REVOKE: @@ -987,10 +993,9 @@ static noinline void avc_compute_av(u32 ssid, u32 tsid, u16 tclass, avc_insert(ssid, tsid, tclass, avd, xp_node); } -static noinline int avc_denied(u32 ssid, u32 tsid, - u16 tclass, u32 requested, - u8 driver, u8 xperm, unsigned int flags, - struct av_decision *avd) +static noinline int avc_denied(u32 ssid, u32 tsid, u16 tclass, u32 requested, + u8 driver, u8 base_perm, u8 xperm, + unsigned int flags, struct av_decision *avd) { if (flags & AVC_STRICT) return -EACCES; @@ -999,7 +1004,7 @@ static noinline int avc_denied(u32 ssid, u32 tsid, !(avd->flags & AVD_FLAGS_PERMISSIVE)) return -EACCES; - avc_update_node(AVC_CALLBACK_GRANT, requested, driver, + avc_update_node(AVC_CALLBACK_GRANT, requested, driver, base_perm, xperm, ssid, tsid, tclass, avd->seqno, NULL, flags); return 0; } @@ -1012,7 +1017,8 @@ static noinline int avc_denied(u32 ssid, u32 tsid, * driver field is used to specify which set contains the permission. */ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, - u8 driver, u8 xperm, struct common_audit_data *ad) + u8 driver, u8 base_perm, u8 xperm, + struct common_audit_data *ad) { struct avc_node *node; struct av_decision avd; @@ -1047,22 +1053,23 @@ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, local_xpd.auditallow = &auditallow; local_xpd.dontaudit = &dontaudit; - xpd = avc_xperms_decision_lookup(driver, xp_node); + xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node); if (unlikely(!xpd)) { /* * Compute the extended_perms_decision only if the driver - * is flagged + * is flagged and the base permission is known. */ - if (!security_xperm_test(xp_node->xp.drivers.p, driver)) { + if (!security_xperm_test(xp_node->xp.drivers.p, driver) || + !(xp_node->xp.base_perms & base_perm)) { avd.allowed &= ~requested; goto decision; } rcu_read_unlock(); - security_compute_xperms_decision(ssid, tsid, tclass, - driver, &local_xpd); + security_compute_xperms_decision(ssid, tsid, tclass, driver, + base_perm, &local_xpd); rcu_read_lock(); - avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, - driver, xperm, ssid, tsid, tclass, avd.seqno, + avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, + base_perm, xperm, ssid, tsid, tclass, avd.seqno, &local_xpd, 0); } else { avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd); @@ -1075,8 +1082,8 @@ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, decision: denied = requested & ~(avd.allowed); if (unlikely(denied)) - rc = avc_denied(ssid, tsid, tclass, requested, - driver, xperm, AVC_EXTENDED_PERMS, &avd); + rc = avc_denied(ssid, tsid, tclass, requested, driver, + base_perm, xperm, AVC_EXTENDED_PERMS, &avd); rcu_read_unlock(); @@ -1110,7 +1117,7 @@ static noinline int avc_perm_nonode(u32 ssid, u32 tsid, u16 tclass, avc_compute_av(ssid, tsid, tclass, avd, &xp_node); denied = requested & ~(avd->allowed); if (unlikely(denied)) - return avc_denied(ssid, tsid, tclass, requested, 0, 0, + return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0, flags, avd); return 0; } @@ -1158,7 +1165,7 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid, rcu_read_unlock(); if (unlikely(denied)) - return avc_denied(ssid, tsid, tclass, requested, 0, 0, + return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0, flags, avd); return 0; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index f5a08f94e094..011d9121b3ab 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3688,8 +3688,8 @@ static int ioctl_has_perm(const struct cred *cred, struct file *file, return 0; isec = inode_security(inode); - rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, - requested, driver, xperm, &ad); + rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, requested, + driver, AVC_EXT_IOCTL, xperm, &ad); out: return rc; } @@ -5952,7 +5952,7 @@ static int nlmsg_sock_has_extended_perms(struct sock *sk, u32 perms, u16 nlmsg_t xperm = nlmsg_type & 0xff; return avc_has_extended_perms(current_sid(), sksec->sid, sksec->sclass, - perms, driver, xperm, &ad); + perms, driver, AVC_EXT_NLMSG, xperm, &ad); } static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb) diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index 96a614d47df8..281f40103663 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -136,8 +136,11 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested, int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata); +#define AVC_EXT_IOCTL (1 << 0) /* Cache entry for an ioctl extended permission */ +#define AVC_EXT_NLMSG (1 << 1) /* Cache entry for an nlmsg extended permission */ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, - u8 driver, u8 perm, struct common_audit_data *ad); + u8 driver, u8 base_perm, u8 perm, + struct common_audit_data *ad); u32 avc_policy_seqno(void); diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index c7f2731abd03..700bd6c8bb38 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -239,6 +239,7 @@ struct extended_perms_data { struct extended_perms_decision { u8 used; u8 driver; + u8 base_perm; struct extended_perms_data *allowed; struct extended_perms_data *auditallow; struct extended_perms_data *dontaudit; @@ -246,6 +247,7 @@ struct extended_perms_decision { struct extended_perms { u16 len; /* length associated decision chain */ + u8 base_perms; /* which base permissions are covered */ struct extended_perms_data drivers; /* flag drivers that are used */ }; @@ -257,6 +259,7 @@ void security_compute_av(u32 ssid, u32 tsid, u16 tclass, struct extended_perms *xperms); void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 tclass, u8 driver, + u8 base_perm, struct extended_perms_decision *xpermd); void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass, diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 3d5c563cfc4c..d9f58b5d0f49 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -582,7 +582,7 @@ static void type_attribute_bounds_av(struct policydb *policydb, } /* - * Flag which drivers have permissions. + * Flag which drivers have permissions and which base permissions are covered. */ void services_compute_xperms_drivers( struct extended_perms *xperms, @@ -592,12 +592,19 @@ void services_compute_xperms_drivers( switch (node->datum.u.xperms->specified) { case AVTAB_XPERMS_IOCTLDRIVER: + xperms->base_perms |= AVC_EXT_IOCTL; /* if one or more driver has all permissions allowed */ for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++) xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i]; break; case AVTAB_XPERMS_IOCTLFUNCTION: + xperms->base_perms |= AVC_EXT_IOCTL; + /* if allowing permissions within a driver */ + security_xperm_set(xperms->drivers.p, + node->datum.u.xperms->driver); + break; case AVTAB_XPERMS_NLMSG: + xperms->base_perms |= AVC_EXT_NLMSG; /* if allowing permissions within a driver */ security_xperm_set(xperms->drivers.p, node->datum.u.xperms->driver); @@ -631,8 +638,7 @@ static void context_struct_compute_av(struct policydb *policydb, avd->auditallow = 0; avd->auditdeny = 0xffffffff; if (xperms) { - memset(&xperms->drivers, 0, sizeof(xperms->drivers)); - xperms->len = 0; + memset(xperms, 0, sizeof(*xperms)); } if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) { @@ -969,13 +975,19 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd, { switch (node->datum.u.xperms->specified) { case AVTAB_XPERMS_IOCTLFUNCTION: - case AVTAB_XPERMS_NLMSG: - if (xpermd->driver != node->datum.u.xperms->driver) + if (xpermd->base_perm != AVC_EXT_IOCTL || + xpermd->driver != node->datum.u.xperms->driver) return; break; case AVTAB_XPERMS_IOCTLDRIVER: - if (!security_xperm_test(node->datum.u.xperms->perms.p, - xpermd->driver)) + if (xpermd->base_perm != AVC_EXT_IOCTL || + !security_xperm_test(node->datum.u.xperms->perms.p, + xpermd->driver)) + return; + break; + case AVTAB_XPERMS_NLMSG: + if (xpermd->base_perm != AVC_EXT_NLMSG || + xpermd->driver != node->datum.u.xperms->driver) return; break; default: @@ -1010,6 +1022,7 @@ void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 orig_tclass, u8 driver, + u8 base_perm, struct extended_perms_decision *xpermd) { struct selinux_policy *policy; @@ -1023,6 +1036,7 @@ void security_compute_xperms_decision(u32 ssid, struct ebitmap_node *snode, *tnode; unsigned int i, j; + xpermd->base_perm = base_perm; xpermd->driver = driver; xpermd->used = 0; memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p)); From 385443057f475e775fe1c66e77d4be9727f40973 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 3 Jan 2025 19:20:23 +0100 Subject: [PATCH 644/807] kbuild: pacman-pkg: provide versioned linux-api-headers package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Arch Linux glibc package contains a versioned dependency on "linux-api-headers". If the linux-api-headers package provided by pacman-pkg does not specify an explicit version this dependency is not satisfied. Fix the dependency by providing an explicit version. Fixes: c8578539deba ("kbuild: add script and target to generate pacman package") Signed-off-by: Thomas Weißschuh Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- scripts/package/PKGBUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD index f83493838cf9..dca706617adc 100644 --- a/scripts/package/PKGBUILD +++ b/scripts/package/PKGBUILD @@ -103,7 +103,7 @@ _package-headers() { _package-api-headers() { pkgdesc="Kernel headers sanitized for use in userspace" - provides=(linux-api-headers) + provides=(linux-api-headers="${pkgver}") conflicts=(linux-api-headers) _prologue From 9d89551994a430b50c4fffcb1e617a057fa76e20 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 5 Jan 2025 14:13:40 -0800 Subject: [PATCH 645/807] Linux 6.13-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 48e89108aa58..7904d5d88088 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* From 4c16e1cadcbcaf3c82d5fc310fbd34d0f5d0db7c Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Mon, 23 Dec 2024 23:30:50 +0800 Subject: [PATCH 646/807] ksmbd: fix a missing return value check bug In the smb2_send_interim_resp(), if ksmbd_alloc_work_struct() fails to allocate a node, it returns a NULL pointer to the in_work pointer. This can lead to an illegal memory write of in_work->response_buf when allocate_interim_rsp_buf() attempts to perform a kzalloc() on it. To address this issue, incorporating a check for the return value of ksmbd_alloc_work_struct() ensures that the function returns immediately upon allocation failure, thereby preventing the aforementioned illegal memory access. Fixes: 041bba4414cd ("ksmbd: fix wrong interim response on compound") Signed-off-by: Wentao Liang Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 23e21845f928..433e33c04039 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -695,6 +695,9 @@ void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status) struct smb2_hdr *rsp_hdr; struct ksmbd_work *in_work = ksmbd_alloc_work_struct(); + if (!in_work) + return; + if (allocate_interim_rsp_buf(in_work)) { pr_err("smb_allocate_rsp_buf failed!\n"); ksmbd_free_work_struct(in_work); From c7f3cd1b245dbdd846ae376cc022c22af8059717 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 30 Dec 2024 13:44:56 +0100 Subject: [PATCH 647/807] ksmbd: Remove unneeded if check in ksmbd_rdma_capable_netdev() Remove the unnecessary if check and assign the result directly. Signed-off-by: Thorsten Blum Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 0ef3c9f0bfeb..c3785a5434f9 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -2283,8 +2283,7 @@ out: ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); if (ibdev) { - if (rdma_frwr_is_supported(&ibdev->attrs)) - rdma_capable = true; + rdma_capable = rdma_frwr_is_supported(&ibdev->attrs); ib_device_put(ibdev); } } From dadf03cfd4eaa09f1d0e8b2521de1e11d3e3bec1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 15:02:23 +0000 Subject: [PATCH 648/807] io_uring/cmd: rename struct uring_cache to io_uring_cmd_data In preparation for making this more generically available for ->uring_cmd() usage that needs stable command data, rename it and move it to io_uring/cmd.h instead. Signed-off-by: Jens Axboe Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 4 ++++ io_uring/io_uring.c | 2 +- io_uring/opdef.c | 3 ++- io_uring/uring_cmd.c | 10 +++++----- io_uring/uring_cmd.h | 4 ---- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c189d36ad55e..24cff2b9b9d4 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -18,6 +18,10 @@ struct io_uring_cmd { u8 pdu[32]; /* available inline for free use */ }; +struct io_uring_cmd_data { + struct io_uring_sqe sqes[2]; +}; + static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) { return sqe->cmd; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b2736e3491b8..8ae6bf746fcc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -315,7 +315,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_rw)); ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct uring_cache)); + sizeof(struct io_uring_cmd_data)); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_kiocb)); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index a2be3bbca5ff..c7746f67cc65 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "io_uring.h" #include "opdef.h" @@ -414,7 +415,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = 2 * sizeof(struct io_uring_sqe), + .async_size = sizeof(struct io_uring_cmd_data), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index e2e8485932d6..eefc203a1214 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -16,10 +16,10 @@ #include "rsrc.h" #include "uring_cmd.h" -static struct uring_cache *io_uring_async_get(struct io_kiocb *req) +static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct uring_cache *cache; + struct io_uring_cmd_data *cache; cache = io_alloc_cache_get(&ctx->uring_cache); if (cache) { @@ -35,7 +35,7 @@ static struct uring_cache *io_uring_async_get(struct io_kiocb *req) static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache = req->async_data; + struct io_uring_cmd_data *cache = req->async_data; if (issue_flags & IO_URING_F_UNLOCKED) return; @@ -183,7 +183,7 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache; + struct io_uring_cmd_data *cache; cache = io_uring_async_get(req); if (unlikely(!cache)) @@ -256,7 +256,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) ret = file->f_op->uring_cmd(ioucmd, issue_flags); if (ret == -EAGAIN) { - struct uring_cache *cache = req->async_data; + struct io_uring_cmd_data *cache = req->async_data; if (ioucmd->sqe != (void *) cache) memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index a361f98664d2..515823ca68b8 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -struct uring_cache { - struct io_uring_sqe sqes[2]; -}; - int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); From 3347fa658a1baecd61b007787d031b729cd86537 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 15:02:24 +0000 Subject: [PATCH 649/807] io_uring/cmd: add per-op data to struct io_uring_cmd_data In case an op handler for ->uring_cmd() needs stable storage for user data, it can allocate io_uring_cmd_data->op_data and use it for the duration of the request. When the request gets cleaned up, uring_cmd will free it automatically. Signed-off-by: Jens Axboe Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 1 + io_uring/uring_cmd.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 24cff2b9b9d4..3df6636ec3a3 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -20,6 +20,7 @@ struct io_uring_cmd { struct io_uring_cmd_data { struct io_uring_sqe sqes[2]; + void *op_data; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index eefc203a1214..019d6f49ff20 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -23,12 +23,16 @@ static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req) cache = io_alloc_cache_get(&ctx->uring_cache); if (cache) { + cache->op_data = NULL; req->flags |= REQ_F_ASYNC_DATA; req->async_data = cache; return cache; } - if (!io_alloc_async_data(req)) - return req->async_data; + if (!io_alloc_async_data(req)) { + cache = req->async_data; + cache->op_data = NULL; + return cache; + } return NULL; } @@ -37,6 +41,11 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd_data *cache = req->async_data; + if (cache->op_data) { + kfree(cache->op_data); + cache->op_data = NULL; + } + if (issue_flags & IO_URING_F_UNLOCKED) return; if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { From b0af20d33f63c74985a6dd98344326e5111b2fea Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Fri, 3 Jan 2025 15:02:25 +0000 Subject: [PATCH 650/807] io_uring: add io_uring_cmd_get_async_data helper Add a helper function in include/linux/io_uring/cmd.h to read the async_data pointer from a struct io_uring_cmd. Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 3df6636ec3a3..b0aeec834c1d 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -118,4 +118,9 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd return cmd_to_io_kiocb(cmd)->task; } +static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->async_data; +} + #endif /* _LINUX_IO_URING_CMD_H */ From c21b89d495bab6ae7ce0a1592bb955e5e80127fd Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Fri, 3 Jan 2025 15:02:26 +0000 Subject: [PATCH 651/807] btrfs: don't read from userspace twice in btrfs_uring_encoded_read() If we return -EAGAIN the first time because we need to block, btrfs_uring_encoded_read() will get called twice. Take a copy of args, the iovs, and the iter the first time, as by the time we are called the second time these may have gone out of scope. Reported-by: Jens Axboe Fixes: 34310c442e17 ("btrfs: add io_uring command for encoded reads (ENCODED_READ ioctl)") Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 128 +++++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 60 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f8680e7cc974..149259180faa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4879,25 +4879,29 @@ out_fail: return ret; } +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; - struct btrfs_ioctl_encoded_io_args args = { 0 }; int ret; u64 disk_bytenr, disk_io_size; struct file *file; struct btrfs_inode *inode; struct btrfs_fs_info *fs_info; struct extent_io_tree *io_tree; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; u64 start, lockend; void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4911,43 +4915,64 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_encoded_io_args_32 args32; - copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); - if (copy_from_user(&args32, sqe_addr, copy_end)) { - ret = -EFAULT; - goto out_acct; - } - args.iov = compat_ptr(args32.iov); - args.iovcnt = args32.iovcnt; - args.offset = args32.offset; - args.flags = args32.flags; #else return -ENOTTY; #endif } else { copy_end = copy_end_kernel; - if (copy_from_user(&args, sqe_addr, copy_end)) { - ret = -EFAULT; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; goto out_acct; } + + io_uring_cmd_get_async_data(cmd)->op_data = data; + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (data->args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + data->iov = data->iovstack; + ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_free; + } } - if (args.flags != 0) - return -EINVAL; - - ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), - &iov, &iter); - if (ret < 0) - goto out_acct; - - if (iov_iter_count(&iter) == 0) { - ret = 0; - goto out_free; - } - - pos = args.offset; - ret = rw_verify_area(READ, file, &pos, args.len); + pos = data->args.offset; + ret = rw_verify_area(READ, file, &pos, data->args.len); if (ret < 0) goto out_free; @@ -4960,15 +4985,16 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue start = ALIGN_DOWN(pos, fs_info->sectorsize); lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, &disk_bytenr, &disk_io_size); if (ret < 0 && ret != -EIOCBQUEUED) goto out_free; file_accessed(file); - if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, - sizeof(args) - copy_end_kernel)) { + if (copy_to_user(sqe_addr + copy_end, + (const char *)&data->args + copy_end_kernel, + sizeof(data->args) - copy_end_kernel)) { if (ret == -EIOCBQUEUED) { unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -4978,40 +5004,22 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue } if (ret == -EIOCBQUEUED) { - u64 count; - - /* - * If we've optimized things by storing the iovecs on the stack, - * undo this. - */ - if (!iov) { - iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); - if (!iov) { - unlock_extent(io_tree, start, lockend, &cached_state); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - ret = -ENOMEM; - goto out_acct; - } - - memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); - } - - count = min_t(u64, iov_iter_count(&iter), disk_io_size); + u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size); /* Match ioctl by not returning past EOF if uncompressed. */ - if (!args.compression) - count = min_t(u64, count, args.len); + if (!data->args.compression) + count = min_t(u64, count, data->args.len); - ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, - cached_state, disk_bytenr, - disk_io_size, count, - args.compression, iov, cmd); + ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend, + cached_state, disk_bytenr, disk_io_size, + count, data->args.compression, + data->iov, cmd); goto out_acct; } out_free: - kfree(iov); + kfree(data->iov); out_acct: if (ret > 0) From 1156b5e8be98c97087f8971609c852e418daf03b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 4 Jan 2025 17:20:57 +0530 Subject: [PATCH 652/807] regulator: Guard of_regulator_bulk_get_all() with CONFIG_OF Since the definition is in drivers/regulator/of_regulator.c and compiled only if CONFIG_OF is enabled, building the consumer driver without CONFIG_OF and with CONFIG_REGULATOR will result in below build error: ERROR: modpost: "of_regulator_bulk_get_all" [drivers/pci/pwrctrl/pci-pwrctl-slot.ko] undefined! Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412181640.12Iufkvd-lkp@intel.com/ Fixes: 27b9ecc7a9ba ("regulator: Add of_regulator_bulk_get_all") Signed-off-by: Manivannan Sadhasivam Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/20250104115058.19216-2-manivannan.sadhasivam@linaro.org Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 8c3c372ad735..85be83c8fa17 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -175,6 +175,8 @@ struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, struct device_node *node, const char *id); +int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers); #else static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct device_node *node, @@ -189,6 +191,13 @@ static inline struct regulator *__must_check devm_of_regulator_get_optional(stru { return ERR_PTR(-ENODEV); } + +static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers) +{ + return 0; +} + #endif int regulator_register_supply_alias(struct device *dev, const char *id, @@ -223,8 +232,6 @@ int regulator_disable_deferred(struct regulator *regulator, int ms); int __must_check regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); -int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers); int __must_check devm_regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); void devm_regulator_bulk_put(struct regulator_bulk_data *consumers); @@ -483,12 +490,6 @@ static inline int devm_regulator_bulk_get(struct device *dev, int num_consumers, return 0; } -static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers) -{ - return 0; -} - static inline int devm_regulator_bulk_get_const( struct device *dev, int num_consumers, const struct regulator_bulk_data *in_consumers, From 907af7d6e0c8cf4086b1bc5218281b2ca09f130b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 4 Jan 2025 17:20:58 +0530 Subject: [PATCH 653/807] regulator: Move OF_ API declarations/definitions outside CONFIG_REGULATOR Since these are hidden inside CONFIG_REGULATOR, building the consumer drivers without CONFIG_REGULATOR will result in the following build error: >> drivers/pci/pwrctrl/slot.c:39:15: error: implicit declaration of function 'of_regulator_bulk_get_all'; did you mean 'regulator_bulk_get'? [-Werror=implicit-function-declaration] 39 | ret = of_regulator_bulk_get_all(dev, dev_of_node(dev), | ^~~~~~~~~~~~~~~~~~~~~~~~~ | regulator_bulk_get cc1: some warnings being treated as errors This also removes the duplicated definitions that were possibly added to fix the build issues. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501020407.HmQQQKa0-lkp@intel.com/ Fixes: 27b9ecc7a9ba ("regulator: Add of_regulator_bulk_get_all") Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20250104115058.19216-3-manivannan.sadhasivam@linaro.org Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 78 ++++++++++++------------------ 1 file changed, 32 insertions(+), 46 deletions(-) diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 85be83c8fa17..bcba3935c6f9 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -168,38 +168,6 @@ int devm_regulator_get_enable_read_voltage(struct device *dev, const char *id); void regulator_put(struct regulator *regulator); void devm_regulator_put(struct regulator *regulator); -#if IS_ENABLED(CONFIG_OF) -struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id); -struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id); -int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers); -#else -static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers) -{ - return 0; -} - -#endif - int regulator_register_supply_alias(struct device *dev, const char *id, struct device *alias_dev, const char *alias_id); @@ -380,20 +348,6 @@ devm_regulator_get_optional(struct device *dev, const char *id) return ERR_PTR(-ENODEV); } -static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - static inline void regulator_put(struct regulator *regulator) { } @@ -701,6 +655,38 @@ regulator_is_equal(struct regulator *reg1, struct regulator *reg2) } #endif +#if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_REGULATOR) +struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id); +struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id); +int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers); +#else +static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} + +static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} + +static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers) +{ + return 0; +} + +#endif + static inline int regulator_set_voltage_triplet(struct regulator *regulator, int min_uV, int target_uV, int max_uV) From cd6313beaeaea0b2e6d428afef7a86a986b50abe Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 6 Jan 2025 06:10:24 -0800 Subject: [PATCH 654/807] Revert "vmstat: disable vmstat_work on vmstat_cpu_down_prep()" This reverts commit adcfb264c3ed51fbbf5068ddf10d309a63683868. It turns out this just causes a different warning splat instead that seems to be much easier to trigger, so let's revert ASAP. Reported-and-bisected-by: Borislav Petkov Tested-by: Breno Leitao Reported-by: Alexander Gordeev Link: https://lore.kernel.org/all/20250106131817.GAZ3vYGVr3-hWFFPLj@fat_crate.local/ Cc: Koichiro Den Cc: Sebastian Andrzej Siewior Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 0889b75cef14..4d016314a56c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2148,14 +2148,13 @@ static int vmstat_cpu_online(unsigned int cpu) if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); } - enable_delayed_work(&per_cpu(vmstat_work, cpu)); return 0; } static int vmstat_cpu_down_prep(unsigned int cpu) { - disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); return 0; } From 07aeefae7ff44d80524375253980b1bdee2396b0 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 5 Jan 2025 17:24:03 +0100 Subject: [PATCH 655/807] ovl: pass realinode to ovl_encode_real_fh() instead of realdentry We want to be able to encode an fid from an inode with no alias. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250105162404.357058-2-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/overlayfs/copy_up.c | 11 ++++++----- fs/overlayfs/export.c | 5 +++-- fs/overlayfs/namei.c | 4 ++-- fs/overlayfs/overlayfs.h | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 56eee9f23ea9..0c28e5fa3407 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -415,13 +415,13 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, return err; } -struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode, bool is_upper) { struct ovl_fh *fh; int fh_type, dwords; int buflen = MAX_HANDLE_SZ; - uuid_t *uuid = &real->d_sb->s_uuid; + uuid_t *uuid = &realinode->i_sb->s_uuid; int err; /* Make sure the real fid stays 32bit aligned */ @@ -438,7 +438,8 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, * the price or reconnecting the dentry. */ dwords = buflen >> 2; - fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0); + fh_type = exportfs_encode_inode_fh(realinode, (void *)fh->fb.fid, + &dwords, NULL, 0); buflen = (dwords << 2); err = -EIO; @@ -479,7 +480,7 @@ struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin) if (!ovl_can_decode_fh(origin->d_sb)) return NULL; - return ovl_encode_real_fh(ofs, origin, false); + return ovl_encode_real_fh(ofs, d_inode(origin), false); } int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, @@ -504,7 +505,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, const struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, upper, true); + fh = ovl_encode_real_fh(ofs, d_inode(upper), true); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 5868cb222955..036c9f39a14d 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -223,6 +223,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, u32 *fid, int buflen) { + struct inode *inode = d_inode(dentry); struct ovl_fh *fh = NULL; int err, enc_lower; int len; @@ -236,8 +237,8 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, goto fail; /* Encode an upper or lower file handle */ - fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) : - ovl_dentry_upper(dentry), !enc_lower); + fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_inode_lower(inode) : + ovl_inode_upper(inode), !enc_lower); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 7e27b7d4adee..cea820cb3b55 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -542,7 +542,7 @@ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, real, is_upper); + fh = ovl_encode_real_fh(ofs, d_inode(real), is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) { fh = NULL; @@ -738,7 +738,7 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, origin, false); + fh = ovl_encode_real_fh(ofs, d_inode(origin), false); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index b361f35762be..0021e2025020 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -865,7 +865,7 @@ int ovl_copy_up_with_data(struct dentry *dentry); int ovl_maybe_copy_up(struct dentry *dentry, int flags); int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new); int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); -struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode, bool is_upper); struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin); int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, From c45beebfde34aa71afbc48b2c54cdda623515037 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 5 Jan 2025 17:24:04 +0100 Subject: [PATCH 656/807] ovl: support encoding fid from inode with no alias Dmitry Safonov reported that a WARN_ON() assertion can be trigered by userspace when calling inotify_show_fdinfo() for an overlayfs watched inode, whose dentry aliases were discarded with drop_caches. The WARN_ON() assertion in inotify_show_fdinfo() was removed, because it is possible for encoding file handle to fail for other reason, but the impact of failing to encode an overlayfs file handle goes beyond this assertion. As shown in the LTP test case mentioned in the link below, failure to encode an overlayfs file handle from a non-aliased inode also leads to failure to report an fid with FAN_DELETE_SELF fanotify events. As Dmitry notes in his analyzis of the problem, ovl_encode_fh() fails if it cannot find an alias for the inode, but this failure can be fixed. ovl_encode_fh() seldom uses the alias and in the case of non-decodable file handles, as is often the case with fanotify fid info, ovl_encode_fh() never needs to use the alias to encode a file handle. Defer finding an alias until it is actually needed so ovl_encode_fh() will not fail in the common case of FAN_DELETE_SELF fanotify events. Fixes: 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles") Reported-by: Dmitry Safonov Closes: https://lore.kernel.org/linux-fsdevel/CAOQ4uxiie81voLZZi2zXS1BziXZCM24nXqPAxbu8kxXCUWdwOg@mail.gmail.com/ Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250105162404.357058-3-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/overlayfs/export.c | 46 +++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 036c9f39a14d..444aeeccb6da 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -176,35 +176,37 @@ static int ovl_connect_layer(struct dentry *dentry) * * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ -static int ovl_check_encode_origin(struct dentry *dentry) +static int ovl_check_encode_origin(struct inode *inode) { - struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct ovl_fs *ofs = OVL_FS(inode->i_sb); bool decodable = ofs->config.nfs_export; + struct dentry *dentry; + int err; /* No upper layer? */ if (!ovl_upper_mnt(ofs)) return 1; /* Lower file handle for non-upper non-decodable */ - if (!ovl_dentry_upper(dentry) && !decodable) + if (!ovl_inode_upper(inode) && !decodable) return 1; /* Upper file handle for pure upper */ - if (!ovl_dentry_lower(dentry)) + if (!ovl_inode_lower(inode)) return 0; /* * Root is never indexed, so if there's an upper layer, encode upper for * root. */ - if (dentry == dentry->d_sb->s_root) + if (inode == d_inode(inode->i_sb->s_root)) return 0; /* * Upper decodable file handle for non-indexed upper. */ - if (ovl_dentry_upper(dentry) && decodable && - !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + if (ovl_inode_upper(inode) && decodable && + !ovl_test_flag(OVL_INDEX, inode)) return 0; /* @@ -213,17 +215,25 @@ static int ovl_check_encode_origin(struct dentry *dentry) * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ - if (d_is_dir(dentry) && decodable) - return ovl_connect_layer(dentry); + if (!decodable || !S_ISDIR(inode->i_mode)) + return 1; + + dentry = d_find_any_alias(inode); + if (!dentry) + return -ENOENT; + + err = ovl_connect_layer(dentry); + dput(dentry); + if (err < 0) + return err; /* Lower file handle for indexed and non-upper dir/non-dir */ return 1; } -static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, +static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct inode *inode, u32 *fid, int buflen) { - struct inode *inode = d_inode(dentry); struct ovl_fh *fh = NULL; int err, enc_lower; int len; @@ -232,7 +242,7 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, * Check if we should encode a lower or upper file handle and maybe * copy up an ancestor to make lower file handle connectable. */ - err = enc_lower = ovl_check_encode_origin(dentry); + err = enc_lower = ovl_check_encode_origin(inode); if (enc_lower < 0) goto fail; @@ -252,8 +262,8 @@ out: return err; fail: - pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n", - dentry, err); + pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n", + inode->i_ino, err); goto out; } @@ -261,19 +271,13 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); - struct dentry *dentry; int bytes, buflen = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) return FILEID_INVALID; - dentry = d_find_any_alias(inode); - if (!dentry) - return FILEID_INVALID; - - bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen); - dput(dentry); + bytes = ovl_dentry_to_fid(ofs, inode, fid, buflen); if (bytes <= 0) return FILEID_INVALID; From 6aecd91a5c5b68939cf4169e32bc49f3cd2dd329 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 2 Jan 2025 14:44:16 +1030 Subject: [PATCH 657/807] btrfs: avoid NULL pointer dereference if no valid extent tree [BUG] Syzbot reported a crash with the following call trace: BTRFS info (device loop0): scrub: started on devid 1 BUG: kernel NULL pointer dereference, address: 0000000000000208 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 106e70067 P4D 106e70067 PUD 107143067 PMD 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 UID: 0 PID: 689 Comm: repro Kdump: loaded Tainted: G O 6.13.0-rc4-custom+ #206 Tainted: [O]=OOT_MODULE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 02/02/2022 RIP: 0010:find_first_extent_item+0x26/0x1f0 [btrfs] Call Trace: scrub_find_fill_first_stripe+0x13d/0x3b0 [btrfs] scrub_simple_mirror+0x175/0x260 [btrfs] scrub_stripe+0x5d4/0x6c0 [btrfs] scrub_chunk+0xbb/0x170 [btrfs] scrub_enumerate_chunks+0x2f4/0x5f0 [btrfs] btrfs_scrub_dev+0x240/0x600 [btrfs] btrfs_ioctl+0x1dc8/0x2fa0 [btrfs] ? do_sys_openat2+0xa5/0xf0 __x64_sys_ioctl+0x97/0xc0 do_syscall_64+0x4f/0x120 entry_SYSCALL_64_after_hwframe+0x76/0x7e [CAUSE] The reproducer is using a corrupted image where extent tree root is corrupted, thus forcing to use "rescue=all,ro" mount option to mount the image. Then it triggered a scrub, but since scrub relies on extent tree to find where the data/metadata extents are, scrub_find_fill_first_stripe() relies on an non-empty extent root. But unfortunately scrub_find_fill_first_stripe() doesn't really expect an NULL pointer for extent root, it use extent_root to grab fs_info and triggered a NULL pointer dereference. [FIX] Add an extra check for a valid extent root at the beginning of scrub_find_fill_first_stripe(). The new error path is introduced by 42437a6386ff ("btrfs: introduce mount option rescue=ignorebadroots"), but that's pretty old, and later commit b979547513ff ("btrfs: scrub: introduce helper to find and fill sector info for a scrub_stripe") changed how we do scrub. So for kernels older than 6.6, the fix will need manual backport. Reported-by: syzbot+339e9dbe3a2ca419b85d@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/67756935.050a0220.25abdd.0a12.GAE@google.com/ Fixes: 42437a6386ff ("btrfs: introduce mount option rescue=ignorebadroots") Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 204c928beaf9..531312efee8d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1541,6 +1541,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; + if (unlikely(!extent_root)) { + btrfs_err(fs_info, "no valid extent root for scrub"); + return -EUCLEAN; + } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * stripe->nr_sectors); scrub_stripe_reset_bitmaps(stripe); From 7467bc5959bf02ef5210ea7e7948e548565c799c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Dec 2024 07:43:43 +0100 Subject: [PATCH 658/807] btrfs: zoned: calculate max_extent_size properly on non-zoned setup Since commit 559218d43ec9 ("block: pre-calculate max_zone_append_sectors"), queue_limits's max_zone_append_sectors is default to be 0 and it is only updated when there is a zoned device. So, we have lim->max_zone_append_sectors = 0 when there is no zoned device in the filesystem. That leads to fs_info->max_zone_append_size and thus fs_info->max_extent_size to be 0, which is wrong and can for example lead to a divide by zero in count_max_extents(). Fix this by only capping fs_info->max_extent_size to fs_info->max_zone_append_size when it is non-zero. Based on a patch from Naohiro Aota , from which much of this commit message is stolen as well. Reported-by: Shinichiro Kawasaki Fixes: 559218d43ec9 ("block: pre-calculate max_zone_append_sectors") Tested-by: Shinichiro Kawasaki Reviewed-by: Johannes Thumshirn Reviewed-by: Naohiro Aota Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index cb32966380f5..5f9d3be1234a 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -745,8 +745,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) (u64)lim->max_segments << PAGE_SHIFT), fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; - if (fs_info->max_zone_append_size < fs_info->max_extent_size) - fs_info->max_extent_size = fs_info->max_zone_append_size; + + fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, + fs_info->max_zone_append_size); /* * Check mount options here, because we might change fs_info->zoned From 0ee4736c003daded513de0ff112d4a1e9c85bbab Mon Sep 17 00:00:00 2001 From: Mikhail Zaslonko Date: Wed, 18 Dec 2024 11:32:51 +0100 Subject: [PATCH 659/807] btrfs: zlib: fix avail_in bytes for s390 zlib HW compression path Since the input data length passed to zlib_compress_folios() can be arbitrary, always setting strm.avail_in to a multiple of PAGE_SIZE may cause read-in bytes to exceed the input range. Currently this triggers an assert in btrfs_compress_folios() on the debug kernel (see below). Fix strm.avail_in calculation for S390 hardware acceleration path. assertion failed: *total_in <= orig_len, in fs/btrfs/compression.c:1041 ------------[ cut here ]------------ kernel BUG at fs/btrfs/compression.c:1041! monitor event: 0040 ilc:2 [#1] PREEMPT SMP CPU: 16 UID: 0 PID: 325 Comm: kworker/u273:3 Not tainted 6.13.0-20241204.rc1.git6.fae3b21430ca.300.fc41.s390x+debug #1 Hardware name: IBM 3931 A01 703 (z/VM 7.4.0) Workqueue: btrfs-delalloc btrfs_work_helper Krnl PSW : 0704d00180000000 0000021761df6538 (btrfs_compress_folios+0x198/0x1a0) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3 Krnl GPRS: 0000000080000000 0000000000000001 0000000000000047 0000000000000000 0000000000000006 ffffff01757bb000 000001976232fcc0 000000000000130c 000001976232fcd0 000001976232fcc8 00000118ff4a0e30 0000000000000001 00000111821ab400 0000011100000000 0000021761df6534 000001976232fb58 Krnl Code: 0000021761df6528: c020006f5ef4 larl %r2,0000021762be2310 0000021761df652e: c0e5ffbd09d5 brasl %r14,00000217615978d8 #0000021761df6534: af000000 mc 0,0 >0000021761df6538: 0707 bcr 0,%r7 0000021761df653a: 0707 bcr 0,%r7 0000021761df653c: 0707 bcr 0,%r7 0000021761df653e: 0707 bcr 0,%r7 0000021761df6540: c004004bb7ec brcl 0,000002176276d518 Call Trace: [<0000021761df6538>] btrfs_compress_folios+0x198/0x1a0 ([<0000021761df6534>] btrfs_compress_folios+0x194/0x1a0) [<0000021761d97788>] compress_file_range+0x3b8/0x6d0 [<0000021761dcee7c>] btrfs_work_helper+0x10c/0x160 [<0000021761645760>] process_one_work+0x2b0/0x5d0 [<000002176164637e>] worker_thread+0x20e/0x3e0 [<000002176165221a>] kthread+0x15a/0x170 [<00000217615b859c>] __ret_from_fork+0x3c/0x60 [<00000217626e72d2>] ret_from_fork+0xa/0x38 INFO: lockdep is turned off. Last Breaking-Event-Address: [<0000021761597924>] _printk+0x4c/0x58 Kernel panic - not syncing: Fatal exception: panic_on_oops Fixes: fd1e75d0105d ("btrfs: make compression path to be subpage compatible") CC: stable@vger.kernel.org # 6.12+ Acked-by: Ilya Leoshkevich Reviewed-by: Qu Wenruo Signed-off-by: Mikhail Zaslonko Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ddf0d5a448a7..c9e92c6941ec 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -174,10 +174,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; - workspace->strm.avail_in = - (in_buf_folios << PAGE_SHIFT); } workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = min(bytes_left, + in_buf_folios << PAGE_SHIFT); } else { unsigned int pg_off; unsigned int cur_len; From 5009628d8509dbb90e1b88e01eda00430fa24b4b Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Tue, 17 Dec 2024 14:03:50 -0700 Subject: [PATCH 660/807] drm/amd/display: Remove unnecessary amdgpu_irq_get/put [WHY & HOW] commit 7fb363c57522 ("drm/amd/display: Let drm_crtc_vblank_on/off manage interrupts") lets drm_crtc_vblank_* to manage interrupts in amdgpu_dm_crtc_set_vblank, and amdgpu_irq_get/put do not need to be called here. Part of that patch got lost somehow, so fix it up. Fixes: 7fb363c57522 ("drm/amd/display: Let drm_crtc_vblank_on/off manage interrupts") Reviewed-by: Alex Deucher Reviewed-by: Leo Li Signed-off-by: Alex Hung Signed-off-by: Alex Deucher (cherry picked from commit 3782305ce5807c18fbf092124b9e8303cf1723ae) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 31 ------------------- 1 file changed, 31 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 48be917e7bc5..4c3b6e6151c5 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8400,16 +8400,6 @@ static void manage_dm_interrupts(struct amdgpu_device *adev, struct amdgpu_crtc *acrtc, struct dm_crtc_state *acrtc_state) { - /* - * We have no guarantee that the frontend index maps to the same - * backend index - some even map to more than one. - * - * TODO: Use a different interrupt or check DC itself for the mapping. - */ - int irq_type = - amdgpu_display_crtc_idx_to_irq_type( - adev, - acrtc->crtc_id); struct drm_vblank_crtc_config config = {0}; struct dc_crtc_timing *timing; int offdelay; @@ -8435,28 +8425,7 @@ static void manage_dm_interrupts(struct amdgpu_device *adev, drm_crtc_vblank_on_config(&acrtc->base, &config); - - amdgpu_irq_get( - adev, - &adev->pageflip_irq, - irq_type); -#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) - amdgpu_irq_get( - adev, - &adev->vline0_irq, - irq_type); -#endif } else { -#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) - amdgpu_irq_put( - adev, - &adev->vline0_irq, - irq_type); -#endif - amdgpu_irq_put( - adev, - &adev->pageflip_irq, - irq_type); drm_crtc_vblank_off(&acrtc->base); } } From 7de8d5c90be9ad9f6575e818a674801db2ada794 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Tue, 17 Dec 2024 17:45:03 -0300 Subject: [PATCH 661/807] drm/amd/display: fix page fault due to max surface definition mismatch DC driver is using two different values to define the maximum number of surfaces: MAX_SURFACES and MAX_SURFACE_NUM. Consolidate MAX_SURFACES as the unique definition for surface updates across DC. It fixes page fault faced by Cosmic users on AMD display versions that support two overlay planes, since the introduction of cursor overlay mode. [Nov26 21:33] BUG: unable to handle page fault for address: 0000000051d0f08b [ +0.000015] #PF: supervisor read access in kernel mode [ +0.000006] #PF: error_code(0x0000) - not-present page [ +0.000005] PGD 0 P4D 0 [ +0.000007] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI [ +0.000006] CPU: 4 PID: 71 Comm: kworker/u32:6 Not tainted 6.10.0+ #300 [ +0.000006] Hardware name: Valve Jupiter/Jupiter, BIOS F7A0131 01/30/2024 [ +0.000007] Workqueue: events_unbound commit_work [drm_kms_helper] [ +0.000040] RIP: 0010:copy_stream_update_to_stream.isra.0+0x30d/0x750 [amdgpu] [ +0.000847] Code: 8b 10 49 89 94 24 f8 00 00 00 48 8b 50 08 49 89 94 24 00 01 00 00 8b 40 10 41 89 84 24 08 01 00 00 49 8b 45 78 48 85 c0 74 0b <0f> b6 00 41 88 84 24 90 64 00 00 49 8b 45 60 48 85 c0 74 3b 48 8b [ +0.000010] RSP: 0018:ffffc203802f79a0 EFLAGS: 00010206 [ +0.000009] RAX: 0000000051d0f08b RBX: 0000000000000004 RCX: ffff9f964f0a8070 [ +0.000004] RDX: ffff9f9710f90e40 RSI: ffff9f96600c8000 RDI: ffff9f964f000000 [ +0.000004] RBP: ffffc203802f79f8 R08: 0000000000000000 R09: 0000000000000000 [ +0.000005] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9f96600c8000 [ +0.000004] R13: ffff9f9710f90e40 R14: ffff9f964f000000 R15: ffff9f96600c8000 [ +0.000004] FS: 0000000000000000(0000) GS:ffff9f9970000000(0000) knlGS:0000000000000000 [ +0.000005] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000005] CR2: 0000000051d0f08b CR3: 00000002e6a20000 CR4: 0000000000350ef0 [ +0.000005] Call Trace: [ +0.000011] [ +0.000010] ? __die_body.cold+0x19/0x27 [ +0.000012] ? page_fault_oops+0x15a/0x2d0 [ +0.000014] ? exc_page_fault+0x7e/0x180 [ +0.000009] ? asm_exc_page_fault+0x26/0x30 [ +0.000013] ? copy_stream_update_to_stream.isra.0+0x30d/0x750 [amdgpu] [ +0.000739] ? dc_commit_state_no_check+0xd6c/0xe70 [amdgpu] [ +0.000470] update_planes_and_stream_state+0x49b/0x4f0 [amdgpu] [ +0.000450] ? srso_return_thunk+0x5/0x5f [ +0.000009] ? commit_minimal_transition_state+0x239/0x3d0 [amdgpu] [ +0.000446] update_planes_and_stream_v2+0x24a/0x590 [amdgpu] [ +0.000464] ? srso_return_thunk+0x5/0x5f [ +0.000009] ? sort+0x31/0x50 [ +0.000007] ? amdgpu_dm_atomic_commit_tail+0x159f/0x3a30 [amdgpu] [ +0.000508] ? srso_return_thunk+0x5/0x5f [ +0.000009] ? amdgpu_crtc_get_scanout_position+0x28/0x40 [amdgpu] [ +0.000377] ? srso_return_thunk+0x5/0x5f [ +0.000009] ? drm_crtc_vblank_helper_get_vblank_timestamp_internal+0x160/0x390 [drm] [ +0.000058] ? srso_return_thunk+0x5/0x5f [ +0.000005] ? dma_fence_default_wait+0x8c/0x260 [ +0.000010] ? srso_return_thunk+0x5/0x5f [ +0.000005] ? wait_for_completion_timeout+0x13b/0x170 [ +0.000006] ? srso_return_thunk+0x5/0x5f [ +0.000005] ? dma_fence_wait_timeout+0x108/0x140 [ +0.000010] ? commit_tail+0x94/0x130 [drm_kms_helper] [ +0.000024] ? process_one_work+0x177/0x330 [ +0.000008] ? worker_thread+0x266/0x3a0 [ +0.000006] ? __pfx_worker_thread+0x10/0x10 [ +0.000004] ? kthread+0xd2/0x100 [ +0.000006] ? __pfx_kthread+0x10/0x10 [ +0.000006] ? ret_from_fork+0x34/0x50 [ +0.000004] ? __pfx_kthread+0x10/0x10 [ +0.000005] ? ret_from_fork_asm+0x1a/0x30 [ +0.000011] Fixes: 1b04dcca4fb1 ("drm/amd/display: Introduce overlay cursor mode") Suggested-by: Leo Li Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3693 Signed-off-by: Melissa Wen Reviewed-by: Rodrigo Siqueira Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher (cherry picked from commit 1c86c81a86c60f9b15d3e3f43af0363cf56063e7) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/core/dc.c | 2 +- drivers/gpu/drm/amd/display/dc/core/dc_state.c | 8 ++++---- drivers/gpu/drm/amd/display/dc/dc.h | 2 +- drivers/gpu/drm/amd/display/dc/dc_stream.h | 2 +- drivers/gpu/drm/amd/display/dc/dc_types.h | 1 - drivers/gpu/drm/amd/display/dc/dml2/dml2_mall_phantom.c | 2 +- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 49fe7dcf9372..2723558049d6 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -4510,7 +4510,7 @@ static bool commit_minimal_transition_based_on_current_context(struct dc *dc, struct pipe_split_policy_backup policy; struct dc_state *intermediate_context; struct dc_state *old_current_state = dc->current_state; - struct dc_surface_update srf_updates[MAX_SURFACE_NUM] = {0}; + struct dc_surface_update srf_updates[MAX_SURFACES] = {0}; int surface_count; /* diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/drivers/gpu/drm/amd/display/dc/core/dc_state.c index e006f816ff2f..1b2cce127981 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c @@ -483,9 +483,9 @@ bool dc_state_add_plane( if (stream_status == NULL) { dm_error("Existing stream not found; failed to attach surface!\n"); goto out; - } else if (stream_status->plane_count == MAX_SURFACE_NUM) { + } else if (stream_status->plane_count == MAX_SURFACES) { dm_error("Surface: can not attach plane_state %p! Maximum is: %d\n", - plane_state, MAX_SURFACE_NUM); + plane_state, MAX_SURFACES); goto out; } else if (!otg_master_pipe) { goto out; @@ -600,7 +600,7 @@ bool dc_state_rem_all_planes_for_stream( { int i, old_plane_count; struct dc_stream_status *stream_status = NULL; - struct dc_plane_state *del_planes[MAX_SURFACE_NUM] = { 0 }; + struct dc_plane_state *del_planes[MAX_SURFACES] = { 0 }; for (i = 0; i < state->stream_count; i++) if (state->streams[i] == stream) { @@ -875,7 +875,7 @@ bool dc_state_rem_all_phantom_planes_for_stream( { int i, old_plane_count; struct dc_stream_status *stream_status = NULL; - struct dc_plane_state *del_planes[MAX_SURFACE_NUM] = { 0 }; + struct dc_plane_state *del_planes[MAX_SURFACES] = { 0 }; for (i = 0; i < state->stream_count; i++) if (state->streams[i] == phantom_stream) { diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index e9b9126c0401..1341a6ebd262 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1398,7 +1398,7 @@ struct dc_scratch_space { * store current value in plane states so we can still recover * a valid current state during dc update. */ - struct dc_plane_state plane_states[MAX_SURFACE_NUM]; + struct dc_plane_state plane_states[MAX_SURFACES]; struct dc_stream_state stream_state; }; diff --git a/drivers/gpu/drm/amd/display/dc/dc_stream.h b/drivers/gpu/drm/amd/display/dc/dc_stream.h index 413970588a26..860506c6bda4 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_stream.h +++ b/drivers/gpu/drm/amd/display/dc/dc_stream.h @@ -56,7 +56,7 @@ struct dc_stream_status { int plane_count; int audio_inst; struct timing_sync_info timing_sync_info; - struct dc_plane_state *plane_states[MAX_SURFACE_NUM]; + struct dc_plane_state *plane_states[MAX_SURFACES]; bool is_abm_supported; struct mall_stream_config mall_stream_config; bool fpo_in_use; diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h index edf4df1d03b5..9466b63644d5 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_types.h +++ b/drivers/gpu/drm/amd/display/dc/dc_types.h @@ -76,7 +76,6 @@ struct dc_perf_trace { unsigned long last_entry_write; }; -#define MAX_SURFACE_NUM 6 #define NUM_PIXEL_FORMATS 10 enum tiling_mode { diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_mall_phantom.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_mall_phantom.c index 3d29169dd6bb..6b3b8803e0ae 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_mall_phantom.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_mall_phantom.c @@ -813,7 +813,7 @@ static bool remove_all_phantom_planes_for_stream(struct dml2_context *ctx, struc { int i, old_plane_count; struct dc_stream_status *stream_status = NULL; - struct dc_plane_state *del_planes[MAX_SURFACE_NUM] = { 0 }; + struct dc_plane_state *del_planes[MAX_SURFACES] = { 0 }; for (i = 0; i < context->stream_count; i++) if (context->streams[i] == stream) { From 21541bc6b44241e3f791f9e552352d8440b2b29e Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Tue, 17 Dec 2024 17:45:04 -0300 Subject: [PATCH 662/807] drm/amd/display: increase MAX_SURFACES to the value supported by hw As the hw supports up to 4 surfaces, increase the maximum number of surfaces to prevent the DC error when trying to use more than three planes. [drm:dc_state_add_plane [amdgpu]] *ERROR* Surface: can not attach plane_state 000000003e2cb82c! Maximum is: 3 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3693 Signed-off-by: Melissa Wen Reviewed-by: Rodrigo Siqueira Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher (cherry picked from commit b8d6daffc871a42026c3c20bff7b8fa0302298c1) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 1341a6ebd262..08c5a315b3a6 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -57,7 +57,7 @@ struct dmub_notification; #define DC_VER "3.2.310" -#define MAX_SURFACES 3 +#define MAX_SURFACES 4 #define MAX_PLANES 6 #define MAX_STREAMS 6 #define MIN_VIEWPORT_SIZE 12 From 5225fd2a26211d012533acf98a6ad3f983885817 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Tue, 17 Dec 2024 17:45:05 -0300 Subject: [PATCH 663/807] drm/amd/display: fix divide error in DM plane scale calcs dm_get_plane_scale doesn't take into account plane scaled size equal to zero, leading to a kernel oops due to division by zero. Fix by setting out-scale size as zero when the dst size is zero, similar to what is done by drm_calc_scale(). This issue started with the introduction of cursor ovelay mode that uses this function to assess cursor mode changes via dm_crtc_get_cursor_mode() before checking plane state. [Dec17 17:14] Oops: divide error: 0000 [#1] PREEMPT SMP NOPTI [ +0.000018] CPU: 5 PID: 1660 Comm: surface-DP-1 Not tainted 6.10.0+ #231 [ +0.000007] Hardware name: Valve Jupiter/Jupiter, BIOS F7A0131 01/30/2024 [ +0.000004] RIP: 0010:dm_get_plane_scale+0x3f/0x60 [amdgpu] [ +0.000553] Code: 44 0f b7 41 3a 44 0f b7 49 3e 83 e0 0f 48 0f a3 c2 73 21 69 41 28 e8 03 00 00 31 d2 41 f7 f1 31 d2 89 06 69 41 2c e8 03 00 00 <41> f7 f0 89 07 e9 d7 d8 7e e9 44 89 c8 45 89 c1 41 89 c0 eb d4 66 [ +0.000005] RSP: 0018:ffffa8df0de6b8a0 EFLAGS: 00010246 [ +0.000006] RAX: 00000000000003e8 RBX: ffff9ac65c1f6e00 RCX: ffff9ac65d055500 [ +0.000003] RDX: 0000000000000000 RSI: ffffa8df0de6b8b0 RDI: ffffa8df0de6b8b4 [ +0.000004] RBP: ffff9ac64e7a5800 R08: 0000000000000000 R09: 0000000000000a00 [ +0.000003] R10: 00000000000000ff R11: 0000000000000054 R12: ffff9ac6d0700010 [ +0.000003] R13: ffff9ac65d054f00 R14: ffff9ac65d055500 R15: ffff9ac64e7a60a0 [ +0.000004] FS: 00007f869ea00640(0000) GS:ffff9ac970080000(0000) knlGS:0000000000000000 [ +0.000004] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000003] CR2: 000055ca701becd0 CR3: 000000010e7f2000 CR4: 0000000000350ef0 [ +0.000004] Call Trace: [ +0.000007] [ +0.000006] ? __die_body.cold+0x19/0x27 [ +0.000009] ? die+0x2e/0x50 [ +0.000007] ? do_trap+0xca/0x110 [ +0.000007] ? do_error_trap+0x6a/0x90 [ +0.000006] ? dm_get_plane_scale+0x3f/0x60 [amdgpu] [ +0.000504] ? exc_divide_error+0x38/0x50 [ +0.000005] ? dm_get_plane_scale+0x3f/0x60 [amdgpu] [ +0.000488] ? asm_exc_divide_error+0x1a/0x20 [ +0.000011] ? dm_get_plane_scale+0x3f/0x60 [amdgpu] [ +0.000593] dm_crtc_get_cursor_mode+0x33f/0x430 [amdgpu] [ +0.000562] amdgpu_dm_atomic_check+0x2ef/0x1770 [amdgpu] [ +0.000501] drm_atomic_check_only+0x5e1/0xa30 [drm] [ +0.000047] drm_mode_atomic_ioctl+0x832/0xcb0 [drm] [ +0.000050] ? __pfx_drm_mode_atomic_ioctl+0x10/0x10 [drm] [ +0.000047] drm_ioctl_kernel+0xb3/0x100 [drm] [ +0.000062] drm_ioctl+0x27a/0x4f0 [drm] [ +0.000049] ? __pfx_drm_mode_atomic_ioctl+0x10/0x10 [drm] [ +0.000055] amdgpu_drm_ioctl+0x4e/0x90 [amdgpu] [ +0.000360] __x64_sys_ioctl+0x97/0xd0 [ +0.000010] do_syscall_64+0x82/0x190 [ +0.000008] ? __pfx_drm_mode_createblob_ioctl+0x10/0x10 [drm] [ +0.000044] ? srso_return_thunk+0x5/0x5f [ +0.000006] ? drm_ioctl_kernel+0xb3/0x100 [drm] [ +0.000040] ? srso_return_thunk+0x5/0x5f [ +0.000005] ? __check_object_size+0x50/0x220 [ +0.000007] ? srso_return_thunk+0x5/0x5f [ +0.000005] ? srso_return_thunk+0x5/0x5f [ +0.000005] ? drm_ioctl+0x2a4/0x4f0 [drm] [ +0.000039] ? __pfx_drm_mode_createblob_ioctl+0x10/0x10 [drm] [ +0.000043] ? srso_return_thunk+0x5/0x5f [ +0.000005] ? srso_return_thunk+0x5/0x5f [ +0.000005] ? __pm_runtime_suspend+0x69/0xc0 [ +0.000006] ? srso_return_thunk+0x5/0x5f [ +0.000005] ? amdgpu_drm_ioctl+0x71/0x90 [amdgpu] [ +0.000366] ? srso_return_thunk+0x5/0x5f [ +0.000006] ? syscall_exit_to_user_mode+0x77/0x210 [ +0.000007] ? srso_return_thunk+0x5/0x5f [ +0.000005] ? do_syscall_64+0x8e/0x190 [ +0.000006] ? srso_return_thunk+0x5/0x5f [ +0.000006] ? do_syscall_64+0x8e/0x190 [ +0.000006] ? srso_return_thunk+0x5/0x5f [ +0.000007] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ +0.000008] RIP: 0033:0x55bb7cd962bc [ +0.000007] Code: 4c 89 6c 24 18 4c 89 64 24 20 4c 89 74 24 28 0f 57 c0 0f 11 44 24 30 89 c7 48 8d 54 24 08 b8 10 00 00 00 be bc 64 38 c0 0f 05 <49> 89 c7 48 83 3b 00 74 09 4c 89 c7 ff 15 62 64 99 00 48 83 7b 18 [ +0.000005] RSP: 002b:00007f869e9f4da0 EFLAGS: 00000217 ORIG_RAX: 0000000000000010 [ +0.000007] RAX: ffffffffffffffda RBX: 00007f869e9f4fb8 RCX: 000055bb7cd962bc [ +0.000004] RDX: 00007f869e9f4da8 RSI: 00000000c03864bc RDI: 000000000000003b [ +0.000003] RBP: 000055bb9ddcbcc0 R08: 00007f86541b9920 R09: 0000000000000009 [ +0.000004] R10: 0000000000000004 R11: 0000000000000217 R12: 00007f865406c6b0 [ +0.000003] R13: 00007f86541b5290 R14: 00007f865410b700 R15: 000055bb9ddcbc18 [ +0.000009] Fixes: 1b04dcca4fb1 ("drm/amd/display: Introduce overlay cursor mode") Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3729 Reported-by: Fabio Scaccabarozzi Co-developed-by: Fabio Scaccabarozzi Signed-off-by: Fabio Scaccabarozzi Signed-off-by: Melissa Wen Reviewed-by: Rodrigo Siqueira Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher (cherry picked from commit ab75a0d2e07942ae15d32c0a5092fd336451378c) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 4c3b6e6151c5..cd16dae534dc 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -11124,8 +11124,8 @@ dm_get_plane_scale(struct drm_plane_state *plane_state, int plane_src_w, plane_src_h; dm_get_oriented_plane_size(plane_state, &plane_src_w, &plane_src_h); - *out_plane_scale_w = plane_state->crtc_w * 1000 / plane_src_w; - *out_plane_scale_h = plane_state->crtc_h * 1000 / plane_src_h; + *out_plane_scale_w = plane_src_w ? plane_state->crtc_w * 1000 / plane_src_w : 0; + *out_plane_scale_h = plane_src_h ? plane_state->crtc_h * 1000 / plane_src_h : 0; } /* From 9738609449c3e44d1afb73eecab4763362b57930 Mon Sep 17 00:00:00 2001 From: "Jesse.zhang@amd.com" Date: Wed, 18 Dec 2024 18:23:52 +0800 Subject: [PATCH 664/807] drm/amdkfd: fixed page fault when enable MES shader debugger Initialize the process context address before setting the shader debugger. [ 260.781212] amdgpu 0000:03:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:32 vmid:0 pasid:0) [ 260.781236] amdgpu 0000:03:00.0: amdgpu: in page starting at address 0x0000000000000000 from client 10 [ 260.781255] amdgpu 0000:03:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00040A40 [ 260.781270] amdgpu 0000:03:00.0: amdgpu: Faulty UTCL2 client ID: CPC (0x5) [ 260.781284] amdgpu 0000:03:00.0: amdgpu: MORE_FAULTS: 0x0 [ 260.781296] amdgpu 0000:03:00.0: amdgpu: WALKER_ERROR: 0x0 [ 260.781308] amdgpu 0000:03:00.0: amdgpu: PERMISSION_FAULTS: 0x4 [ 260.781320] amdgpu 0000:03:00.0: amdgpu: MAPPING_ERROR: 0x0 [ 260.781332] amdgpu 0000:03:00.0: amdgpu: RW: 0x1 [ 260.782017] amdgpu 0000:03:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:32 vmid:0 pasid:0) [ 260.782039] amdgpu 0000:03:00.0: amdgpu: in page starting at address 0x0000000000000000 from client 10 [ 260.782058] amdgpu 0000:03:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00040A41 [ 260.782073] amdgpu 0000:03:00.0: amdgpu: Faulty UTCL2 client ID: CPC (0x5) [ 260.782087] amdgpu 0000:03:00.0: amdgpu: MORE_FAULTS: 0x1 [ 260.782098] amdgpu 0000:03:00.0: amdgpu: WALKER_ERROR: 0x0 [ 260.782110] amdgpu 0000:03:00.0: amdgpu: PERMISSION_FAULTS: 0x4 [ 260.782122] amdgpu 0000:03:00.0: amdgpu: MAPPING_ERROR: 0x0 [ 260.782137] amdgpu 0000:03:00.0: amdgpu: RW: 0x1 [ 260.782155] amdgpu 0000:03:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:32 vmid:0 pasid:0) [ 260.782166] amdgpu 0000:03:00.0: amdgpu: in page starting at address 0x0000000000000000 from client 10 Fixes: 438b39ac74e2 ("drm/amdkfd: pause autosuspend when creating pdd") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3849 Signed-off-by: Jesse Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 5b231f5bc9ff02ec5737f2ec95cdf15ac95088e9) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c index 312dfa84f29f..a8abc3091801 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c @@ -350,10 +350,27 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en) { uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; uint32_t flags = pdd->process->dbg_flags; + struct amdgpu_device *adev = pdd->dev->adev; + int r; if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) return 0; + if (!pdd->proc_ctx_cpu_ptr) { + r = amdgpu_amdkfd_alloc_gtt_mem(adev, + AMDGPU_MES_PROC_CTX_SIZE, + &pdd->proc_ctx_bo, + &pdd->proc_ctx_gpu_addr, + &pdd->proc_ctx_cpu_ptr, + false); + if (r) { + dev_err(adev->dev, + "failed to allocate process context bo\n"); + return r; + } + memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); + } + return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, pdd->watch_points, flags, sq_trap_en); } From 0881fbc4fd62e00a2b8e102725f76d10351b2ea8 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Fri, 13 Dec 2024 13:51:07 -0500 Subject: [PATCH 665/807] drm/amd/display: Add check for granularity in dml ceil/floor helpers [Why] Wrapper functions for dcn_bw_ceil2() and dcn_bw_floor2() should check for granularity is non zero to avoid assert and divide-by-zero error in dcn_bw_ functions. [How] Add check for granularity 0. Cc: Mario Limonciello Reviewed-by: Alvin Lee Signed-off-by: Roman Li Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit f6e09701c3eb2ccb8cb0518e0b67f1c69742a4ec) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dml/dml_inline_defs.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dml_inline_defs.h b/drivers/gpu/drm/amd/display/dc/dml/dml_inline_defs.h index 072bd0539605..6b2ab4ec2b5f 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dml_inline_defs.h +++ b/drivers/gpu/drm/amd/display/dc/dml/dml_inline_defs.h @@ -66,11 +66,15 @@ static inline double dml_max5(double a, double b, double c, double d, double e) static inline double dml_ceil(double a, double granularity) { + if (granularity == 0) + return 0; return (double) dcn_bw_ceil2(a, granularity); } static inline double dml_floor(double a, double granularity) { + if (granularity == 0) + return 0; return (double) dcn_bw_floor2(a, granularity); } @@ -114,11 +118,15 @@ static inline double dml_ceil_2(double f) static inline double dml_ceil_ex(double x, double granularity) { + if (granularity == 0) + return 0; return (double) dcn_bw_ceil2(x, granularity); } static inline double dml_floor_ex(double x, double granularity) { + if (granularity == 0) + return 0; return (double) dcn_bw_floor2(x, granularity); } From a993d319aebb7cce8a10c6e685344b7c2ad5c4c2 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Wed, 11 Dec 2024 11:51:13 +0800 Subject: [PATCH 666/807] drm/amdkfd: wq_release signals dma_fence only when available kfd_process_wq_release() signals eviction fence by dma_fence_signal() which wanrs if dma_fence is NULL. kfd_process->ef is initialized by kfd_process_device_init_vm() through ioctl. That means the fence is NULL for a new created kfd_process, and close a kfd_process right after open it will trigger the warning. This commit conditionally signals the eviction fence in kfd_process_wq_release() only when it is available. [ 503.660882] WARNING: CPU: 0 PID: 9 at drivers/dma-buf/dma-fence.c:467 dma_fence_signal+0x74/0xa0 [ 503.782940] Workqueue: kfd_process_wq kfd_process_wq_release [amdgpu] [ 503.789640] RIP: 0010:dma_fence_signal+0x74/0xa0 [ 503.877620] Call Trace: [ 503.880066] [ 503.882168] ? __warn+0xcd/0x260 [ 503.885407] ? dma_fence_signal+0x74/0xa0 [ 503.889416] ? report_bug+0x288/0x2d0 [ 503.893089] ? handle_bug+0x53/0xa0 [ 503.896587] ? exc_invalid_op+0x14/0x50 [ 503.900424] ? asm_exc_invalid_op+0x16/0x20 [ 503.904616] ? dma_fence_signal+0x74/0xa0 [ 503.908626] kfd_process_wq_release+0x6b/0x370 [amdgpu] [ 503.914081] process_one_work+0x654/0x10a0 [ 503.918186] worker_thread+0x6c3/0xe70 [ 503.921943] ? srso_alias_return_thunk+0x5/0xfbef5 [ 503.926735] ? srso_alias_return_thunk+0x5/0xfbef5 [ 503.931527] ? __kthread_parkme+0x82/0x140 [ 503.935631] ? __pfx_worker_thread+0x10/0x10 [ 503.939904] kthread+0x2a8/0x380 [ 503.943132] ? __pfx_kthread+0x10/0x10 [ 503.946882] ret_from_fork+0x2d/0x70 [ 503.950458] ? __pfx_kthread+0x10/0x10 [ 503.954210] ret_from_fork_asm+0x1a/0x30 [ 503.958142] [ 503.960328] ---[ end trace 0000000000000000 ]--- Fixes: 967d226eaae8 ("dma-buf: add WARN_ON() illegal dma-fence signaling") Signed-off-by: Zhu Lingshan Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher (cherry picked from commit 2774ef7625adb5fb9e9265c26a59dca7b8fd171e) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index d0ee173acf82..edfe0b4788f4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1160,7 +1160,8 @@ static void kfd_process_wq_release(struct work_struct *work) */ synchronize_rcu(); ef = rcu_access_pointer(p->ef); - dma_fence_signal(ef); + if (ef) + dma_fence_signal(ef); kfd_process_remove_sysfs(p); From 2a238b09bfd04e8155a7a323364bce1c38b28c0f Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Fri, 27 Dec 2024 11:43:22 +0800 Subject: [PATCH 667/807] drm/amd/pm: fix BUG: scheduling while atomic atomic scheduling will be triggered in interrupt handler for AC/DC mode switch as following backtrace. Call Trace: dump_stack_lvl __schedule_bug __schedule schedule schedule_preempt_disabled __mutex_lock smu_cmn_send_smc_msg_with_param smu_v13_0_irq_process amdgpu_irq_dispatch amdgpu_ih_process amdgpu_irq_handler __handle_irq_event_percpu handle_irq_event handle_edge_irq __common_interrupt common_interrupt asm_common_interrupt Reviewed-by: Lijo Lazar Reviewed-by: Kenneth Feng Signed-off-by: Kun Liu Signed-off-by: Alex Deucher (cherry picked from commit 03cc84b102d1a832e8dfc59344346dedcebcdf42) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h | 2 ++ drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 12 ++++++------ drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 1 + drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 1 + 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h index ae3563d71fa0..356d9422b411 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h @@ -303,5 +303,7 @@ int smu_v13_0_set_wbrf_exclusion_ranges(struct smu_context *smu, int smu_v13_0_get_boot_freq_by_index(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t *value); + +void smu_v13_0_interrupt_work(struct smu_context *smu); #endif #endif diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 2bfea740dace..2d1e7ebd1bac 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -1320,11 +1320,11 @@ static int smu_v13_0_set_irq_state(struct amdgpu_device *adev, return 0; } -static int smu_v13_0_ack_ac_dc_interrupt(struct smu_context *smu) +void smu_v13_0_interrupt_work(struct smu_context *smu) { - return smu_cmn_send_smc_msg(smu, - SMU_MSG_ReenableAcDcInterrupt, - NULL); + smu_cmn_send_smc_msg(smu, + SMU_MSG_ReenableAcDcInterrupt, + NULL); } #define THM_11_0__SRCID__THM_DIG_THERM_L2H 0 /* ASIC_TEMP > CG_THERMAL_INT.DIG_THERM_INTH */ @@ -1377,12 +1377,12 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev, switch (ctxid) { case SMU_IH_INTERRUPT_CONTEXT_ID_AC: dev_dbg(adev->dev, "Switched to AC mode!\n"); - smu_v13_0_ack_ac_dc_interrupt(smu); + schedule_work(&smu->interrupt_work); adev->pm.ac_power = true; break; case SMU_IH_INTERRUPT_CONTEXT_ID_DC: dev_dbg(adev->dev, "Switched to DC mode!\n"); - smu_v13_0_ack_ac_dc_interrupt(smu); + schedule_work(&smu->interrupt_work); adev->pm.ac_power = false; break; case SMU_IH_INTERRUPT_CONTEXT_ID_THERMAL_THROTTLING: diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 3aa705aae4c0..2a09b27788e8 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -3219,6 +3219,7 @@ static const struct pptable_funcs smu_v13_0_0_ppt_funcs = { .is_asic_wbrf_supported = smu_v13_0_0_wbrf_support_check, .enable_uclk_shadow = smu_v13_0_enable_uclk_shadow, .set_wbrf_exclusion_ranges = smu_v13_0_set_wbrf_exclusion_ranges, + .interrupt_work = smu_v13_0_interrupt_work, }; void smu_v13_0_0_set_ppt_funcs(struct smu_context *smu) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index aabb94796005..55ef18517b0f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -2797,6 +2797,7 @@ static const struct pptable_funcs smu_v13_0_7_ppt_funcs = { .is_asic_wbrf_supported = smu_v13_0_7_wbrf_support_check, .enable_uclk_shadow = smu_v13_0_enable_uclk_shadow, .set_wbrf_exclusion_ranges = smu_v13_0_set_wbrf_exclusion_ranges, + .interrupt_work = smu_v13_0_interrupt_work, }; void smu_v13_0_7_set_ppt_funcs(struct smu_context *smu) From 75c8b703e5bded1e33b08fb09b829e7c2c1ed50a Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Tue, 10 Dec 2024 12:50:08 +0530 Subject: [PATCH 668/807] drm/amdgpu: Add a lock when accessing the buddy trim function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running YouTube videos and Steam games simultaneously, the tester found a system hang / race condition issue with the multi-display configuration setting. Adding a lock to the buddy allocator's trim function would be the solution. [ 7197.250436] general protection fault, probably for non-canonical address 0xdead000000000108 [ 7197.250447] RIP: 0010:__alloc_range+0x8b/0x340 [amddrm_buddy] [ 7197.250470] Call Trace: [ 7197.250472] [ 7197.250475] ? show_regs+0x6d/0x80 [ 7197.250481] ? die_addr+0x37/0xa0 [ 7197.250483] ? exc_general_protection+0x1db/0x480 [ 7197.250488] ? drm_suballoc_new+0x13c/0x93d [drm_suballoc_helper] [ 7197.250493] ? asm_exc_general_protection+0x27/0x30 [ 7197.250498] ? __alloc_range+0x8b/0x340 [amddrm_buddy] [ 7197.250501] ? __alloc_range+0x109/0x340 [amddrm_buddy] [ 7197.250506] amddrm_buddy_block_trim+0x1b5/0x260 [amddrm_buddy] [ 7197.250511] amdgpu_vram_mgr_new+0x4f5/0x590 [amdgpu] [ 7197.250682] amdttm_resource_alloc+0x46/0xb0 [amdttm] [ 7197.250689] ttm_bo_alloc_resource+0xe4/0x370 [amdttm] [ 7197.250696] amdttm_bo_validate+0x9d/0x180 [amdttm] [ 7197.250701] amdgpu_bo_pin+0x15a/0x2f0 [amdgpu] [ 7197.250831] amdgpu_dm_plane_helper_prepare_fb+0xb2/0x360 [amdgpu] [ 7197.251025] ? try_wait_for_completion+0x59/0x70 [ 7197.251030] drm_atomic_helper_prepare_planes.part.0+0x2f/0x1e0 [ 7197.251035] drm_atomic_helper_prepare_planes+0x5d/0x70 [ 7197.251037] drm_atomic_helper_commit+0x84/0x160 [ 7197.251040] drm_atomic_nonblocking_commit+0x59/0x70 [ 7197.251043] drm_mode_atomic_ioctl+0x720/0x850 [ 7197.251047] ? __pfx_drm_mode_atomic_ioctl+0x10/0x10 [ 7197.251049] drm_ioctl_kernel+0xb9/0x120 [ 7197.251053] ? srso_alias_return_thunk+0x5/0xfbef5 [ 7197.251056] drm_ioctl+0x2d4/0x550 [ 7197.251058] ? __pfx_drm_mode_atomic_ioctl+0x10/0x10 [ 7197.251063] amdgpu_drm_ioctl+0x4e/0x90 [amdgpu] [ 7197.251186] __x64_sys_ioctl+0xa0/0xf0 [ 7197.251190] x64_sys_call+0x143b/0x25c0 [ 7197.251193] do_syscall_64+0x7f/0x180 [ 7197.251197] ? srso_alias_return_thunk+0x5/0xfbef5 [ 7197.251199] ? amdgpu_display_user_framebuffer_create+0x215/0x320 [amdgpu] [ 7197.251329] ? drm_internal_framebuffer_create+0xb7/0x1a0 [ 7197.251332] ? srso_alias_return_thunk+0x5/0xfbef5 Signed-off-by: Arunpravin Paneer Selvam Fixes: 4a5ad08f5377 ("drm/amdgpu: Add address alignment support to DCC buffers") Acked-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 3318ba94e56b9183d0304577c74b33b6b01ce516) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 7d26a962f811..ff5e52025266 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -567,7 +567,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, else remaining_size -= size; } - mutex_unlock(&mgr->lock); if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && adjust_dcc_size) { struct drm_buddy_block *dcc_block; @@ -584,6 +583,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, (u64)vres->base.size, &vres->blocks); } + mutex_unlock(&mgr->lock); vres->base.start = 0; size = max_t(u64, amdgpu_vram_mgr_blocks_size(&vres->blocks), From 8c817eb26230dc0ae553cee16ff43a4a895f6756 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 3 Jan 2025 11:51:47 -0800 Subject: [PATCH 669/807] pds_core: limit loop over fw name list Add an array size limit to the for-loop to be sure we don't try to reference a fw_version string off the end of the fw info names array. We know that our firmware only has a limited number of firmware slot names, but we shouldn't leave this unchecked. Fixes: 45d76f492938 ("pds_core: set up device and adminq") Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Reviewed-by: Brett Creeley Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250103195147.7408-1-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c index 2681889162a2..44971e71991f 100644 --- a/drivers/net/ethernet/amd/pds_core/devlink.c +++ b/drivers/net/ethernet/amd/pds_core/devlink.c @@ -118,7 +118,7 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req, if (err && err != -EIO) return err; - listlen = fw_list.num_fw_slots; + listlen = min(fw_list.num_fw_slots, ARRAY_SIZE(fw_list.fw_names)); for (i = 0; i < listlen; i++) { if (i < ARRAY_SIZE(fw_slotnames)) strscpy(buf, fw_slotnames[i], sizeof(buf)); From c8dafb0e4398dacc362832098a04b97da3b0395b Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Fri, 3 Jan 2025 20:38:47 -0800 Subject: [PATCH 670/807] bnxt_en: Fix possible memory leak when hwrm_req_replace fails When hwrm_req_replace() fails, the driver is not invoking bnxt_req_drop() which could cause a memory leak. Fixes: bbf33d1d9805 ("bnxt_en: update all firmware calls to use the new APIs") Reviewed-by: Pavan Chebbi Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250104043849.3482067-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index b771c84cdd89..0ed26e3a28f4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -208,7 +208,7 @@ int bnxt_send_msg(struct bnxt_en_dev *edev, rc = hwrm_req_replace(bp, req, fw_msg->msg, fw_msg->msg_len); if (rc) - return rc; + goto drop_req; hwrm_req_timeout(bp, req, fw_msg->timeout); resp = hwrm_req_hold(bp, req); @@ -220,6 +220,7 @@ int bnxt_send_msg(struct bnxt_en_dev *edev, memcpy(fw_msg->resp, resp, resp_len); } +drop_req: hwrm_req_drop(bp, req); return rc; } From 40452969a50652e3cbf89dac83d54eebf2206d27 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 3 Jan 2025 20:38:48 -0800 Subject: [PATCH 671/807] bnxt_en: Fix DIM shutdown DIM work will call the firmware to adjust the coalescing parameters on the RX rings. We should cancel DIM work before we call the firmware to free the RX rings. Otherwise, FW will reject the call from DIM work if the RX ring has been freed. This will generate an error message like this: bnxt_en 0000:21:00.1 ens2f1np1: hwrm req_type 0x53 seq id 0x6fca error 0x2 and cause unnecessary concern for the user. It is also possible to modify the coalescing parameters of the wrong ring if the ring has been re-allocated. To prevent this, cancel DIM work right before freeing the RX rings. We also have to add a check in NAPI poll to not schedule DIM if the RX rings are shutting down. Check that the VNIC is active before we schedule DIM. The VNIC is always disabled before we free the RX rings. Fixes: 0bc0b97fca73 ("bnxt_en: cleanup DIM work on device shutdown") Reviewed-by: Hongguang Gao Reviewed-by: Kalesh AP Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250104043849.3482067-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 38 ++++++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b86f980fa7ea..aeaa74f03046 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2897,6 +2897,13 @@ static int bnxt_hwrm_handler(struct bnxt *bp, struct tx_cmp *txcmp) return 0; } +static bool bnxt_vnic_is_active(struct bnxt *bp) +{ + struct bnxt_vnic_info *vnic = &bp->vnic_info[0]; + + return vnic->fw_vnic_id != INVALID_HW_RING_ID && vnic->mru > 0; +} + static irqreturn_t bnxt_msix(int irq, void *dev_instance) { struct bnxt_napi *bnapi = dev_instance; @@ -3164,7 +3171,7 @@ static int bnxt_poll(struct napi_struct *napi, int budget) break; } } - if (bp->flags & BNXT_FLAG_DIM) { + if ((bp->flags & BNXT_FLAG_DIM) && bnxt_vnic_is_active(bp)) { struct dim_sample dim_sample = {}; dim_update_sample(cpr->event_ctr, @@ -3295,7 +3302,7 @@ static int bnxt_poll_p5(struct napi_struct *napi, int budget) poll_done: cpr_rx = &cpr->cp_ring_arr[0]; if (cpr_rx->cp_ring_type == BNXT_NQ_HDL_TYPE_RX && - (bp->flags & BNXT_FLAG_DIM)) { + (bp->flags & BNXT_FLAG_DIM) && bnxt_vnic_is_active(bp)) { struct dim_sample dim_sample = {}; dim_update_sample(cpr->event_ctr, @@ -7266,6 +7273,26 @@ err_out: return rc; } +static void bnxt_cancel_dim(struct bnxt *bp) +{ + int i; + + /* DIM work is initialized in bnxt_enable_napi(). Proceed only + * if NAPI is enabled. + */ + if (!bp->bnapi || test_bit(BNXT_STATE_NAPI_DISABLED, &bp->state)) + return; + + /* Make sure NAPI sees that the VNIC is disabled */ + synchronize_net(); + for (i = 0; i < bp->rx_nr_rings; i++) { + struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i]; + struct bnxt_napi *bnapi = rxr->bnapi; + + cancel_work_sync(&bnapi->cp_ring.dim.work); + } +} + static int hwrm_ring_free_send_msg(struct bnxt *bp, struct bnxt_ring_struct *ring, u32 ring_type, int cmpl_ring_id) @@ -7366,6 +7393,7 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path) } } + bnxt_cancel_dim(bp); for (i = 0; i < bp->rx_nr_rings; i++) { bnxt_hwrm_rx_ring_free(bp, &bp->rx_ring[i], close_path); bnxt_hwrm_rx_agg_ring_free(bp, &bp->rx_ring[i], close_path); @@ -11309,8 +11337,6 @@ static void bnxt_disable_napi(struct bnxt *bp) if (bnapi->in_reset) cpr->sw_stats->rx.rx_resets++; napi_disable(&bnapi->napi); - if (bnapi->rx_ring) - cancel_work_sync(&cpr->dim.work); } } @@ -15572,8 +15598,10 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) bnxt_hwrm_vnic_update(bp, vnic, VNIC_UPDATE_REQ_ENABLES_MRU_VALID); } - + /* Make sure NAPI sees that the VNIC is disabled */ + synchronize_net(); rxr = &bp->rx_ring[idx]; + cancel_work_sync(&rxr->bnapi->cp_ring.dim.work); bnxt_hwrm_rx_ring_free(bp, rxr, false); bnxt_hwrm_rx_agg_ring_free(bp, rxr, false); rxr->rx_next_cons = 0; From 4c1224501e9d6c5fd12d83752f1c1b444e0e3418 Mon Sep 17 00:00:00 2001 From: Anumula Murali Mohan Reddy Date: Fri, 3 Jan 2025 14:53:27 +0530 Subject: [PATCH 672/807] cxgb4: Avoid removal of uninserted tid During ARP failure, tid is not inserted but _c4iw_free_ep() attempts to remove tid which results in error. This patch fixes the issue by avoiding removal of uninserted tid. Fixes: 59437d78f088 ("cxgb4/chtls: fix ULD connection failures due to wrong TID base") Signed-off-by: Anumula Murali Mohan Reddy Signed-off-by: Potnuri Bharat Teja Link: https://patch.msgid.link/20250103092327.1011925-1-anumula@chelsio.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index bc3af0054406..604dcfd49aa4 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -1799,7 +1799,10 @@ void cxgb4_remove_tid(struct tid_info *t, unsigned int chan, unsigned int tid, struct adapter *adap = container_of(t, struct adapter, tids); struct sk_buff *skb; - WARN_ON(tid_out_of_range(&adap->tids, tid)); + if (tid_out_of_range(&adap->tids, tid)) { + dev_err(adap->pdev_dev, "tid %d out of range\n", tid); + return; + } if (t->tid_tab[tid - adap->tids.tid_base]) { t->tid_tab[tid - adap->tids.tid_base] = NULL; From 6f660ffce7c938f2a5d8473c0e0b45e4fb25ef7f Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 1 Jan 2025 14:11:19 +0100 Subject: [PATCH 673/807] usb: gadget: midi2: Reverse-select at the right place We should do reverse selection of other components from CONFIG_USB_F_MIDI2 which is tristate, instead of CONFIG_USB_CONFIGFS_F_MIDI2 which is bool, for satisfying subtle module dependencies. Fixes: 8b645922b223 ("usb: gadget: Add support for USB MIDI 2.0 function driver") Cc: stable Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20250101131124.27599-1-tiwai@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig index 566ff0b1282a..76521555e3c1 100644 --- a/drivers/usb/gadget/Kconfig +++ b/drivers/usb/gadget/Kconfig @@ -211,6 +211,8 @@ config USB_F_MIDI config USB_F_MIDI2 tristate + select SND_UMP + select SND_UMP_LEGACY_RAWMIDI config USB_F_HID tristate @@ -445,8 +447,6 @@ config USB_CONFIGFS_F_MIDI2 depends on USB_CONFIGFS depends on SND select USB_LIBCOMPOSITE - select SND_UMP - select SND_UMP_LEGACY_RAWMIDI select USB_F_MIDI2 help The MIDI 2.0 function driver provides the generic emulated From cdef30e0774802df2f87024d68a9d86c3b99ca2a Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Wed, 1 Jan 2025 22:22:06 +0100 Subject: [PATCH 674/807] usb-storage: Add max sectors quirk for Nokia 208 This fixes data corruption when accessing the internal SD card in mass storage mode. I am actually not too sure why. I didn't figure a straightforward way to reproduce the issue, but i seem to get garbage when issuing a lot (over 50) of large reads (over 120 sectors) are done in a quick succession. That is, time seems to matter here -- larger reads are fine if they are done with some delay between them. But I'm not great at understanding this sort of things, so I'll assume the issue other, smarter, folks were seeing with similar phones is the same problem and I'll just put my quirk next to theirs. The "Software details" screen on the phone is as follows: V 04.06 07-08-13 RM-849 (c) Nokia TL;DR version of the device descriptor: idVendor 0x0421 Nokia Mobile Phones idProduct 0x06c2 bcdDevice 4.06 iManufacturer 1 Nokia iProduct 2 Nokia 208 The patch assumes older firmwares are broken too (I'm unable to test, but no biggie if they aren't I guess), and I have no idea if newer firmware exists. Signed-off-by: Lubomir Rintel Cc: stable Acked-by: Alan Stern Link: https://lore.kernel.org/r/20250101212206.2386207-1-lkundrak@v3.sk Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index e5ad23d86833..54f0b1c83317 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -255,6 +255,13 @@ UNUSUAL_DEV( 0x0421, 0x06aa, 0x1110, 0x1110, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_MAX_SECTORS_64 ), +/* Added by Lubomir Rintel , a very fine chap */ +UNUSUAL_DEV( 0x0421, 0x06c2, 0x0000, 0x0406, + "Nokia", + "Nokia 208", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_MAX_SECTORS_64 ), + #ifdef NO_SDDR09 UNUSUAL_DEV( 0x0436, 0x0005, 0x0100, 0x0100, "Microtech", From b0e525d7a22ea350e75e2aec22e47fcfafa4cacd Mon Sep 17 00:00:00 2001 From: GONG Ruiqi Date: Tue, 7 Jan 2025 09:57:50 +0800 Subject: [PATCH 675/807] usb: typec: fix pm usage counter imbalance in ucsi_ccg_sync_control() The error handling for the case `con_index == 0` should involve dropping the pm usage counter, as ucsi_ccg_sync_control() gets it at the beginning. Fix it. Cc: stable Fixes: e56aac6e5a25 ("usb: typec: fix potential array underflow in ucsi_ccg_sync_control()") Signed-off-by: GONG Ruiqi Reviewed-by: Dan Carpenter Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250107015750.2778646-1-gongruiqi1@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi_ccg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c b/drivers/usb/typec/ucsi/ucsi_ccg.c index fcb8e61136cf..740171f24ef9 100644 --- a/drivers/usb/typec/ucsi/ucsi_ccg.c +++ b/drivers/usb/typec/ucsi/ucsi_ccg.c @@ -646,7 +646,7 @@ static int ucsi_ccg_sync_control(struct ucsi *ucsi, u64 command) UCSI_CMD_CONNECTOR_MASK; if (con_index == 0) { ret = -EINVAL; - goto unlock; + goto err_put; } con = &uc->ucsi->connector[con_index - 1]; ucsi_ccg_update_set_new_cam_cmd(uc, con, &command); @@ -654,8 +654,8 @@ static int ucsi_ccg_sync_control(struct ucsi *ucsi, u64 command) ret = ucsi_sync_control_common(ucsi, command); +err_put: pm_runtime_put_sync(uc->dev); -unlock: mutex_unlock(&uc->lock); return ret; From fd48f071a3d6d51e737e953bb43fe69785cf59a9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Jan 2025 10:32:07 -0800 Subject: [PATCH 676/807] net: don't dump Tx and uninitialized NAPIs We use NAPI ID as the key for continuing dumps. We also depend on the NAPIs being sorted by ID within the driver list. Tx NAPIs (which don't have an ID assigned) break this expectation, it's not currently possible to dump them reliably. Since Tx NAPIs are relatively rare, and can't be used in doit (GET or SET) hide them from the dump API as well. Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103183207.1216004-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/core/netdev-genl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b0772d135efb..125b660004d3 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -176,8 +176,7 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, if (!hdr) return -EMSGSIZE; - if (napi->napi_id >= MIN_NAPI_ID && - nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) @@ -272,6 +271,8 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, return err; list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (napi->napi_id < MIN_NAPI_ID) + continue; if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; From 60495b08cf7a6920035c5172a22655ca2001270b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jan 2025 14:11:32 +0000 Subject: [PATCH 677/807] io_uring: silence false positive warnings If we kill a ring and then immediately exit the task, we'll get cancellattion running by the task and a kthread in io_ring_exit_work. For DEFER_TASKRUN, we do want to limit it to only one entity executing it, however it's currently not an issue as it's protected by uring_lock. Silence lockdep assertions for now, we'll return to it later. Reported-by: syzbot+1bcb75613069ad4957fc@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7e5f68281acb0f081f65fde435833c68a3b7e02f.1736257837.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 12abee607e4a..492cbbf2c23b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -125,6 +125,9 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) #if defined(CONFIG_PROVE_LOCKING) lockdep_assert(in_task()); + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + lockdep_assert_held(&ctx->uring_lock); + if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); } else if (!ctx->task_complete) { @@ -136,9 +139,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) * Not from an SQE, as those cannot be submitted, but via * updating tagged resources. */ - if (percpu_ref_is_dying(&ctx->refs)) - lockdep_assert(current_work()); - else + if (!percpu_ref_is_dying(&ctx->refs)) lockdep_assert(current == ctx->submitter_task); } #endif From 84b172cea4a23016dc80a44eaa7ff8b7c97b04b3 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 7 Jan 2025 14:50:32 +0100 Subject: [PATCH 678/807] staging: gpib: refer to correct config symbol in tnt4882 Makefile Commit 79d2e1919a27 ("staging: gpib: fix Makefiles") uses the corresponding config symbols to let Makefiles include the driver sources appropriately in the kernel build. Unfortunately, the Makefile in the tnt4882 directory refers to the non-existing config GPIB_TNT4882. The actual config name for this driver is GPIB_NI_PCI_ISA, as can be observed in the gpib Makefile. Probably, this is caused by the subtle differences between the config names, directory names and file names in ./drivers/staging/gpib/, where often config names and directory names are identical or at least close in naming, but in this case, it is not. Change the reference in the tnt4882 Makefile from the non-existing config GPIB_TNT4882 to the existing config GPIB_NI_PCI_ISA. Fixes: 79d2e1919a27 ("staging: gpib: fix Makefiles") Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20250107135032.34424-1-lukas.bulwahn@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/tnt4882/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/gpib/tnt4882/Makefile b/drivers/staging/gpib/tnt4882/Makefile index 04a4520ed3b7..a3c3fb96d5ed 100644 --- a/drivers/staging/gpib/tnt4882/Makefile +++ b/drivers/staging/gpib/tnt4882/Makefile @@ -1,5 +1,5 @@ ccflags-$(CONFIG_GPIB_PCMCIA) := -DGPIB_PCMCIA -obj-$(CONFIG_GPIB_TNT4882) += tnt4882.o +obj-$(CONFIG_GPIB_NI_PCI_ISA) += tnt4882.o tnt4882-objs := tnt4882_gpib.o mite.o From 8fd56ad6e7c90ac2bddb0741c6b248c8c5d56ac8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 6 Jan 2025 16:21:00 +0000 Subject: [PATCH 679/807] afs: Fix the maximum cell name length The kafs filesystem limits the maximum length of a cell to 256 bytes, but a problem occurs if someone actually does that: kafs tries to create a directory under /proc/net/afs/ with the name of the cell, but that fails with a warning: WARNING: CPU: 0 PID: 9 at fs/proc/generic.c:405 because procfs limits the maximum filename length to 255. However, the DNS limits the maximum lookup length and, by extension, the maximum cell name, to 255 less two (length count and trailing NUL). Fix this by limiting the maximum acceptable cellname length to 253. This also allows us to be sure we can create the "/afs/./" mountpoint too. Further, split the YFS VL record cell name maximum to be the 256 allowed by the protocol and ignore the record retrieved by YFSVL.GetCellName if it exceeds 253. Fixes: c3e9f888263b ("afs: Implement client support for the YFSVL.GetCellName RPC op") Reported-by: syzbot+7848fee1f1e5c53f912b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/6776d25d.050a0220.3a8527.0048.GAE@google.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/376236.1736180460@warthog.procyon.org.uk Tested-by: syzbot+7848fee1f1e5c53f912b@syzkaller.appspotmail.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- fs/afs/afs.h | 2 +- fs/afs/afs_vl.h | 1 + fs/afs/vl_alias.c | 8 ++++++-- fs/afs/vlclient.c | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/afs/afs.h b/fs/afs/afs.h index b488072aee87..ec3db00bd081 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -10,7 +10,7 @@ #include -#define AFS_MAXCELLNAME 256 /* Maximum length of a cell name */ +#define AFS_MAXCELLNAME 253 /* Maximum length of a cell name (DNS limited) */ #define AFS_MAXVOLNAME 64 /* Maximum length of a volume name */ #define AFS_MAXNSERVERS 8 /* Maximum servers in a basic volume record */ #define AFS_NMAXNSERVERS 13 /* Maximum servers in a N/U-class volume record */ diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h index a06296c8827d..b835e25a2c02 100644 --- a/fs/afs/afs_vl.h +++ b/fs/afs/afs_vl.h @@ -13,6 +13,7 @@ #define AFS_VL_PORT 7003 /* volume location service port */ #define VL_SERVICE 52 /* RxRPC service ID for the Volume Location service */ #define YFS_VL_SERVICE 2503 /* Service ID for AuriStor upgraded VL service */ +#define YFS_VL_MAXCELLNAME 256 /* Maximum length of a cell name in YFS protocol */ enum AFSVL_Operations { VLGETENTRYBYID = 503, /* AFS Get VLDB entry by ID */ diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 9f36e14f1c2d..f9e76b604f31 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -253,6 +253,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key) static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) { struct afs_cell *master; + size_t name_len; char *cell_name; cell_name = afs_vl_get_cell_name(cell, key); @@ -264,8 +265,11 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) return 0; } - master = afs_lookup_cell(cell->net, cell_name, strlen(cell_name), - NULL, false); + name_len = strlen(cell_name); + if (!name_len || name_len > AFS_MAXCELLNAME) + master = ERR_PTR(-EOPNOTSUPP); + else + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false); kfree(cell_name); if (IS_ERR(master)) return PTR_ERR(master); diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index cac75f89b64a..55dd0fc5aad7 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -697,7 +697,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) return ret; namesz = ntohl(call->tmp); - if (namesz > AFS_MAXCELLNAME) + if (namesz > YFS_VL_MAXCELLNAME) return afs_protocol_error(call, afs_eproto_cellname_len); paddedsz = (namesz + 3) & ~3; call->count = namesz; From f8f25893a477a4da4414c3e40ddd51d77fac9cfc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 29 Nov 2024 13:15:37 +0100 Subject: [PATCH 680/807] fs: debugfs: differentiate short fops with proxy ops Geert reported that my previous short fops debugfs changes broke m68k, because it only has mandatory alignement of two, so we can't stash the "is it short" information into the pointer (as we already did with the "is it real" bit.) Instead, exploit the fact that debugfs_file_get() called on an already open file will already find that the fsdata is no longer the real fops but rather the allocated data that already distinguishes full/short ops, so only open() needs to be able to distinguish. We can achieve that by using two different open functions. Unfortunately this requires another set of full file ops, increasing the size by 536 bytes (x86-64), but that's still a reasonable trade-off given that only converting some of the wireless stack gained over 28k. This brings the total cost of this to around 1k, for wins of 28k (all x86-64). Reported-and-tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/CAMuHMdWu_9-L2Te101w8hU7H_2yobJFPXSwwUmGHSJfaPWDKiQ@mail.gmail.com Fixes: 8dc6d81c6b2a ("debugfs: add small file operations for most files") Signed-off-by: Johannes Berg Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20241129121536.30989-2-johannes@sipsolutions.net Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/file.c | 72 ++++++++++++++++++++++++++++++------------- fs/debugfs/inode.c | 11 +++---- fs/debugfs/internal.h | 6 +--- 3 files changed, 55 insertions(+), 34 deletions(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 47dc96dfe386..bdb4f2ca0506 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -64,22 +64,13 @@ const struct file_operations *debugfs_real_fops(const struct file *filp) } EXPORT_SYMBOL_GPL(debugfs_real_fops); -/** - * debugfs_file_get - mark the beginning of file data access - * @dentry: the dentry object whose data is being accessed. - * - * Up to a matching call to debugfs_file_put(), any successive call - * into the file removing functions debugfs_remove() and - * debugfs_remove_recursive() will block. Since associated private - * file data may only get freed after a successful return of any of - * the removal functions, you may safely access it after a successful - * call to debugfs_file_get() without worrying about lifetime issues. - * - * If -%EIO is returned, the file has already been removed and thus, - * it is not safe to access any of its data. If, on the other hand, - * it is allowed to access the file data, zero is returned. - */ -int debugfs_file_get(struct dentry *dentry) +enum dbgfs_get_mode { + DBGFS_GET_ALREADY, + DBGFS_GET_REGULAR, + DBGFS_GET_SHORT, +}; + +static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode) { struct debugfs_fsdata *fsd; void *d_fsd; @@ -96,15 +87,17 @@ int debugfs_file_get(struct dentry *dentry) if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) { fsd = d_fsd; } else { + if (WARN_ON(mode == DBGFS_GET_ALREADY)) + return -EINVAL; + fsd = kmalloc(sizeof(*fsd), GFP_KERNEL); if (!fsd) return -ENOMEM; - if ((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT) { + if (mode == DBGFS_GET_SHORT) { fsd->real_fops = NULL; fsd->short_fops = (void *)((unsigned long)d_fsd & - ~(DEBUGFS_FSDATA_IS_REAL_FOPS_BIT | - DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT)); + ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); } else { fsd->real_fops = (void *)((unsigned long)d_fsd & ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); @@ -138,6 +131,26 @@ int debugfs_file_get(struct dentry *dentry) return 0; } + +/** + * debugfs_file_get - mark the beginning of file data access + * @dentry: the dentry object whose data is being accessed. + * + * Up to a matching call to debugfs_file_put(), any successive call + * into the file removing functions debugfs_remove() and + * debugfs_remove_recursive() will block. Since associated private + * file data may only get freed after a successful return of any of + * the removal functions, you may safely access it after a successful + * call to debugfs_file_get() without worrying about lifetime issues. + * + * If -%EIO is returned, the file has already been removed and thus, + * it is not safe to access any of its data. If, on the other hand, + * it is allowed to access the file data, zero is returned. + */ +int debugfs_file_get(struct dentry *dentry) +{ + return __debugfs_file_get(dentry, DBGFS_GET_ALREADY); +} EXPORT_SYMBOL_GPL(debugfs_file_get); /** @@ -424,7 +437,8 @@ static void __full_proxy_fops_init(struct file_operations *proxy_fops, proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl; } -static int full_proxy_open(struct inode *inode, struct file *filp) +static int full_proxy_open(struct inode *inode, struct file *filp, + enum dbgfs_get_mode mode) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops; @@ -432,7 +446,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) struct debugfs_fsdata *fsd; int r; - r = debugfs_file_get(dentry); + r = __debugfs_file_get(dentry, mode); if (r) return r == -EIO ? -ENOENT : r; @@ -491,8 +505,22 @@ out: return r; } +static int full_proxy_open_regular(struct inode *inode, struct file *filp) +{ + return full_proxy_open(inode, filp, DBGFS_GET_REGULAR); +} + const struct file_operations debugfs_full_proxy_file_operations = { - .open = full_proxy_open, + .open = full_proxy_open_regular, +}; + +static int full_proxy_open_short(struct inode *inode, struct file *filp) +{ + return full_proxy_open(inode, filp, DBGFS_GET_SHORT); +} + +const struct file_operations debugfs_full_short_proxy_file_operations = { + .open = full_proxy_open_short, }; ssize_t debugfs_attr_read(struct file *file, char __user *buf, diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 38a9c7eb97e6..65e46c7b6bf1 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -455,8 +455,7 @@ struct dentry *debugfs_create_file_full(const char *name, umode_t mode, const struct file_operations *fops) { if (WARN_ON((unsigned long)fops & - (DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT | - DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))) + DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) return ERR_PTR(-EINVAL); return __debugfs_create_file(name, mode, parent, data, @@ -471,15 +470,13 @@ struct dentry *debugfs_create_file_short(const char *name, umode_t mode, const struct debugfs_short_fops *fops) { if (WARN_ON((unsigned long)fops & - (DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT | - DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))) + DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) return ERR_PTR(-EINVAL); return __debugfs_create_file(name, mode, parent, data, - fops ? &debugfs_full_proxy_file_operations : + fops ? &debugfs_full_short_proxy_file_operations : &debugfs_noop_file_operations, - (const void *)((unsigned long)fops | - DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT)); + fops); } EXPORT_SYMBOL_GPL(debugfs_create_file_short); diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h index a3edfa4f0d8e..bbae4a228ef4 100644 --- a/fs/debugfs/internal.h +++ b/fs/debugfs/internal.h @@ -15,6 +15,7 @@ struct file_operations; extern const struct file_operations debugfs_noop_file_operations; extern const struct file_operations debugfs_open_proxy_file_operations; extern const struct file_operations debugfs_full_proxy_file_operations; +extern const struct file_operations debugfs_full_short_proxy_file_operations; struct debugfs_fsdata { const struct file_operations *real_fops; @@ -40,11 +41,6 @@ struct debugfs_fsdata { * pointer gets its lowest bit set. */ #define DEBUGFS_FSDATA_IS_REAL_FOPS_BIT BIT(0) -/* - * A dentry's ->d_fsdata, when pointing to real fops, is with - * short fops instead of full fops. - */ -#define DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT BIT(1) /* Access BITS */ #define DEBUGFS_ALLOW_API BIT(0) From 24edfbdedf19998366205130cfc93158a475497e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Dec 2024 08:12:04 +0000 Subject: [PATCH 681/807] debugfs: fix missing mutex_destroy() in short_fops case we need that in ->real_fops == NULL, ->short_fops != NULL case Fixes: 8dc6d81c6b2a "debugfs: add small file operations for most files" Signed-off-by: Al Viro Link: https://lore.kernel.org/r/20241229081223.3193228-1-viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 65e46c7b6bf1..e752009de929 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -229,7 +229,7 @@ static void debugfs_release_dentry(struct dentry *dentry) return; /* check it wasn't a dir (no fsdata) or automount (no real_fops) */ - if (fsd && fsd->real_fops) { + if (fsd && (fsd->real_fops || fsd->short_fops)) { WARN_ON(!list_empty(&fsd->cancellations)); mutex_destroy(&fsd->cancellations_mtx); } From dd410d784402c5775f66faf8b624e85e41c38aaf Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Jan 2025 18:40:34 +0100 Subject: [PATCH 682/807] platform/x86/amd/pmc: Only disable IRQ1 wakeup where i8042 actually enabled it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wakeup for IRQ1 should be disabled only in cases where i8042 had actually enabled it, otherwise "wake_depth" for this IRQ will try to drop below zero and there will be an unpleasant WARN() logged: kernel: atkbd serio0: Disabling IRQ1 wakeup source to avoid platform firmware bug kernel: ------------[ cut here ]------------ kernel: Unbalanced IRQ 1 wake disable kernel: WARNING: CPU: 10 PID: 6431 at kernel/irq/manage.c:920 irq_set_irq_wake+0x147/0x1a0 The PMC driver uses DEFINE_SIMPLE_DEV_PM_OPS() to define its dev_pm_ops which sets amd_pmc_suspend_handler() to the .suspend, .freeze, and .poweroff handlers. i8042_pm_suspend(), however, is only set as the .suspend handler. Fix the issue by call PMC suspend handler only from the same set of dev_pm_ops handlers as i8042_pm_suspend(), which currently means just the .suspend handler. To reproduce this issue try hibernating (S4) the machine after a fresh boot without putting it into s2idle first. Fixes: 8e60615e8932 ("platform/x86/amd: pmc: Disable IRQ1 wakeup for RN/CZN") Reviewed-by: Mario Limonciello Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/r/c8f28c002ca3c66fbeeb850904a1f43118e17200.1736184606.git.mail@maciej.szmigiero.name [ij: edited the commit message.] Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c index 26b878ee5191..a254debb9256 100644 --- a/drivers/platform/x86/amd/pmc/pmc.c +++ b/drivers/platform/x86/amd/pmc/pmc.c @@ -947,6 +947,10 @@ static int amd_pmc_suspend_handler(struct device *dev) { struct amd_pmc_dev *pdev = dev_get_drvdata(dev); + /* + * Must be called only from the same set of dev_pm_ops handlers + * as i8042_pm_suspend() is called: currently just from .suspend. + */ if (pdev->disable_8042_wakeup && !disable_workarounds) { int rc = amd_pmc_wa_irq1(pdev); @@ -959,7 +963,9 @@ static int amd_pmc_suspend_handler(struct device *dev) return 0; } -static DEFINE_SIMPLE_DEV_PM_OPS(amd_pmc_pm, amd_pmc_suspend_handler, NULL); +static const struct dev_pm_ops amd_pmc_pm = { + .suspend = amd_pmc_suspend_handler, +}; static const struct pci_device_id pmc_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_PS) }, From bee9a0838fd223823e5a6d85c055ab1691dc738e Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 3 Jan 2025 07:52:53 -0800 Subject: [PATCH 683/807] platform/x86/intel: power-domains: Add Clearwater Forest support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Clearwater Forest support (INTEL_ATOM_DARKMONT_X) to tpmi_cpu_ids to support domaid id mappings. Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20250103155255.1488139-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/tpmi_power_domains.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/intel/tpmi_power_domains.c b/drivers/platform/x86/intel/tpmi_power_domains.c index 0609a8320f7e..12fb0943b5dc 100644 --- a/drivers/platform/x86/intel/tpmi_power_domains.c +++ b/drivers/platform/x86/intel/tpmi_power_domains.c @@ -81,6 +81,7 @@ static const struct x86_cpu_id tpmi_cpu_ids[] = { X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, NULL), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, NULL), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, NULL), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, NULL), X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, NULL), X86_MATCH_VFM(INTEL_PANTHERCOVE_X, NULL), {} From cc1ff7bc1bb378e7c46992c977b605e97d908801 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 3 Jan 2025 07:52:54 -0800 Subject: [PATCH 684/807] platform/x86: ISST: Add Clearwater Forest to support list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Clearwater Forest (INTEL_ATOM_DARKMONT_X) to SST support list by adding to isst_cpu_ids. Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20250103155255.1488139-2-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/speed_select_if/isst_if_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c index 1e46e30dae96..dbcd3087aaa4 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c @@ -804,6 +804,7 @@ EXPORT_SYMBOL_GPL(isst_if_cdev_unregister); static const struct x86_cpu_id isst_cpu_ids[] = { X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, SST_HPM_SUPPORTED), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, SST_HPM_SUPPORTED), From 1d7461d0c8330689117286169106af6531a747ed Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Mon, 6 Jan 2025 09:46:52 -0800 Subject: [PATCH 685/807] platform/x86: intel/pmc: Fix ioremap() of bad address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In pmc_core_ssram_get_pmc(), the physical addresses for hidden SSRAM devices are retrieved from the MMIO region of the primary SSRAM device. If additional devices are not present, the address returned is zero. Currently, the code does not check for this condition, resulting in ioremap() incorrectly attempting to map address 0. Add a check for a zero address and return 0 if no additional devices are found, as it is not an error for the device to be absent. Fixes: a01486dc4bb1 ("platform/x86/intel/pmc: Cleanup SSRAM discovery") Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20250106174653.1497128-1-david.e.box@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/pmc/core_ssram.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/platform/x86/intel/pmc/core_ssram.c b/drivers/platform/x86/intel/pmc/core_ssram.c index 8504154b649f..927f58dc73e3 100644 --- a/drivers/platform/x86/intel/pmc/core_ssram.c +++ b/drivers/platform/x86/intel/pmc/core_ssram.c @@ -269,8 +269,12 @@ pmc_core_ssram_get_pmc(struct pmc_dev *pmcdev, int pmc_idx, u32 offset) /* * The secondary PMC BARS (which are behind hidden PCI devices) * are read from fixed offsets in MMIO of the primary PMC BAR. + * If a device is not present, the value will be 0. */ ssram_base = get_base(tmp_ssram, offset); + if (!ssram_base) + return 0; + ssram = ioremap(ssram_base, SSRAM_HDR_SIZE); if (!ssram) return -ENOMEM; From 984aaf6161bcbec22795a61b92f627051a91b465 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 10 Dec 2024 09:39:48 +0100 Subject: [PATCH 686/807] KVM: s390: vsie: fix virtual/physical address in unpin_scb() In commit 77b533411595 ("KVM: s390: VSIE: sort out virtual/physical address in pin_guest_page"), only pin_scb() has been updated. This means that in unpin_scb() a virtual address was still used directly as physical address without conversion. The resulting physical address is obviously wrong and most of the time also invalid. Since commit d0ef8d9fbebe ("KVM: s390: Use kvm_release_page_dirty() to unpin "struct page" memory"), unpin_guest_page() will directly use kvm_release_page_dirty(), instead of kvm_release_pfn_dirty(), which has since been removed. One of the checks that were performed by kvm_release_pfn_dirty() was to verify whether the page was valid at all, and silently return successfully without doing anything if the page was invalid. When kvm_release_pfn_dirty() was still used, the invalid page was thus silently ignored. Now the check is gone and the result is an Oops. This also means that when running with a V!=R kernel, the page was not released, causing a leak. The solution is simply to add the missing virt_to_phys(). Fixes: 77b533411595 ("KVM: s390: VSIE: sort out virtual/physical address in pin_guest_page") Reviewed-by: Janosch Frank Reviewed-by: Nico Boehr Link: https://lore.kernel.org/r/20241210083948.23963-1-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20241210083948.23963-1-imbrenda@linux.ibm.com> --- arch/s390/kvm/vsie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 150b9387860a..a687695d8f68 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -854,7 +854,7 @@ unpin: static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, gpa_t gpa) { - hpa_t hpa = (hpa_t) vsie_page->scb_o; + hpa_t hpa = virt_to_phys(vsie_page->scb_o); if (hpa) unpin_guest_page(vcpu->kvm, gpa, hpa); From df989238fa46039393f0af5cfb93a863a233c061 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Mon, 16 Dec 2024 10:21:35 +0100 Subject: [PATCH 687/807] KVM: s390: Reject setting flic pfault attributes on ucontrol VMs Prevent null pointer dereference when processing the KVM_DEV_FLIC_APF_ENABLE and KVM_DEV_FLIC_APF_DISABLE_WAIT ioctls in the interrupt controller. Fixes: 3c038e6be0e2 ("KVM: async_pf: Async page fault support on s390") Reported-by: Claudio Imbrenda Signed-off-by: Christoph Schlameuss Reviewed-by: Hariharan Mari Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20241216092140.329196-2-schlameuss@linux.ibm.com Message-ID: <20241216092140.329196-2-schlameuss@linux.ibm.com> Signed-off-by: Claudio Imbrenda --- Documentation/virt/kvm/devices/s390_flic.rst | 4 ++++ arch/s390/kvm/interrupt.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/Documentation/virt/kvm/devices/s390_flic.rst b/Documentation/virt/kvm/devices/s390_flic.rst index ea96559ba501..b784f8016748 100644 --- a/Documentation/virt/kvm/devices/s390_flic.rst +++ b/Documentation/virt/kvm/devices/s390_flic.rst @@ -58,11 +58,15 @@ Groups: Enables async page faults for the guest. So in case of a major page fault the host is allowed to handle this async and continues the guest. + -EINVAL is returned when called on the FLIC of a ucontrol VM. + KVM_DEV_FLIC_APF_DISABLE_WAIT Disables async page faults for the guest and waits until already pending async page faults are done. This is necessary to trigger a completion interrupt for every init interrupt before migrating the interrupt list. + -EINVAL is returned when called on the FLIC of a ucontrol VM. + KVM_DEV_FLIC_ADAPTER_REGISTER Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter describing the adapter to register:: diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ea8dce299954..22d73c13e555 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2678,9 +2678,13 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) kvm_s390_clear_float_irqs(dev->kvm); break; case KVM_DEV_FLIC_APF_ENABLE: + if (kvm_is_ucontrol(dev->kvm)) + return -EINVAL; dev->kvm->arch.gmap->pfault_enabled = 1; break; case KVM_DEV_FLIC_APF_DISABLE_WAIT: + if (kvm_is_ucontrol(dev->kvm)) + return -EINVAL; dev->kvm->arch.gmap->pfault_enabled = 0; /* * Make sure no async faults are in transition when From b07f6a30c7a42661ce7c0222a642c8e91d69c8b1 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Mon, 16 Dec 2024 10:21:36 +0100 Subject: [PATCH 688/807] KVM: s390: selftests: Add ucontrol flic attr selftests Add some superficial selftests for the floating interrupt controller when using ucontrol VMs. These tests are intended to cover very basic calls only. Some of the calls may trigger null pointer dereferences on kernels not containing the fixes in this patch series. Signed-off-by: Christoph Schlameuss Tested-by: Hariharan Mari Reviewed-by: Hariharan Mari Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20241216092140.329196-3-schlameuss@linux.ibm.com Message-ID: <20241216092140.329196-3-schlameuss@linux.ibm.com> Signed-off-by: Claudio Imbrenda --- .../selftests/kvm/s390x/ucontrol_test.c | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c index 0c112319dab1..b003abda8495 100644 --- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c +++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c @@ -635,4 +635,152 @@ TEST_F(uc_kvm, uc_skey) uc_assert_diag44(self); } +static char uc_flic_b[PAGE_SIZE]; +static struct kvm_s390_io_adapter uc_flic_ioa = { .id = 0 }; +static struct kvm_s390_io_adapter_req uc_flic_ioam = { .id = 0 }; +static struct kvm_s390_ais_req uc_flic_asim = { .isc = 0 }; +static struct kvm_s390_ais_all uc_flic_asima = { .simm = 0 }; +static struct uc_flic_attr_test { + char *name; + struct kvm_device_attr a; + int hasrc; + int geterrno; + int seterrno; +} uc_flic_attr_tests[] = { + { + .name = "KVM_DEV_FLIC_GET_ALL_IRQS", + .seterrno = EINVAL, + .a = { + .group = KVM_DEV_FLIC_GET_ALL_IRQS, + .addr = (u64)&uc_flic_b, + .attr = PAGE_SIZE, + }, + }, + { + .name = "KVM_DEV_FLIC_ENQUEUE", + .geterrno = EINVAL, + .a = { .group = KVM_DEV_FLIC_ENQUEUE, }, + }, + { + .name = "KVM_DEV_FLIC_CLEAR_IRQS", + .geterrno = EINVAL, + .a = { .group = KVM_DEV_FLIC_CLEAR_IRQS, }, + }, + { + .name = "KVM_DEV_FLIC_ADAPTER_REGISTER", + .geterrno = EINVAL, + .a = { + .group = KVM_DEV_FLIC_ADAPTER_REGISTER, + .addr = (u64)&uc_flic_ioa, + }, + }, + { + .name = "KVM_DEV_FLIC_ADAPTER_MODIFY", + .geterrno = EINVAL, + .seterrno = EINVAL, + .a = { + .group = KVM_DEV_FLIC_ADAPTER_MODIFY, + .addr = (u64)&uc_flic_ioam, + .attr = sizeof(uc_flic_ioam), + }, + }, + { + .name = "KVM_DEV_FLIC_CLEAR_IO_IRQ", + .geterrno = EINVAL, + .seterrno = EINVAL, + .a = { + .group = KVM_DEV_FLIC_CLEAR_IO_IRQ, + .attr = 32, + }, + }, + { + .name = "KVM_DEV_FLIC_AISM", + .geterrno = EINVAL, + .seterrno = ENOTSUP, + .a = { + .group = KVM_DEV_FLIC_AISM, + .addr = (u64)&uc_flic_asim, + }, + }, + { + .name = "KVM_DEV_FLIC_AIRQ_INJECT", + .geterrno = EINVAL, + .a = { .group = KVM_DEV_FLIC_AIRQ_INJECT, }, + }, + { + .name = "KVM_DEV_FLIC_AISM_ALL", + .geterrno = ENOTSUP, + .seterrno = ENOTSUP, + .a = { + .group = KVM_DEV_FLIC_AISM_ALL, + .addr = (u64)&uc_flic_asima, + .attr = sizeof(uc_flic_asima), + }, + }, + { + .name = "KVM_DEV_FLIC_APF_ENABLE", + .geterrno = EINVAL, + .seterrno = EINVAL, + .a = { .group = KVM_DEV_FLIC_APF_ENABLE, }, + }, + { + .name = "KVM_DEV_FLIC_APF_DISABLE_WAIT", + .geterrno = EINVAL, + .seterrno = EINVAL, + .a = { .group = KVM_DEV_FLIC_APF_DISABLE_WAIT, }, + }, +}; + +TEST_F(uc_kvm, uc_flic_attrs) +{ + struct kvm_create_device cd = { .type = KVM_DEV_TYPE_FLIC }; + struct kvm_device_attr attr; + u64 value; + int rc, i; + + rc = ioctl(self->vm_fd, KVM_CREATE_DEVICE, &cd); + ASSERT_EQ(0, rc) TH_LOG("create device failed with err %s (%i)", + strerror(errno), errno); + + for (i = 0; i < ARRAY_SIZE(uc_flic_attr_tests); i++) { + TH_LOG("test %s", uc_flic_attr_tests[i].name); + attr = (struct kvm_device_attr) { + .group = uc_flic_attr_tests[i].a.group, + .attr = uc_flic_attr_tests[i].a.attr, + .addr = uc_flic_attr_tests[i].a.addr, + }; + if (attr.addr == 0) + attr.addr = (u64)&value; + + rc = ioctl(cd.fd, KVM_HAS_DEVICE_ATTR, &attr); + EXPECT_EQ(uc_flic_attr_tests[i].hasrc, !!rc) + TH_LOG("expected dev attr missing %s", + uc_flic_attr_tests[i].name); + + rc = ioctl(cd.fd, KVM_GET_DEVICE_ATTR, &attr); + EXPECT_EQ(!!uc_flic_attr_tests[i].geterrno, !!rc) + TH_LOG("get dev attr rc not expected on %s %s (%i)", + uc_flic_attr_tests[i].name, + strerror(errno), errno); + if (uc_flic_attr_tests[i].geterrno) + EXPECT_EQ(uc_flic_attr_tests[i].geterrno, errno) + TH_LOG("get dev attr errno not expected on %s %s (%i)", + uc_flic_attr_tests[i].name, + strerror(errno), errno); + + rc = ioctl(cd.fd, KVM_SET_DEVICE_ATTR, &attr); + EXPECT_EQ(!!uc_flic_attr_tests[i].seterrno, !!rc) + TH_LOG("set sev attr rc not expected on %s %s (%i)", + uc_flic_attr_tests[i].name, + strerror(errno), errno); + if (uc_flic_attr_tests[i].seterrno) + EXPECT_EQ(uc_flic_attr_tests[i].seterrno, errno) + TH_LOG("set dev attr errno not expected on %s %s (%i)", + uc_flic_attr_tests[i].name, + strerror(errno), errno); + } + + close(cd.fd); +} + TEST_HARNESS_MAIN From 5021fd77d68fce28048b1af18c87bad6b7ffb282 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Mon, 16 Dec 2024 10:21:37 +0100 Subject: [PATCH 689/807] KVM: s390: Reject KVM_SET_GSI_ROUTING on ucontrol VMs Prevent null pointer dereference when processing KVM_IRQ_ROUTING_S390_ADAPTER routing entries. The ioctl cannot be processed for ucontrol VMs. Fixes: f65470661f36 ("KVM: s390/interrupt: do not pin adapter interrupt pages") Signed-off-by: Christoph Schlameuss Tested-by: Hariharan Mari Reviewed-by: Hariharan Mari Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20241216092140.329196-4-schlameuss@linux.ibm.com Message-ID: <20241216092140.329196-4-schlameuss@linux.ibm.com> Signed-off-by: Claudio Imbrenda --- Documentation/virt/kvm/api.rst | 3 +++ arch/s390/kvm/interrupt.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 454c2aaa155e..f15b61317aad 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1914,6 +1914,9 @@ No flags are specified so far, the corresponding field must be set to zero. #define KVM_IRQ_ROUTING_HV_SINT 4 #define KVM_IRQ_ROUTING_XEN_EVTCHN 5 +On s390, adding a KVM_IRQ_ROUTING_S390_ADAPTER is rejected on ucontrol VMs with +error -EINVAL. + flags: - KVM_MSI_VALID_DEVID: used along with KVM_IRQ_ROUTING_MSI routing entry diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 22d73c13e555..d4f031e086fc 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2898,6 +2898,8 @@ int kvm_set_routing_entry(struct kvm *kvm, switch (ue->type) { /* we store the userspace addresses instead of the guest addresses */ case KVM_IRQ_ROUTING_S390_ADAPTER: + if (kvm_is_ucontrol(kvm)) + return -EINVAL; e->set = set_adapter_int; uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); if (uaddr == -EFAULT) From b1da33b0e3dcd0d14cf375be2fd05b54cf75df56 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Mon, 16 Dec 2024 10:21:38 +0100 Subject: [PATCH 690/807] KVM: s390: selftests: Add ucontrol gis routing test Add a selftests for the interrupt routing configuration when using ucontrol VMs. Calling the test may trigger a null pointer dereferences on kernels not containing the fixes in this patch series. Signed-off-by: Christoph Schlameuss Tested-by: Hariharan Mari Reviewed-by: Hariharan Mari Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20241216092140.329196-5-schlameuss@linux.ibm.com Message-ID: <20241216092140.329196-5-schlameuss@linux.ibm.com> Signed-off-by: Claudio Imbrenda --- .../selftests/kvm/s390x/ucontrol_test.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c index b003abda8495..8f306395696e 100644 --- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c +++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c @@ -783,4 +783,23 @@ TEST_F(uc_kvm, uc_flic_attrs) close(cd.fd); } +TEST_F(uc_kvm, uc_set_gsi_routing) +{ + struct kvm_irq_routing *routing = kvm_gsi_routing_create(); + struct kvm_irq_routing_entry ue = { + .type = KVM_IRQ_ROUTING_S390_ADAPTER, + .gsi = 1, + .u.adapter = (struct kvm_irq_routing_s390_adapter) { + .ind_addr = 0, + }, + }; + int rc; + + routing->entries[0] = ue; + routing->nr = 1; + rc = ioctl(self->vm_fd, KVM_SET_GSI_ROUTING, routing); + ASSERT_EQ(-1, rc) TH_LOG("err %s (%i)", strerror(errno), errno); + ASSERT_EQ(EINVAL, errno) TH_LOG("err %s (%i)", strerror(errno), errno); +} + TEST_HARNESS_MAIN From e376d958871c0eeb7e97cf95655015fc343d209c Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Mon, 16 Dec 2024 10:21:40 +0100 Subject: [PATCH 691/807] KVM: s390: selftests: Add has device attr check to uc_attr_mem_limit selftest Fixup the uc_attr_mem_limit test case to also cover the KVM_HAS_DEVICE_ATTR ioctl. Signed-off-by: Christoph Schlameuss Tested-by: Hariharan Mari Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20241216092140.329196-7-schlameuss@linux.ibm.com Message-ID: <20241216092140.329196-7-schlameuss@linux.ibm.com> Signed-off-by: Claudio Imbrenda --- tools/testing/selftests/kvm/s390x/ucontrol_test.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c index 8f306395696e..135ee22856cf 100644 --- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c +++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c @@ -210,10 +210,13 @@ TEST_F(uc_kvm, uc_attr_mem_limit) struct kvm_device_attr attr = { .group = KVM_S390_VM_MEM_CTRL, .attr = KVM_S390_VM_MEM_LIMIT_SIZE, - .addr = (unsigned long)&limit, + .addr = (u64)&limit, }; int rc; + rc = ioctl(self->vm_fd, KVM_HAS_DEVICE_ATTR, &attr); + EXPECT_EQ(0, rc); + rc = ioctl(self->vm_fd, KVM_GET_DEVICE_ATTR, &attr); EXPECT_EQ(0, rc); EXPECT_EQ(~0UL, limit); From cbd399f78e23ad4492c174fc5e6b3676dba74a52 Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Thu, 14 Nov 2024 19:01:41 +0800 Subject: [PATCH 692/807] topology: Keep the cpumask unchanged when printing cpumap During fuzz testing, the following warning was discovered: different return values (15 and 11) from vsnprintf("%*pbl ", ...) test:keyward is WARNING in kvasprintf WARNING: CPU: 55 PID: 1168477 at lib/kasprintf.c:30 kvasprintf+0x121/0x130 Call Trace: kvasprintf+0x121/0x130 kasprintf+0xa6/0xe0 bitmap_print_to_buf+0x89/0x100 core_siblings_list_read+0x7e/0xb0 kernfs_file_read_iter+0x15b/0x270 new_sync_read+0x153/0x260 vfs_read+0x215/0x290 ksys_read+0xb9/0x160 do_syscall_64+0x56/0x100 entry_SYSCALL_64_after_hwframe+0x78/0xe2 The call trace shows that kvasprintf() reported this warning during the printing of core_siblings_list. kvasprintf() has several steps: (1) First, calculate the length of the resulting formatted string. (2) Allocate a buffer based on the returned length. (3) Then, perform the actual string formatting. (4) Check whether the lengths of the formatted strings returned in steps (1) and (2) are consistent. If the core_cpumask is modified between steps (1) and (3), the lengths obtained in these two steps may not match. Indeed our test includes cpu hotplugging, which should modify core_cpumask while printing. To fix this issue, cache the cpumask into a temporary variable before calling cpumap_print_{list, cpumask}_to_buf(), to keep it unchanged during the printing process. Fixes: bb9ec13d156e ("topology: use bin_attribute to break the size limitation of cpumap ABI") Cc: stable Signed-off-by: Li Huafei Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20241114110141.94725-1-lihuafei1@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/topology.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/base/topology.c b/drivers/base/topology.c index cf160dd2c27b..b962da263eee 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -27,9 +27,17 @@ static ssize_t name##_read(struct file *file, struct kobject *kobj, \ loff_t off, size_t count) \ { \ struct device *dev = kobj_to_dev(kobj); \ + cpumask_var_t mask; \ + ssize_t n; \ \ - return cpumap_print_bitmask_to_buf(buf, topology_##mask(dev->id), \ - off, count); \ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) \ + return -ENOMEM; \ + \ + cpumask_copy(mask, topology_##mask(dev->id)); \ + n = cpumap_print_bitmask_to_buf(buf, mask, off, count); \ + free_cpumask_var(mask); \ + \ + return n; \ } \ \ static ssize_t name##_list_read(struct file *file, struct kobject *kobj, \ @@ -37,9 +45,17 @@ static ssize_t name##_list_read(struct file *file, struct kobject *kobj, \ loff_t off, size_t count) \ { \ struct device *dev = kobj_to_dev(kobj); \ + cpumask_var_t mask; \ + ssize_t n; \ \ - return cpumap_print_list_to_buf(buf, topology_##mask(dev->id), \ - off, count); \ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) \ + return -ENOMEM; \ + \ + cpumask_copy(mask, topology_##mask(dev->id)); \ + n = cpumap_print_list_to_buf(buf, mask, off, count); \ + free_cpumask_var(mask); \ + \ + return n; \ } define_id_show_func(physical_package_id, "%d"); From 65104599b3a8ed42d85b3f8f27be650afe1f3a7e Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Wed, 20 Nov 2024 08:51:12 +0100 Subject: [PATCH 693/807] ice: fix max values for dpll pin phase adjust Mask admin command returned max phase adjust value for both input and output pins. Only 31 bits are relevant, last released data sheet wrongly points that 32 bits are valid - see [1] 3.2.6.4.1 Get CCU Capabilities Command for reference. Fix of the datasheet itself is in progress. Fix the min/max assignment logic, previously the value was wrongly considered as negative value due to most significant bit being set. Example of previous broken behavior: $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/dpll.yaml \ --do pin-get --json '{"id":1}'| grep phase-adjust 'phase-adjust': 0, 'phase-adjust-max': 16723, 'phase-adjust-min': -16723, Correct behavior with the fix: $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/dpll.yaml \ --do pin-get --json '{"id":1}'| grep phase-adjust 'phase-adjust': 0, 'phase-adjust-max': 2147466925, 'phase-adjust-min': -2147466925, [1] https://cdrdv2.intel.com/v1/dl/getContent/613875?explicitVersion=true Fixes: 90e1c90750d7 ("ice: dpll: implement phase related callbacks") Reviewed-by: Przemek Kitszel Signed-off-by: Arkadiusz Kubalewski Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ice/ice_adminq_cmd.h | 2 ++ drivers/net/ethernet/intel/ice/ice_dpll.c | 35 ++++++++++++------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 1489a8ceec51..ef14cff9a333 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -2264,6 +2264,8 @@ struct ice_aqc_get_pkg_info_resp { struct ice_aqc_get_pkg_info pkg_info[]; }; +#define ICE_AQC_GET_CGU_MAX_PHASE_ADJ GENMASK(30, 0) + /* Get CGU abilities command response data structure (indirect 0x0C61) */ struct ice_aqc_get_cgu_abilities { u8 num_inputs; diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index d5ad6d84007c..38e151c7ea23 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -2064,6 +2064,18 @@ static int ice_dpll_init_worker(struct ice_pf *pf) return 0; } +/** + * ice_dpll_phase_range_set - initialize phase adjust range helper + * @range: pointer to phase adjust range struct to be initialized + * @phase_adj: a value to be used as min(-)/max(+) boundary + */ +static void ice_dpll_phase_range_set(struct dpll_pin_phase_adjust_range *range, + u32 phase_adj) +{ + range->min = -phase_adj; + range->max = phase_adj; +} + /** * ice_dpll_init_info_pins_generic - initializes generic pins info * @pf: board private structure @@ -2105,8 +2117,8 @@ static int ice_dpll_init_info_pins_generic(struct ice_pf *pf, bool input) for (i = 0; i < pin_num; i++) { pins[i].idx = i; pins[i].prop.board_label = labels[i]; - pins[i].prop.phase_range.min = phase_adj_max; - pins[i].prop.phase_range.max = -phase_adj_max; + ice_dpll_phase_range_set(&pins[i].prop.phase_range, + phase_adj_max); pins[i].prop.capabilities = cap; pins[i].pf = pf; ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL); @@ -2152,6 +2164,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, struct ice_hw *hw = &pf->hw; struct ice_dpll_pin *pins; unsigned long caps; + u32 phase_adj_max; u8 freq_supp_num; bool input; @@ -2159,11 +2172,13 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, case ICE_DPLL_PIN_TYPE_INPUT: pins = pf->dplls.inputs; num_pins = pf->dplls.num_inputs; + phase_adj_max = pf->dplls.input_phase_adj_max; input = true; break; case ICE_DPLL_PIN_TYPE_OUTPUT: pins = pf->dplls.outputs; num_pins = pf->dplls.num_outputs; + phase_adj_max = pf->dplls.output_phase_adj_max; input = false; break; default: @@ -2188,19 +2203,13 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, return ret; caps |= (DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE | DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE); - pins[i].prop.phase_range.min = - pf->dplls.input_phase_adj_max; - pins[i].prop.phase_range.max = - -pf->dplls.input_phase_adj_max; } else { - pins[i].prop.phase_range.min = - pf->dplls.output_phase_adj_max; - pins[i].prop.phase_range.max = - -pf->dplls.output_phase_adj_max; ret = ice_cgu_get_output_pin_state_caps(hw, i, &caps); if (ret) return ret; } + ice_dpll_phase_range_set(&pins[i].prop.phase_range, + phase_adj_max); pins[i].prop.capabilities = caps; ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL); if (ret) @@ -2308,8 +2317,10 @@ static int ice_dpll_init_info(struct ice_pf *pf, bool cgu) dp->dpll_idx = abilities.pps_dpll_idx; d->num_inputs = abilities.num_inputs; d->num_outputs = abilities.num_outputs; - d->input_phase_adj_max = le32_to_cpu(abilities.max_in_phase_adj); - d->output_phase_adj_max = le32_to_cpu(abilities.max_out_phase_adj); + d->input_phase_adj_max = le32_to_cpu(abilities.max_in_phase_adj) & + ICE_AQC_GET_CGU_MAX_PHASE_ADJ; + d->output_phase_adj_max = le32_to_cpu(abilities.max_out_phase_adj) & + ICE_AQC_GET_CGU_MAX_PHASE_ADJ; alloc_size = sizeof(*d->inputs) * d->num_inputs; d->inputs = kzalloc(alloc_size, GFP_KERNEL); From 6c5b989116083a98f45aada548ff54e7a83a9c2d Mon Sep 17 00:00:00 2001 From: Przemyslaw Korba Date: Wed, 4 Dec 2024 14:22:18 +0100 Subject: [PATCH 694/807] ice: fix incorrect PHY settings for 100 GB/s ptp4l application reports too high offset when ran on E823 device with a 100GB/s link. Those values cannot go under 100ns, like in a working case when using 100 GB/s cable. This is due to incorrect frequency settings on the PHY clocks for 100 GB/s speed. Changes are introduced to align with the internal hardware documentation, and correctly initialize frequency in PHY clocks with the frequency values that are in our HW spec. To reproduce the issue run ptp4l as a Time Receiver on E823 device, and observe the offset, which will never approach values seen in the PTP working case. Reproduction output: ptp4l -i enp137s0f3 -m -2 -s -f /etc/ptp4l_8275.conf ptp4l[5278.775]: master offset 12470 s2 freq +41288 path delay -3002 ptp4l[5278.837]: master offset 10525 s2 freq +39202 path delay -3002 ptp4l[5278.900]: master offset -24840 s2 freq -20130 path delay -3002 ptp4l[5278.963]: master offset 10597 s2 freq +37908 path delay -3002 ptp4l[5279.025]: master offset 8883 s2 freq +36031 path delay -3002 ptp4l[5279.088]: master offset 7267 s2 freq +34151 path delay -3002 ptp4l[5279.150]: master offset 5771 s2 freq +32316 path delay -3002 ptp4l[5279.213]: master offset 4388 s2 freq +30526 path delay -3002 ptp4l[5279.275]: master offset -30434 s2 freq -28485 path delay -3002 ptp4l[5279.338]: master offset -28041 s2 freq -27412 path delay -3002 ptp4l[5279.400]: master offset 7870 s2 freq +31118 path delay -3002 Fixes: 3a7496234d17 ("ice: implement basic E822 PTP support") Reviewed-by: Milena Olech Signed-off-by: Przemyslaw Korba Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ptp_consts.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h index 585ce200c60f..d75f0eddd631 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h @@ -761,9 +761,9 @@ const struct ice_vernier_info_e82x e822_vernier[NUM_ICE_PTP_LNK_SPD] = { /* rx_desk_rsgb_par */ 644531250, /* 644.53125 MHz Reed Solomon gearbox */ /* tx_desk_rsgb_pcs */ - 644531250, /* 644.53125 MHz Reed Solomon gearbox */ + 390625000, /* 390.625 MHz Reed Solomon gearbox */ /* rx_desk_rsgb_pcs */ - 644531250, /* 644.53125 MHz Reed Solomon gearbox */ + 390625000, /* 390.625 MHz Reed Solomon gearbox */ /* tx_fixed_delay */ 1620, /* pmd_adj_divisor */ From bd2776e39c2a82ef4681d02678bb77b3d41e79be Mon Sep 17 00:00:00 2001 From: En-Wei Wu Date: Wed, 18 Dec 2024 10:37:42 +0800 Subject: [PATCH 695/807] igc: return early when failing to read EECD register When booting with a dock connected, the igc driver may get stuck for ~40 seconds if PCIe link is lost during initialization. This happens because the driver access device after EECD register reads return all F's, indicating failed reads. Consequently, hw->hw_addr is set to NULL, which impacts subsequent rd32() reads. This leads to the driver hanging in igc_get_hw_semaphore_i225(), as the invalid hw->hw_addr prevents retrieving the expected value. To address this, a validation check and a corresponding return value catch is added for the EECD register read result. If all F's are returned, indicating PCIe link loss, the driver will return -ENXIO immediately. This avoids the 40-second hang and significantly improves boot time when using a dock with an igc NIC. Log before the patch: [ 0.911913] igc 0000:70:00.0: enabling device (0000 -> 0002) [ 0.912386] igc 0000:70:00.0: PTM enabled, 4ns granularity [ 1.571098] igc 0000:70:00.0 (unnamed net_device) (uninitialized): PCIe link lost, device now detached [ 43.449095] igc_get_hw_semaphore_i225: igc 0000:70:00.0 (unnamed net_device) (uninitialized): Driver can't access device - SMBI bit is set. [ 43.449186] igc 0000:70:00.0: probe with driver igc failed with error -13 [ 46.345701] igc 0000:70:00.0: enabling device (0000 -> 0002) [ 46.345777] igc 0000:70:00.0: PTM enabled, 4ns granularity Log after the patch: [ 1.031000] igc 0000:70:00.0: enabling device (0000 -> 0002) [ 1.032097] igc 0000:70:00.0: PTM enabled, 4ns granularity [ 1.642291] igc 0000:70:00.0 (unnamed net_device) (uninitialized): PCIe link lost, device now detached [ 5.480490] igc 0000:70:00.0: enabling device (0000 -> 0002) [ 5.480516] igc 0000:70:00.0: PTM enabled, 4ns granularity Fixes: ab4056126813 ("igc: Add NVM support") Cc: Chia-Lin Kao (AceLan) Signed-off-by: En-Wei Wu Reviewed-by: Vitaly Lifshits Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_base.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_base.c b/drivers/net/ethernet/intel/igc/igc_base.c index 9fae8bdec2a7..1613b562d17c 100644 --- a/drivers/net/ethernet/intel/igc/igc_base.c +++ b/drivers/net/ethernet/intel/igc/igc_base.c @@ -68,6 +68,10 @@ static s32 igc_init_nvm_params_base(struct igc_hw *hw) u32 eecd = rd32(IGC_EECD); u16 size; + /* failed to read reg and got all F's */ + if (!(~eecd)) + return -ENXIO; + size = FIELD_GET(IGC_EECD_SIZE_EX_MASK, eecd); /* Added to a constant, "size" becomes the left-shift value @@ -221,6 +225,8 @@ static s32 igc_get_invariants_base(struct igc_hw *hw) /* NVM initialization */ ret_val = igc_init_nvm_params_base(hw); + if (ret_val) + goto out; switch (hw->mac.type) { case igc_i225: ret_val = igc_init_nvm_params_i225(hw); From b4aee757f1baf20fa2650fc23a7b0335696e005c Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Wed, 4 Dec 2024 16:22:47 +0100 Subject: [PATCH 696/807] MAINTAINERS: align Danilo's maintainer entries Some entries use my kernel.org address, while others use my Red Hat one. Since this is a bit of an inconvinience for me, align them to all use the same (kernel.org) address. Signed-off-by: Danilo Krummrich Link: https://lore.kernel.org/r/20241204152248.8644-1-dakr@kernel.org Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index baf0eeb9a355..03a21163d952 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7347,7 +7347,7 @@ F: drivers/gpu/drm/panel/panel-novatek-nt36672a.c DRM DRIVER FOR NVIDIA GEFORCE/QUADRO GPUS M: Karol Herbst M: Lyude Paul -M: Danilo Krummrich +M: Danilo Krummrich L: dri-devel@lists.freedesktop.org L: nouveau@lists.freedesktop.org S: Supported @@ -8924,7 +8924,7 @@ F: include/linux/arm_ffa.h FIRMWARE LOADER (request_firmware) M: Luis Chamberlain M: Russ Weight -M: Danilo Krummrich +M: Danilo Krummrich L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/firmware_class/ From 66d337fede44dcbab4107d37684af8fcab3d648e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 20 Dec 2024 19:13:52 +0100 Subject: [PATCH 697/807] ACPI: resource: Add Asus Vivobook X1504VAP to irq1_level_low_skip_override[] Like the Vivobook X1704VAP the X1504VAP has its keyboard IRQ (1) described as ActiveLow in the DSDT, which the kernel overrides to EdgeHigh which breaks the keyboard. Add the X1504VAP to the irq1_level_low_skip_override[] quirk table to fix this. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219224 Cc: All applicable Signed-off-by: Hans de Goede Link: https://patch.msgid.link/20241220181352.25974-1-hdegoede@redhat.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 821867de43be..ab4c0e0b6b8e 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -440,6 +440,13 @@ static const struct dmi_system_id irq1_level_low_skip_override[] = { DMI_MATCH(DMI_BOARD_NAME, "S5602ZA"), }, }, + { + /* Asus Vivobook X1504VAP */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_BOARD_NAME, "X1504VAP"), + }, + }, { /* Asus Vivobook X1704VAP */ .matches = { From 7ed4e4a659d99499dc6968c61970d41b64feeac0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 28 Dec 2024 17:48:45 +0100 Subject: [PATCH 698/807] ACPI: resource: Add TongFang GM5HG0A to irq1_edge_low_force_override[] The TongFang GM5HG0A is a TongFang barebone design which is sold under various brand names. The ACPI IRQ override for the keyboard IRQ must be used on these AMD Zen laptops in order for the IRQ to work. At least on the SKIKK Vanaheim variant the DMI product- and board-name strings have been replaced by the OEM with "Vanaheim" so checking that board-name contains "GM5HG0A" as is usually done for TongFang barebones quirks does not work. The DMI OEM strings do contain "GM5HG0A". I have looked at the dmidecode for a few other TongFang devices and the TongFang code-name string being in the OEM strings seems to be something which is consistently true. Add a quirk checking one of the DMI_OEM_STRING(s) is "GM5HG0A" in the hope that this will work for other OEM versions of the "GM5HG0A" too. Link: https://www.skikk.eu/en/laptops/vanaheim-15-rtx-4060 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219614 Cc: All applicable Signed-off-by: Hans de Goede Link: https://patch.msgid.link/20241228164845.42381-1-hdegoede@redhat.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index ab4c0e0b6b8e..d27a3bf96f80 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -653,6 +653,17 @@ static const struct dmi_system_id irq1_edge_low_force_override[] = { DMI_MATCH(DMI_BOARD_NAME, "GMxHGxx"), }, }, + { + /* + * TongFang GM5HG0A in case of the SKIKK Vanaheim relabel the + * board-name is changed, so check OEM strings instead. Note + * OEM string matches are always exact matches. + * https://bugzilla.kernel.org/show_bug.cgi?id=219614 + */ + .matches = { + DMI_EXACT_MATCH(DMI_OEM_STRING, "GM5HG0A"), + }, + }, { } }; From cd4a7b2e6a2437a5502910c08128ea3bad55a80b Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 28 Dec 2024 17:52:53 +0100 Subject: [PATCH 699/807] ACPI: resource: acpi_dev_irq_override(): Check DMI match last acpi_dev_irq_override() gets called approx. 30 times during boot (15 legacy IRQs * 2 override_table entries). Of these 30 calls at max 1 will match the non DMI checks done by acpi_dev_irq_override(). The dmi_check_system() check is by far the most expensive check done by acpi_dev_irq_override(), make this call the last check done by acpi_dev_irq_override() so that it will be called at max 1 time instead of 30 times. Signed-off-by: Hans de Goede Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/20241228165253.42584-1-hdegoede@redhat.com [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index d27a3bf96f80..90aaec923889 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -689,11 +689,11 @@ static bool acpi_dev_irq_override(u32 gsi, u8 triggering, u8 polarity, for (i = 0; i < ARRAY_SIZE(override_table); i++) { const struct irq_override_cmp *entry = &override_table[i]; - if (dmi_check_system(entry->system) && - entry->irq == gsi && + if (entry->irq == gsi && entry->triggering == triggering && entry->polarity == polarity && - entry->shareable == shareable) + entry->shareable == shareable && + dmi_check_system(entry->system)) return entry->override; } From 9164e0912af206a72ddac4915f7784e470a04ace Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Tue, 24 Dec 2024 12:18:09 +0900 Subject: [PATCH 700/807] thermal: of: fix OF node leak in of_thermal_zone_find() of_thermal_zone_find() calls of_parse_phandle_with_args(), but does not release the OF node reference obtained by it. Add a of_node_put() call when the call is successful. Fixes: 3fd6d6e2b4e8 ("thermal/of: Rework the thermal device tree initialization") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241224031809.950461-1-joe@pf.is.s.u-tokyo.ac.jp [ rjw: Changelog edit ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_of.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index fab11b98ca49..5ab4ce4daaeb 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -160,6 +160,7 @@ static struct device_node *of_thermal_zone_find(struct device_node *sensor, int return ERR_PTR(ret); } + of_node_put(sensor_specs.np); if ((sensor == sensor_specs.np) && id == (sensor_specs.args_count ? sensor_specs.args[0] : 0)) { pr_debug("sensor %pOFn id=%d belongs to %pOFn\n", sensor, id, child); From 95978931d55fb7685f8c0b2598d6c12a9b6bc82a Mon Sep 17 00:00:00 2001 From: Su Hui Date: Mon, 6 Jan 2025 10:36:48 +0800 Subject: [PATCH 701/807] eth: fbnic: Revert "eth: fbnic: Add hardware monitoring support via HWMON interface" There is a garbage value problem in fbnic_mac_get_sensor_asic(). 'fw_cmpl' is uninitialized which makes 'sensor' and '*val' to be stored garbage value. Revert commit d85ebade02e8 ("eth: fbnic: Add hardware monitoring support via HWMON interface") to avoid this problem. Fixes: d85ebade02e8 ("eth: fbnic: Add hardware monitoring support via HWMON interface") Signed-off-by: Su Hui Suggested-by: Jakub Kicinski Suggested-by: Michal Swiatkowski Link: https://patch.msgid.link/20250106023647.47756-1-suhui@nfschina.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/Makefile | 1 - drivers/net/ethernet/meta/fbnic/fbnic.h | 5 -- drivers/net/ethernet/meta/fbnic/fbnic_fw.h | 7 -- drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c | 81 ------------------- drivers/net/ethernet/meta/fbnic/fbnic_mac.c | 22 ----- drivers/net/ethernet/meta/fbnic/fbnic_mac.h | 7 -- drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 3 - 7 files changed, 126 deletions(-) delete mode 100644 drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c diff --git a/drivers/net/ethernet/meta/fbnic/Makefile b/drivers/net/ethernet/meta/fbnic/Makefile index 239b2258ec65..ea6214ca48e7 100644 --- a/drivers/net/ethernet/meta/fbnic/Makefile +++ b/drivers/net/ethernet/meta/fbnic/Makefile @@ -13,7 +13,6 @@ fbnic-y := fbnic_csr.o \ fbnic_ethtool.o \ fbnic_fw.o \ fbnic_hw_stats.o \ - fbnic_hwmon.o \ fbnic_irq.o \ fbnic_mac.o \ fbnic_netdev.o \ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 706ae6104c8e..744eb0d95449 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -20,7 +20,6 @@ struct fbnic_dev { struct device *dev; struct net_device *netdev; struct dentry *dbg_fbd; - struct device *hwmon; u32 __iomem *uc_addr0; u32 __iomem *uc_addr4; @@ -33,7 +32,6 @@ struct fbnic_dev { struct fbnic_fw_mbx mbx[FBNIC_IPC_MBX_INDICES]; struct fbnic_fw_cap fw_cap; - struct fbnic_fw_completion *cmpl_data; /* Lock protecting Tx Mailbox queue to prevent possible races */ spinlock_t fw_tx_lock; @@ -142,9 +140,6 @@ void fbnic_devlink_unregister(struct fbnic_dev *fbd); int fbnic_fw_enable_mbx(struct fbnic_dev *fbd); void fbnic_fw_disable_mbx(struct fbnic_dev *fbd); -void fbnic_hwmon_register(struct fbnic_dev *fbd); -void fbnic_hwmon_unregister(struct fbnic_dev *fbd); - int fbnic_pcs_irq_enable(struct fbnic_dev *fbd); void fbnic_pcs_irq_disable(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h index 7cd8841920e4..221faf8c6756 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h @@ -44,13 +44,6 @@ struct fbnic_fw_cap { u8 link_fec; }; -struct fbnic_fw_completion { - struct { - s32 millivolts; - s32 millidegrees; - } tsene; -}; - void fbnic_mbx_init(struct fbnic_dev *fbd); void fbnic_mbx_clean(struct fbnic_dev *fbd); void fbnic_mbx_poll(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c b/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c deleted file mode 100644 index bcd1086e3768..000000000000 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) Meta Platforms, Inc. and affiliates. */ - -#include - -#include "fbnic.h" -#include "fbnic_mac.h" - -static int fbnic_hwmon_sensor_id(enum hwmon_sensor_types type) -{ - if (type == hwmon_temp) - return FBNIC_SENSOR_TEMP; - if (type == hwmon_in) - return FBNIC_SENSOR_VOLTAGE; - - return -EOPNOTSUPP; -} - -static umode_t fbnic_hwmon_is_visible(const void *drvdata, - enum hwmon_sensor_types type, - u32 attr, int channel) -{ - if (type == hwmon_temp && attr == hwmon_temp_input) - return 0444; - if (type == hwmon_in && attr == hwmon_in_input) - return 0444; - - return 0; -} - -static int fbnic_hwmon_read(struct device *dev, enum hwmon_sensor_types type, - u32 attr, int channel, long *val) -{ - struct fbnic_dev *fbd = dev_get_drvdata(dev); - const struct fbnic_mac *mac = fbd->mac; - int id; - - id = fbnic_hwmon_sensor_id(type); - return id < 0 ? id : mac->get_sensor(fbd, id, val); -} - -static const struct hwmon_ops fbnic_hwmon_ops = { - .is_visible = fbnic_hwmon_is_visible, - .read = fbnic_hwmon_read, -}; - -static const struct hwmon_channel_info *fbnic_hwmon_info[] = { - HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT), - HWMON_CHANNEL_INFO(in, HWMON_I_INPUT), - NULL -}; - -static const struct hwmon_chip_info fbnic_chip_info = { - .ops = &fbnic_hwmon_ops, - .info = fbnic_hwmon_info, -}; - -void fbnic_hwmon_register(struct fbnic_dev *fbd) -{ - if (!IS_REACHABLE(CONFIG_HWMON)) - return; - - fbd->hwmon = hwmon_device_register_with_info(fbd->dev, "fbnic", - fbd, &fbnic_chip_info, - NULL); - if (IS_ERR(fbd->hwmon)) { - dev_notice(fbd->dev, - "Failed to register hwmon device %pe\n", - fbd->hwmon); - fbd->hwmon = NULL; - } -} - -void fbnic_hwmon_unregister(struct fbnic_dev *fbd) -{ - if (!IS_REACHABLE(CONFIG_HWMON) || !fbd->hwmon) - return; - - hwmon_device_unregister(fbd->hwmon); - fbd->hwmon = NULL; -} diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c index 80b82ff12c4d..7b654d0a6dac 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c @@ -686,27 +686,6 @@ fbnic_mac_get_eth_mac_stats(struct fbnic_dev *fbd, bool reset, MAC_STAT_TX_BROADCAST); } -static int fbnic_mac_get_sensor_asic(struct fbnic_dev *fbd, int id, long *val) -{ - struct fbnic_fw_completion fw_cmpl; - s32 *sensor; - - switch (id) { - case FBNIC_SENSOR_TEMP: - sensor = &fw_cmpl.tsene.millidegrees; - break; - case FBNIC_SENSOR_VOLTAGE: - sensor = &fw_cmpl.tsene.millivolts; - break; - default: - return -EINVAL; - } - - *val = *sensor; - - return 0; -} - static const struct fbnic_mac fbnic_mac_asic = { .init_regs = fbnic_mac_init_regs, .pcs_enable = fbnic_pcs_enable_asic, @@ -716,7 +695,6 @@ static const struct fbnic_mac fbnic_mac_asic = { .get_eth_mac_stats = fbnic_mac_get_eth_mac_stats, .link_down = fbnic_mac_link_down_asic, .link_up = fbnic_mac_link_up_asic, - .get_sensor = fbnic_mac_get_sensor_asic, }; /** diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_mac.h b/drivers/net/ethernet/meta/fbnic/fbnic_mac.h index 05a591653e09..476239a9d381 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_mac.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_mac.h @@ -47,11 +47,6 @@ enum { #define FBNIC_LINK_MODE_PAM4 (FBNIC_LINK_50R1) #define FBNIC_LINK_MODE_MASK (FBNIC_LINK_AUTO - 1) -enum fbnic_sensor_id { - FBNIC_SENSOR_TEMP, /* Temp in millidegrees Centigrade */ - FBNIC_SENSOR_VOLTAGE, /* Voltage in millivolts */ -}; - /* This structure defines the interface hooks for the MAC. The MAC hooks * will be configured as a const struct provided with a set of function * pointers. @@ -88,8 +83,6 @@ struct fbnic_mac { void (*link_down)(struct fbnic_dev *fbd); void (*link_up)(struct fbnic_dev *fbd, bool tx_pause, bool rx_pause); - - int (*get_sensor)(struct fbnic_dev *fbd, int id, long *val); }; int fbnic_mac_init(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 32702dc4a066..7ccf192f13d5 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -296,8 +296,6 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Capture snapshot of hardware stats so netdev can calculate delta */ fbnic_reset_hw_stats(fbd); - fbnic_hwmon_register(fbd); - if (!fbd->dsn) { dev_warn(&pdev->dev, "Reading serial number failed\n"); goto init_failure_mode; @@ -360,7 +358,6 @@ static void fbnic_remove(struct pci_dev *pdev) fbnic_netdev_free(fbd); } - fbnic_hwmon_unregister(fbd); fbnic_dbg_fbd_exit(fbd); fbnic_devlink_unregister(fbd); fbnic_fw_disable_mbx(fbd); From 2ac538e40278a2c0c051cca81bcaafc547d61372 Mon Sep 17 00:00:00 2001 From: He Wang Date: Mon, 6 Jan 2025 03:39:54 +0000 Subject: [PATCH 702/807] ksmbd: fix unexpectedly changed path in ksmbd_vfs_kern_path_locked When `ksmbd_vfs_kern_path_locked` met an error and it is not the last entry, it will exit without restoring changed path buffer. But later this buffer may be used as the filename for creation. Fixes: c5a709f08d40 ("ksmbd: handle caseless file creation") Signed-off-by: He Wang Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/vfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 88d167a5f8b7..40f08eac519c 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -1264,6 +1264,8 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, filepath, flags, path); + if (!is_last) + next[0] = '/'; if (err) goto out2; else if (is_last) @@ -1271,7 +1273,6 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, path_put(parent_path); *parent_path = *path; - next[0] = '/'; remain_len -= filename_len + 1; } From a9d9c33132d49329ada647e4514d210d15e31d81 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Tue, 7 Jan 2025 15:30:56 -0800 Subject: [PATCH 703/807] x86/fpu: Ensure shadow stack is active before "getting" registers The x86 shadow stack support has its own set of registers. Those registers are XSAVE-managed, but they are "supervisor state components" which means that userspace can not touch them with XSAVE/XRSTOR. It also means that they are not accessible from the existing ptrace ABI for XSAVE state. Thus, there is a new ptrace get/set interface for it. The regset code that ptrace uses provides an ->active() handler in addition to the get/set ones. For shadow stack this ->active() handler verifies that shadow stack is enabled via the ARCH_SHSTK_SHSTK bit in the thread struct. The ->active() handler is checked from some call sites of the regset get/set handlers, but not the ptrace ones. This was not understood when shadow stack support was put in place. As a result, both the set/get handlers can be called with XFEATURE_CET_USER in its init state, which would cause get_xsave_addr() to return NULL and trigger a WARN_ON(). The ssp_set() handler luckily has an ssp_active() check to avoid surprising the kernel with shadow stack behavior when the kernel is not ready for it (ARCH_SHSTK_SHSTK==0). That check just happened to avoid the warning. But the ->get() side wasn't so lucky. It can be called with shadow stacks disabled, triggering the warning in practice, as reported by Christina Schimpe: WARNING: CPU: 5 PID: 1773 at arch/x86/kernel/fpu/regset.c:198 ssp_get+0x89/0xa0 [...] Call Trace: ? show_regs+0x6e/0x80 ? ssp_get+0x89/0xa0 ? __warn+0x91/0x150 ? ssp_get+0x89/0xa0 ? report_bug+0x19d/0x1b0 ? handle_bug+0x46/0x80 ? exc_invalid_op+0x1d/0x80 ? asm_exc_invalid_op+0x1f/0x30 ? __pfx_ssp_get+0x10/0x10 ? ssp_get+0x89/0xa0 ? ssp_get+0x52/0xa0 __regset_get+0xad/0xf0 copy_regset_to_user+0x52/0xc0 ptrace_regset+0x119/0x140 ptrace_request+0x13c/0x850 ? wait_task_inactive+0x142/0x1d0 ? do_syscall_64+0x6d/0x90 arch_ptrace+0x102/0x300 [...] Ensure that shadow stacks are active in a thread before looking them up in the XSAVE buffer. Since ARCH_SHSTK_SHSTK and user_ssp[SHSTK_EN] are set at the same time, the active check ensures that there will be something to find in the XSAVE buffer. [ dhansen: changelog/subject tweaks ] Fixes: 2fab02b25ae7 ("x86: Add PTRACE interface for shadow stack") Reported-by: Christina Schimpe Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Tested-by: Christina Schimpe Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20250107233056.235536-1-rick.p.edgecombe%40intel.com --- arch/x86/kernel/fpu/regset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 6bc1eb2a21bd..887b0b8e21e3 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -190,7 +190,8 @@ int ssp_get(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; struct cet_user_state *cetregs; - if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !ssp_active(target, regset)) return -ENODEV; sync_fpstate(fpu); From 82163d63ae7a4c36142cd252388737205bb7e4b9 Mon Sep 17 00:00:00 2001 From: Daniil Stas Date: Sun, 5 Jan 2025 21:36:18 +0000 Subject: [PATCH 704/807] hwmon: (drivetemp) Fix driver producing garbage data when SCSI errors occur scsi_execute_cmd() function can return both negative (linux codes) and positive (scsi_cmnd result field) error codes. Currently the driver just passes error codes of scsi_execute_cmd() to hwmon core, which is incorrect because hwmon only checks for negative error codes. This leads to hwmon reporting uninitialized data to userspace in case of SCSI errors (for example if the disk drive was disconnected). This patch checks scsi_execute_cmd() output and returns -EIO if it's error code is positive. Fixes: 5b46903d8bf37 ("hwmon: Driver for disk and solid state drives with temperature sensors") Signed-off-by: Daniil Stas Cc: Guenter Roeck Cc: Chris Healy Cc: Linus Walleij Cc: Martin K. Petersen Cc: Bart Van Assche Cc: linux-kernel@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: linux-ide@vger.kernel.org Cc: linux-hwmon@vger.kernel.org Link: https://lore.kernel.org/r/20250105213618.531691-1-daniil.stas@posteo.net [groeck: Avoid inline variable declaration for portability] Signed-off-by: Guenter Roeck --- drivers/hwmon/drivetemp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/drivetemp.c b/drivers/hwmon/drivetemp.c index 6bdd21aa005a..2a4ec55ddb47 100644 --- a/drivers/hwmon/drivetemp.c +++ b/drivers/hwmon/drivetemp.c @@ -165,6 +165,7 @@ static int drivetemp_scsi_command(struct drivetemp_data *st, { u8 scsi_cmd[MAX_COMMAND_SIZE]; enum req_op op; + int err; memset(scsi_cmd, 0, sizeof(scsi_cmd)); scsi_cmd[0] = ATA_16; @@ -192,8 +193,11 @@ static int drivetemp_scsi_command(struct drivetemp_data *st, scsi_cmd[12] = lba_high; scsi_cmd[14] = ata_command; - return scsi_execute_cmd(st->sdev, scsi_cmd, op, st->smartdata, - ATA_SECT_SIZE, HZ, 5, NULL); + err = scsi_execute_cmd(st->sdev, scsi_cmd, op, st->smartdata, + ATA_SECT_SIZE, HZ, 5, NULL); + if (err > 0) + err = -EIO; + return err; } static int drivetemp_ata_command(struct drivetemp_data *st, u8 feature, From e8580b4c600e085b3c8e6404392de2f822d4c132 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 7 Jan 2025 17:41:21 +0900 Subject: [PATCH 705/807] ksmbd: Implement new SMB3 POSIX type As SMB3 posix extension specification, Give posix file type to posix mode. https://www.samba.org/~slow/SMB3_POSIX/fscc_posix_extensions.html#posix-file-type-definition Cc: stable@vger.kernel.org Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 40 ++++++++++++++++++++++++++++++++++++++++ fs/smb/server/smb2pdu.h | 10 ++++++++++ 2 files changed, 50 insertions(+) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 433e33c04039..772deec5b90f 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -3994,6 +3994,26 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev); posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink); posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode & 0777); + switch (ksmbd_kstat->kstat->mode & S_IFMT) { + case S_IFDIR: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_DIR << POSIX_FILETYPE_SHIFT); + break; + case S_IFLNK: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_SYMLINK << POSIX_FILETYPE_SHIFT); + break; + case S_IFCHR: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_CHARDEV << POSIX_FILETYPE_SHIFT); + break; + case S_IFBLK: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_BLKDEV << POSIX_FILETYPE_SHIFT); + break; + case S_IFIFO: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_FIFO << POSIX_FILETYPE_SHIFT); + break; + case S_IFSOCK: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_SOCKET << POSIX_FILETYPE_SHIFT); + } + posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino); posix_info->DosAttributes = S_ISDIR(ksmbd_kstat->kstat->mode) ? @@ -5184,6 +5204,26 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); file_info->HardLinks = cpu_to_le32(stat.nlink); file_info->Mode = cpu_to_le32(stat.mode & 0777); + switch (stat.mode & S_IFMT) { + case S_IFDIR: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_DIR << POSIX_FILETYPE_SHIFT); + break; + case S_IFLNK: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_SYMLINK << POSIX_FILETYPE_SHIFT); + break; + case S_IFCHR: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_CHARDEV << POSIX_FILETYPE_SHIFT); + break; + case S_IFBLK: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_BLKDEV << POSIX_FILETYPE_SHIFT); + break; + case S_IFIFO: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_FIFO << POSIX_FILETYPE_SHIFT); + break; + case S_IFSOCK: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_SOCKET << POSIX_FILETYPE_SHIFT); + } + file_info->DeviceId = cpu_to_le32(stat.rdev); /* diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 649dacf7e8c4..17a0b18a8406 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -502,4 +502,14 @@ static inline void *smb2_get_msg(void *buf) return buf + 4; } +#define POSIX_TYPE_FILE 0 +#define POSIX_TYPE_DIR 1 +#define POSIX_TYPE_SYMLINK 2 +#define POSIX_TYPE_CHARDEV 3 +#define POSIX_TYPE_BLKDEV 4 +#define POSIX_TYPE_FIFO 5 +#define POSIX_TYPE_SOCKET 6 + +#define POSIX_FILETYPE_SHIFT 12 + #endif /* _SMB2PDU_H */ From b341ca51d2679829d26a3f6a4aa9aee9abd94f92 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Sat, 4 Jan 2025 10:29:45 -0500 Subject: [PATCH 706/807] tls: Fix tls_sw_sendmsg error handling We've noticed that NFS can hang when using RPC over TLS on an unstable connection, and investigation shows that the RPC layer is stuck in a tight loop attempting to transmit, but forever getting -EBADMSG back from the underlying network. The loop begins when tcp_sendmsg_locked() returns -EPIPE to tls_tx_records(), but that error is converted to -EBADMSG when calling the socket's error reporting handler. Instead of converting errors from tcp_sendmsg_locked(), let's pass them along in this path. The RPC layer handles -EPIPE by reconnecting the transport, which prevents the endless attempts to transmit on a broken connection. Signed-off-by: Benjamin Coddington Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Link: https://patch.msgid.link/9594185559881679d81f071b181a10eb07cd079f.1736004079.git.bcodding@redhat.com Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bbf26cc4f6ee..7bcc9b4408a2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -458,7 +458,7 @@ int tls_tx_records(struct sock *sk, int flags) tx_err: if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, -EBADMSG); + tls_err_abort(sk, rc); return rc; } From cb358ff94154774d031159b018adf45e17673941 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 6 Jan 2025 16:19:11 +0900 Subject: [PATCH 707/807] ipvlan: Fix use-after-free in ipvlan_get_iflink(). syzbot presented an use-after-free report [0] regarding ipvlan and linkwatch. ipvlan does not hold a refcnt of the lower device unlike vlan and macvlan. If the linkwatch work is triggered for the ipvlan dev, the lower dev might have already been freed, resulting in UAF of ipvlan->phy_dev in ipvlan_get_iflink(). We can delay the lower dev unregistration like vlan and macvlan by holding the lower dev's refcnt in dev->netdev_ops->ndo_init() and releasing it in dev->priv_destructor(). Jakub pointed out calling .ndo_XXX after unregister_netdevice() has returned is error prone and suggested [1] addressing this UAF in the core by taking commit 750e51603395 ("net: avoid potential UAF in default_operstate()") further. Let's assume unregistering devices DOWN and use RCU protection in default_operstate() not to race with the device unregistration. [0]: BUG: KASAN: slab-use-after-free in ipvlan_get_iflink+0x84/0x88 drivers/net/ipvlan/ipvlan_main.c:353 Read of size 4 at addr ffff0000d768c0e0 by task kworker/u8:35/6944 CPU: 0 UID: 0 PID: 6944 Comm: kworker/u8:35 Not tainted 6.13.0-rc2-g9bc5c9515b48 #12 4c3cb9e8b4565456f6a355f312ff91f4f29b3c47 Hardware name: linux,dummy-virt (DT) Workqueue: events_unbound linkwatch_event Call trace: show_stack+0x38/0x50 arch/arm64/kernel/stacktrace.c:484 (C) __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0xbc/0x108 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x16c/0x6f0 mm/kasan/report.c:489 kasan_report+0xc0/0x120 mm/kasan/report.c:602 __asan_report_load4_noabort+0x20/0x30 mm/kasan/report_generic.c:380 ipvlan_get_iflink+0x84/0x88 drivers/net/ipvlan/ipvlan_main.c:353 dev_get_iflink+0x7c/0xd8 net/core/dev.c:674 default_operstate net/core/link_watch.c:45 [inline] rfc2863_policy+0x144/0x360 net/core/link_watch.c:72 linkwatch_do_dev+0x60/0x228 net/core/link_watch.c:175 __linkwatch_run_queue+0x2f4/0x5b8 net/core/link_watch.c:239 linkwatch_event+0x64/0xa8 net/core/link_watch.c:282 process_one_work+0x700/0x1398 kernel/workqueue.c:3229 process_scheduled_works kernel/workqueue.c:3310 [inline] worker_thread+0x8c4/0xe10 kernel/workqueue.c:3391 kthread+0x2b0/0x360 kernel/kthread.c:389 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:862 Allocated by task 9303: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_alloc_info+0x44/0x58 mm/kasan/generic.c:568 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x84/0xa0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4283 [inline] __kmalloc_node_noprof+0x2a0/0x560 mm/slub.c:4289 __kvmalloc_node_noprof+0x9c/0x230 mm/util.c:650 alloc_netdev_mqs+0xb4/0x1118 net/core/dev.c:11209 rtnl_create_link+0x2b8/0xb60 net/core/rtnetlink.c:3595 rtnl_newlink_create+0x19c/0x868 net/core/rtnetlink.c:3771 __rtnl_newlink net/core/rtnetlink.c:3896 [inline] rtnl_newlink+0x122c/0x15c0 net/core/rtnetlink.c:4011 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6901 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6928 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] __sys_sendto+0x2ec/0x438 net/socket.c:2197 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __arm64_sys_sendto+0xe4/0x110 net/socket.c:2200 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 Freed by task 10200: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_free_info+0x58/0x70 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x48/0x68 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2338 [inline] slab_free mm/slub.c:4598 [inline] kfree+0x140/0x420 mm/slub.c:4746 kvfree+0x4c/0x68 mm/util.c:693 netdev_release+0x94/0xc8 net/core/net-sysfs.c:2034 device_release+0x98/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x2b0/0x438 lib/kobject.c:737 netdev_run_todo+0xdd8/0xf48 net/core/dev.c:10924 rtnl_unlock net/core/rtnetlink.c:152 [inline] rtnl_net_unlock net/core/rtnetlink.c:209 [inline] rtnl_dellink+0x484/0x680 net/core/rtnetlink.c:3526 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6901 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6928 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] ____sys_sendmsg+0x410/0x708 net/socket.c:2583 ___sys_sendmsg+0x178/0x1d8 net/socket.c:2637 __sys_sendmsg net/socket.c:2669 [inline] __do_sys_sendmsg net/socket.c:2674 [inline] __se_sys_sendmsg net/socket.c:2672 [inline] __arm64_sys_sendmsg+0x12c/0x1c8 net/socket.c:2672 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 The buggy address belongs to the object at ffff0000d768c000 which belongs to the cache kmalloc-cg-4k of size 4096 The buggy address is located 224 bytes inside of freed 4096-byte region [ffff0000d768c000, ffff0000d768d000) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x117688 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:ffff0000c77ef981 flags: 0xbfffe0000000040(head|node=0|zone=2|lastcpupid=0x1ffff) page_type: f5(slab) raw: 0bfffe0000000040 ffff0000c000f500 dead000000000100 dead000000000122 raw: 0000000000000000 0000000000040004 00000001f5000000 ffff0000c77ef981 head: 0bfffe0000000040 ffff0000c000f500 dead000000000100 dead000000000122 head: 0000000000000000 0000000000040004 00000001f5000000 ffff0000c77ef981 head: 0bfffe0000000003 fffffdffc35da201 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff0000d768bf80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff0000d768c000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff0000d768c080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff0000d768c100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff0000d768c180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 8c55facecd7a ("net: linkwatch: only report IF_OPER_LOWERLAYERDOWN if iflink is actually down") Reported-by: syzkaller Suggested-by: Jakub Kicinski Link: https://lore.kernel.org/netdev/20250102174400.085fd8ac@kernel.org/ [1] Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250106071911.64355-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/link_watch.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1b4d39e38084..cb04ef2b9807 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -42,14 +42,18 @@ static unsigned int default_operstate(const struct net_device *dev) * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { - int iflink = dev_get_iflink(dev); struct net_device *peer; + int iflink; /* If called from netdev_run_todo()/linkwatch_sync_dev(), * dev_net(dev) can be already freed, and RTNL is not held. */ - if (dev->reg_state == NETREG_UNREGISTERED || - iflink == dev->ifindex) + if (dev->reg_state <= NETREG_REGISTERED) + iflink = dev_get_iflink(dev); + else + iflink = dev->ifindex; + + if (iflink == dev->ifindex) return IF_OPER_DOWN; ASSERT_RTNL(); From db78475ba0d3c66d430f7ded2388cc041078a542 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 6 Jan 2025 10:02:10 -0800 Subject: [PATCH 708/807] eth: gve: use appropriate helper to set xdp_features Commit f85949f98206 ("xdp: add xdp_set_features_flag utility routine") added routines to inform the core about XDP flag changes. GVE support was added around the same time and missed using them. GVE only changes the flags on error recover or resume. Presumably the flags may change during resume if VM migrated. User would not get the notification and upper devices would not get a chance to recalculate their flags. Fixes: 75eaae158b1b ("gve: Add XDP DROP and TX support for GQI-QPL format") Reviewed-By: Jeroen de Borst Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250106180210.1861784-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 8a8f6ab12a98..533e659b15b3 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2241,14 +2241,18 @@ static void gve_service_task(struct work_struct *work) static void gve_set_netdev_xdp_features(struct gve_priv *priv) { + xdp_features_t xdp_features; + if (priv->queue_format == GVE_GQI_QPL_FORMAT) { - priv->dev->xdp_features = NETDEV_XDP_ACT_BASIC; - priv->dev->xdp_features |= NETDEV_XDP_ACT_REDIRECT; - priv->dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT; - priv->dev->xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY; + xdp_features = NETDEV_XDP_ACT_BASIC; + xdp_features |= NETDEV_XDP_ACT_REDIRECT; + xdp_features |= NETDEV_XDP_ACT_NDO_XMIT; + xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY; } else { - priv->dev->xdp_features = 0; + xdp_features = 0; } + + xdp_set_features_flag(priv->dev, xdp_features); } static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) From 77bf21a03a2ad45cf66f73f13154b1669d9cf52a Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Fri, 3 Jan 2025 14:15:17 +0530 Subject: [PATCH 709/807] Revert "drm/i915/hdcp: Don't enable HDCP1.4 directly from check_link" This reverts commit 483f7d94a0453564ad9295288c0242136c5f36a0. This needs to be reverted since HDCP even after updating the connector state HDCP property we don't reenable HDCP until the next commit in which the CP Property is set causing compliance to fail. --v2 -Fix build issue [Dnyaneshwar] Signed-off-by: Suraj Kandpal Reviewed-by: Dnyaneshwar Bhadane Link: https://patchwork.freedesktop.org/patch/msgid/20250103084517.239998-1-suraj.kandpal@intel.com (cherry picked from commit fcf73e20cd1fe60c3ba5f9626f1e8f9cd4511edf) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_hdcp.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_hdcp.c b/drivers/gpu/drm/i915/display/intel_hdcp.c index f57e4dba2873..8fee26d791f4 100644 --- a/drivers/gpu/drm/i915/display/intel_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_hdcp.c @@ -1158,9 +1158,15 @@ static int intel_hdcp_check_link(struct intel_connector *connector) goto out; } - intel_hdcp_update_value(connector, - DRM_MODE_CONTENT_PROTECTION_DESIRED, - true); + ret = intel_hdcp1_enable(connector); + if (ret) { + drm_err(display->drm, "Failed to enable hdcp (%d)\n", ret); + intel_hdcp_update_value(connector, + DRM_MODE_CONTENT_PROTECTION_DESIRED, + true); + goto out; + } + out: mutex_unlock(&dig_port->hdcp_mutex); mutex_unlock(&hdcp->mutex); From e59f4c97172de0c302894cfd5616161c1f0c4d85 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Tue, 7 Jan 2025 18:38:56 +0800 Subject: [PATCH 710/807] gpio: loongson: Fix Loongson-2K2000 ACPI GPIO register offset Since commit 3feb70a61740 ("gpio: loongson: add more gpio chip support"), the Loongson-2K2000 GPIO is supported. However, according to the firmware development specification, the Loongson-2K2000 ACPI GPIO register offsets in the driver do not match the register base addresses in the firmware, resulting in the registers not being accessed properly. Now, we fix it to ensure the GPIO function works properly. Cc: stable@vger.kernel.org Cc: Yinbo Zhu Fixes: 3feb70a61740 ("gpio: loongson: add more gpio chip support") Co-developed-by: Hongliang Wang Signed-off-by: Hongliang Wang Signed-off-by: Binbin Zhou Link: https://lore.kernel.org/r/20250107103856.1037222-1-zhoubinbin@loongson.cn Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-loongson-64bit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-loongson-64bit.c b/drivers/gpio/gpio-loongson-64bit.c index 6749d4dd6d64..7f4d78fd800e 100644 --- a/drivers/gpio/gpio-loongson-64bit.c +++ b/drivers/gpio/gpio-loongson-64bit.c @@ -237,9 +237,9 @@ static const struct loongson_gpio_chip_data loongson_gpio_ls2k2000_data1 = { static const struct loongson_gpio_chip_data loongson_gpio_ls2k2000_data2 = { .label = "ls2k2000_gpio", .mode = BIT_CTRL_MODE, - .conf_offset = 0x84, - .in_offset = 0x88, - .out_offset = 0x80, + .conf_offset = 0x4, + .in_offset = 0x8, + .out_offset = 0x0, }; static const struct loongson_gpio_chip_data loongson_gpio_ls3a5000_data = { From 854eee93bd6e3dca619d47087af4d65b2045828e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 8 Jan 2025 11:24:36 +0100 Subject: [PATCH 711/807] USB: serial: cp210x: add Phoenix Contact UPS Device Phoenix Contact sells UPS Quint devices [1] with a custom datacable [2] that embeds a Silicon Labs converter: Bus 001 Device 003: ID 1b93:1013 Silicon Labs Phoenix Contact UPS Device Device Descriptor: bLength 18 bDescriptorType 1 bcdUSB 2.00 bDeviceClass 0 bDeviceSubClass 0 bDeviceProtocol 0 bMaxPacketSize0 64 idVendor 0x1b93 idProduct 0x1013 bcdDevice 1.00 iManufacturer 1 Silicon Labs iProduct 2 Phoenix Contact UPS Device iSerial 3 bNumConfigurations 1 Configuration Descriptor: bLength 9 bDescriptorType 2 wTotalLength 0x0020 bNumInterfaces 1 bConfigurationValue 1 iConfiguration 0 bmAttributes 0x80 (Bus Powered) MaxPower 100mA Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 0 bAlternateSetting 0 bNumEndpoints 2 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 0 bInterfaceProtocol 0 iInterface 2 Phoenix Contact UPS Device Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x01 EP 1 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0040 1x 64 bytes bInterval 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x82 EP 2 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0040 1x 64 bytes bInterval 0 [1] https://www.phoenixcontact.com/en-pc/products/power-supply-unit-quint-ps-1ac-24dc-10-2866763 [2] https://www.phoenixcontact.com/en-il/products/data-cable-preassembled-ifs-usb-datacable-2320500 Reported-by: Giuseppe Corbelli Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index c24101f0a07a..9960ac2b10b7 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -223,6 +223,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x19CF, 0x3000) }, /* Parrot NMEA GPS Flight Recorder */ { USB_DEVICE(0x1ADB, 0x0001) }, /* Schweitzer Engineering C662 Cable */ { USB_DEVICE(0x1B1C, 0x1C00) }, /* Corsair USB Dongle */ + { USB_DEVICE(0x1B93, 0x1013) }, /* Phoenix Contact UPS Device */ { USB_DEVICE(0x1BA4, 0x0002) }, /* Silicon Labs 358x factory default */ { USB_DEVICE(0x1BE3, 0x07A6) }, /* WAGO 750-923 USB Service Cable */ { USB_DEVICE(0x1D6F, 0x0010) }, /* Seluxit ApS RF Dongle */ From c1947d244f807b1f95605b75a4059e7b37b5dcc3 Mon Sep 17 00:00:00 2001 From: Chukun Pan Date: Sun, 15 Dec 2024 18:00:27 +0800 Subject: [PATCH 712/807] USB: serial: option: add MeiG Smart SRM815 It looks like SRM815 shares ID with SRM825L. T: Bus=03 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2dee ProdID=4d22 Rev= 4.14 S: Manufacturer=MEIG S: Product=LTE-A Module S: SerialNumber=123456 C:* #Ifs= 5 Cfg#= 1 Atr=80 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Chukun Pan Link: https://lore.kernel.org/lkml/20241215100027.1970930-1-amadeus@jmu.edu.cn/ Link: https://lore.kernel.org/all/4333b4d0-281f-439d-9944-5570cbc4971d@gmail.com/ Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 64317b390d22..845de1544345 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -621,7 +621,7 @@ static void option_instat_callback(struct urb *urb); /* MeiG Smart Technology products */ #define MEIGSMART_VENDOR_ID 0x2dee -/* MeiG Smart SRM825L based on Qualcomm 315 */ +/* MeiG Smart SRM815/SRM825L based on Qualcomm 315 */ #define MEIGSMART_PRODUCT_SRM825L 0x4d22 /* MeiG Smart SLM320 based on UNISOC UIS8910 */ #define MEIGSMART_PRODUCT_SLM320 0x4d41 @@ -2405,6 +2405,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM320, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM770A, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, From f5b435be70cb126866fa92ffc6f89cda9e112c75 Mon Sep 17 00:00:00 2001 From: Michal Hrusecky Date: Tue, 7 Jan 2025 17:08:29 +0100 Subject: [PATCH 713/807] USB: serial: option: add Neoway N723-EA support Update the USB serial option driver to support Neoway N723-EA. ID 2949:8700 Marvell Mobile Composite Device Bus T: Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2949 ProdID=8700 Rev= 1.00 S: Manufacturer=Marvell S: Product=Mobile Composite Device Bus S: SerialNumber=200806006809080000 C:* #Ifs= 5 Cfg#= 1 Atr=c0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=e0(wlcon) Sub=01 Prot=03 I:* If#= 0 Alt= 0 #EPs= 1 Cls=e0(wlcon) Sub=01 Prot=03 Driver=rndis_host E: Ad=87(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0c(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=89(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0b(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0e(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=88(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0a(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Tested successfully connecting to the Internet via rndis interface after dialing via AT commands on If#=4 or If#=6. Not sure of the purpose of the other serial interface. Signed-off-by: Michal Hrusecky Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 845de1544345..1e2ae0c6c41c 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2413,6 +2413,7 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(1) }, { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0640, 0xff), /* TCL IK512 ECM */ .driver_info = NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(0x2949, 0x8700, 0xff) }, /* Neoway N723-EA */ { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, option_ids); From 6f79db028e827b023623a6ff825952e0d5fb619f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 8 Jan 2025 10:10:10 +0100 Subject: [PATCH 714/807] staging: gpib: mite: remove unused global functions The mite.c file was originally copied from the COMEDI code, and now that it is in the kernel tree, along with the comedi code, on some build configurations there are errors due to duplicate symbols (specifically mite_dma_disarm). Remove all of the unused functions in the gpib mite.c and .h files as they aren't needed and cause the compiler to be confused. Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202501081239.BAPhfAHJ-lkp@intel.com/ Reported-by: Stephen Rothwell Link: https://lore.kernel.org/r/2025010809-padding-survive-91b3@gregkh Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gpib/tnt4882/mite.c | 69 ----------------------------- drivers/staging/gpib/tnt4882/mite.h | 9 ---- 2 files changed, 78 deletions(-) diff --git a/drivers/staging/gpib/tnt4882/mite.c b/drivers/staging/gpib/tnt4882/mite.c index 0edf34d243e9..4bd352967616 100644 --- a/drivers/staging/gpib/tnt4882/mite.c +++ b/drivers/staging/gpib/tnt4882/mite.c @@ -148,72 +148,3 @@ void mite_list_devices(void) } pr_info("\n"); } - -int mite_bytes_transferred(struct mite_struct *mite, int chan) -{ - int dar, fcr; - - dar = readl(mite->mite_io_addr + MITE_DAR + CHAN_OFFSET(chan)); - fcr = readl(mite->mite_io_addr + MITE_FCR + CHAN_OFFSET(chan)) & 0x000000FF; - return dar - fcr; -} - -int mite_dma_tcr(struct mite_struct *mite) -{ - int tcr; - int lkar; - - lkar = readl(mite->mite_io_addr + CHAN_OFFSET(0) + MITE_LKAR); - tcr = readl(mite->mite_io_addr + CHAN_OFFSET(0) + MITE_TCR); - MDPRINTK("lkar=0x%08x tcr=%d\n", lkar, tcr); - - return tcr; -} - -void mite_dma_disarm(struct mite_struct *mite) -{ - int chor; - - /* disarm */ - chor = CHOR_ABORT; - writel(chor, mite->mite_io_addr + CHAN_OFFSET(0) + MITE_CHOR); -} - -void mite_dump_regs(struct mite_struct *mite) -{ - void *addr = 0; - unsigned long temp = 0; - - pr_info("mite address is =0x%p\n", mite->mite_io_addr); - - addr = mite->mite_io_addr + MITE_CHOR + CHAN_OFFSET(0); - pr_info("mite status[CHOR]at 0x%p =0x%08lx\n", addr, temp = readl(addr)); - //mite_decode(mite_CHOR_strings,temp); - addr = mite->mite_io_addr + MITE_CHCR + CHAN_OFFSET(0); - pr_info("mite status[CHCR]at 0x%p =0x%08lx\n", addr, temp = readl(addr)); - //mite_decode(mite_CHCR_strings,temp); - addr = mite->mite_io_addr + MITE_TCR + CHAN_OFFSET(0); - pr_info("mite status[TCR] at 0x%p =0x%08x\n", addr, readl(addr)); - addr = mite->mite_io_addr + MITE_MCR + CHAN_OFFSET(0); - pr_info("mite status[MCR] at 0x%p =0x%08lx\n", addr, temp = readl(addr)); - //mite_decode(mite_MCR_strings,temp); - addr = mite->mite_io_addr + MITE_MAR + CHAN_OFFSET(0); - pr_info("mite status[MAR] at 0x%p =0x%08x\n", addr, readl(addr)); - addr = mite->mite_io_addr + MITE_DCR + CHAN_OFFSET(0); - pr_info("mite status[DCR] at 0x%p =0x%08lx\n", addr, temp = readl(addr)); - //mite_decode(mite_CR_strings,temp); - addr = mite->mite_io_addr + MITE_DAR + CHAN_OFFSET(0); - pr_info("mite status[DAR] at 0x%p =0x%08x\n", addr, readl(addr)); - addr = mite->mite_io_addr + MITE_LKCR + CHAN_OFFSET(0); - pr_info("mite status[LKCR]at 0x%p =0x%08lx\n", addr, temp = readl(addr)); - //mite_decode(mite_CR_strings,temp); - addr = mite->mite_io_addr + MITE_LKAR + CHAN_OFFSET(0); - pr_info("mite status[LKAR]at 0x%p =0x%08x\n", addr, readl(addr)); - - addr = mite->mite_io_addr + MITE_CHSR + CHAN_OFFSET(0); - pr_info("mite status[CHSR]at 0x%p =0x%08lx\n", addr, temp = readl(addr)); - //mite_decode(mite_CHSR_strings,temp); - addr = mite->mite_io_addr + MITE_FCR + CHAN_OFFSET(0); - pr_info("mite status[FCR] at 0x%p =0x%08x\n\n", addr, readl(addr)); -} - diff --git a/drivers/staging/gpib/tnt4882/mite.h b/drivers/staging/gpib/tnt4882/mite.h index 7a475279b2fb..edb873435b51 100644 --- a/drivers/staging/gpib/tnt4882/mite.h +++ b/drivers/staging/gpib/tnt4882/mite.h @@ -61,15 +61,6 @@ int mite_setup(struct mite_struct *mite); void mite_unsetup(struct mite_struct *mite); void mite_list_devices(void); -int mite_dma_tcr(struct mite_struct *mite); - -void mite_dma_arm(struct mite_struct *mite); -void mite_dma_disarm(struct mite_struct *mite); - -void mite_dump_regs(struct mite_struct *mite); -void mite_setregs(struct mite_struct *mite, unsigned long ll_start, int chan, int dir); -int mite_bytes_transferred(struct mite_struct *mite, int chan); - #define CHAN_OFFSET(x) (0x100 * (x)) /* DMA base for chan 0 is 0x500, chan 1 is 0x600 */ From 95147bb42bc163866fc103c957820345fefa96cd Mon Sep 17 00:00:00 2001 From: Anton Kirilov Date: Thu, 19 Dec 2024 11:31:45 +0000 Subject: [PATCH 715/807] arm64: dts: rockchip: Fix the SD card detection on NanoPi R6C/R6S Fix the SD card detection on FriendlyElec NanoPi R6C/R6S boards. Signed-off-by: Anton Kirilov Link: https://lore.kernel.org/r/20241219113145.483205-1-anton.kirilov@arm.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6.dtsi b/arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6.dtsi index 76a6e8e517e9..c9749cb50076 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6.dtsi @@ -434,6 +434,7 @@ &sdmmc { bus-width = <4>; cap-sd-highspeed; + cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>; disable-wp; max-frequency = <150000000>; no-mmc; From 7ee7c9b39ed36caf983706f5b893cc5c37a79071 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Dec 2024 05:27:35 +0100 Subject: [PATCH 716/807] xfs: don't return an error from xfs_update_last_rtgroup_size for !XFS_RT Non-rtg file systems have a fake RT group even if they do not have a RT device, and thus an rgcount of 1. Ensure xfs_update_last_rtgroup_size doesn't fail when called for !XFS_RT to handle this case. Fixes: 87fe4c34a383 ("xfs: create incore realtime group structures") Reported-by: Brian Foster Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_rtgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index 7e7e491ff06f..2d7822644eff 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -272,7 +272,7 @@ static inline int xfs_initialize_rtgroups(struct xfs_mount *mp, } # define xfs_rtgroup_extents(mp, rgno) (0) -# define xfs_update_last_rtgroup_size(mp, rgno) (-EOPNOTSUPP) +# define xfs_update_last_rtgroup_size(mp, rgno) (0) # define xfs_rtgroup_lock(rtg, gf) ((void)0) # define xfs_rtgroup_unlock(rtg, gf) ((void)0) # define xfs_rtgroup_trans_join(tp, rtg, gf) ((void)0) From 47f33c27fc9565fb0bc7dfb76be08d445cd3d236 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 7 Jan 2025 17:47:01 +0100 Subject: [PATCH 717/807] dm-ebs: don't set the flag DM_TARGET_PASSES_INTEGRITY dm-ebs uses dm-bufio to process requests that are not aligned on logical sector size. dm-bufio doesn't support passing integrity data (and it is unclear how should it do it), so we shouldn't set the DM_TARGET_PASSES_INTEGRITY flag. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Fixes: d3c7b35c20d6 ("dm: add emulated block size target") --- drivers/md/dm-ebs-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index ec5db1478b2f..18ae45dcbfb2 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -442,7 +442,7 @@ static int ebs_iterate_devices(struct dm_target *ti, static struct target_type ebs_target = { .name = "ebs", .version = {1, 0, 1}, - .features = DM_TARGET_PASSES_INTEGRITY, + .features = 0, .module = THIS_MODULE, .ctr = ebs_ctr, .dtr = ebs_dtr, From 80f130bfad1dab93b95683fc39b87235682b8f72 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Tue, 7 Jan 2025 15:24:58 -0800 Subject: [PATCH 718/807] dm thin: make get_first_thin use rcu-safe list first function The documentation in rculist.h explains the absence of list_empty_rcu() and cautions programmers against relying on a list_empty() -> list_first() sequence in RCU safe code. This is because each of these functions performs its own READ_ONCE() of the list head. This can lead to a situation where the list_empty() sees a valid list entry, but the subsequent list_first() sees a different view of list head state after a modification. In the case of dm-thin, this author had a production box crash from a GP fault in the process_deferred_bios path. This function saw a valid list head in get_first_thin() but when it subsequently dereferenced that and turned it into a thin_c, it got the inside of the struct pool, since the list was now empty and referring to itself. The kernel on which this occurred printed both a warning about a refcount_t being saturated, and a UBSAN error for an out-of-bounds cpuid access in the queued spinlock, prior to the fault itself. When the resulting kdump was examined, it was possible to see another thread patiently waiting in thin_dtr's synchronize_rcu. The thin_dtr call managed to pull the thin_c out of the active thins list (and have it be the last entry in the active_thins list) at just the wrong moment which lead to this crash. Fortunately, the fix here is straight forward. Switch get_first_thin() function to use list_first_or_null_rcu() which performs just a single READ_ONCE() and returns NULL if the list is already empty. This was run against the devicemapper test suite's thin-provisioning suites for delete and suspend and no regressions were observed. Signed-off-by: Krister Johansen Fixes: b10ebd34ccca ("dm thin: fix rcu_read_lock being held in code that can sleep") Cc: stable@vger.kernel.org Acked-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-thin.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c9f47d0cccf9..872bb59f5470 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2332,10 +2332,9 @@ static struct thin_c *get_first_thin(struct pool *pool) struct thin_c *tc = NULL; rcu_read_lock(); - if (!list_empty(&pool->active_thins)) { - tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list); + tc = list_first_or_null_rcu(&pool->active_thins, struct thin_c, list); + if (tc) thin_get(tc); - } rcu_read_unlock(); return tc; From 194f9f94a5169547d682e9bbcc5ae6d18a564735 Mon Sep 17 00:00:00 2001 From: Rengarajan S Date: Thu, 5 Dec 2024 19:06:25 +0530 Subject: [PATCH 719/807] misc: microchip: pci1xxxx: Resolve kernel panic during GPIO IRQ handling Resolve kernel panic caused by improper handling of IRQs while accessing GPIO values. This is done by replacing generic_handle_irq with handle_nested_irq. Fixes: 1f4d8ae231f4 ("misc: microchip: pci1xxxx: Add gpio irq handler and irq helper functions irq_ack, irq_mask, irq_unmask and irq_set_type of irq_chip.") Cc: stable Signed-off-by: Rengarajan S Link: https://lore.kernel.org/r/20241205133626.1483499-2-rengarajan.s@microchip.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c index e616e3ec2b42..558290bdb938 100644 --- a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c +++ b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c @@ -277,7 +277,7 @@ static irqreturn_t pci1xxxx_gpio_irq_handler(int irq, void *dev_id) writel(BIT(bit), priv->reg_base + INTR_STATUS_OFFSET(gpiobank)); spin_unlock_irqrestore(&priv->lock, flags); irq = irq_find_mapping(gc->irq.domain, (bit + (gpiobank * 32))); - generic_handle_irq(irq); + handle_nested_irq(irq); } } spin_lock_irqsave(&priv->lock, flags); From c7a5378a0f707686de3ddb489f1653c523bb7dcc Mon Sep 17 00:00:00 2001 From: Rengarajan S Date: Thu, 5 Dec 2024 19:06:26 +0530 Subject: [PATCH 720/807] misc: microchip: pci1xxxx: Resolve return code mismatch during GPIO set config Driver returns -EOPNOTSUPPORTED on unsupported parameters case in set config. Upper level driver checks for -ENOTSUPP. Because of the return code mismatch, the ioctls from userspace fail. Resolve the issue by passing -ENOTSUPP during unsupported case. Fixes: 7d3e4d807df2 ("misc: microchip: pci1xxxx: load gpio driver for the gpio controller auxiliary device enumerated by the auxiliary bus driver.") Cc: stable Signed-off-by: Rengarajan S Link: https://lore.kernel.org/r/20241205133626.1483499-3-rengarajan.s@microchip.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c index 558290bdb938..3c1359d8d4e6 100644 --- a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c +++ b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c @@ -148,7 +148,7 @@ static int pci1xxxx_gpio_set_config(struct gpio_chip *gpio, unsigned int offset, pci1xxx_assign_bit(priv->reg_base, OPENDRAIN_OFFSET(offset), (offset % 32), true); break; default: - ret = -EOPNOTSUPP; + ret = -ENOTSUPP; break; } spin_unlock_irqrestore(&priv->lock, flags); From c2994b008492db033d40bd767be1620229a3035e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Nov 2024 15:42:09 -0500 Subject: [PATCH 721/807] Bluetooth: hci_sync: Fix not setting Random Address when required This fixes errors such as the following when Own address type is set to Random Address but it has not been programmed yet due to either be advertising or connecting: < HCI Command: LE Set Exte.. (0x08|0x0041) plen 13 Own address type: Random (0x03) Filter policy: Ignore not in accept list (0x01) PHYs: 0x05 Entry 0: LE 1M Type: Passive (0x00) Interval: 60.000 msec (0x0060) Window: 30.000 msec (0x0030) Entry 1: LE Coded Type: Passive (0x00) Interval: 180.000 msec (0x0120) Window: 90.000 msec (0x0090) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Exten.. (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Enable (0x08|0x0042) ncmd 1 Status: Invalid HCI Command Parameters (0x12) Fixes: c45074d68a9b ("Bluetooth: Fix not generating RPA when required") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index c86f4e42e69c..7b2b04d6b856 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1031,9 +1031,9 @@ static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) { - /* If we're advertising or initiating an LE connection we can't - * go ahead and change the random address at this time. This is - * because the eventual initiator address used for the + /* If a random_addr has been set we're advertising or initiating an LE + * connection we can't go ahead and change the random address at this + * time. This is because the eventual initiator address used for the * subsequently created connection will be undefined (some * controllers use the new address and others the one we had * when the operation started). @@ -1041,8 +1041,9 @@ static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) * In this kind of scenario skip the update and let the random * address be updated at the next cycle. */ - if (hci_dev_test_flag(hdev, HCI_LE_ADV) || - hci_lookup_le_connect(hdev)) { + if (bacmp(&hdev->random_addr, BDADDR_ANY) && + (hci_dev_test_flag(hdev, HCI_LE_ADV) || + hci_lookup_le_connect(hdev))) { bt_dev_dbg(hdev, "Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return 0; From a182d9c84f9c52fb5db895ecceeee8b3a1bf661e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Nov 2024 15:42:10 -0500 Subject: [PATCH 722/807] Bluetooth: MGMT: Fix Add Device to responding before completing Add Device with LE type requires updating resolving/accept list which requires quite a number of commands to complete and each of them may fail, so instead of pretending it would always work this checks the return of hci_update_passive_scan_sync which indicates if everything worked as intended. Fixes: e8907f76544f ("Bluetooth: hci_sync: Make use of hci_cmd_sync_queue set 3") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index b31192d473d0..de47ad999d7b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7655,6 +7655,24 @@ static void device_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk); } +static void add_device_complete(struct hci_dev *hdev, void *data, int err) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_add_device *cp = cmd->param; + + if (!err) { + device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type, + cp->action); + device_flags_changed(NULL, hdev, &cp->addr.bdaddr, + cp->addr.type, hdev->conn_flags, + PTR_UINT(cmd->user_data)); + } + + mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE, + mgmt_status(err), &cp->addr, sizeof(cp->addr)); + mgmt_pending_free(cmd); +} + static int add_device_sync(struct hci_dev *hdev, void *data) { return hci_update_passive_scan_sync(hdev); @@ -7663,6 +7681,7 @@ static int add_device_sync(struct hci_dev *hdev, void *data) static int add_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { + struct mgmt_pending_cmd *cmd; struct mgmt_cp_add_device *cp = data; u8 auto_conn, addr_type; struct hci_conn_params *params; @@ -7743,9 +7762,24 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, current_flags = params->flags; } - err = hci_cmd_sync_queue(hdev, add_device_sync, NULL, NULL); - if (err < 0) + cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len); + if (!cmd) { + err = -ENOMEM; goto unlock; + } + + cmd->user_data = UINT_PTR(current_flags); + + err = hci_cmd_sync_queue(hdev, add_device_sync, cmd, + add_device_complete); + if (err < 0) { + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_FAILED, &cp->addr, + sizeof(cp->addr)); + mgmt_pending_free(cmd); + } + + goto unlock; added: device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); From 8023dd2204254a70887f5ee58d914bf70a060b9d Mon Sep 17 00:00:00 2001 From: Neeraj Sanjay Kale Date: Fri, 20 Dec 2024 18:32:52 +0530 Subject: [PATCH 723/807] Bluetooth: btnxpuart: Fix driver sending truncated data This fixes the apparent controller hang issue seen during stress test where the host sends a truncated payload, followed by HCI commands. The controller treats these HCI commands as a part of previously truncated payload, leading to command timeouts. Adding a serdev_device_wait_until_sent() call after serdev_device_write_buf() fixed the issue. Fixes: 689ca16e5232 ("Bluetooth: NXP: Add protocol support for NXP Bluetooth chipsets") Signed-off-by: Neeraj Sanjay Kale Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btnxpuart.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index 569f5b7d6e46..1230045d78a5 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -1381,6 +1381,7 @@ static void btnxpuart_tx_work(struct work_struct *work) while ((skb = nxp_dequeue(nxpdev))) { len = serdev_device_write_buf(serdev, skb->data, skb->len); + serdev_device_wait_until_sent(serdev, 0); hdev->stat.byte_tx += len; skb_pull(skb, len); From 67dba2c28fe0af7e25ea1aeade677162ed05310a Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Wed, 8 Jan 2025 17:50:28 +0800 Subject: [PATCH 724/807] Bluetooth: btmtk: Fix failed to send func ctrl for MediaTek devices. Use usb_autopm_get_interface() and usb_autopm_put_interface() in btmtk_usb_shutdown(), it could send func ctrl after enabling autosuspend. Bluetooth: btmtk_usb_hci_wmt_sync() hci0: Execution of wmt command timed out Bluetooth: btmtk_usb_shutdown() hci0: Failed to send wmt func ctrl (-110) Fixes: 5c5e8c52e3ca ("Bluetooth: btmtk: move btusb_mtk_[setup, shutdown] to btmtk.c") Signed-off-by: Chris Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 7 +++++++ net/bluetooth/rfcomm/tty.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 7fd9d5ddce02..224eafc27dbe 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -1472,10 +1472,15 @@ EXPORT_SYMBOL_GPL(btmtk_usb_setup); int btmtk_usb_shutdown(struct hci_dev *hdev) { + struct btmtk_data *data = hci_get_priv(hdev); struct btmtk_hci_wmt_params wmt_params; u8 param = 0; int err; + err = usb_autopm_get_interface(data->intf); + if (err < 0) + return err; + /* Disable the device */ wmt_params.op = BTMTK_WMT_FUNC_CTRL; wmt_params.flag = 0; @@ -1486,9 +1491,11 @@ int btmtk_usb_shutdown(struct hci_dev *hdev) err = btmtk_usb_hci_wmt_sync(hdev, &wmt_params); if (err < 0) { bt_dev_err(hdev, "Failed to send wmt func ctrl (%d)", err); + usb_autopm_put_interface(data->intf); return err; } + usb_autopm_put_interface(data->intf); return 0; } EXPORT_SYMBOL_GPL(btmtk_usb_shutdown); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index af80d599c337..21a5b5535ebc 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -201,14 +201,14 @@ static ssize_t address_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - return sprintf(buf, "%pMR\n", &dev->dst); + return sysfs_emit(buf, "%pMR\n", &dev->dst); } static ssize_t channel_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - return sprintf(buf, "%d\n", dev->channel); + return sysfs_emit(buf, "%d\n", dev->channel); } static DEVICE_ATTR_RO(address); From 30dd3b13f9de612ef7328ccffcf1a07d0d40ab51 Mon Sep 17 00:00:00 2001 From: Henry Huang Date: Wed, 8 Jan 2025 16:47:10 +0800 Subject: [PATCH 725/807] sched_ext: keep running prev when prev->scx.slice != 0 When %SCX_OPS_ENQ_LAST is set and prev->scx.slice != 0, @prev will be dispacthed into the local DSQ in put_prev_task_scx(). However, pick_task_scx() is executed before put_prev_task_scx(), so it will not pick @prev. Set %SCX_RQ_BAL_KEEP in balance_one() to ensure that pick_task_scx() can pick @prev. Signed-off-by: Henry Huang Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 19d2699cf638..335371cc2cbd 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2747,6 +2747,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; int nr_loops = SCX_DSP_MAX_LOOPS; lockdep_assert_rq_held(rq); @@ -2779,8 +2780,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * See scx_ops_disable_workfn() for the explanation on the * bypassing test. */ - if ((prev->scx.flags & SCX_TASK_QUEUED) && - prev->scx.slice && !scx_rq_bypassing(rq)) { + if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { rq->scx.flags |= SCX_RQ_BAL_KEEP; goto has_tasks; } @@ -2813,6 +2813,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) flush_dispatch_buf(rq); + if (prev_on_rq && prev->scx.slice) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + goto has_tasks; + } if (rq->scx.local_dsq.nr) goto has_tasks; if (consume_global_dsq(rq)) @@ -2838,8 +2842,7 @@ no_tasks: * Didn't find another task to run. Keep running @prev unless * %SCX_OPS_ENQ_LAST is in effect. */ - if ((prev->scx.flags & SCX_TASK_QUEUED) && - (!static_branch_unlikely(&scx_ops_enq_last) || + if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; goto has_tasks; From 6268d5bc10354fc2ab8d44a0cd3b042d49a0417e Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Thu, 9 Jan 2025 00:08:06 +0900 Subject: [PATCH 726/807] sched_ext: Replace rq_lock() to raw_spin_rq_lock() in scx_ops_bypass() scx_ops_bypass() iterates all CPUs to re-enqueue all the scx tasks. For each CPU, it acquires a lock using rq_lock() regardless of whether a CPU is offline or the CPU is currently running a task in a higher scheduler class (e.g., deadline). The rq_lock() is supposed to be used for online CPUs, and the use of rq_lock() may trigger an unnecessary warning in rq_pin_lock(). Therefore, replace rq_lock() to raw_spin_rq_lock() in scx_ops_bypass(). Without this change, we observe the following warning: ===== START ===== [ 6.615205] rq->balance_callback && rq->balance_callback != &balance_push_callback [ 6.615208] WARNING: CPU: 2 PID: 0 at kernel/sched/sched.h:1730 __schedule+0x1130/0x1c90 ===== END ===== Fixes: 0e7ffff1b811 ("scx: Fix raciness in scx_ops_bypass()") Signed-off-by: Changwoo Min Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 335371cc2cbd..11a0e1a9d86e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4747,10 +4747,9 @@ static void scx_ops_bypass(bool bypass) */ for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; struct task_struct *p, *n; - rq_lock(rq, &rf); + raw_spin_rq_lock(rq); if (bypass) { WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); @@ -4766,7 +4765,7 @@ static void scx_ops_bypass(bool bypass) * sees scx_rq_bypassing() before moving tasks to SCX. */ if (!scx_enabled()) { - rq_unlock(rq, &rf); + raw_spin_rq_unlock(rq); continue; } @@ -4786,10 +4785,11 @@ static void scx_ops_bypass(bool bypass) sched_enq_and_set_task(&ctx); } - rq_unlock(rq, &rf); - /* resched to restore ticks and idle state */ - resched_cpu(cpu); + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); + + raw_spin_rq_unlock(rq); } atomic_dec(&scx_ops_breather_depth); From 68e449d849fd50bd5e61d8bd32b3458dbd3a3df6 Mon Sep 17 00:00:00 2001 From: Honglei Wang Date: Wed, 8 Jan 2025 10:33:28 +0800 Subject: [PATCH 727/807] sched_ext: switch class when preempted by higher priority scheduler ops.cpu_release() function, if defined, must be invoked when preempted by a higher priority scheduler class task. This scenario was skipped in commit f422316d7466 ("sched_ext: Remove switch_class_scx()"). Let's fix it. Fixes: f422316d7466 ("sched_ext: Remove switch_class_scx()") Signed-off-by: Honglei Wang Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 11a0e1a9d86e..68150e110451 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3037,7 +3037,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, */ if (p->scx.slice && !scx_rq_bypassing(rq)) { dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); - return; + goto switch_class; } /* @@ -3054,6 +3054,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, } } +switch_class: if (next && next->sched_class != &ext_sched_class) switch_class(rq, next); } From d1cacd74776895f6435941f86a1130e58f6dd226 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 6 Jan 2025 10:01:36 -0800 Subject: [PATCH 728/807] netdev: prevent accessing NAPI instances from another namespace The NAPI IDs were not fully exposed to user space prior to the netlink API, so they were never namespaced. The netlink API must ensure that at the very least NAPI instance belongs to the same netns as the owner of the genl sock. napi_by_id() can become static now, but it needs to move because of dev_get_by_napi_id(). Cc: stable@vger.kernel.org Fixes: 1287c1ae0fc2 ("netdev-genl: Support setting per-NAPI config values") Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Reviewed-by: Sridhar Samudrala Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250106180137.1861472-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 43 +++++++++++++++++++++++++++++------------- net/core/dev.h | 3 ++- net/core/netdev-genl.c | 6 ++---- 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index faa23042df38..a9f62f5aeb84 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -753,6 +753,36 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, } EXPORT_SYMBOL_GPL(dev_fill_forward_path); +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} + +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id) +{ + struct napi_struct *napi; + + napi = napi_by_id(napi_id); + if (!napi) + return NULL; + + if (WARN_ON_ONCE(!napi->dev)) + return NULL; + if (!net_eq(net, dev_net(napi->dev))) + return NULL; + + return napi; +} + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -6293,19 +6323,6 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -/* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) -{ - unsigned int hash = napi_id % HASH_SIZE(napi_hash); - struct napi_struct *napi; - - hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) - if (napi->napi_id == napi_id) - return napi; - - return NULL; -} - static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; diff --git a/net/core/dev.h b/net/core/dev.h index d043dee25a68..deb5eae5749f 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -22,6 +22,8 @@ struct sd_flow_limit { extern int netdev_flow_limit_table_len; +struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id); + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else @@ -269,7 +271,6 @@ void xdp_do_check_flushed(struct napi_struct *napi); static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif -struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 125b660004d3..a3bdaf075b6b 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -167,8 +167,6 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, void *hdr; pid_t pid; - if (WARN_ON_ONCE(!napi->dev)) - return -EINVAL; if (!(napi->dev->flags & IFF_UP)) return 0; @@ -234,7 +232,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); rcu_read_lock(); - napi = napi_by_id(napi_id); + napi = netdev_napi_by_id(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); } else { @@ -355,7 +353,7 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); rcu_read_lock(); - napi = napi_by_id(napi_id); + napi = netdev_napi_by_id(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_set_config(napi, info); } else { From 80fb40baba19e25a1b6f3ecff6fc5c0171806bde Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 7 Jan 2025 11:14:39 +0100 Subject: [PATCH 729/807] tcp: Annotate data-race around sk->sk_mark in tcp_v4_send_reset This is a follow-up to 3c5b4d69c358 ("net: annotate data-races around sk->sk_mark"). sk->sk_mark can be read and written without holding the socket lock. IPv6 equivalent is already covered with READ_ONCE() annotation in tcp_v6_send_response(). Fixes: 3c5b4d69c358 ("net: annotate data-races around sk->sk_mark") Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/f459d1fc44f205e13f6d8bdca2c8bfb9902ffac9.1736244569.git.daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a38c8b1f44db..c26f6c4b7bb4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -896,7 +896,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); From 03f0b548537f758830bdb2dc3f2aba713069cef2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Thu, 28 Nov 2024 09:16:34 +0100 Subject: [PATCH 730/807] riscv: module: remove relocation_head rel_entry member allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit relocation_head's list_head member, rel_entry, doesn't need to be allocated, its storage can just be part of the allocated relocation_head. Remove the pointer which allows to get rid of the allocation as well as an existing memory leak found by Kai Zhang using kmemleak. Fixes: 8fd6c5142395 ("riscv: Add remaining module relocations") Reported-by: Kai Zhang Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Reviewed-by: Charlie Jenkins Tested-by: Charlie Jenkins Link: https://lore.kernel.org/r/20241128081636.3620468-1-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/module.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 1cd461f3d872..47d0ebeec93c 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -23,7 +23,7 @@ struct used_bucket { struct relocation_head { struct hlist_node node; - struct list_head *rel_entry; + struct list_head rel_entry; void *location; }; @@ -634,7 +634,7 @@ process_accumulated_relocations(struct module *me, location = rel_head_iter->location; list_for_each_entry_safe(rel_entry_iter, rel_entry_iter_tmp, - rel_head_iter->rel_entry, + &rel_head_iter->rel_entry, head) { curr_type = rel_entry_iter->type; reloc_handlers[curr_type].reloc_handler( @@ -704,16 +704,7 @@ static int add_relocation_to_accumulate(struct module *me, int type, return -ENOMEM; } - rel_head->rel_entry = - kmalloc(sizeof(struct list_head), GFP_KERNEL); - - if (!rel_head->rel_entry) { - kfree(entry); - kfree(rel_head); - return -ENOMEM; - } - - INIT_LIST_HEAD(rel_head->rel_entry); + INIT_LIST_HEAD(&rel_head->rel_entry); rel_head->location = location; INIT_HLIST_NODE(&rel_head->node); if (!current_head->first) { @@ -722,7 +713,6 @@ static int add_relocation_to_accumulate(struct module *me, int type, if (!bucket) { kfree(entry); - kfree(rel_head->rel_entry); kfree(rel_head); return -ENOMEM; } @@ -735,7 +725,7 @@ static int add_relocation_to_accumulate(struct module *me, int type, } /* Add relocation to head of discovered rel_head */ - list_add_tail(&entry->head, rel_head->rel_entry); + list_add_tail(&entry->head, &rel_head->rel_entry); return 0; } From 6a97f4118ac07cfdc316433f385dbdc12af5025e Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 18 Nov 2024 10:13:33 +0100 Subject: [PATCH 731/807] riscv: Fix sleeping in invalid context in die() die() can be called in exception handler, and therefore cannot sleep. However, die() takes spinlock_t which can sleep with PREEMPT_RT enabled. That causes the following warning: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 285, name: mutex preempt_count: 110001, expected: 0 RCU nest depth: 0, expected: 0 CPU: 0 UID: 0 PID: 285 Comm: mutex Not tainted 6.12.0-rc7-00022-ge19049cf7d56-dirty #234 Hardware name: riscv-virtio,qemu (DT) Call Trace: dump_backtrace+0x1c/0x24 show_stack+0x2c/0x38 dump_stack_lvl+0x5a/0x72 dump_stack+0x14/0x1c __might_resched+0x130/0x13a rt_spin_lock+0x2a/0x5c die+0x24/0x112 do_trap_insn_illegal+0xa0/0xea _new_vmalloc_restore_context_a0+0xcc/0xd8 Oops - illegal instruction [#1] Switch to use raw_spinlock_t, which does not sleep even with PREEMPT_RT enabled. Fixes: 76d2a0493a17 ("RISC-V: Init and Halt Code") Signed-off-by: Nam Cao Cc: stable@vger.kernel.org Reviewed-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20241118091333.1185288-1-namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 51ebfd23e007..8ff8e8b36524 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -35,7 +35,7 @@ int show_unhandled_signals = 1; -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); static int copy_code(struct pt_regs *regs, u16 *val, const u16 *insns) { @@ -81,7 +81,7 @@ void die(struct pt_regs *regs, const char *str) oops_enter(); - spin_lock_irqsave(&die_lock, flags); + raw_spin_lock_irqsave(&die_lock, flags); console_verbose(); bust_spinlocks(1); @@ -100,7 +100,7 @@ void die(struct pt_regs *regs, const char *str) bust_spinlocks(0); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - spin_unlock_irqrestore(&die_lock, flags); + raw_spin_unlock_irqrestore(&die_lock, flags); oops_exit(); if (in_interrupt()) From 5a4b584c67699a69981f0740618a144965a63237 Mon Sep 17 00:00:00 2001 From: Hao Lan Date: Mon, 6 Jan 2025 22:36:36 +0800 Subject: [PATCH 732/807] net: hns3: fixed reset failure issues caused by the incorrect reset type When a reset type that is not supported by the driver is input, a reset pending flag bit of the HNAE3_NONE_RESET type is generated in reset_pending. The driver does not have a mechanism to clear this type of error. As a result, the driver considers that the reset is not complete. This patch provides a mechanism to clear the HNAE3_NONE_RESET flag and the parameter of hnae3_ae_ops.set_default_reset_request is verified. The error message: hns3 0000:39:01.0: cmd failed -16 hns3 0000:39:01.0: hclge device re-init failed, VF is disabled! hns3 0000:39:01.0: failed to reset VF stack hns3 0000:39:01.0: failed to reset VF(4) hns3 0000:39:01.0: prepare reset(2) wait done hns3 0000:39:01.0 eth4: already uninitialized Use the crash tool to view struct hclgevf_dev: struct hclgevf_dev { ... default_reset_request = 0x20, reset_level = HNAE3_NONE_RESET, reset_pending = 0x100, reset_type = HNAE3_NONE_RESET, ... }; Fixes: 720bd5837e37 ("net: hns3: add set_default_reset_request in the hnae3_ae_ops") Signed-off-by: Hao Lan Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250106143642.539698-2-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../hisilicon/hns3/hns3pf/hclge_main.c | 33 ++++++++++++++-- .../hisilicon/hns3/hns3vf/hclgevf_main.c | 38 ++++++++++++++++--- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 05942fa78b11..7d44dc777dc5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3574,6 +3574,17 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf, return ret; } +static void hclge_set_reset_pending(struct hclge_dev *hdev, + enum hnae3_reset_type reset_type) +{ + /* When an incorrect reset type is executed, the get_reset_level + * function generates the HNAE3_NONE_RESET flag. As a result, this + * type do not need to pending. + */ + if (reset_type != HNAE3_NONE_RESET) + set_bit(reset_type, &hdev->reset_pending); +} + static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) { u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg; @@ -3594,7 +3605,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) */ if (BIT(HCLGE_VECTOR0_IMPRESET_INT_B) & msix_src_reg) { dev_info(&hdev->pdev->dev, "IMP reset interrupt\n"); - set_bit(HNAE3_IMP_RESET, &hdev->reset_pending); + hclge_set_reset_pending(hdev, HNAE3_IMP_RESET); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); *clearval = BIT(HCLGE_VECTOR0_IMPRESET_INT_B); hdev->rst_stats.imp_rst_cnt++; @@ -3604,7 +3615,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) if (BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B) & msix_src_reg) { dev_info(&hdev->pdev->dev, "global reset interrupt\n"); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); - set_bit(HNAE3_GLOBAL_RESET, &hdev->reset_pending); + hclge_set_reset_pending(hdev, HNAE3_GLOBAL_RESET); *clearval = BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B); hdev->rst_stats.global_rst_cnt++; return HCLGE_VECTOR0_EVENT_RST; @@ -4052,7 +4063,7 @@ static void hclge_do_reset(struct hclge_dev *hdev) case HNAE3_FUNC_RESET: dev_info(&pdev->dev, "PF reset requested\n"); /* schedule again to check later */ - set_bit(HNAE3_FUNC_RESET, &hdev->reset_pending); + hclge_set_reset_pending(hdev, HNAE3_FUNC_RESET); hclge_reset_task_schedule(hdev); break; default: @@ -4086,6 +4097,8 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev, clear_bit(HNAE3_FLR_RESET, addr); } + clear_bit(HNAE3_NONE_RESET, addr); + if (hdev->reset_type != HNAE3_NONE_RESET && rst_level < hdev->reset_type) return HNAE3_NONE_RESET; @@ -4227,7 +4240,7 @@ static bool hclge_reset_err_handle(struct hclge_dev *hdev) return false; } else if (hdev->rst_stats.reset_fail_cnt < MAX_RESET_FAIL_CNT) { hdev->rst_stats.reset_fail_cnt++; - set_bit(hdev->reset_type, &hdev->reset_pending); + hclge_set_reset_pending(hdev, hdev->reset_type); dev_info(&hdev->pdev->dev, "re-schedule reset task(%u)\n", hdev->rst_stats.reset_fail_cnt); @@ -4470,8 +4483,20 @@ static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle) static void hclge_set_def_reset_request(struct hnae3_ae_dev *ae_dev, enum hnae3_reset_type rst_type) { +#define HCLGE_SUPPORT_RESET_TYPE \ + (BIT(HNAE3_FLR_RESET) | BIT(HNAE3_FUNC_RESET) | \ + BIT(HNAE3_GLOBAL_RESET) | BIT(HNAE3_IMP_RESET)) + struct hclge_dev *hdev = ae_dev->priv; + if (!(BIT(rst_type) & HCLGE_SUPPORT_RESET_TYPE)) { + /* To prevent reset triggered by hclge_reset_event */ + set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request); + dev_warn(&hdev->pdev->dev, "unsupported reset type %d\n", + rst_type); + return; + } + set_bit(rst_type, &hdev->default_reset_request); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 2f6ffb88e700..fd0abe37fdd7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1393,6 +1393,17 @@ static int hclgevf_notify_roce_client(struct hclgevf_dev *hdev, return ret; } +static void hclgevf_set_reset_pending(struct hclgevf_dev *hdev, + enum hnae3_reset_type reset_type) +{ + /* When an incorrect reset type is executed, the get_reset_level + * function generates the HNAE3_NONE_RESET flag. As a result, this + * type do not need to pending. + */ + if (reset_type != HNAE3_NONE_RESET) + set_bit(reset_type, &hdev->reset_pending); +} + static int hclgevf_reset_wait(struct hclgevf_dev *hdev) { #define HCLGEVF_RESET_WAIT_US 20000 @@ -1542,7 +1553,7 @@ static void hclgevf_reset_err_handle(struct hclgevf_dev *hdev) hdev->rst_stats.rst_fail_cnt); if (hdev->rst_stats.rst_fail_cnt < HCLGEVF_RESET_MAX_FAIL_CNT) - set_bit(hdev->reset_type, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, hdev->reset_type); if (hclgevf_is_reset_pending(hdev)) { set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); @@ -1662,6 +1673,8 @@ static enum hnae3_reset_type hclgevf_get_reset_level(unsigned long *addr) clear_bit(HNAE3_FLR_RESET, addr); } + clear_bit(HNAE3_NONE_RESET, addr); + return rst_level; } @@ -1671,14 +1684,15 @@ static void hclgevf_reset_event(struct pci_dev *pdev, struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); struct hclgevf_dev *hdev = ae_dev->priv; - dev_info(&hdev->pdev->dev, "received reset request from VF enet\n"); - if (hdev->default_reset_request) hdev->reset_level = hclgevf_get_reset_level(&hdev->default_reset_request); else hdev->reset_level = HNAE3_VF_FUNC_RESET; + dev_info(&hdev->pdev->dev, "received reset request from VF enet, reset level is %d\n", + hdev->reset_level); + /* reset of this VF requested */ set_bit(HCLGEVF_RESET_REQUESTED, &hdev->reset_state); hclgevf_reset_task_schedule(hdev); @@ -1689,8 +1703,20 @@ static void hclgevf_reset_event(struct pci_dev *pdev, static void hclgevf_set_def_reset_request(struct hnae3_ae_dev *ae_dev, enum hnae3_reset_type rst_type) { +#define HCLGEVF_SUPPORT_RESET_TYPE \ + (BIT(HNAE3_VF_RESET) | BIT(HNAE3_VF_FUNC_RESET) | \ + BIT(HNAE3_VF_PF_FUNC_RESET) | BIT(HNAE3_VF_FULL_RESET) | \ + BIT(HNAE3_FLR_RESET) | BIT(HNAE3_VF_EXP_RESET)) + struct hclgevf_dev *hdev = ae_dev->priv; + if (!(BIT(rst_type) & HCLGEVF_SUPPORT_RESET_TYPE)) { + /* To prevent reset triggered by hclge_reset_event */ + set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request); + dev_info(&hdev->pdev->dev, "unsupported reset type %d\n", + rst_type); + return; + } set_bit(rst_type, &hdev->default_reset_request); } @@ -1847,14 +1873,14 @@ static void hclgevf_reset_service_task(struct hclgevf_dev *hdev) */ if (hdev->reset_attempts > HCLGEVF_MAX_RESET_ATTEMPTS_CNT) { /* prepare for full reset of stack + pcie interface */ - set_bit(HNAE3_VF_FULL_RESET, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, HNAE3_VF_FULL_RESET); /* "defer" schedule the reset task again */ set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); } else { hdev->reset_attempts++; - set_bit(hdev->reset_level, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, hdev->reset_level); set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); } hclgevf_reset_task_schedule(hdev); @@ -1977,7 +2003,7 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev, rst_ing_reg = hclgevf_read_dev(&hdev->hw, HCLGEVF_RST_ING); dev_info(&hdev->pdev->dev, "receive reset interrupt 0x%x!\n", rst_ing_reg); - set_bit(HNAE3_VF_RESET, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, HNAE3_VF_RESET); set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); *clearval = ~(1U << HCLGEVF_VECTOR0_RST_INT_B); From ac1e2836fe294c2007ca81cf7006862c3bdf0510 Mon Sep 17 00:00:00 2001 From: Hao Lan Date: Mon, 6 Jan 2025 22:36:37 +0800 Subject: [PATCH 733/807] net: hns3: fix missing features due to dev->features configuration too early Currently, the netdev->features is configured in hns3_nic_set_features. As a result, __netdev_update_features considers that there is no feature difference, and the procedures of the real features are missing. Fixes: 2a7556bb2b73 ("net: hns3: implement ndo_features_check ops for hns3 driver") Signed-off-by: Hao Lan Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250106143642.539698-3-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 43377a7b2426..a7e3b22f641c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2452,7 +2452,6 @@ static int hns3_nic_set_features(struct net_device *netdev, return ret; } - netdev->features = features; return 0; } From 5191a8d3c2ab5bc01930ea3425e06a739af5b0e9 Mon Sep 17 00:00:00 2001 From: Hao Lan Date: Mon, 6 Jan 2025 22:36:38 +0800 Subject: [PATCH 734/807] net: hns3: Resolved the issue that the debugfs query result is inconsistent. This patch modifies the implementation of debugfs: When the user process stops unexpectedly, not all data of the file system is read. In this case, the save_buf pointer is not released. When the user process is called next time, save_buf is used to copy the cached data to the user space. As a result, the queried data is stale. To solve this problem, this patch implements .open() and .release() handler for debugfs file_operations. moving allocation buffer and execution of the cmd to the .open() handler and freeing in to the .release() handler. Allocate separate buffer for each reader and associate the buffer with the file pointer. When different user read processes no longer share the buffer, the stale data problem is fixed. Fixes: 5e69ea7ee2a6 ("net: hns3: refactor the debugfs process") Signed-off-by: Hao Lan Signed-off-by: Guangwei Zhang Signed-off-by: Jijie Shao Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250106143642.539698-4-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 3 - .../ethernet/hisilicon/hns3/hns3_debugfs.c | 96 ++++++------------- 2 files changed, 31 insertions(+), 68 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 710a8f9f2248..12ba380eb701 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -916,9 +916,6 @@ struct hnae3_handle { u8 netdev_flags; struct dentry *hnae3_dbgfs; - /* protects concurrent contention between debugfs commands */ - struct mutex dbgfs_lock; - char **dbgfs_buf; /* Network interface message level enabled bits */ u32 msg_enable; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 807eb3bbb11c..9bbece25552b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -1260,69 +1260,55 @@ static int hns3_dbg_read_cmd(struct hns3_dbg_data *dbg_data, static ssize_t hns3_dbg_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { - struct hns3_dbg_data *dbg_data = filp->private_data; + char *buf = filp->private_data; + + return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); +} + +static int hns3_dbg_open(struct inode *inode, struct file *filp) +{ + struct hns3_dbg_data *dbg_data = inode->i_private; struct hnae3_handle *handle = dbg_data->handle; struct hns3_nic_priv *priv = handle->priv; - ssize_t size = 0; - char **save_buf; - char *read_buf; u32 index; + char *buf; int ret; + if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) || + test_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) + return -EBUSY; + ret = hns3_dbg_get_cmd_index(dbg_data, &index); if (ret) return ret; - mutex_lock(&handle->dbgfs_lock); - save_buf = &handle->dbgfs_buf[index]; + buf = kvzalloc(hns3_dbg_cmd[index].buf_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; - if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) || - test_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) { - ret = -EBUSY; - goto out; + ret = hns3_dbg_read_cmd(dbg_data, hns3_dbg_cmd[index].cmd, + buf, hns3_dbg_cmd[index].buf_len); + if (ret) { + kvfree(buf); + return ret; } - if (*save_buf) { - read_buf = *save_buf; - } else { - read_buf = kvzalloc(hns3_dbg_cmd[index].buf_len, GFP_KERNEL); - if (!read_buf) { - ret = -ENOMEM; - goto out; - } + filp->private_data = buf; + return 0; +} - /* save the buffer addr until the last read operation */ - *save_buf = read_buf; - - /* get data ready for the first time to read */ - ret = hns3_dbg_read_cmd(dbg_data, hns3_dbg_cmd[index].cmd, - read_buf, hns3_dbg_cmd[index].buf_len); - if (ret) - goto out; - } - - size = simple_read_from_buffer(buffer, count, ppos, read_buf, - strlen(read_buf)); - if (size > 0) { - mutex_unlock(&handle->dbgfs_lock); - return size; - } - -out: - /* free the buffer for the last read operation */ - if (*save_buf) { - kvfree(*save_buf); - *save_buf = NULL; - } - - mutex_unlock(&handle->dbgfs_lock); - return ret; +static int hns3_dbg_release(struct inode *inode, struct file *filp) +{ + kvfree(filp->private_data); + filp->private_data = NULL; + return 0; } static const struct file_operations hns3_dbg_fops = { .owner = THIS_MODULE, - .open = simple_open, + .open = hns3_dbg_open, .read = hns3_dbg_read, + .release = hns3_dbg_release, }; static int hns3_dbg_bd_file_init(struct hnae3_handle *handle, u32 cmd) @@ -1379,13 +1365,6 @@ int hns3_dbg_init(struct hnae3_handle *handle) int ret; u32 i; - handle->dbgfs_buf = devm_kcalloc(&handle->pdev->dev, - ARRAY_SIZE(hns3_dbg_cmd), - sizeof(*handle->dbgfs_buf), - GFP_KERNEL); - if (!handle->dbgfs_buf) - return -ENOMEM; - hns3_dbg_dentry[HNS3_DBG_DENTRY_COMMON].dentry = debugfs_create_dir(name, hns3_dbgfs_root); handle->hnae3_dbgfs = hns3_dbg_dentry[HNS3_DBG_DENTRY_COMMON].dentry; @@ -1395,8 +1374,6 @@ int hns3_dbg_init(struct hnae3_handle *handle) debugfs_create_dir(hns3_dbg_dentry[i].name, handle->hnae3_dbgfs); - mutex_init(&handle->dbgfs_lock); - for (i = 0; i < ARRAY_SIZE(hns3_dbg_cmd); i++) { if ((hns3_dbg_cmd[i].cmd == HNAE3_DBG_CMD_TM_NODES && ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2) || @@ -1425,24 +1402,13 @@ int hns3_dbg_init(struct hnae3_handle *handle) out: debugfs_remove_recursive(handle->hnae3_dbgfs); handle->hnae3_dbgfs = NULL; - mutex_destroy(&handle->dbgfs_lock); return ret; } void hns3_dbg_uninit(struct hnae3_handle *handle) { - u32 i; - debugfs_remove_recursive(handle->hnae3_dbgfs); handle->hnae3_dbgfs = NULL; - - for (i = 0; i < ARRAY_SIZE(hns3_dbg_cmd); i++) - if (handle->dbgfs_buf[i]) { - kvfree(handle->dbgfs_buf[i]); - handle->dbgfs_buf[i] = NULL; - } - - mutex_destroy(&handle->dbgfs_lock); } void hns3_dbg_register_debugfs(const char *debugfs_dir_name) From 98b1e3b27734139c76295754b6c317aa4df6d32e Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 6 Jan 2025 22:36:39 +0800 Subject: [PATCH 735/807] net: hns3: don't auto enable misc vector Currently, there is a time window between misc irq enabled and service task inited. If an interrupte is reported at this time, it will cause warning like below: [ 16.324639] Call trace: [ 16.324641] __queue_delayed_work+0xb8/0xe0 [ 16.324643] mod_delayed_work_on+0x78/0xd0 [ 16.324655] hclge_errhand_task_schedule+0x58/0x90 [hclge] [ 16.324662] hclge_misc_irq_handle+0x168/0x240 [hclge] [ 16.324666] __handle_irq_event_percpu+0x64/0x1e0 [ 16.324667] handle_irq_event+0x80/0x170 [ 16.324670] handle_fasteoi_edge_irq+0x110/0x2bc [ 16.324671] __handle_domain_irq+0x84/0xfc [ 16.324673] gic_handle_irq+0x88/0x2c0 [ 16.324674] el1_irq+0xb8/0x140 [ 16.324677] arch_cpu_idle+0x18/0x40 [ 16.324679] default_idle_call+0x5c/0x1bc [ 16.324682] cpuidle_idle_call+0x18c/0x1c4 [ 16.324684] do_idle+0x174/0x17c [ 16.324685] cpu_startup_entry+0x30/0x6c [ 16.324687] secondary_start_kernel+0x1a4/0x280 [ 16.324688] ---[ end trace 6aa0bff672a964aa ]--- So don't auto enable misc vector when request irq.. Fixes: 7be1b9f3e99f ("net: hns3: make hclge_service use delayed workqueue") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250106143642.539698-5-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 7d44dc777dc5..db7845009252 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -3770,7 +3771,7 @@ static int hclge_misc_irq_init(struct hclge_dev *hdev) snprintf(hdev->misc_vector.name, HNAE3_INT_NAME_LEN, "%s-misc-%s", HCLGE_NAME, pci_name(hdev->pdev)); ret = request_irq(hdev->misc_vector.vector_irq, hclge_misc_irq_handle, - 0, hdev->misc_vector.name, hdev); + IRQF_NO_AUTOEN, hdev->misc_vector.name, hdev); if (ret) { hclge_free_vector(hdev, 0); dev_err(&hdev->pdev->dev, "request misc irq(%d) fail\n", @@ -11906,9 +11907,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_init_rxd_adv_layout(hdev); - /* Enable MISC vector(vector0) */ - hclge_enable_vector(&hdev->misc_vector, true); - ret = hclge_init_wol(hdev); if (ret) dev_warn(&pdev->dev, @@ -11921,6 +11919,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_state_init(hdev); hdev->last_reset_time = jiffies; + /* Enable MISC vector(vector0) */ + enable_irq(hdev->misc_vector.vector_irq); + hclge_enable_vector(&hdev->misc_vector, true); + dev_info(&hdev->pdev->dev, "%s driver initialization finished.\n", HCLGE_DRIVER_NAME); @@ -12326,7 +12328,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) /* Disable MISC vector(vector0) */ hclge_enable_vector(&hdev->misc_vector, false); - synchronize_irq(hdev->misc_vector.vector_irq); + disable_irq(hdev->misc_vector.vector_irq); /* Disable all hw interrupts */ hclge_config_mac_tnl_int(hdev, false); From 247fd1e33e1cd156aabe444e932d2648d33f1245 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 6 Jan 2025 22:36:40 +0800 Subject: [PATCH 736/807] net: hns3: initialize reset_timer before hclgevf_misc_irq_init() Currently the misc irq is initialized before reset_timer setup. But it will access the reset_timer in the irq handler. So initialize the reset_timer earlier. Fixes: ff200099d271 ("net: hns3: remove unnecessary work in hclgevf_main") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250106143642.539698-6-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index fd0abe37fdd7..163c6e59ea4c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2313,6 +2313,8 @@ static void hclgevf_state_init(struct hclgevf_dev *hdev) clear_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state); INIT_DELAYED_WORK(&hdev->service_task, hclgevf_service_task); + /* timer needs to be initialized before misc irq */ + timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0); mutex_init(&hdev->mbx_resp.mbx_mutex); sema_init(&hdev->reset_sem, 1); @@ -3012,7 +3014,6 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) HCLGEVF_DRIVER_NAME); hclgevf_task_schedule(hdev, round_jiffies_relative(HZ)); - timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0); return 0; From 7997ddd46c54408bcba5e37fe18b4d832e45d4d4 Mon Sep 17 00:00:00 2001 From: Hao Lan Date: Mon, 6 Jan 2025 22:36:41 +0800 Subject: [PATCH 737/807] net: hns3: fixed hclge_fetch_pf_reg accesses bar space out of bounds issue The TQP BAR space is divided into two segments. TQPs 0-1023 and TQPs 1024-1279 are in different BAR space addresses. However, hclge_fetch_pf_reg does not distinguish the tqp space information when reading the tqp space information. When the number of TQPs is greater than 1024, access bar space overwriting occurs. The problem of different segments has been considered during the initialization of tqp.io_base. Therefore, tqp.io_base is directly used when the queue is read in hclge_fetch_pf_reg. The error message: Unable to handle kernel paging request at virtual address ffff800037200000 pc : hclge_fetch_pf_reg+0x138/0x250 [hclge] lr : hclge_get_regs+0x84/0x1d0 [hclge] Call trace: hclge_fetch_pf_reg+0x138/0x250 [hclge] hclge_get_regs+0x84/0x1d0 [hclge] hns3_get_regs+0x2c/0x50 [hns3] ethtool_get_regs+0xf4/0x270 dev_ethtool+0x674/0x8a0 dev_ioctl+0x270/0x36c sock_do_ioctl+0x110/0x2a0 sock_ioctl+0x2ac/0x530 __arm64_sys_ioctl+0xa8/0x100 invoke_syscall+0x4c/0x124 el0_svc_common.constprop.0+0x140/0x15c do_el0_svc+0x30/0xd0 el0_svc+0x1c/0x2c el0_sync_handler+0xb0/0xb4 el0_sync+0x168/0x180 Fixes: 939ccd107ffc ("net: hns3: move dump regs function to a separate file") Signed-off-by: Hao Lan Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250106143642.539698-7-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c | 9 +++++---- .../net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c index 43c1c18fa81f..8c057192aae6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c @@ -510,9 +510,9 @@ out: static int hclge_fetch_pf_reg(struct hclge_dev *hdev, void *data, struct hnae3_knic_private_info *kinfo) { -#define HCLGE_RING_REG_OFFSET 0x200 #define HCLGE_RING_INT_REG_OFFSET 0x4 + struct hnae3_queue *tqp; int i, j, reg_num; int data_num_sum; u32 *reg = data; @@ -533,10 +533,11 @@ static int hclge_fetch_pf_reg(struct hclge_dev *hdev, void *data, reg_num = ARRAY_SIZE(ring_reg_addr_list); for (j = 0; j < kinfo->num_tqps; j++) { reg += hclge_reg_get_tlv(HCLGE_REG_TAG_RING, reg_num, reg); + tqp = kinfo->tqp[j]; for (i = 0; i < reg_num; i++) - *reg++ = hclge_read_dev(&hdev->hw, - ring_reg_addr_list[i] + - HCLGE_RING_REG_OFFSET * j); + *reg++ = readl_relaxed(tqp->io_base - + HCLGE_TQP_REG_OFFSET + + ring_reg_addr_list[i]); } data_num_sum += (reg_num + HCLGE_REG_TLV_SPACE) * kinfo->num_tqps; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c index 6db415d8b917..7d9d9dbc7560 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c @@ -123,10 +123,10 @@ int hclgevf_get_regs_len(struct hnae3_handle *handle) void hclgevf_get_regs(struct hnae3_handle *handle, u32 *version, void *data) { -#define HCLGEVF_RING_REG_OFFSET 0x200 #define HCLGEVF_RING_INT_REG_OFFSET 0x4 struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); + struct hnae3_queue *tqp; int i, j, reg_um; u32 *reg = data; @@ -147,10 +147,11 @@ void hclgevf_get_regs(struct hnae3_handle *handle, u32 *version, reg_um = ARRAY_SIZE(ring_reg_addr_list); for (j = 0; j < hdev->num_tqps; j++) { reg += hclgevf_reg_get_tlv(HCLGEVF_REG_TAG_RING, reg_um, reg); + tqp = &hdev->htqp[j].q; for (i = 0; i < reg_um; i++) - *reg++ = hclgevf_read_dev(&hdev->hw, - ring_reg_addr_list[i] + - HCLGEVF_RING_REG_OFFSET * j); + *reg++ = readl_relaxed(tqp->io_base - + HCLGEVF_TQP_REG_OFFSET + + ring_reg_addr_list[i]); } reg_um = ARRAY_SIZE(tqp_intr_reg_addr_list); From 9741e72b2286de8b38de9db685588ac421a95c87 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Mon, 6 Jan 2025 22:36:42 +0800 Subject: [PATCH 738/807] net: hns3: fix kernel crash when 1588 is sent on HIP08 devices Currently, HIP08 devices does not register the ptp devices, so the hdev->ptp is NULL. But the tx process would still try to set hardware time stamp info with SKBTX_HW_TSTAMP flag and cause a kernel crash. [ 128.087798] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000018 ... [ 128.280251] pc : hclge_ptp_set_tx_info+0x2c/0x140 [hclge] [ 128.286600] lr : hclge_ptp_set_tx_info+0x20/0x140 [hclge] [ 128.292938] sp : ffff800059b93140 [ 128.297200] x29: ffff800059b93140 x28: 0000000000003280 [ 128.303455] x27: ffff800020d48280 x26: ffff0cb9dc814080 [ 128.309715] x25: ffff0cb9cde93fa0 x24: 0000000000000001 [ 128.315969] x23: 0000000000000000 x22: 0000000000000194 [ 128.322219] x21: ffff0cd94f986000 x20: 0000000000000000 [ 128.328462] x19: ffff0cb9d2a166c0 x18: 0000000000000000 [ 128.334698] x17: 0000000000000000 x16: ffffcf1fc523ed24 [ 128.340934] x15: 0000ffffd530a518 x14: 0000000000000000 [ 128.347162] x13: ffff0cd6bdb31310 x12: 0000000000000368 [ 128.353388] x11: ffff0cb9cfbc7070 x10: ffff2cf55dd11e02 [ 128.359606] x9 : ffffcf1f85a212b4 x8 : ffff0cd7cf27dab0 [ 128.365831] x7 : 0000000000000a20 x6 : ffff0cd7cf27d000 [ 128.372040] x5 : 0000000000000000 x4 : 000000000000ffff [ 128.378243] x3 : 0000000000000400 x2 : ffffcf1f85a21294 [ 128.384437] x1 : ffff0cb9db520080 x0 : ffff0cb9db500080 [ 128.390626] Call trace: [ 128.393964] hclge_ptp_set_tx_info+0x2c/0x140 [hclge] [ 128.399893] hns3_nic_net_xmit+0x39c/0x4c4 [hns3] [ 128.405468] xmit_one.constprop.0+0xc4/0x200 [ 128.410600] dev_hard_start_xmit+0x54/0xf0 [ 128.415556] sch_direct_xmit+0xe8/0x634 [ 128.420246] __dev_queue_xmit+0x224/0xc70 [ 128.425101] dev_queue_xmit+0x1c/0x40 [ 128.429608] ovs_vport_send+0xac/0x1a0 [openvswitch] [ 128.435409] do_output+0x60/0x17c [openvswitch] [ 128.440770] do_execute_actions+0x898/0x8c4 [openvswitch] [ 128.446993] ovs_execute_actions+0x64/0xf0 [openvswitch] [ 128.453129] ovs_dp_process_packet+0xa0/0x224 [openvswitch] [ 128.459530] ovs_vport_receive+0x7c/0xfc [openvswitch] [ 128.465497] internal_dev_xmit+0x34/0xb0 [openvswitch] [ 128.471460] xmit_one.constprop.0+0xc4/0x200 [ 128.476561] dev_hard_start_xmit+0x54/0xf0 [ 128.481489] __dev_queue_xmit+0x968/0xc70 [ 128.486330] dev_queue_xmit+0x1c/0x40 [ 128.490856] ip_finish_output2+0x250/0x570 [ 128.495810] __ip_finish_output+0x170/0x1e0 [ 128.500832] ip_finish_output+0x3c/0xf0 [ 128.505504] ip_output+0xbc/0x160 [ 128.509654] ip_send_skb+0x58/0xd4 [ 128.513892] udp_send_skb+0x12c/0x354 [ 128.518387] udp_sendmsg+0x7a8/0x9c0 [ 128.522793] inet_sendmsg+0x4c/0x8c [ 128.527116] __sock_sendmsg+0x48/0x80 [ 128.531609] __sys_sendto+0x124/0x164 [ 128.536099] __arm64_sys_sendto+0x30/0x5c [ 128.540935] invoke_syscall+0x50/0x130 [ 128.545508] el0_svc_common.constprop.0+0x10c/0x124 [ 128.551205] do_el0_svc+0x34/0xdc [ 128.555347] el0_svc+0x20/0x30 [ 128.559227] el0_sync_handler+0xb8/0xc0 [ 128.563883] el0_sync+0x160/0x180 Fixes: 0bf5eb788512 ("net: hns3: add support for PTP") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250106143642.539698-8-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c index 5505caea88e9..bab16c2191b2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c @@ -58,6 +58,9 @@ bool hclge_ptp_set_tx_info(struct hnae3_handle *handle, struct sk_buff *skb) struct hclge_dev *hdev = vport->back; struct hclge_ptp *ptp = hdev->ptp; + if (!ptp) + return false; + if (!test_bit(HCLGE_PTP_FLAG_TX_EN, &ptp->flags) || test_and_set_bit(HCLGE_STATE_PTP_TX_HANDLING, &hdev->state)) { ptp->tx_skipped++; From 13134cc949148e1dfa540a0fe5dc73569bc62155 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Tue, 19 Nov 2024 12:10:56 +0100 Subject: [PATCH 739/807] riscv: kprobes: Fix incorrect address calculation p->ainsn.api.insn is a pointer to u32, therefore arithmetic operations are multiplied by four. This is clearly undesirable for this case. Cast it to (void *) first before any calculation. Below is a sample before/after. The dumped memory is two kprobe slots, the first slot has - c.addiw a0, 0x1c (0x7125) - ebreak (0x00100073) and the second slot has: - c.addiw a0, -4 (0x7135) - ebreak (0x00100073) Before this patch: (gdb) x/16xh 0xff20000000135000 0xff20000000135000: 0x7125 0x0000 0x0000 0x0000 0x7135 0x0010 0x0000 0x0000 0xff20000000135010: 0x0073 0x0010 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 After this patch: (gdb) x/16xh 0xff20000000125000 0xff20000000125000: 0x7125 0x0073 0x0010 0x0000 0x7135 0x0073 0x0010 0x0000 0xff20000000125010: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 Fixes: b1756750a397 ("riscv: kprobes: Use patch_text_nosync() for insn slots") Signed-off-by: Nam Cao Cc: stable@vger.kernel.org Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20241119111056.2554419-1-namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 380a0e8cecc0..c0738d6c6498 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -30,7 +30,7 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p) p->ainsn.api.restore = (unsigned long)p->addr + len; patch_text_nosync(p->ainsn.api.insn, &p->opcode, len); - patch_text_nosync(p->ainsn.api.insn + len, &insn, GET_INSN_LENGTH(insn)); + patch_text_nosync((void *)p->ainsn.api.insn + len, &insn, GET_INSN_LENGTH(insn)); } static void __kprobes arch_prepare_simulate(struct kprobe *p) From 7e25044b804581b9c029d5a28d8800aebde18043 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Sat, 16 Nov 2024 00:32:39 +0100 Subject: [PATCH 740/807] cpuidle: riscv-sbi: fix device node release in early exit of for_each_possible_cpu The 'np' device_node is initialized via of_cpu_device_node_get(), which requires explicit calls to of_node_put() when it is no longer required to avoid leaking the resource. Instead of adding the missing calls to of_node_put() in all execution paths, use the cleanup attribute for 'np' by means of the __free() macro, which automatically calls of_node_put() when the variable goes out of scope. Given that 'np' is only used within the for_each_possible_cpu(), reduce its scope to release the nood after every iteration of the loop. Fixes: 6abf32f1d9c5 ("cpuidle: Add RISC-V SBI CPU idle driver") Reviewed-by: Andrew Jones Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20241116-cpuidle-riscv-sbi-cleanup-v3-1-a3a46372ce08@gmail.com Signed-off-by: Palmer Dabbelt --- drivers/cpuidle/cpuidle-riscv-sbi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c index 14462c092039..0c92a628bbd4 100644 --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -504,12 +504,12 @@ static int sbi_cpuidle_probe(struct platform_device *pdev) int cpu, ret; struct cpuidle_driver *drv; struct cpuidle_device *dev; - struct device_node *np, *pds_node; + struct device_node *pds_node; /* Detect OSI support based on CPU DT nodes */ sbi_cpuidle_use_osi = true; for_each_possible_cpu(cpu) { - np = of_cpu_device_node_get(cpu); + struct device_node *np __free(device_node) = of_cpu_device_node_get(cpu); if (np && of_property_present(np, "power-domains") && of_property_present(np, "power-domain-names")) { From f754f27e98f88428aaf6be6e00f5cbce97f62d4b Mon Sep 17 00:00:00 2001 From: Xu Lu Date: Mon, 9 Dec 2024 20:26:17 +0800 Subject: [PATCH 741/807] riscv: mm: Fix the out of bound issue of vmemmap address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In sparse vmemmap model, the virtual address of vmemmap is calculated as: ((struct page *)VMEMMAP_START - (phys_ram_base >> PAGE_SHIFT)). And the struct page's va can be calculated with an offset: (vmemmap + (pfn)). However, when initializing struct pages, kernel actually starts from the first page from the same section that phys_ram_base belongs to. If the first page's physical address is not (phys_ram_base >> PAGE_SHIFT), then we get an va below VMEMMAP_START when calculating va for it's struct page. For example, if phys_ram_base starts from 0x82000000 with pfn 0x82000, the first page in the same section is actually pfn 0x80000. During init_unavailable_range(), we will initialize struct page for pfn 0x80000 with virtual address ((struct page *)VMEMMAP_START - 0x2000), which is below VMEMMAP_START as well as PCI_IO_END. This commit fixes this bug by introducing a new variable 'vmemmap_start_pfn' which is aligned with memory section size and using it to calculate vmemmap address instead of phys_ram_base. Fixes: a11dd49dcb93 ("riscv: Sparse-Memory/vmemmap out-of-bounds fix") Signed-off-by: Xu Lu Reviewed-by: Alexandre Ghiti Tested-by: Björn Töpel Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20241209122617.53341-1-luxu.kernel@bytedance.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/page.h | 1 + arch/riscv/include/asm/pgtable.h | 2 +- arch/riscv/mm/init.c | 17 ++++++++++++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 71aabc5c6713..125f5ecd9565 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -122,6 +122,7 @@ struct kernel_mapping { extern struct kernel_mapping kernel_map; extern phys_addr_t phys_ram_base; +extern unsigned long vmemmap_start_pfn; #define is_kernel_mapping(x) \ ((x) >= kernel_map.virt_addr && (x) < (kernel_map.virt_addr + kernel_map.size)) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index d4e99eef90ac..050fdc49b5ad 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -87,7 +87,7 @@ * Define vmemmap for pfn_to_page & page_to_pfn calls. Needed if kernel * is configured with CONFIG_SPARSEMEM_VMEMMAP enabled. */ -#define vmemmap ((struct page *)VMEMMAP_START - (phys_ram_base >> PAGE_SHIFT)) +#define vmemmap ((struct page *)VMEMMAP_START - vmemmap_start_pfn) #define PCI_IO_SIZE SZ_16M #define PCI_IO_END VMEMMAP_START diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index fc53ce748c80..8d167e09f1fe 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "../kernel/head.h" @@ -62,6 +63,13 @@ EXPORT_SYMBOL(pgtable_l5_enabled); phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); +#ifdef CONFIG_SPARSEMEM_VMEMMAP +#define VMEMMAP_ADDR_ALIGN (1ULL << SECTION_SIZE_BITS) + +unsigned long vmemmap_start_pfn __ro_after_init; +EXPORT_SYMBOL(vmemmap_start_pfn); +#endif + unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -240,8 +248,12 @@ static void __init setup_bootmem(void) * Make sure we align the start of the memory on a PMD boundary so that * at worst, we map the linear mapping with PMD mappings. */ - if (!IS_ENABLED(CONFIG_XIP_KERNEL)) + if (!IS_ENABLED(CONFIG_XIP_KERNEL)) { phys_ram_base = memblock_start_of_DRAM() & PMD_MASK; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_start_pfn = round_down(phys_ram_base, VMEMMAP_ADDR_ALIGN) >> PAGE_SHIFT; +#endif + } /* * In 64-bit, any use of __va/__pa before this point is wrong as we @@ -1101,6 +1113,9 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom); phys_ram_base = CONFIG_PHYS_RAM_BASE; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_start_pfn = round_down(phys_ram_base, VMEMMAP_ADDR_ALIGN) >> PAGE_SHIFT; +#endif kernel_map.phys_addr = (uintptr_t)CONFIG_PHYS_RAM_BASE; kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_start); From 51356ce60e5915a6bd812873186ed54e45c2699d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Mon, 9 Dec 2024 16:57:12 +0100 Subject: [PATCH 742/807] riscv: stacktrace: fix backtracing through exceptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prior to commit 5d5fc33ce58e ("riscv: Improve exception and system call latency"), backtrace through exception worked since ra was filled with ret_from_exception symbol address and the stacktrace code checked 'pc' to be equal to that symbol. Now that handle_exception uses regular 'call' instructions, this isn't working anymore and backtrace stops at handle_exception(). Since there are multiple call site to C code in the exception handling path, rather than checking multiple potential return addresses, add a new symbol at the end of exception handling and check pc to be in that range. Fixes: 5d5fc33ce58e ("riscv: Improve exception and system call latency") Signed-off-by: Clément Léger Tested-by: Alexandre Ghiti Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20241209155714.1239665-1-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 1 + arch/riscv/kernel/stacktrace.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index c200d329d4bd..7a6c48e6d211 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -278,6 +278,7 @@ SYM_CODE_START_NOALIGN(ret_from_exception) #else sret #endif +SYM_INNER_LABEL(ret_from_exception_end, SYM_L_GLOBAL) SYM_CODE_END(ret_from_exception) ASM_NOKPROBE(ret_from_exception) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 153a2db4c5fa..d4355c770c36 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -17,6 +17,7 @@ #ifdef CONFIG_FRAME_POINTER extern asmlinkage void handle_exception(void); +extern unsigned long ret_from_exception_end; static inline int fp_is_valid(unsigned long fp, unsigned long sp) { @@ -71,7 +72,8 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, fp = frame->fp; pc = ftrace_graph_ret_addr(current, &graph_idx, frame->ra, &frame->ra); - if (pc == (unsigned long)handle_exception) { + if (pc >= (unsigned long)handle_exception && + pc < (unsigned long)&ret_from_exception_end) { if (unlikely(!__kernel_text_address(pc) || !fn(arg, pc))) break; From 40e6073e764870da39d0203fc4326adc4c37e690 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sun, 15 Dec 2024 08:52:52 -0500 Subject: [PATCH 743/807] riscv: qspinlock: Fixup _Q_PENDING_LOOPS definition When CONFIG_RISCV_QUEUED_SPINLOCKS=y, the _Q_PENDING_LOOPS definition is missing. Add the _Q_PENDING_LOOPS definition for pure qspinlock usage. Fixes: ab83647fadae ("riscv: Add qspinlock support") Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20241215135252.201983-1-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/spinlock.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h index e5121b89acea..52f11bfd0079 100644 --- a/arch/riscv/include/asm/spinlock.h +++ b/arch/riscv/include/asm/spinlock.h @@ -3,8 +3,11 @@ #ifndef __ASM_RISCV_SPINLOCK_H #define __ASM_RISCV_SPINLOCK_H -#ifdef CONFIG_RISCV_COMBO_SPINLOCKS +#ifdef CONFIG_QUEUED_SPINLOCKS #define _Q_PENDING_LOOPS (1 << 9) +#endif + +#ifdef CONFIG_RISCV_COMBO_SPINLOCKS #define __no_arch_spinlock_redefine #include From 5cd900b8b7e42c492431eb4261c18927768db1f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 3 Jan 2025 15:17:58 +0100 Subject: [PATCH 744/807] riscv: use local label names instead of global ones in assembly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Local labels should be prefix by '.L' or they'll be exported in the symbol table. Additionally, this messes up the backtrace by displaying an incorrect symbol: ... [ 12.751810] [] _copy_from_user+0x28/0xc2 [ 12.752035] [] handle_misaligned_load+0x1ca/0x2fc [ 12.752310] [] do_trap_load_misaligned+0x24/0xee [ 12.752596] [] _new_vmalloc_restore_context_a0+0xc2/0xce After: ... [ 10.243916] [] _copy_from_user+0x28/0xc2 [ 10.244026] [] handle_misaligned_load+0x1ca/0x2fc [ 10.244150] [] do_trap_load_misaligned+0x24/0xee [ 10.244268] [] handle_exception+0x146/0x152 Signed-off-by: Clément Léger Reviewed-by: Alexandre Ghiti Fixes: 503638e0babf3 ("riscv: Stop emitting preventive sfence.vma for new vmalloc mappings") Link: https://lore.kernel.org/r/20250103141814.508865-1-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 7a6c48e6d211..33a5a9f2a0d4 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -23,21 +23,21 @@ REG_S a0, TASK_TI_A0(tp) csrr a0, CSR_CAUSE /* Exclude IRQs */ - blt a0, zero, _new_vmalloc_restore_context_a0 + blt a0, zero, .Lnew_vmalloc_restore_context_a0 REG_S a1, TASK_TI_A1(tp) /* Only check new_vmalloc if we are in page/protection fault */ li a1, EXC_LOAD_PAGE_FAULT - beq a0, a1, _new_vmalloc_kernel_address + beq a0, a1, .Lnew_vmalloc_kernel_address li a1, EXC_STORE_PAGE_FAULT - beq a0, a1, _new_vmalloc_kernel_address + beq a0, a1, .Lnew_vmalloc_kernel_address li a1, EXC_INST_PAGE_FAULT - bne a0, a1, _new_vmalloc_restore_context_a1 + bne a0, a1, .Lnew_vmalloc_restore_context_a1 -_new_vmalloc_kernel_address: +.Lnew_vmalloc_kernel_address: /* Is it a kernel address? */ csrr a0, CSR_TVAL - bge a0, zero, _new_vmalloc_restore_context_a1 + bge a0, zero, .Lnew_vmalloc_restore_context_a1 /* Check if a new vmalloc mapping appeared that could explain the trap */ REG_S a2, TASK_TI_A2(tp) @@ -69,7 +69,7 @@ _new_vmalloc_kernel_address: /* Check the value of new_vmalloc for this cpu */ REG_L a2, 0(a0) and a2, a2, a1 - beq a2, zero, _new_vmalloc_restore_context + beq a2, zero, .Lnew_vmalloc_restore_context /* Atomically reset the current cpu bit in new_vmalloc */ amoxor.d a0, a1, (a0) @@ -83,11 +83,11 @@ _new_vmalloc_kernel_address: csrw CSR_SCRATCH, x0 sret -_new_vmalloc_restore_context: +.Lnew_vmalloc_restore_context: REG_L a2, TASK_TI_A2(tp) -_new_vmalloc_restore_context_a1: +.Lnew_vmalloc_restore_context_a1: REG_L a1, TASK_TI_A1(tp) -_new_vmalloc_restore_context_a0: +.Lnew_vmalloc_restore_context_a0: REG_L a0, TASK_TI_A0(tp) .endm From 3cb97a927fffe443e1e7e8eddbfebfdb062e86ed Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Mon, 6 Jan 2025 08:19:04 +0000 Subject: [PATCH 745/807] cgroup/cpuset: remove kernfs active break A warning was found: WARNING: CPU: 10 PID: 3486953 at fs/kernfs/file.c:828 CPU: 10 PID: 3486953 Comm: rmdir Kdump: loaded Tainted: G RIP: 0010:kernfs_should_drain_open_files+0x1a1/0x1b0 RSP: 0018:ffff8881107ef9e0 EFLAGS: 00010202 RAX: 0000000080000002 RBX: ffff888154738c00 RCX: dffffc0000000000 RDX: 0000000000000007 RSI: 0000000000000004 RDI: ffff888154738c04 RBP: ffff888154738c04 R08: ffffffffaf27fa15 R09: ffffed102a8e7180 R10: ffff888154738c07 R11: 0000000000000000 R12: ffff888154738c08 R13: ffff888750f8c000 R14: ffff888750f8c0e8 R15: ffff888154738ca0 FS: 00007f84cd0be740(0000) GS:ffff8887ddc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000555f9fbe00c8 CR3: 0000000153eec001 CR4: 0000000000370ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: kernfs_drain+0x15e/0x2f0 __kernfs_remove+0x165/0x300 kernfs_remove_by_name_ns+0x7b/0xc0 cgroup_rm_file+0x154/0x1c0 cgroup_addrm_files+0x1c2/0x1f0 css_clear_dir+0x77/0x110 kill_css+0x4c/0x1b0 cgroup_destroy_locked+0x194/0x380 cgroup_rmdir+0x2a/0x140 It can be explained by: rmdir echo 1 > cpuset.cpus kernfs_fop_write_iter // active=0 cgroup_rm_file kernfs_remove_by_name_ns kernfs_get_active // active=1 __kernfs_remove // active=0x80000002 kernfs_drain cpuset_write_resmask wait_event //waiting (active == 0x80000001) kernfs_break_active_protection // active = 0x80000001 // continue kernfs_unbreak_active_protection // active = 0x80000002 ... kernfs_should_drain_open_files // warning occurs kernfs_put_active This warning is caused by 'kernfs_break_active_protection' when it is writing to cpuset.cpus, and the cgroup is removed concurrently. The commit 3a5a6d0c2b03 ("cpuset: don't nest cgroup_mutex inside get_online_cpus()") made cpuset_hotplug_workfn asynchronous, This change involves calling flush_work(), which can create a multiple processes circular locking dependency that involve cgroup_mutex, potentially leading to a deadlock. To avoid deadlock. the commit 76bb5ab8f6e3 ("cpuset: break kernfs active protection in cpuset_write_resmask()") added 'kernfs_break_active_protection' in the cpuset_write_resmask. This could lead to this warning. After the commit 2125c0034c5d ("cgroup/cpuset: Make cpuset hotplug processing synchronous"), the cpuset_write_resmask no longer needs to wait the hotplug to finish, which means that concurrent hotplug and cpuset operations are no longer possible. Therefore, the deadlock doesn't exist anymore and it does not have to 'break active protection' now. To fix this warning, just remove kernfs_break_active_protection operation in the 'cpuset_write_resmask'. Fixes: bdb2fd7fc56e ("kernfs: Skip kernfs_drain_open_files() more aggressively") Fixes: 76bb5ab8f6e3 ("cpuset: break kernfs active protection in cpuset_write_resmask()") Reported-by: Ji Fa Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 7ea559fb0cbf..0f910c828973 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3124,29 +3124,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, int retval = -ENODEV; buf = strstrip(buf); - - /* - * CPU or memory hotunplug may leave @cs w/o any execution - * resources, in which case the hotplug code asynchronously updates - * configuration and transfers all tasks to the nearest ancestor - * which can execute. - * - * As writes to "cpus" or "mems" may restore @cs's execution - * resources, wait for the previously scheduled operations before - * proceeding, so that we don't end up keep removing tasks added - * after execution capability is restored. - * - * cpuset_handle_hotplug may call back into cgroup core asynchronously - * via cgroup_transfer_tasks() and waiting for it from a cgroupfs - * operation like this one can lead to a deadlock through kernfs - * active_ref protection. Let's break the protection. Losing the - * protection is okay as we check whether @cs is online after - * grabbing cpuset_mutex anyway. This only happens on the legacy - * hierarchies. - */ - css_get(&cs->css); - kernfs_break_active_protection(of->kn); - cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) @@ -3179,8 +3156,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); - kernfs_unbreak_active_protection(of->kn); - css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } From d1bf27c4e1768d4733143f26962a5c68ea8bd03c Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 7 Jan 2025 15:26:59 +0100 Subject: [PATCH 746/807] dt-bindings: net: pse-pd: Fix unusual character in documentation The documentation contained an unusual character due to an issue in my personal b4 setup. Fix the problem by providing the correct PSE Pinout Alternatives table number description. Signed-off-by: Kory Maincent Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250107142659.425877-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- .../devicetree/bindings/net/pse-pd/pse-controller.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml b/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml index a12cda8aa764..cd09560e0aea 100644 --- a/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml +++ b/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml @@ -81,7 +81,7 @@ properties: List of phandles, each pointing to the power supply for the corresponding pairset named in 'pairset-names'. This property aligns with IEEE 802.3-2022, Section 33.2.3 and 145.2.4. - PSE Pinout Alternatives (as per IEEE 802.3-2022 Table 145\u20133) + PSE Pinout Alternatives (as per IEEE 802.3-2022 Table 145-3) |-----------|---------------|---------------|---------------|---------------| | Conductor | Alternative A | Alternative A | Alternative B | Alternative B | | | (MDI-X) | (MDI) | (X) | (S) | From 9ab4981552930a9c45682d62424ba610edc3992d Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 2 Jan 2025 16:11:10 -0800 Subject: [PATCH 747/807] drm/xe: Fix tlb invalidation when wedging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If GuC fails to load, the driver wedges, but in the process it tries to do stuff that may not be initialized yet. This moves the xe_gt_tlb_invalidation_init() to be done earlier: as its own doc says, it's a software-only initialization and should had been named with the _early() suffix. Move it to be called by xe_gt_init_early(), so the locks and seqno are initialized, avoiding a NULL ptr deref when wedging: xe 0000:03:00.0: [drm] *ERROR* GT0: load failed: status: Reset = 0, BootROM = 0x50, UKernel = 0x00, MIA = 0x00, Auth = 0x01 xe 0000:03:00.0: [drm] *ERROR* GT0: firmware signature verification failed xe 0000:03:00.0: [drm] *ERROR* CRITICAL: Xe has declared device 0000:03:00.0 as wedged. ... BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 9 UID: 0 PID: 3908 Comm: modprobe Tainted: G U W 6.13.0-rc4-xe+ #3 Tainted: [U]=USER, [W]=WARN Hardware name: Intel Corporation Alder Lake Client Platform/AlderLake-S ADP-S DDR5 UDIMM CRB, BIOS ADLSFWI1.R00.3275.A00.2207010640 07/01/2022 RIP: 0010:xe_gt_tlb_invalidation_reset+0x75/0x110 [xe] This can be easily triggered by poking the GuC binary to force a signature failure. There will still be an extra message, xe 0000:03:00.0: [drm] *ERROR* GT0: GuC mmio request 0x4100: no reply 0x4100 but that's better than a NULL ptr deref. Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3956 Fixes: c9474b726b93 ("drm/xe: Wedge the entire device") Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20250103001111.331684-2-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 5001ef3af8f2c972d6fd9c5221a8457556f8bea6) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt.c | 8 ++++---- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 4 ++-- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index d6744be01a68..94d468d01253 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -387,6 +387,10 @@ int xe_gt_init_early(struct xe_gt *gt) xe_force_wake_init_gt(gt, gt_to_fw(gt)); spin_lock_init(>->global_invl_lock); + err = xe_gt_tlb_invalidation_init_early(gt); + if (err) + return err; + return 0; } @@ -588,10 +592,6 @@ int xe_gt_init(struct xe_gt *gt) xe_hw_fence_irq_init(>->fence_irq[i]); } - err = xe_gt_tlb_invalidation_init(gt); - if (err) - return err; - err = xe_gt_pagefault_init(gt); if (err) return err; diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index 6146d1776bda..0a0af413770e 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -106,7 +106,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work) } /** - * xe_gt_tlb_invalidation_init - Initialize GT TLB invalidation state + * xe_gt_tlb_invalidation_init_early - Initialize GT TLB invalidation state * @gt: graphics tile * * Initialize GT TLB invalidation state, purely software initialization, should @@ -114,7 +114,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work) * * Return: 0 on success, negative error code on error. */ -int xe_gt_tlb_invalidation_init(struct xe_gt *gt) +int xe_gt_tlb_invalidation_init_early(struct xe_gt *gt) { gt->tlb_invalidation.seqno = 1; INIT_LIST_HEAD(>->tlb_invalidation.pending_fences); diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h index 00b1c6c01e8d..672acfcdf0d7 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h @@ -14,7 +14,8 @@ struct xe_gt; struct xe_guc; struct xe_vma; -int xe_gt_tlb_invalidation_init(struct xe_gt *gt); +int xe_gt_tlb_invalidation_init_early(struct xe_gt *gt); + void xe_gt_tlb_invalidation_reset(struct xe_gt *gt); int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt); int xe_gt_tlb_invalidation_vma(struct xe_gt *gt, From b84e1cd22f8a8c03b7b1051372560c7017c8be92 Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Thu, 19 Dec 2024 18:55:36 -0500 Subject: [PATCH 748/807] drm/xe/dg1: Fix power gate sequence. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sub-pipe PG is not present on DG1. Setting these bits can disable other power gates and cause GPU hangs on video playbacks. VLK: 16314, 4304 Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13381 Cc: Vinay Belgaumkar Cc: Himal Prasad Ghimiray Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20241219235536.454270-1-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 2f12e9c029315c1400059b2e7fdf53117c09c3a9) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_idle.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_idle.c b/drivers/gpu/drm/xe/xe_gt_idle.c index fd80afeef56a..ffd3ba7f6656 100644 --- a/drivers/gpu/drm/xe/xe_gt_idle.c +++ b/drivers/gpu/drm/xe/xe_gt_idle.c @@ -122,10 +122,12 @@ void xe_gt_idle_enable_pg(struct xe_gt *gt) if (!xe_gt_is_media_type(gt)) gtidle->powergate_enable |= RENDER_POWERGATE_ENABLE; - for (i = XE_HW_ENGINE_VCS0, j = 0; i <= XE_HW_ENGINE_VCS7; ++i, ++j) { - if ((gt->info.engine_mask & BIT(i))) - gtidle->powergate_enable |= (VDN_HCP_POWERGATE_ENABLE(j) | - VDN_MFXVDENC_POWERGATE_ENABLE(j)); + if (xe->info.platform != XE_DG1) { + for (i = XE_HW_ENGINE_VCS0, j = 0; i <= XE_HW_ENGINE_VCS7; ++i, ++j) { + if ((gt->info.engine_mask & BIT(i))) + gtidle->powergate_enable |= (VDN_HCP_POWERGATE_ENABLE(j) | + VDN_MFXVDENC_POWERGATE_ENABLE(j)); + } } fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); From 2d2d4f60ed266a8f340a721102d035252606980b Mon Sep 17 00:00:00 2001 From: Leo Yang Date: Tue, 7 Jan 2025 11:15:30 +0800 Subject: [PATCH 749/807] mctp i3c: fix MCTP I3C driver multi-thread issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found a timeout problem with the pldm command on our system. The reason is that the MCTP-I3C driver has a race condition when receiving multiple-packet messages in multi-thread, resulting in a wrong packet order problem. We identified this problem by adding a debug message to the mctp_i3c_read function. According to the MCTP spec, a multiple-packet message must be composed in sequence, and if there is a wrong sequence, the whole message will be discarded and wait for the next SOM. For example, SOM → Pkt Seq #2 → Pkt Seq #1 → Pkt Seq #3 → EOM. Therefore, we try to solve this problem by adding a mutex to the mctp_i3c_read function. Before the modification, when a command requesting a multiple-packet message response is sent consecutively, an error usually occurs within 100 loops. After the mutex, it can go through 40000 loops without any error, and it seems to run well. Fixes: c8755b29b58e ("mctp i3c: MCTP I3C driver") Signed-off-by: Leo Yang Link: https://patch.msgid.link/20250107031529.3296094-1-Leo-Yang@quantatw.com [pabeni@redhat.com: dropped already answered question from changelog] Signed-off-by: Paolo Abeni --- drivers/net/mctp/mctp-i3c.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/mctp/mctp-i3c.c b/drivers/net/mctp/mctp-i3c.c index 9adad59b8676..d247fe483c58 100644 --- a/drivers/net/mctp/mctp-i3c.c +++ b/drivers/net/mctp/mctp-i3c.c @@ -125,6 +125,8 @@ static int mctp_i3c_read(struct mctp_i3c_device *mi) xfer.data.in = skb_put(skb, mi->mrl); + /* Make sure netif_rx() is read in the same order as i3c. */ + mutex_lock(&mi->lock); rc = i3c_device_do_priv_xfers(mi->i3c, &xfer, 1); if (rc < 0) goto err; @@ -166,8 +168,10 @@ static int mctp_i3c_read(struct mctp_i3c_device *mi) stats->rx_dropped++; } + mutex_unlock(&mi->lock); return 0; err: + mutex_unlock(&mi->lock); kfree_skb(skb); return rc; } From 13210fc63f353fe78584048079343413a3cdf819 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 2 Jan 2025 13:01:13 +0100 Subject: [PATCH 750/807] netfilter: nf_tables: imbalance in flowtable binding All these cases cause imbalance between BIND and UNBIND calls: - Delete an interface from a flowtable with multiple interfaces - Add a (device to a) flowtable with --check flag - Delete a netns containing a flowtable - In an interactive nft session, create a table with owner flag and flowtable inside, then quit. Fix it by calling FLOW_BLOCK_UNBIND when unregistering hooks, then remove late FLOW_BLOCK_UNBIND call when destroying flowtable. Fixes: ff4bf2f42a40 ("netfilter: nf_tables: add nft_unregister_flowtable_hook()") Reported-by: Phil Sutter Tested-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0b9f1e8dfe49..c4af283356e7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8822,6 +8822,7 @@ static void nft_unregister_flowtable_hook(struct net *net, } static void __nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list, bool release_netdev) { @@ -8829,6 +8830,8 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, list_for_each_entry_safe(hook, next, hook_list, list) { nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); if (release_netdev) { list_del(&hook->list); kfree_rcu(hook, rcu); @@ -8837,9 +8840,10 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, } static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list) { - __nft_unregister_flowtable_net_hooks(net, hook_list, false); + __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); } static int nft_register_flowtable_net_hooks(struct net *net, @@ -9481,8 +9485,6 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } @@ -10870,6 +10872,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) &nft_trans_flowtable_hooks(trans), trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); @@ -10878,6 +10881,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NULL, trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -11140,11 +11144,13 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { nft_use_dec_restore(&table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -11737,7 +11743,8 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) list_for_each_entry(chain, &table->chains, list) __nf_tables_unregister_hook(net, table, chain, true); list_for_each_entry(flowtable, &table->flowtables, list) - __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list, + __nft_unregister_flowtable_net_hooks(net, flowtable, + &flowtable->hook_list, true); } From b541ba7d1f5a5b7b3e2e22dc9e40e18a7d6dbc13 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 8 Jan 2025 22:56:33 +0100 Subject: [PATCH 751/807] netfilter: conntrack: clamp maximum hashtable size to INT_MAX Use INT_MAX as maximum size for the conntrack hashtable. Otherwise, it is possible to hit WARN_ON_ONCE in __kvmalloc_node_noprof() when resizing hashtable because __GFP_NOWARN is unset. See: 0708a0afe291 ("mm: Consider __GFP_NOWARN flag for oversized kvmalloc() calls") Note: hashtable resize is only possible from init_netns. Fixes: 9cc1c73ad666 ("netfilter: conntrack: avoid integer overflow when resizing") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9db3e2b0b1c3..456446d7af20 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2517,12 +2517,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) struct hlist_nulls_head *hash; unsigned int nr_slots, i; - if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) From fcede1f0a043ccefe9bc6ad57f12718e42f63f1d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 8 Jan 2025 16:41:48 +0800 Subject: [PATCH 752/807] block, bfq: fix waker_bfqq UAF after bfq_split_bfqq() Our syzkaller report a following UAF for v6.6: BUG: KASAN: slab-use-after-free in bfq_init_rq+0x175d/0x17a0 block/bfq-iosched.c:6958 Read of size 8 at addr ffff8881b57147d8 by task fsstress/232726 CPU: 2 PID: 232726 Comm: fsstress Not tainted 6.6.0-g3629d1885222 #39 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x91/0xf0 lib/dump_stack.c:106 print_address_description.constprop.0+0x66/0x300 mm/kasan/report.c:364 print_report+0x3e/0x70 mm/kasan/report.c:475 kasan_report+0xb8/0xf0 mm/kasan/report.c:588 hlist_add_head include/linux/list.h:1023 [inline] bfq_init_rq+0x175d/0x17a0 block/bfq-iosched.c:6958 bfq_insert_request.isra.0+0xe8/0xa20 block/bfq-iosched.c:6271 bfq_insert_requests+0x27f/0x390 block/bfq-iosched.c:6323 blk_mq_insert_request+0x290/0x8f0 block/blk-mq.c:2660 blk_mq_submit_bio+0x1021/0x15e0 block/blk-mq.c:3143 __submit_bio+0xa0/0x6b0 block/blk-core.c:639 __submit_bio_noacct_mq block/blk-core.c:718 [inline] submit_bio_noacct_nocheck+0x5b7/0x810 block/blk-core.c:747 submit_bio_noacct+0xca0/0x1990 block/blk-core.c:847 __ext4_read_bh fs/ext4/super.c:205 [inline] ext4_read_bh+0x15e/0x2e0 fs/ext4/super.c:230 __read_extent_tree_block+0x304/0x6f0 fs/ext4/extents.c:567 ext4_find_extent+0x479/0xd20 fs/ext4/extents.c:947 ext4_ext_map_blocks+0x1a3/0x2680 fs/ext4/extents.c:4182 ext4_map_blocks+0x929/0x15a0 fs/ext4/inode.c:660 ext4_iomap_begin_report+0x298/0x480 fs/ext4/inode.c:3569 iomap_iter+0x3dd/0x1010 fs/iomap/iter.c:91 iomap_fiemap+0x1f4/0x360 fs/iomap/fiemap.c:80 ext4_fiemap+0x181/0x210 fs/ext4/extents.c:5051 ioctl_fiemap.isra.0+0x1b4/0x290 fs/ioctl.c:220 do_vfs_ioctl+0x31c/0x11a0 fs/ioctl.c:811 __do_sys_ioctl fs/ioctl.c:869 [inline] __se_sys_ioctl+0xae/0x190 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x70/0x120 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x78/0xe2 Allocated by task 232719: kasan_save_stack+0x22/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 __kasan_slab_alloc+0x87/0x90 mm/kasan/common.c:328 kasan_slab_alloc include/linux/kasan.h:188 [inline] slab_post_alloc_hook mm/slab.h:768 [inline] slab_alloc_node mm/slub.c:3492 [inline] kmem_cache_alloc_node+0x1b8/0x6f0 mm/slub.c:3537 bfq_get_queue+0x215/0x1f00 block/bfq-iosched.c:5869 bfq_get_bfqq_handle_split+0x167/0x5f0 block/bfq-iosched.c:6776 bfq_init_rq+0x13a4/0x17a0 block/bfq-iosched.c:6938 bfq_insert_request.isra.0+0xe8/0xa20 block/bfq-iosched.c:6271 bfq_insert_requests+0x27f/0x390 block/bfq-iosched.c:6323 blk_mq_insert_request+0x290/0x8f0 block/blk-mq.c:2660 blk_mq_submit_bio+0x1021/0x15e0 block/blk-mq.c:3143 __submit_bio+0xa0/0x6b0 block/blk-core.c:639 __submit_bio_noacct_mq block/blk-core.c:718 [inline] submit_bio_noacct_nocheck+0x5b7/0x810 block/blk-core.c:747 submit_bio_noacct+0xca0/0x1990 block/blk-core.c:847 __ext4_read_bh fs/ext4/super.c:205 [inline] ext4_read_bh_nowait+0x15a/0x240 fs/ext4/super.c:217 ext4_read_bh_lock+0xac/0xd0 fs/ext4/super.c:242 ext4_bread_batch+0x268/0x500 fs/ext4/inode.c:958 __ext4_find_entry+0x448/0x10f0 fs/ext4/namei.c:1671 ext4_lookup_entry fs/ext4/namei.c:1774 [inline] ext4_lookup.part.0+0x359/0x6f0 fs/ext4/namei.c:1842 ext4_lookup+0x72/0x90 fs/ext4/namei.c:1839 __lookup_slow+0x257/0x480 fs/namei.c:1696 lookup_slow fs/namei.c:1713 [inline] walk_component+0x454/0x5c0 fs/namei.c:2004 link_path_walk.part.0+0x773/0xda0 fs/namei.c:2331 link_path_walk fs/namei.c:3826 [inline] path_openat+0x1b9/0x520 fs/namei.c:3826 do_filp_open+0x1b7/0x400 fs/namei.c:3857 do_sys_openat2+0x5dc/0x6e0 fs/open.c:1428 do_sys_open fs/open.c:1443 [inline] __do_sys_openat fs/open.c:1459 [inline] __se_sys_openat fs/open.c:1454 [inline] __x64_sys_openat+0x148/0x200 fs/open.c:1454 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x70/0x120 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x78/0xe2 Freed by task 232726: kasan_save_stack+0x22/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 kasan_save_free_info+0x2b/0x50 mm/kasan/generic.c:522 ____kasan_slab_free mm/kasan/common.c:236 [inline] __kasan_slab_free+0x12a/0x1b0 mm/kasan/common.c:244 kasan_slab_free include/linux/kasan.h:164 [inline] slab_free_hook mm/slub.c:1827 [inline] slab_free_freelist_hook mm/slub.c:1853 [inline] slab_free mm/slub.c:3820 [inline] kmem_cache_free+0x110/0x760 mm/slub.c:3842 bfq_put_queue+0x6a7/0xfb0 block/bfq-iosched.c:5428 bfq_forget_entity block/bfq-wf2q.c:634 [inline] bfq_put_idle_entity+0x142/0x240 block/bfq-wf2q.c:645 bfq_forget_idle+0x189/0x1e0 block/bfq-wf2q.c:671 bfq_update_vtime block/bfq-wf2q.c:1280 [inline] __bfq_lookup_next_entity block/bfq-wf2q.c:1374 [inline] bfq_lookup_next_entity+0x350/0x480 block/bfq-wf2q.c:1433 bfq_update_next_in_service+0x1c0/0x4f0 block/bfq-wf2q.c:128 bfq_deactivate_entity+0x10a/0x240 block/bfq-wf2q.c:1188 bfq_deactivate_bfqq block/bfq-wf2q.c:1592 [inline] bfq_del_bfqq_busy+0x2e8/0xad0 block/bfq-wf2q.c:1659 bfq_release_process_ref+0x1cc/0x220 block/bfq-iosched.c:3139 bfq_split_bfqq+0x481/0xdf0 block/bfq-iosched.c:6754 bfq_init_rq+0xf29/0x17a0 block/bfq-iosched.c:6934 bfq_insert_request.isra.0+0xe8/0xa20 block/bfq-iosched.c:6271 bfq_insert_requests+0x27f/0x390 block/bfq-iosched.c:6323 blk_mq_insert_request+0x290/0x8f0 block/blk-mq.c:2660 blk_mq_submit_bio+0x1021/0x15e0 block/blk-mq.c:3143 __submit_bio+0xa0/0x6b0 block/blk-core.c:639 __submit_bio_noacct_mq block/blk-core.c:718 [inline] submit_bio_noacct_nocheck+0x5b7/0x810 block/blk-core.c:747 submit_bio_noacct+0xca0/0x1990 block/blk-core.c:847 __ext4_read_bh fs/ext4/super.c:205 [inline] ext4_read_bh+0x15e/0x2e0 fs/ext4/super.c:230 __read_extent_tree_block+0x304/0x6f0 fs/ext4/extents.c:567 ext4_find_extent+0x479/0xd20 fs/ext4/extents.c:947 ext4_ext_map_blocks+0x1a3/0x2680 fs/ext4/extents.c:4182 ext4_map_blocks+0x929/0x15a0 fs/ext4/inode.c:660 ext4_iomap_begin_report+0x298/0x480 fs/ext4/inode.c:3569 iomap_iter+0x3dd/0x1010 fs/iomap/iter.c:91 iomap_fiemap+0x1f4/0x360 fs/iomap/fiemap.c:80 ext4_fiemap+0x181/0x210 fs/ext4/extents.c:5051 ioctl_fiemap.isra.0+0x1b4/0x290 fs/ioctl.c:220 do_vfs_ioctl+0x31c/0x11a0 fs/ioctl.c:811 __do_sys_ioctl fs/ioctl.c:869 [inline] __se_sys_ioctl+0xae/0x190 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x70/0x120 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x78/0xe2 commit 1ba0403ac644 ("block, bfq: fix uaf for accessing waker_bfqq after splitting") fix the problem that if waker_bfqq is in the merge chain, and current is the only procress, waker_bfqq can be freed from bfq_split_bfqq(). However, the case that waker_bfqq is not in the merge chain is missed, and if the procress reference of waker_bfqq is 0, waker_bfqq can be freed as well. Fix the problem by checking procress reference if waker_bfqq is not in the merge_chain. Fixes: 1ba0403ac644 ("block, bfq: fix uaf for accessing waker_bfqq after splitting") Signed-off-by: Hou Tao Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20250108084148.1549973-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 95dd7b795935..cad16c163611 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6844,16 +6844,24 @@ static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq) if (new_bfqq == waker_bfqq) { /* * If waker_bfqq is in the merge chain, and current - * is the only procress. + * is the only process, waker_bfqq can be freed. */ if (bfqq_process_refs(waker_bfqq) == 1) return NULL; - break; + + return waker_bfqq; } new_bfqq = new_bfqq->new_bfqq; } + /* + * If waker_bfqq is not in the merge chain, and it's procress reference + * is 0, waker_bfqq can be freed. + */ + if (bfqq_process_refs(waker_bfqq) == 0) + return NULL; + return waker_bfqq; } From c9a40292a44e78f71258b8522655bffaf5753bdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2025 10:28:05 -0700 Subject: [PATCH 753/807] io_uring/eventfd: ensure io_eventfd_signal() defers another RCU period io_eventfd_do_signal() is invoked from an RCU callback, but when dropping the reference to the io_ev_fd, it calls io_eventfd_free() directly if the refcount drops to zero. This isn't correct, as any potential freeing of the io_ev_fd should be deferred another RCU grace period. Just call io_eventfd_put() rather than open-code the dec-and-test and free, which will correctly defer it another RCU grace period. Fixes: 21a091b970cd ("io_uring: signal registered eventfd to process deferred task work") Reported-by: Jann Horn Cc: stable@vger.kernel.org Tested-by: Li Zetao Reviewed-by: Li Zetao Reviewed-by: Prasanna Kumar T S M Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index fab936d31ba8..100d5da94cb9 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -33,20 +33,18 @@ static void io_eventfd_free(struct rcu_head *rcu) kfree(ev_fd); } +static void io_eventfd_put(struct io_ev_fd *ev_fd) +{ + if (refcount_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); +} + static void io_eventfd_do_signal(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - - if (refcount_dec_and_test(&ev_fd->refs)) - io_eventfd_free(rcu); -} - -static void io_eventfd_put(struct io_ev_fd *ev_fd) -{ - if (refcount_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); + io_eventfd_put(ev_fd); } static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) From c13094b894de289514d84b8db56d1f2931a0bade Mon Sep 17 00:00:00 2001 From: Marco Nelissen Date: Wed, 8 Jan 2025 20:11:50 -0800 Subject: [PATCH 754/807] iomap: avoid avoid truncating 64-bit offset to 32 bits on 32-bit kernels, iomap_write_delalloc_scan() was inadvertently using a 32-bit position due to folio_next_index() returning an unsigned long. This could lead to an infinite loop when writing to an xfs filesystem. Signed-off-by: Marco Nelissen Link: https://lore.kernel.org/r/20250109041253.2494374-1-marco.nelissen@gmail.com Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 54dc27d92781..d303e6c8900c 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1138,7 +1138,7 @@ static void iomap_write_delalloc_scan(struct inode *inode, start_byte, end_byte, iomap, punch); /* move offset to start of next folio in range */ - start_byte = folio_next_index(folio) << PAGE_SHIFT; + start_byte = folio_pos(folio) + folio_size(folio); folio_unlock(folio); folio_put(folio); } From 3699f2c43ea9984e00d70463f8c29baaf260ea97 Mon Sep 17 00:00:00 2001 From: Peter Geis Date: Sat, 14 Dec 2024 22:43:39 +0000 Subject: [PATCH 755/807] arm64: dts: rockchip: add hevc power domain clock to rk3328 There is a race condition at startup between disabling power domains not used and disabling clocks not used on the rk3328. When the clocks are disabled first, the hevc power domain fails to shut off leading to a splat of failures. Add the hevc core clock to the rk3328 power domain node to prevent this condition. rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 3-.... } 1087 jiffies s: 89 root: 0x8/. rcu: blocking rcu_node structures (internal RCU debug): Sending NMI from CPU 0 to CPUs 3: NMI backtrace for cpu 3 CPU: 3 UID: 0 PID: 86 Comm: kworker/3:3 Not tainted 6.12.0-rc5+ #53 Hardware name: Firefly ROC-RK3328-CC (DT) Workqueue: pm genpd_power_off_work_fn pstate: 20400005 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : regmap_unlock_spinlock+0x18/0x30 lr : regmap_read+0x60/0x88 sp : ffff800081123c00 x29: ffff800081123c00 x28: ffff2fa4c62cad80 x27: 0000000000000000 x26: ffffd74e6e660eb8 x25: ffff2fa4c62cae00 x24: 0000000000000040 x23: ffffd74e6d2f3ab8 x22: 0000000000000001 x21: ffff800081123c74 x20: 0000000000000000 x19: ffff2fa4c0412000 x18: 0000000000000000 x17: 77202c31203d2065 x16: 6c6469203a72656c x15: 6c6f72746e6f632d x14: 7265776f703a6e6f x13: 2063766568206e69 x12: 616d6f64202c3431 x11: 347830206f742030 x10: 3430303034783020 x9 : ffffd74e6c7369e0 x8 : 3030316666206e69 x7 : 205d383738353733 x6 : 332e31202020205b x5 : ffffd74e6c73fc88 x4 : ffffd74e6c73fcd4 x3 : ffffd74e6c740b40 x2 : ffff800080015484 x1 : 0000000000000000 x0 : ffff2fa4c0412000 Call trace: regmap_unlock_spinlock+0x18/0x30 rockchip_pmu_set_idle_request+0xac/0x2c0 rockchip_pd_power+0x144/0x5f8 rockchip_pd_power_off+0x1c/0x30 _genpd_power_off+0x9c/0x180 genpd_power_off.part.0.isra.0+0x130/0x2a8 genpd_power_off_work_fn+0x6c/0x98 process_one_work+0x170/0x3f0 worker_thread+0x290/0x4a8 kthread+0xec/0xf8 ret_from_fork+0x10/0x20 rockchip-pm-domain ff100000.syscon:power-controller: failed to get ack on domain 'hevc', val=0x88220 Fixes: 52e02d377a72 ("arm64: dts: rockchip: add core dtsi file for RK3328 SoCs") Signed-off-by: Peter Geis Reviewed-by: Dragan Simic Link: https://lore.kernel.org/r/20241214224339.24674-1-pgwipeout@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3328.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 0597de415fe0..7d992c3c01ce 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -333,6 +333,7 @@ power-domain@RK3328_PD_HEVC { reg = ; + clocks = <&cru SCLK_VENC_CORE>; #power-domain-cells = <0>; }; power-domain@RK3328_PD_VIDEO { From 344bac8f0d73fe970cd9f5b2f132906317d29e8b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 15 Dec 2024 21:17:05 +0100 Subject: [PATCH 756/807] fs: kill MNT_ONRB Move mnt->mnt_node into the union with mnt->mnt_rcu and mnt->mnt_llist instead of keeping it with mnt->mnt_list. This allows us to use RB_CLEAR_NODE(&mnt->mnt_node) in umount_tree() as well as list_empty(&mnt->mnt_node). That in turn allows us to remove MNT_ONRB. This also fixes the bug reported in [1] where seemingly MNT_ONRB wasn't set in @mnt->mnt_flags even though the mount was present in the mount rbtree of the mount namespace. The root cause is the following race. When a btrfs subvolume is mounted a temporary mount is created: btrfs_get_tree_subvol() { mnt = fc_mount() // Register the newly allocated mount with sb->mounts: lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); } and registered on sb->s_mounts. Later it is added to an anonymous mount namespace via mount_subvol(): -> mount_subvol() -> mount_subtree() -> alloc_mnt_ns() mnt_add_to_ns() vfs_path_lookup() put_mnt_ns() The mnt_add_to_ns() call raises MNT_ONRB in @mnt->mnt_flags. If someone concurrently does a ro remount: reconfigure_super() -> sb_prepare_remount_readonly() { list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { } all mounts registered in sb->s_mounts are visited and first MNT_WRITE_HOLD is raised, then MNT_READONLY is raised, and finally MNT_WRITE_HOLD is removed again. The flag modification for MNT_WRITE_HOLD/MNT_READONLY and MNT_ONRB race so MNT_ONRB might be lost. Fixes: 2eea9ce4310d ("mounts: keep list of mounts in an rbtree") Cc: # v6.8+ Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-1-fd55922c4af8@kernel.org Link: https://lore.kernel.org/r/ec6784ed-8722-4695-980a-4400d4e7bd1a@gmx.com [1] Signed-off-by: Christian Brauner --- fs/mount.h | 15 +++++++++------ fs/namespace.c | 14 ++++++-------- include/linux/mount.h | 3 +-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 185fc56afc13..179f690a0c72 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -38,6 +38,7 @@ struct mount { struct dentry *mnt_mountpoint; struct vfsmount mnt; union { + struct rb_node mnt_node; /* node in the ns->mounts rbtree */ struct rcu_head mnt_rcu; struct llist_node mnt_llist; }; @@ -51,10 +52,7 @@ struct mount { struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ - union { - struct rb_node mnt_node; /* Under ns->mounts */ - struct list_head mnt_list; - }; + struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list;/* list of slave mounts */ @@ -145,11 +143,16 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) return ns->seq == 0; } +static inline bool mnt_ns_attached(const struct mount *mnt) +{ + return !RB_EMPTY_NODE(&mnt->mnt_node); +} + static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) { - WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB)); - mnt->mnt.mnt_flags &= ~MNT_ONRB; + WARN_ON(!mnt_ns_attached(mnt)); rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts); + RB_CLEAR_NODE(&mnt->mnt_node); list_add_tail(&mnt->mnt_list, dt_list); } diff --git a/fs/namespace.c b/fs/namespace.c index 23e81c2a1e3f..847fa8443e8a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -344,6 +344,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; @@ -1124,7 +1125,7 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) struct rb_node **link = &ns->mounts.rb_node; struct rb_node *parent = NULL; - WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); + WARN_ON(mnt_ns_attached(mnt)); mnt->mnt_ns = ns; while (*link) { parent = *link; @@ -1135,7 +1136,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) } rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); - mnt->mnt.mnt_flags |= MNT_ONRB; } /* @@ -1305,7 +1305,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1763,7 +1763,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; - if (p->mnt.mnt_flags & MNT_ONRB) + if (mnt_ns_attached(p)) move_from_ns(p, &tmp_list); else list_move(&p->mnt_list, &tmp_list); @@ -1912,16 +1912,14 @@ static int do_umount(struct mount *mnt, int flags) event++; if (flags & MNT_DETACH) { - if (mnt->mnt.mnt_flags & MNT_ONRB || - !list_empty(&mnt->mnt_list)) + if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (mnt->mnt.mnt_flags & MNT_ONRB || - !list_empty(&mnt->mnt_list)) + if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } diff --git a/include/linux/mount.h b/include/linux/mount.h index c34c18b4e8f3..04213d8ef837 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,7 +50,7 @@ struct path; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) #define MNT_INTERNAL 0x4000 @@ -64,7 +64,6 @@ struct path; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 -#define MNT_ONRB 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ From 737d4d91d35b5f7fa5bb442651472277318b0bfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 7 Jan 2025 13:01:05 +0100 Subject: [PATCH 757/807] sched: sch_cake: add bounds checks to host bulk flow fairness counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even though we fixed a logic error in the commit cited below, syzbot still managed to trigger an underflow of the per-host bulk flow counters, leading to an out of bounds memory access. To avoid any such logic errors causing out of bounds memory accesses, this commit factors out all accesses to the per-host bulk flow counters to a series of helpers that perform bounds-checking before any increments and decrements. This also has the benefit of improving readability by moving the conditional checks for the flow mode into these helpers, instead of having them spread out throughout the code (which was the cause of the original logic error). As part of this change, the flow quantum calculation is consolidated into a helper function, which means that the dithering applied to the ost load scaling is now applied both in the DRR rotation and when a sparse flow's quantum is first initiated. The only user-visible effect of this is that the maximum packet size that can be sent while a flow stays sparse will now vary with +/- one byte in some cases. This should not make a noticeable difference in practice, and thus it's not worth complicating the code to preserve the old behaviour. Fixes: 546ea84d07e3 ("sched: sch_cake: fix bulk flow accounting logic for host fairness") Reported-by: syzbot+f63600d288bfb7057424@syzkaller.appspotmail.com Signed-off-by: Toke Høiland-Jørgensen Acked-by: Dave Taht Link: https://patch.msgid.link/20250107120105.70685-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 140 +++++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 65 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 8d8b2db4653c..2c2e2a67f3b2 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -627,6 +627,63 @@ static bool cake_ddst(int flow_mode) return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; } +static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count)) + q->hosts[flow->srchost].srchost_bulk_flow_count--; +} + +static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->srchost].srchost_bulk_flow_count++; +} + +static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count--; +} + +static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count++; +} + +static u16 cake_get_flow_quantum(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + u16 host_load = 1; + + if (cake_dsrc(flow_mode)) + host_load = max(host_load, + q->hosts[flow->srchost].srchost_bulk_flow_count); + + if (cake_ddst(flow_mode)) + host_load = max(host_load, + q->hosts[flow->dsthost].dsthost_bulk_flow_count); + + /* The get_random_u16() is a way to apply dithering to avoid + * accumulating roundoff errors + */ + return (q->flow_quantum * quantum_div[host_load] + + get_random_u16()) >> 16; +} + static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { @@ -773,10 +830,8 @@ skip_hash: allocate_dst = cake_ddst(flow_mode); if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { - if (allocate_src) - q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; - if (allocate_dst) - q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); + cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); } found: /* reserve queue for future packets in same flow */ @@ -801,9 +856,10 @@ found: q->hosts[outer_hash + k].srchost_tag = srchost_hash; found_src: srchost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[srchost_idx].srchost_bulk_flow_count++; q->flows[reduced_hash].srchost = srchost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } if (allocate_dst) { @@ -824,9 +880,10 @@ found_src: q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; found_dst: dsthost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[dsthost_idx].dsthost_bulk_flow_count++; q->flows[reduced_hash].dsthost = dsthost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } } @@ -1839,10 +1896,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - u16 host_load = 1; - if (!flow->set) { list_add_tail(&flow->flowchain, &b->new_flows); } else { @@ -1852,18 +1905,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - flow->deficit = (b->flow_quantum * - quantum_div[host_load]) >> 16; + flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. */ @@ -1871,12 +1914,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; - + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); } if (q->buffer_used > q->buffer_max_used) @@ -1933,13 +1972,11 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; - struct cake_host *srchost, *dsthost; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; bool first_flow = true; struct sk_buff *skb; - u16 host_load; u64 delay; u32 len; @@ -2039,11 +2076,6 @@ retry: q->cur_flow = flow - b->flows; first_flow = false; - /* triple isolation (modified DRR++) */ - srchost = &b->hosts[flow->srchost]; - dsthost = &b->hosts[flow->dsthost]; - host_load = 1; - /* flow isolation (DRR++) */ if (flow->deficit <= 0) { /* Keep all flows with deficits out of the sparse and decaying @@ -2055,11 +2087,8 @@ retry: b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); flow->set = CAKE_SET_BULK; } else { @@ -2071,19 +2100,7 @@ retry: } } - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - WARN_ON(host_load > CAKE_QUEUES); - - /* The get_random_u16() is a way to apply dithering to avoid - * accumulating roundoff errors - */ - flow->deficit += (b->flow_quantum * quantum_div[host_load] + - get_random_u16()) >> 16; + flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2107,11 +2124,8 @@ retry: if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || @@ -2129,12 +2143,8 @@ retry: else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; - + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); } else b->decaying_flow_count--; From 3f6bc9e3ab9b127171d39f9ac6eca1abb693b731 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 7 Jan 2025 18:39:27 +0000 Subject: [PATCH 758/807] netfs: Fix kernel async DIO Netfslib needs to be able to handle kernel-initiated asynchronous DIO that is supplied with a bio_vec[] array. Currently, because of the async flag, this gets passed to netfs_extract_user_iter() which throws a warning and fails because it only handles IOVEC and UBUF iterators. This can be triggered through a combination of cifs and a loopback blockdev with something like: mount //my/cifs/share /foo dd if=/dev/zero of=/foo/m0 bs=4K count=1K losetup --sector-size 4096 --direct-io=on /dev/loop2046 /foo/m0 echo hello >/dev/loop2046 This causes the following to appear in syslog: WARNING: CPU: 2 PID: 109 at fs/netfs/iterator.c:50 netfs_extract_user_iter+0x170/0x250 [netfs] and the write to fail. Fix this by removing the check in netfs_unbuffered_write_iter_locked() that causes async kernel DIO writes to be handled as userspace writes. Note that this change relies on the kernel caller maintaining the existence of the bio_vec array (or kvec[] or folio_queue) until the op is complete. Fixes: 153a9961b551 ("netfs: Implement unbuffered/DIO write support") Reported-by: Nicolas Baranger Closes: https://lore.kernel.org/r/fedd8a40d54b2969097ffa4507979858@3xo.fr/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/608725.1736275167@warthog.procyon.org.uk Tested-by: Nicolas Baranger Acked-by: Paulo Alcantara (Red Hat) cc: Steve French cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/direct_write.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 173e8b5e6a93..f9421f3e6d37 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -67,7 +67,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * * allocate a sufficiently large bvec array and may shorten the * request. */ - if (async || user_backed_iter(iter)) { + if (user_backed_iter(iter)) { n = netfs_extract_user_iter(iter, len, &wreq->iter, 0); if (n < 0) { ret = n; @@ -77,6 +77,11 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * wreq->direct_bv_count = n; wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); } else { + /* If this is a kernel-generated async DIO request, + * assume that any resources the iterator points to + * (eg. a bio_vec array) will persist till the end of + * the op. + */ wreq->iter = *iter; } From 904abff4b1b94184aaa0e9f5fce7821f7b5b81a3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 7 Jan 2025 14:43:30 +0000 Subject: [PATCH 759/807] netfs: Fix read-retry for fs with no ->prepare_read() Fix netfslib's read-retry to only call ->prepare_read() in the backing filesystem such a function is provided. We can get to this point if a there's an active cache as failed reads from the cache need negotiating with the server instead. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Signed-off-by: David Howells Link: https://lore.kernel.org/r/529329.1736261010@warthog.procyon.org.uk cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_retry.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 21b4a54e545e..16b676c68dcd 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -152,7 +152,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) BUG_ON(!len); /* Renegotiate max_len (rsize) */ - if (rreq->netfs_ops->prepare_read(subreq) < 0) { + if (rreq->netfs_ops->prepare_read && + rreq->netfs_ops->prepare_read(subreq) < 0) { trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed); __set_bit(NETFS_SREQ_FAILED, &subreq->flags); } From 426046e2d62dd19533808661e912b8e8a9eaec16 Mon Sep 17 00:00:00 2001 From: Parker Newman Date: Tue, 7 Jan 2025 16:24:59 -0500 Subject: [PATCH 760/807] net: stmmac: dwmac-tegra: Read iommu stream id from device tree Nvidia's Tegra MGBE controllers require the IOMMU "Stream ID" (SID) to be written to the MGBE_WRAP_AXI_ASID0_CTRL register. The current driver is hard coded to use MGBE0's SID for all controllers. This causes softirq time outs and kernel panics when using controllers other than MGBE0. Example dmesg errors when an ethernet cable is connected to MGBE1: [ 116.133290] tegra-mgbe 6910000.ethernet eth1: Link is Up - 1Gbps/Full - flow control rx/tx [ 121.851283] tegra-mgbe 6910000.ethernet eth1: NETDEV WATCHDOG: CPU: 5: transmit queue 0 timed out 5690 ms [ 121.851782] tegra-mgbe 6910000.ethernet eth1: Reset adapter. [ 121.892464] tegra-mgbe 6910000.ethernet eth1: Register MEM_TYPE_PAGE_POOL RxQ-0 [ 121.905920] tegra-mgbe 6910000.ethernet eth1: PHY [stmmac-1:00] driver [Aquantia AQR113] (irq=171) [ 121.907356] tegra-mgbe 6910000.ethernet eth1: Enabling Safety Features [ 121.907578] tegra-mgbe 6910000.ethernet eth1: IEEE 1588-2008 Advanced Timestamp supported [ 121.908399] tegra-mgbe 6910000.ethernet eth1: registered PTP clock [ 121.908582] tegra-mgbe 6910000.ethernet eth1: configuring for phy/10gbase-r link mode [ 125.961292] tegra-mgbe 6910000.ethernet eth1: Link is Up - 1Gbps/Full - flow control rx/tx [ 181.921198] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: [ 181.921404] rcu: 7-....: (1 GPs behind) idle=540c/1/0x4000000000000002 softirq=1748/1749 fqs=2337 [ 181.921684] rcu: (detected by 4, t=6002 jiffies, g=1357, q=1254 ncpus=8) [ 181.921878] Sending NMI from CPU 4 to CPUs 7: [ 181.921886] NMI backtrace for cpu 7 [ 181.922131] CPU: 7 UID: 0 PID: 0 Comm: swapper/7 Kdump: loaded Not tainted 6.13.0-rc3+ #6 [ 181.922390] Hardware name: NVIDIA CTI Forge + Orin AGX/Jetson, BIOS 202402.1-Unknown 10/28/2024 [ 181.922658] pstate: 40400009 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 181.922847] pc : handle_softirqs+0x98/0x368 [ 181.922978] lr : __do_softirq+0x18/0x20 [ 181.923095] sp : ffff80008003bf50 [ 181.923189] x29: ffff80008003bf50 x28: 0000000000000008 x27: 0000000000000000 [ 181.923379] x26: ffffce78ea277000 x25: 0000000000000000 x24: 0000001c61befda0 [ 181.924486] x23: 0000000060400009 x22: ffffce78e99918bc x21: ffff80008018bd70 [ 181.925568] x20: ffffce78e8bb00d8 x19: ffff80008018bc20 x18: 0000000000000000 [ 181.926655] x17: ffff318ebe7d3000 x16: ffff800080038000 x15: 0000000000000000 [ 181.931455] x14: ffff000080816680 x13: ffff318ebe7d3000 x12: 000000003464d91d [ 181.938628] x11: 0000000000000040 x10: ffff000080165a70 x9 : ffffce78e8bb0160 [ 181.945804] x8 : ffff8000827b3160 x7 : f9157b241586f343 x6 : eeb6502a01c81c74 [ 181.953068] x5 : a4acfcdd2e8096bb x4 : ffffce78ea277340 x3 : 00000000ffffd1e1 [ 181.960329] x2 : 0000000000000101 x1 : ffffce78ea277340 x0 : ffff318ebe7d3000 [ 181.967591] Call trace: [ 181.970043] handle_softirqs+0x98/0x368 (P) [ 181.974240] __do_softirq+0x18/0x20 [ 181.977743] ____do_softirq+0x14/0x28 [ 181.981415] call_on_irq_stack+0x24/0x30 [ 181.985180] do_softirq_own_stack+0x20/0x30 [ 181.989379] __irq_exit_rcu+0x114/0x140 [ 181.993142] irq_exit_rcu+0x14/0x28 [ 181.996816] el1_interrupt+0x44/0xb8 [ 182.000316] el1h_64_irq_handler+0x14/0x20 [ 182.004343] el1h_64_irq+0x80/0x88 [ 182.007755] cpuidle_enter_state+0xc4/0x4a8 (P) [ 182.012305] cpuidle_enter+0x3c/0x58 [ 182.015980] cpuidle_idle_call+0x128/0x1c0 [ 182.020005] do_idle+0xe0/0xf0 [ 182.023155] cpu_startup_entry+0x3c/0x48 [ 182.026917] secondary_start_kernel+0xdc/0x120 [ 182.031379] __secondary_switched+0x74/0x78 [ 212.971162] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 7-.... } 6103 jiffies s: 417 root: 0x80/. [ 212.985935] rcu: blocking rcu_node structures (internal RCU debug): [ 212.992758] Sending NMI from CPU 0 to CPUs 7: [ 212.998539] NMI backtrace for cpu 7 [ 213.004304] CPU: 7 UID: 0 PID: 0 Comm: swapper/7 Kdump: loaded Not tainted 6.13.0-rc3+ #6 [ 213.016116] Hardware name: NVIDIA CTI Forge + Orin AGX/Jetson, BIOS 202402.1-Unknown 10/28/2024 [ 213.030817] pstate: 40400009 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 213.040528] pc : handle_softirqs+0x98/0x368 [ 213.046563] lr : __do_softirq+0x18/0x20 [ 213.051293] sp : ffff80008003bf50 [ 213.055839] x29: ffff80008003bf50 x28: 0000000000000008 x27: 0000000000000000 [ 213.067304] x26: ffffce78ea277000 x25: 0000000000000000 x24: 0000001c61befda0 [ 213.077014] x23: 0000000060400009 x22: ffffce78e99918bc x21: ffff80008018bd70 [ 213.087339] x20: ffffce78e8bb00d8 x19: ffff80008018bc20 x18: 0000000000000000 [ 213.097313] x17: ffff318ebe7d3000 x16: ffff800080038000 x15: 0000000000000000 [ 213.107201] x14: ffff000080816680 x13: ffff318ebe7d3000 x12: 000000003464d91d [ 213.116651] x11: 0000000000000040 x10: ffff000080165a70 x9 : ffffce78e8bb0160 [ 213.127500] x8 : ffff8000827b3160 x7 : 0a37b344852820af x6 : 3f049caedd1ff608 [ 213.138002] x5 : cff7cfdbfaf31291 x4 : ffffce78ea277340 x3 : 00000000ffffde04 [ 213.150428] x2 : 0000000000000101 x1 : ffffce78ea277340 x0 : ffff318ebe7d3000 [ 213.162063] Call trace: [ 213.165494] handle_softirqs+0x98/0x368 (P) [ 213.171256] __do_softirq+0x18/0x20 [ 213.177291] ____do_softirq+0x14/0x28 [ 213.182017] call_on_irq_stack+0x24/0x30 [ 213.186565] do_softirq_own_stack+0x20/0x30 [ 213.191815] __irq_exit_rcu+0x114/0x140 [ 213.196891] irq_exit_rcu+0x14/0x28 [ 213.202401] el1_interrupt+0x44/0xb8 [ 213.207741] el1h_64_irq_handler+0x14/0x20 [ 213.213519] el1h_64_irq+0x80/0x88 [ 213.217541] cpuidle_enter_state+0xc4/0x4a8 (P) [ 213.224364] cpuidle_enter+0x3c/0x58 [ 213.228653] cpuidle_idle_call+0x128/0x1c0 [ 213.233993] do_idle+0xe0/0xf0 [ 213.237928] cpu_startup_entry+0x3c/0x48 [ 213.243791] secondary_start_kernel+0xdc/0x120 [ 213.249830] __secondary_switched+0x74/0x78 This bug has existed since the dwmac-tegra driver was added in Dec 2022 (See Fixes tag below for commit hash). The Tegra234 SOC has 4 MGBE controllers, however Nvidia's Developer Kit only uses MGBE0 which is why the bug was not found previously. Connect Tech has many products that use 2 (or more) MGBE controllers. The solution is to read the controller's SID from the existing "iommus" device tree property. The 2nd field of the "iommus" device tree property is the controller's SID. Device tree snippet from tegra234.dtsi showing MGBE1's "iommus" property: smmu_niso0: iommu@12000000 { compatible = "nvidia,tegra234-smmu", "nvidia,smmu-500"; ... } /* MGBE1 */ ethernet@6900000 { compatible = "nvidia,tegra234-mgbe"; ... iommus = <&smmu_niso0 TEGRA234_SID_MGBE_VF1>; ... } Nvidia's arm-smmu driver reads the "iommus" property and stores the SID in the MGBE device's "fwspec" struct. The dwmac-tegra driver can access the SID using the tegra_dev_iommu_get_stream_id() helper function found in linux/iommu.h. Calling tegra_dev_iommu_get_stream_id() should not fail unless the "iommus" property is removed from the device tree or the IOMMU is disabled. While the Tegra234 SOC technically supports bypassing the IOMMU, it is not supported by the current firmware, has not been tested and not recommended. More detailed discussion with Thierry Reding from Nvidia linked below. Fixes: d8ca113724e7 ("net: stmmac: tegra: Add MGBE support") Link: https://lore.kernel.org/netdev/cover.1731685185.git.pnewman@connecttech.com Signed-off-by: Parker Newman Reviewed-by: Andrew Lunn Acked-by: Thierry Reding Link: https://patch.msgid.link/6fb97f32cf4accb4f7cf92846f6b60064ba0a3bd.1736284360.git.pnewman@connecttech.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c index 3827997d2132..dc903b846b1b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include @@ -19,6 +20,8 @@ struct tegra_mgbe { struct reset_control *rst_mac; struct reset_control *rst_pcs; + u32 iommu_sid; + void __iomem *hv; void __iomem *regs; void __iomem *xpcs; @@ -50,7 +53,6 @@ struct tegra_mgbe { #define MGBE_WRAP_COMMON_INTR_ENABLE 0x8704 #define MAC_SBD_INTR BIT(2) #define MGBE_WRAP_AXI_ASID0_CTRL 0x8400 -#define MGBE_SID 0x6 static int __maybe_unused tegra_mgbe_suspend(struct device *dev) { @@ -84,7 +86,7 @@ static int __maybe_unused tegra_mgbe_resume(struct device *dev) writel(MAC_SBD_INTR, mgbe->regs + MGBE_WRAP_COMMON_INTR_ENABLE); /* Program SID */ - writel(MGBE_SID, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); + writel(mgbe->iommu_sid, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_STATUS); if ((value & XPCS_WRAP_UPHY_STATUS_TX_P_UP) == 0) { @@ -241,6 +243,12 @@ static int tegra_mgbe_probe(struct platform_device *pdev) if (IS_ERR(mgbe->xpcs)) return PTR_ERR(mgbe->xpcs); + /* get controller's stream id from iommu property in device tree */ + if (!tegra_dev_iommu_get_stream_id(mgbe->dev, &mgbe->iommu_sid)) { + dev_err(mgbe->dev, "failed to get iommu stream id\n"); + return -EINVAL; + } + res.addr = mgbe->regs; res.irq = irq; @@ -346,7 +354,7 @@ static int tegra_mgbe_probe(struct platform_device *pdev) writel(MAC_SBD_INTR, mgbe->regs + MGBE_WRAP_COMMON_INTR_ENABLE); /* Program SID */ - writel(MGBE_SID, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); + writel(mgbe->iommu_sid, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); plat->flags |= STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP; From 17a4fde81d3a7478d97d15304a6d61094a10c2e3 Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Tue, 7 Jan 2025 14:52:32 +0000 Subject: [PATCH 761/807] afs: Fix merge preference rule failure condition syzbot reported a lock held when returning to userspace[1]. This is because if argc is less than 0 and the function returns directly, the held inode lock is not released. Fix this by store the error in ret and jump to done to clean up instead of returning directly. [dh: Modified Lizhi Xu's original patch to make it honour the error code from afs_split_string()] [1] WARNING: lock held when returning to user space! 6.13.0-rc3-syzkaller-00209-g499551201b5f #0 Not tainted ------------------------------------------------ syz-executor133/5823 is leaving the kernel with locks still held! 1 lock held by syz-executor133/5823: #0: ffff888071cffc00 (&sb->s_type->i_mutex_key#9){++++}-{4:4}, at: inode_lock include/linux/fs.h:818 [inline] #0: ffff888071cffc00 (&sb->s_type->i_mutex_key#9){++++}-{4:4}, at: afs_proc_addr_prefs_write+0x2bb/0x14e0 fs/afs/addr_prefs.c:388 Reported-by: syzbot+76f33569875eb708e575@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=76f33569875eb708e575 Signed-off-by: Lizhi Xu Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241226012616.2348907-1-lizhi.xu@windriver.com/ Link: https://lore.kernel.org/r/529850.1736261552@warthog.procyon.org.uk Tested-by: syzbot+76f33569875eb708e575@syzkaller.appspotmail.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- fs/afs/addr_prefs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c index a189ff8a5034..c0384201b8fe 100644 --- a/fs/afs/addr_prefs.c +++ b/fs/afs/addr_prefs.c @@ -413,8 +413,10 @@ int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size) do { argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv)); - if (argc < 0) - return argc; + if (argc < 0) { + ret = argc; + goto done; + } if (argc < 2) goto inval; From 2055272e3ae01a954e41a5afb437c5d76f758e0b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Jan 2025 12:15:53 +0300 Subject: [PATCH 762/807] rtase: Fix a check for error in rtase_alloc_msix() The pci_irq_vector() function never returns zero. It returns negative error codes or a positive non-zero IRQ number. Fix the error checking to test for negatives. Fixes: a36e9f5cfe9e ("rtase: Add support for a pci table in this module") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Reviewed-by: Kalesh AP Link: https://patch.msgid.link/f2ecc88d-af13-4651-9820-7cc665230019@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/rtase/rtase_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index de7f11232593..c42c0516656b 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -1827,7 +1827,7 @@ static int rtase_alloc_msix(struct pci_dev *pdev, struct rtase_private *tp) for (i = 0; i < tp->int_nums; i++) { irq = pci_irq_vector(pdev, i); - if (!irq) { + if (irq < 0) { pci_disable_msix(pdev); return irq; } From 0e2909c6bec9048f49d0c8e16887c63b50b14647 Mon Sep 17 00:00:00 2001 From: Chenguang Zhao Date: Wed, 8 Jan 2025 11:00:09 +0800 Subject: [PATCH 763/807] net/mlx5: Fix variable not being completed when function returns When cmd_alloc_index(), fails cmd_work_handler() needs to complete ent->slotted before returning early. Otherwise the task which issued the command may hang: mlx5_core 0000:01:00.0: cmd_work_handler:877:(pid 3880418): failed to allocate command entry INFO: task kworker/13:2:4055883 blocked for more than 120 seconds. Not tainted 4.19.90-25.44.v2101.ky10.aarch64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kworker/13:2 D 0 4055883 2 0x00000228 Workqueue: events mlx5e_tx_dim_work [mlx5_core] Call trace: __switch_to+0xe8/0x150 __schedule+0x2a8/0x9b8 schedule+0x2c/0x88 schedule_timeout+0x204/0x478 wait_for_common+0x154/0x250 wait_for_completion+0x28/0x38 cmd_exec+0x7a0/0xa00 [mlx5_core] mlx5_cmd_exec+0x54/0x80 [mlx5_core] mlx5_core_modify_cq+0x6c/0x80 [mlx5_core] mlx5_core_modify_cq_moderation+0xa0/0xb8 [mlx5_core] mlx5e_tx_dim_work+0x54/0x68 [mlx5_core] process_one_work+0x1b0/0x448 worker_thread+0x54/0x468 kthread+0x134/0x138 ret_from_fork+0x10/0x18 Fixes: 485d65e13571 ("net/mlx5: Add a timeout to acquire the command queue semaphore") Signed-off-by: Chenguang Zhao Reviewed-by: Moshe Shemesh Acked-by: Tariq Toukan Link: https://patch.msgid.link/20250108030009.68520-1-zhaochenguang@kylinos.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 6bd8a18e3af3..e733b81e18a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1013,6 +1013,7 @@ static void cmd_work_handler(struct work_struct *work) complete(&ent->done); } up(&cmd->vars.sem); + complete(&ent->slotted); return; } } else { From d58200966ed7985be48d342e99a5e81bc481821c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:35 -0800 Subject: [PATCH 764/807] MAINTAINERS: mark Synopsys DW XPCS as Orphan There's not much review support from Jose, there is a sharp drop in his participation around 4 years ago. The DW XPCS IP is very popular and the driver requires active maintenance. gitdm missingmaints says: Subsystem SYNOPSYS DESIGNWARE ETHERNET XPCS DRIVER Changes 33 / 94 (35%) (No activity) Top reviewers: [16]: andrew@lunn.ch [12]: vladimir.oltean@nxp.com [2]: f.fainelli@gmail.com INACTIVE MAINTAINER Jose Abreu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250108155242.2575530-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index b1777b53c63a..2a5f5f49269f 100644 --- a/CREDITS +++ b/CREDITS @@ -20,6 +20,10 @@ N: Thomas Abraham E: thomas.ab@samsung.com D: Samsung pin controller driver +N: Jose Abreu +E: jose.abreu@synopsys.com +D: Synopsys DesignWare XPCS MDIO/PCS driver. + N: Dragos Acostachioaie E: dragos@iname.com W: http://www.arbornet.org/~dragos diff --git a/MAINTAINERS b/MAINTAINERS index 52378f994294..d92c8cd9f805 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22737,9 +22737,8 @@ S: Supported F: drivers/net/ethernet/synopsys/ SYNOPSYS DESIGNWARE ETHERNET XPCS DRIVER -M: Jose Abreu L: netdev@vger.kernel.org -S: Supported +S: Orphan F: drivers/net/pcs/pcs-xpcs.c F: drivers/net/pcs/pcs-xpcs.h F: include/linux/pcs/pcs-xpcs.h From b506668613ef9138cac7479a5dd47559835b6552 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:36 -0800 Subject: [PATCH 765/807] MAINTAINERS: update maintainers for Microchip LAN78xx Woojung Huh seems to have only replied to the list 35 times in the last 5 years, and didn't provide any reviews in 3 years. The LAN78XX driver has seen quite a bit of activity lately. gitdm missingmaints says: Subsystem USB LAN78XX ETHERNET DRIVER Changes 35 / 91 (38%) (No activity) Top reviewers: [23]: andrew@lunn.ch [3]: horms@kernel.org [2]: mateusz.polchlopek@intel.com INACTIVE MAINTAINER Woojung Huh Move Woojung to CREDITS and add new maintainers who are more likely to review LAN78xx patches. Acked-by: Woojung Huh Acked-by: Rengarajan Sundararajan Link: https://patch.msgid.link/20250108155242.2575530-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 2a5f5f49269f..7a5332907ef0 100644 --- a/CREDITS +++ b/CREDITS @@ -1816,6 +1816,10 @@ D: Author/maintainer of most DRM drivers (especially ATI, MGA) D: Core DRM templates, general DRM and 3D-related hacking S: No fixed address +N: Woojung Huh +E: woojung.huh@microchip.com +D: Microchip LAN78XX USB Ethernet driver + N: Kenn Humborg E: kenn@wombat.ie D: Mods to loop device to support sparse backing files diff --git a/MAINTAINERS b/MAINTAINERS index d92c8cd9f805..58fb3c81c735 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24252,7 +24252,8 @@ F: Documentation/devicetree/bindings/usb/nxp,isp1760.yaml F: drivers/usb/isp1760/* USB LAN78XX ETHERNET DRIVER -M: Woojung Huh +M: Thangaraj Samynathan +M: Rengarajan Sundararajan M: UNGLinuxDriver@microchip.com L: netdev@vger.kernel.org S: Maintained From e049fb86d39139050bb792b17ef86c3918cc8068 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:37 -0800 Subject: [PATCH 766/807] MAINTAINERS: remove Andy Gospodarek from bonding Andy does not participate much in bonding reviews, unfortunately. Move him to CREDITS. gitdm missingmaint says: Subsystem BONDING DRIVER Changes 149 / 336 (44%) Last activity: 2024-09-05 Jay Vosburgh : Tags 68db604e16d5 2024-09-05 00:00:00 8 Andy Gospodarek : Top reviewers: [65]: jay.vosburgh@canonical.com [23]: liuhangbin@gmail.com [16]: razor@blackwall.org INACTIVE MAINTAINER Andy Gospodarek Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250108155242.2575530-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 7a5332907ef0..cda68f04d5f1 100644 --- a/CREDITS +++ b/CREDITS @@ -1432,6 +1432,10 @@ S: 8124 Constitution Apt. 7 S: Sterling Heights, Michigan 48313 S: USA +N: Andy Gospodarek +E: andy@greyhouse.net +D: Maintenance and contributions to the network interface bonding driver. + N: Wolfgang Grandegger E: wg@grandegger.com D: Controller Area Network (device drivers) diff --git a/MAINTAINERS b/MAINTAINERS index 58fb3c81c735..c518bda0215f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4058,7 +4058,6 @@ F: net/bluetooth/ BONDING DRIVER M: Jay Vosburgh -M: Andy Gospodarek L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/bonding.rst From 03868822c553e549ac5c28781c29f80bddee5487 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:38 -0800 Subject: [PATCH 767/807] MAINTAINERS: mark stmmac ethernet as an Orphan I tried a couple of things to reinvigorate the stmmac maintainers over the last few years but with little effect. The maintainers are not active, let the MAINTAINERS file reflect reality. The Synopsys IP this driver supports is very popular we need a solid maintainer to deal with the complexity of the driver. gitdm missingmaints says: Subsystem STMMAC ETHERNET DRIVER Changes 344 / 978 (35%) Last activity: 2020-05-01 Alexandre Torgue : Tags 1bb694e20839 2020-05-01 00:00:00 1 Jose Abreu : Top reviewers: [75]: horms@kernel.org [49]: andrew@lunn.ch [46]: fancer.lancer@gmail.com INACTIVE MAINTAINER Jose Abreu Acked-by: Alexandre Torgue Link: https://patch.msgid.link/20250108155242.2575530-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index c518bda0215f..955092ed27d6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22500,11 +22500,8 @@ F: Documentation/devicetree/bindings/phy/st,stm32mp25-combophy.yaml F: drivers/phy/st/phy-stm32-combophy.c STMMAC ETHERNET DRIVER -M: Alexandre Torgue -M: Jose Abreu L: netdev@vger.kernel.org -S: Supported -W: http://www.stlinux.com +S: Orphan F: Documentation/networking/device_drivers/ethernet/stmicro/ F: drivers/net/ethernet/stmicro/stmmac/ From 9d7b1191d030bb0f6932722755b1103a2207421d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:39 -0800 Subject: [PATCH 768/807] MAINTAINERS: remove Mark Lee from MediaTek Ethernet The mailing lists have seen no email from Mark Lee in the last 4 years. gitdm missingmaints says: Subsystem MEDIATEK ETHERNET DRIVER Changes 103 / 400 (25%) Last activity: 2024-12-19 Felix Fietkau : Author 88806efc034a 2024-10-17 00:00:00 44 Tags 88806efc034a 2024-10-17 00:00:00 51 Sean Wang : Tags a5d75538295b 2020-04-07 00:00:00 1 Mark Lee : Lorenzo Bianconi : Author 0c7469ee718e 2024-12-19 00:00:00 123 Tags 0c7469ee718e 2024-12-19 00:00:00 139 Top reviewers: [32]: horms@kernel.org [15]: leonro@nvidia.com [9]: andrew@lunn.ch INACTIVE MAINTAINER Mark Lee Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250108155242.2575530-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 955092ed27d6..82157f7e01e4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14565,7 +14565,6 @@ F: drivers/dma/mediatek/ MEDIATEK ETHERNET DRIVER M: Felix Fietkau M: Sean Wang -M: Mark Lee M: Lorenzo Bianconi L: netdev@vger.kernel.org S: Maintained From d4782fbab1c06fe1a3b1e064d2d6efd3e281e805 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:40 -0800 Subject: [PATCH 769/807] MAINTAINERS: remove Ying Xue from TIPC There is a steady stream of fixes for TIPC, even tho the development has slowed down a lot. Over last 2 years we have merged almost 70 TIPC patches, but we haven't heard from Ying Xue once: Subsystem TIPC NETWORK LAYER Changes 42 / 69 (60%) Last activity: 2023-10-04 Jon Maloy : Tags 08e50cf07184 2023-10-04 00:00:00 6 Ying Xue : Top reviewers: [9]: horms@kernel.org [8]: tung.q.nguyen@dektech.com.au [4]: jiri@nvidia.com [3]: tung.q.nguyen@endava.com [2]: kuniyu@amazon.com INACTIVE MAINTAINER Ying Xue Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250108155242.2575530-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 82157f7e01e4..b060b1fe8762 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23641,7 +23641,6 @@ F: tools/testing/selftests/timers/ TIPC NETWORK LAYER M: Jon Maloy -M: Ying Xue L: netdev@vger.kernel.org (core kernel code) L: tipc-discussion@lists.sourceforge.net (user apps, general discussion) S: Maintained From d95e2cc737017de537fc07cfc7d59307182bd0bc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:41 -0800 Subject: [PATCH 770/807] MAINTAINERS: remove Noam Dagan from AMAZON ETHERNET Noam Dagan was added to ENA reviewers in 2021, we have not seen a single email from this person to any list, ever (according to lore). Git history mentions the name in 2 SoB tags from 2020. Acked-by: Arthur Kiyanovski Link: https://patch.msgid.link/20250108155242.2575530-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b060b1fe8762..4996219d95b2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -949,7 +949,6 @@ AMAZON ETHERNET DRIVERS M: Shay Agroskin M: Arthur Kiyanovski R: David Arinzon -R: Noam Dagan R: Saeed Bishara L: netdev@vger.kernel.org S: Supported From d9e03c6ffc4cd92c99418afc970ea8c8c53c66a8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 8 Jan 2025 07:52:42 -0800 Subject: [PATCH 771/807] MAINTAINERS: remove Lars Povlsen from Microchip Sparx5 SoC We have not seen emails or tags from Lars in almost 4 years. Steen and Daniel are pretty active, but the review coverage isn't stellar (35% of changes go in without a review tag). Subsystem ARM/Microchip Sparx5 SoC support Changes 28 / 79 (35%) Last activity: 2024-11-24 Lars Povlsen : Steen Hegelund : Tags 6c7c4b91aa43 2024-04-08 00:00:00 15 Daniel Machon : Author 48ba00da2eb4 2024-04-09 00:00:00 2 Tags f164b296638d 2024-11-24 00:00:00 6 Top reviewers: [7]: horms@kernel.org [1]: jacob.e.keller@intel.com [1]: jensemil.schulzostergaard@microchip.com [1]: horatiu.vultur@microchip.com INACTIVE MAINTAINER Lars Povlsen Acked-by: Daniel Machon Link: https://patch.msgid.link/20250108155242.2575530-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4996219d95b2..4e93a00df185 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2689,7 +2689,6 @@ N: at91 N: atmel ARM/Microchip Sparx5 SoC support -M: Lars Povlsen M: Steen Hegelund M: Daniel Machon M: UNGLinuxDriver@microchip.com From 771ec78dc8b48d562e6015bb535ed3cd37043d78 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:29 +0100 Subject: [PATCH 772/807] mptcp: sysctl: avail sched: remove write access 'net.mptcp.available_schedulers' sysctl knob is there to list available schedulers, not to modify this list. There are then no reasons to give write access to it. Nothing would have been written anyway, but no errors would have been returned, which is unexpected. Fixes: 73c900aa3660 ("mptcp: add net.mptcp.available_schedulers") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-1-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 38d8121331d4..d9b57fab2a13 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -228,7 +228,7 @@ static struct ctl_table mptcp_sysctl_table[] = { { .procname = "available_schedulers", .maxlen = MPTCP_SCHED_BUF_MAX, - .mode = 0644, + .mode = 0444, .proc_handler = proc_available_schedulers, }, { From d38e26e36206ae3d544d496513212ae931d1da0a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:30 +0100 Subject: [PATCH 773/807] mptcp: sysctl: sched: avoid using current->nsproxy Using the 'net' structure via 'current' is not recommended for different reasons. First, if the goal is to use it to read or write per-netns data, this is inconsistent with how the "generic" sysctl entries are doing: directly by only using pointers set to the table entry, e.g. table->data. Linked to that, the per-netns data should always be obtained from the table linked to the netns it had been created for, which may not coincide with the reader's or writer's netns. Another reason is that access to current->nsproxy->netns can oops if attempted when current->nsproxy had been dropped when the current task is exiting. This is what syzbot found, when using acct(2): Oops: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] CPU: 1 UID: 0 PID: 5924 Comm: syz-executor Not tainted 6.13.0-rc5-syzkaller-00004-gccb98ccef0e5 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:proc_scheduler+0xc6/0x3c0 net/mptcp/ctrl.c:125 Code: 03 42 80 3c 38 00 0f 85 fe 02 00 00 4d 8b a4 24 08 09 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7c 24 28 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 cc 02 00 00 4d 8b 7c 24 28 48 8d 84 24 c8 00 00 RSP: 0018:ffffc900034774e8 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 1ffff9200068ee9e RCX: ffffc90003477620 RDX: 0000000000000005 RSI: ffffffff8b08f91e RDI: 0000000000000028 RBP: 0000000000000001 R08: ffffc90003477710 R09: 0000000000000040 R10: 0000000000000040 R11: 00000000726f7475 R12: 0000000000000000 R13: ffffc90003477620 R14: ffffc90003477710 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee3cd452d8 CR3: 000000007d116000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: proc_sys_call_handler+0x403/0x5d0 fs/proc/proc_sysctl.c:601 __kernel_write_iter+0x318/0xa80 fs/read_write.c:612 __kernel_write+0xf6/0x140 fs/read_write.c:632 do_acct_process+0xcb0/0x14a0 kernel/acct.c:539 acct_pin_kill+0x2d/0x100 kernel/acct.c:192 pin_kill+0x194/0x7c0 fs/fs_pin.c:44 mnt_pin_kill+0x61/0x1e0 fs/fs_pin.c:81 cleanup_mnt+0x3ac/0x450 fs/namespace.c:1366 task_work_run+0x14e/0x250 kernel/task_work.c:239 exit_task_work include/linux/task_work.h:43 [inline] do_exit+0xad8/0x2d70 kernel/exit.c:938 do_group_exit+0xd3/0x2a0 kernel/exit.c:1087 get_signal+0x2576/0x2610 kernel/signal.c:3017 arch_do_signal_or_restart+0x90/0x7e0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xda/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fee3cb87a6a Code: Unable to access opcode bytes at 0x7fee3cb87a40. RSP: 002b:00007fffcccac688 EFLAGS: 00000202 ORIG_RAX: 0000000000000037 RAX: 0000000000000000 RBX: 00007fffcccac710 RCX: 00007fee3cb87a6a RDX: 0000000000000041 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000003 R08: 00007fffcccac6ac R09: 00007fffcccacac7 R10: 00007fffcccac710 R11: 0000000000000202 R12: 00007fee3cd49500 R13: 00007fffcccac6ac R14: 0000000000000000 R15: 00007fee3cd4b000 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:proc_scheduler+0xc6/0x3c0 net/mptcp/ctrl.c:125 Code: 03 42 80 3c 38 00 0f 85 fe 02 00 00 4d 8b a4 24 08 09 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7c 24 28 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 cc 02 00 00 4d 8b 7c 24 28 48 8d 84 24 c8 00 00 RSP: 0018:ffffc900034774e8 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 1ffff9200068ee9e RCX: ffffc90003477620 RDX: 0000000000000005 RSI: ffffffff8b08f91e RDI: 0000000000000028 RBP: 0000000000000001 R08: ffffc90003477710 R09: 0000000000000040 R10: 0000000000000040 R11: 00000000726f7475 R12: 0000000000000000 R13: ffffc90003477620 R14: ffffc90003477710 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee3cd452d8 CR3: 000000007d116000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 ---------------- Code disassembly (best guess), 1 bytes skipped: 0: 42 80 3c 38 00 cmpb $0x0,(%rax,%r15,1) 5: 0f 85 fe 02 00 00 jne 0x309 b: 4d 8b a4 24 08 09 00 mov 0x908(%r12),%r12 12: 00 13: 48 b8 00 00 00 00 00 movabs $0xdffffc0000000000,%rax 1a: fc ff df 1d: 49 8d 7c 24 28 lea 0x28(%r12),%rdi 22: 48 89 fa mov %rdi,%rdx 25: 48 c1 ea 03 shr $0x3,%rdx * 29: 80 3c 02 00 cmpb $0x0,(%rdx,%rax,1) <-- trapping instruction 2d: 0f 85 cc 02 00 00 jne 0x2ff 33: 4d 8b 7c 24 28 mov 0x28(%r12),%r15 38: 48 rex.W 39: 8d .byte 0x8d 3a: 84 24 c8 test %ah,(%rax,%rcx,8) Here with 'net.mptcp.scheduler', the 'net' structure is not really needed, because the table->data already has a pointer to the current scheduler, the only thing needed from the per-netns data. Simply use 'data', instead of getting (most of the time) the same thing, but from a longer and indirect way. Fixes: 6963c508fd7a ("mptcp: only allow set existing scheduler for net.mptcp.scheduler") Cc: stable@vger.kernel.org Reported-by: syzbot+e364f774c6f57f2c86d1@syzkaller.appspotmail.com Closes: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com Suggested-by: Al Viro Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-2-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index d9b57fab2a13..81c30aa02196 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -102,16 +102,15 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) } #ifdef CONFIG_SYSCTL -static int mptcp_set_scheduler(const struct net *net, const char *name) +static int mptcp_set_scheduler(char *scheduler, const char *name) { - struct mptcp_pernet *pernet = mptcp_get_pernet(net); struct mptcp_sched_ops *sched; int ret = 0; rcu_read_lock(); sched = mptcp_sched_find(name); if (sched) - strscpy(pernet->scheduler, name, MPTCP_SCHED_NAME_MAX); + strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX); else ret = -ENOENT; rcu_read_unlock(); @@ -122,7 +121,7 @@ static int mptcp_set_scheduler(const struct net *net, const char *name) static int proc_scheduler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - const struct net *net = current->nsproxy->net_ns; + char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data; char val[MPTCP_SCHED_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -130,11 +129,11 @@ static int proc_scheduler(const struct ctl_table *ctl, int write, }; int ret; - strscpy(val, mptcp_get_scheduler(net), MPTCP_SCHED_NAME_MAX); + strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = mptcp_set_scheduler(net, val); + ret = mptcp_set_scheduler(*scheduler, val); return ret; } From 92cf7a51bdae24a32c592adcdd59a773ae149289 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:31 +0100 Subject: [PATCH 774/807] mptcp: sysctl: blackhole timeout: avoid using current->nsproxy As mentioned in the previous commit, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'pernet' structure can be obtained from the table->data using container_of(). Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-3-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 81c30aa02196..b0dd008e2114 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -160,7 +160,9 @@ static int proc_blackhole_detect_timeout(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct mptcp_pernet *pernet = mptcp_get_pernet(current->nsproxy->net_ns); + struct mptcp_pernet *pernet = container_of(table->data, + struct mptcp_pernet, + blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); From ea62dd1383913b5999f3d16ae99d411f41b528d4 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:32 +0100 Subject: [PATCH 775/807] sctp: sysctl: cookie_hmac_alg: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.sctp_hmac_alg' is used. Fixes: 3c68198e7511 ("sctp: Make hmac algorithm selection for cookie generation dynamic") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-4-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index e5a5af343c4c..9848d19630a4 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table sctp_net_table[] = { static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, + sctp.sctp_hmac_alg); struct ctl_table tbl; bool changed = false; char *none = "none"; From 9fc17b76fc70763780aa78b38fcf4742384044a5 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:33 +0100 Subject: [PATCH 776/807] sctp: sysctl: rto_min/max: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.rto_min/max' is used. Fixes: 4f3fdf3bc59c ("sctp: add check rto_min and rto_max in sysctl") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-5-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9848d19630a4..a5285815264d 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -433,7 +433,7 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.rto_min); unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; @@ -461,7 +461,7 @@ static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, static int proc_sctp_do_rto_max(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.rto_max); unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; From 15649fd5415eda664ef35780c2013adeb5d9c695 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:34 +0100 Subject: [PATCH 777/807] sctp: sysctl: auth_enable: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, but that would increase the size of this fix, while 'sctp.ctl_sock' still needs to be retrieved from 'net' structure. Fixes: b14878ccb7fa ("net: sctp: cache auth_enable per endpoint") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-6-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index a5285815264d..9d29611621fe 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -499,7 +499,7 @@ static int proc_sctp_do_alpha_beta(const struct ctl_table *ctl, int write, static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.auth_enable); struct ctl_table tbl; int new_value, ret; From c10377bbc1972d858eaf0ab366a311b39f8ef1b6 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:35 +0100 Subject: [PATCH 778/807] sctp: sysctl: udp_port: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, but that would increase the size of this fix, while 'sctp.ctl_sock' still needs to be retrieved from 'net' structure. Fixes: 046c052b475e ("sctp: enable udp tunneling socks") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-7-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9d29611621fe..18fa4f44e8ec 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -528,7 +528,7 @@ static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.udp_port); unsigned int min = *(unsigned int *)ctl->extra1; unsigned int max = *(unsigned int *)ctl->extra2; struct ctl_table tbl; From 6259d2484d0ceff42245d1f09cc8cb6ee72d847a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:36 +0100 Subject: [PATCH 779/807] sctp: sysctl: plpmtud_probe_interval: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.probe_interval' is used. Fixes: d1e462a7a5f3 ("sctp: add probe_interval in sysctl and sock/asoc/transport") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-8-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 18fa4f44e8ec..8e1e97be4df7 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -569,7 +569,8 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, static int proc_sctp_do_probe_interval(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, + sctp.probe_interval); struct ctl_table tbl; int ret, new_value; From 7f5611cbc4871c7fb1ad36c2e5a9edad63dca95c Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:37 +0100 Subject: [PATCH 780/807] rds: sysctl: rds_tcp_{rcv,snd}buf: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The per-netns structure can be obtained from the table->data using container_of(), then the 'net' one can be retrieved from the listen socket (if available). Fixes: c6a58ffed536 ("RDS: TCP: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-9-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/rds/tcp.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 351ac1747224..0581c53e6517 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -61,8 +61,10 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; -static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, - void *buffer, size_t *lenp, loff_t *fpos); +static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos); +static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -74,7 +76,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* data is per-net pointer */ .maxlen = sizeof(int), .mode = 0644, - .proc_handler = rds_tcp_skbuf_handler, + .proc_handler = rds_tcp_sndbuf_handler, .extra1 = &rds_tcp_min_sndbuf, }, #define RDS_TCP_RCVBUF 1 @@ -83,7 +85,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* data is per-net pointer */ .maxlen = sizeof(int), .mode = 0644, - .proc_handler = rds_tcp_skbuf_handler, + .proc_handler = rds_tcp_rcvbuf_handler, .extra1 = &rds_tcp_min_rcvbuf, }, }; @@ -682,10 +684,10 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_unlock_irq(&rds_tcp_conn_lock); } -static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, +static int rds_tcp_skbuf_handler(struct rds_tcp_net *rtn, + const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *fpos) { - struct net *net = current->nsproxy->net_ns; int err; err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos); @@ -694,11 +696,34 @@ static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, *(int *)(ctl->extra1)); return err; } - if (write) + + if (write && rtn->rds_tcp_listen_sock && rtn->rds_tcp_listen_sock->sk) { + struct net *net = sock_net(rtn->rds_tcp_listen_sock->sk); + rds_tcp_sysctl_reset(net); + } + return 0; } +static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos) +{ + struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net, + sndbuf_size); + + return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos); +} + +static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos) +{ + struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net, + rcvbuf_size); + + return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos); +} + static void rds_tcp_exit(void) { rds_tcp_set_unloading(); From 503465d4dc40849af3cc18a517a5c06e155c5e33 Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Fri, 20 Dec 2024 17:17:26 +0800 Subject: [PATCH 781/807] tools: selftests: riscv: Add pass message for v_initval_nolibc Add the pass message after we successfully complete the test. Fixes: 5c93c4c72fbc ("selftests: Test RISC-V Vector's first-use handler") Signed-off-by: Yong-Xuan Wang Reviewed-by: Andrew Jones Reviewed-by: Andy Chiu Link: https://lore.kernel.org/r/20241220091730.28006-2-yongxuan.wang@sifive.com Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/vector/v_initval_nolibc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/riscv/vector/v_initval_nolibc.c b/tools/testing/selftests/riscv/vector/v_initval_nolibc.c index 1dd94197da30..6174ffe016dc 100644 --- a/tools/testing/selftests/riscv/vector/v_initval_nolibc.c +++ b/tools/testing/selftests/riscv/vector/v_initval_nolibc.c @@ -25,6 +25,8 @@ int main(void) unsigned long vl; char *datap, *tmp; + ksft_set_plan(1); + datap = malloc(MAX_VSIZE); if (!datap) { ksft_test_result_fail("fail to allocate memory for size = %d\n", MAX_VSIZE); @@ -63,6 +65,8 @@ int main(void) } free(datap); + + ksft_test_result_pass("tests for v_initval_nolibc pass\n"); ksft_exit_pass(); return 0; } From ebdc22c51acee963e26cacb2cb63f8fa2f483808 Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Fri, 20 Dec 2024 17:17:27 +0800 Subject: [PATCH 782/807] tools: selftests: riscv: Add test count for vstate_prctl Add the test count to drop the warning message. "Planned tests != run tests (0 != 1)" Fixes: 7cf6198ce22d ("selftests: Test RISC-V Vector prctl interface") Signed-off-by: Yong-Xuan Wang Reviewed-by: Andrew Jones Reviewed-by: Andy Chiu Link: https://lore.kernel.org/r/20241220091730.28006-3-yongxuan.wang@sifive.com Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/vector/vstate_prctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/riscv/vector/vstate_prctl.c b/tools/testing/selftests/riscv/vector/vstate_prctl.c index 895177f6bf4c..40b3bffcbb40 100644 --- a/tools/testing/selftests/riscv/vector/vstate_prctl.c +++ b/tools/testing/selftests/riscv/vector/vstate_prctl.c @@ -76,6 +76,8 @@ int main(void) long flag, expected; long rc; + ksft_set_plan(1); + pair.key = RISCV_HWPROBE_KEY_IMA_EXT_0; rc = riscv_hwprobe(&pair, 1, 0, NULL, 0); if (rc < 0) { From fc58db9aeb15e89b69ff5e9abc69ecf9e5f888ed Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 12 Dec 2024 16:09:32 -0800 Subject: [PATCH 783/807] drivers/perf: riscv: Fix Platform firmware event data Platform firmware event data field is allowed to be 62 bits for Linux as uppper most two bits are reserved to indicate SBI fw or platform specific firmware events. However, the event data field is masked as per the hardware raw event mask which is not correct. Fix the platform firmware event data field with proper mask. Fixes: f0c9363db2dd ("perf/riscv-sbi: Add platform specific firmware event handling") Signed-off-by: Atish Patra Link: https://lore.kernel.org/r/20241212-pmu_event_fixes_v2-v2-1-813e8a4f5962@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 1 + drivers/perf/riscv_pmu_sbi.c | 12 +++++------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 6c82318065cf..3d250824178b 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -159,6 +159,7 @@ struct riscv_pmu_snapshot_data { }; #define RISCV_PMU_RAW_EVENT_MASK GENMASK_ULL(47, 0) +#define RISCV_PMU_PLAT_FW_EVENT_MASK GENMASK_ULL(61, 0) #define RISCV_PMU_RAW_EVENT_IDX 0x20000 #define RISCV_PLAT_FW_EVENT 0xFFFF diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 1aa303f76cc7..3473ba02abf3 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -507,7 +507,6 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) { u32 type = event->attr.type; u64 config = event->attr.config; - u64 raw_config_val; int ret; /* @@ -528,21 +527,20 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) case PERF_TYPE_RAW: /* * As per SBI specification, the upper 16 bits must be unused - * for a raw event. + * for a hardware raw event. * Bits 63:62 are used to distinguish between raw events * 00 - Hardware raw event * 10 - SBI firmware events * 11 - Risc-V platform specific firmware event */ - raw_config_val = config & RISCV_PMU_RAW_EVENT_MASK; + switch (config >> 62) { case 0: ret = RISCV_PMU_RAW_EVENT_IDX; - *econfig = raw_config_val; + *econfig = config & RISCV_PMU_RAW_EVENT_MASK; break; case 2: - ret = (raw_config_val & 0xFFFF) | - (SBI_PMU_EVENT_TYPE_FW << 16); + ret = (config & 0xFFFF) | (SBI_PMU_EVENT_TYPE_FW << 16); break; case 3: /* @@ -551,7 +549,7 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) * Event data - raw event encoding */ ret = SBI_PMU_EVENT_TYPE_FW << 16 | RISCV_PLAT_FW_EVENT; - *econfig = raw_config_val; + *econfig = config & RISCV_PMU_PLAT_FW_EVENT_MASK; break; } break; From 2c206cdede567f53035c622e846678a996f39d69 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 12 Dec 2024 16:09:33 -0800 Subject: [PATCH 784/807] drivers/perf: riscv: Return error for default case If the upper two bits has an invalid valid (0x1), the event mapping is not reliable as it returns an uninitialized variable. Return appropriate value for the default case. Fixes: f0c9363db2dd ("perf/riscv-sbi: Add platform specific firmware event handling") Signed-off-by: Atish Patra Link: https://lore.kernel.org/r/20241212-pmu_event_fixes_v2-v2-2-813e8a4f5962@rivosinc.com Signed-off-by: Palmer Dabbelt --- drivers/perf/riscv_pmu_sbi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 3473ba02abf3..da3651d32906 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -507,7 +507,7 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) { u32 type = event->attr.type; u64 config = event->attr.config; - int ret; + int ret = -ENOENT; /* * Ensure we are finished checking standard hardware events for @@ -551,10 +551,11 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) ret = SBI_PMU_EVENT_TYPE_FW << 16 | RISCV_PLAT_FW_EVENT; *econfig = config & RISCV_PMU_PLAT_FW_EVENT_MASK; break; + default: + break; } break; default: - ret = -ENOENT; break; } From 3aff4cdbe506652da77570baccad623511628250 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 12 Dec 2024 16:09:34 -0800 Subject: [PATCH 785/807] drivers/perf: riscv: Do not allow invalid raw event config The SBI specification allows only lower 48bits of hpmeventX to be configured via SBI PMU. Currently, the driver masks of the higher bits but doesn't return an error. This will lead to an additional SBI call for config matching which should return for an invalid event error in most of the cases. However, if a platform(i.e Rocket and sifive cores) implements a bitmap of all bits in the event encoding this will lead to an incorrect event being programmed leading to user confusion. Report the error to the user if higher bits are set during the event mapping itself to avoid the confusion and save an additional SBI call. Suggested-by: Samuel Holland Signed-off-by: Atish Patra Link: https://lore.kernel.org/r/20241212-pmu_event_fixes_v2-v2-3-813e8a4f5962@rivosinc.com Signed-off-by: Palmer Dabbelt --- drivers/perf/riscv_pmu_sbi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index da3651d32906..194c153e5d71 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -536,8 +536,11 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) switch (config >> 62) { case 0: - ret = RISCV_PMU_RAW_EVENT_IDX; - *econfig = config & RISCV_PMU_RAW_EVENT_MASK; + /* Return error any bits [48-63] is set as it is not allowed by the spec */ + if (!(config & ~RISCV_PMU_RAW_EVENT_MASK)) { + *econfig = config & RISCV_PMU_RAW_EVENT_MASK; + ret = RISCV_PMU_RAW_EVENT_IDX; + } break; case 2: ret = (config & 0xFFFF) | (SBI_PMU_EVENT_TYPE_FW << 16); From 155c5bf26f983e9988333eeb0ef217138304d13b Mon Sep 17 00:00:00 2001 From: guanjing Date: Fri, 20 Dec 2024 09:33:35 +0100 Subject: [PATCH 786/807] firewall: remove misplaced semicolon from stm32_firewall_get_firewall Remove misplaced colon in stm32_firewall_get_firewall() which results in a syntax error when the code is compiled without CONFIG_STM32_FIREWALL. Fixes: 5c9668cfc6d7 ("firewall: introduce stm32_firewall framework") Signed-off-by: guanjing Reviewed-by: Gatien Chevallier Signed-off-by: Alexandre Torgue Signed-off-by: Arnd Bergmann --- include/linux/bus/stm32_firewall_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bus/stm32_firewall_device.h b/include/linux/bus/stm32_firewall_device.h index 18e0a2fc3816..5178b72bc920 100644 --- a/include/linux/bus/stm32_firewall_device.h +++ b/include/linux/bus/stm32_firewall_device.h @@ -115,7 +115,7 @@ void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 su #else /* CONFIG_STM32_FIREWALL */ int stm32_firewall_get_firewall(struct device_node *np, struct stm32_firewall *firewall, - unsigned int nb_firewall); + unsigned int nb_firewall) { return -ENODEV; } From 30c8fd31c571db486a5331a92d03eb60a0fb277c Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 9 Jan 2025 23:29:37 +0900 Subject: [PATCH 787/807] tracing/kprobes: Fix to free objects when failed to copy a symbol In __trace_kprobe_create(), if something fails it must goto error block to free objects. But when strdup() a symbol, it returns without that. Fix it to goto the error block to free objects correctly. Link: https://lore.kernel.org/all/173643297743.1514810.2408159540454241947.stgit@devnote2/ Fixes: 6212dd29683e ("tracing/kprobes: Use dyn_event framework for kprobe events") Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_kprobe.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 935a886af40c..0642ea174849 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -940,8 +940,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } /* a symbol specified */ symbol = kstrdup(argv[1], GFP_KERNEL); - if (!symbol) - return -ENOMEM; + if (!symbol) { + ret = -ENOMEM; + goto error; + } tmp = strchr(symbol, '%'); if (tmp) { From b583ef82b671c9a752fbe3e95bd4c1c51eab764d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 9 Jan 2025 15:14:40 +0100 Subject: [PATCH 788/807] uprobes: Fix race in uprobe_free_utask Max Makarov reported kernel panic [1] in perf user callchain code. The reason for that is the race between uprobe_free_utask and bpf profiler code doing the perf user stack unwind and is triggered within uprobe_free_utask function: - after current->utask is freed and - before current->utask is set to NULL general protection fault, probably for non-canonical address 0x9e759c37ee555c76: 0000 [#1] SMP PTI RIP: 0010:is_uprobe_at_func_entry+0x28/0x80 ... ? die_addr+0x36/0x90 ? exc_general_protection+0x217/0x420 ? asm_exc_general_protection+0x26/0x30 ? is_uprobe_at_func_entry+0x28/0x80 perf_callchain_user+0x20a/0x360 get_perf_callchain+0x147/0x1d0 bpf_get_stackid+0x60/0x90 bpf_prog_9aac297fb833e2f5_do_perf_event+0x434/0x53b ? __smp_call_single_queue+0xad/0x120 bpf_overflow_handler+0x75/0x110 ... asm_sysvec_apic_timer_interrupt+0x1a/0x20 RIP: 0010:__kmem_cache_free+0x1cb/0x350 ... ? uprobe_free_utask+0x62/0x80 ? acct_collect+0x4c/0x220 uprobe_free_utask+0x62/0x80 mm_release+0x12/0xb0 do_exit+0x26b/0xaa0 __x64_sys_exit+0x1b/0x20 do_syscall_64+0x5a/0x80 It can be easily reproduced by running following commands in separate terminals: # while :; do bpftrace -e 'uprobe:/bin/ls:_start { printf("hit\n"); }' -c ls; done # bpftrace -e 'profile:hz:100000 { @[ustack()] = count(); }' Fixing this by making sure current->utask pointer is set to NULL before we start to release the utask object. [1] https://github.com/grafana/pyroscope/issues/3673 Fixes: cfa7f3d2c526 ("perf,x86: avoid missing caller address in stack traces captured in uprobe") Reported-by: Max Makarov Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250109141440.2692173-1-jolsa@kernel.org --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index fa04b14a7d72..5d71ef85420c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1915,6 +1915,7 @@ void uprobe_free_utask(struct task_struct *t) if (!utask) return; + t->utask = NULL; WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); timer_delete_sync(&utask->ri_timer); @@ -1924,7 +1925,6 @@ void uprobe_free_utask(struct task_struct *t) ri = free_ret_instance(ri, true /* cleanup_hprobe */); kfree(utask); - t->utask = NULL; } #define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ From 67510d7e2e5f5bdc020bf9d759aa575cce48c8e1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Jan 2025 08:58:14 +0100 Subject: [PATCH 789/807] fs: debugfs: fix open proxy for unsafe files In the previous commit referenced below, I had to split the short fops handling into different proxy fops. This necessitated knowing out-of-band whether or not the ops are short or full, when attempting to convert from fops to allocated fsdata. Unfortunately, I only converted full_proxy_open() which is used for the new full_proxy_open_regular() and full_proxy_open_short(), but forgot about the call in open_proxy_open(), used for debugfs_create_file_unsafe(). Fix that, it never has short fops. Fixes: f8f25893a477 ("fs: debugfs: differentiate short fops with proxy ops") Reported-by: Suresh Kumar Kurmi Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202501101055.bb8bf3e7-lkp@intel.com Reported-by: Venkat Rao Bagalkote Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20250110085826.cd74f3b7a36b.I430c79c82ec3f954c2ff9665753bf6ac9e63eef8@changeid Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index bdb4f2ca0506..16e198a26339 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -280,7 +280,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) const struct file_operations *real_fops = NULL; int r; - r = debugfs_file_get(dentry); + r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR); if (r) return r == -EIO ? -ENOENT : r; From 111d36d6278756128b7d7fab787fdcbf8221cd98 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 8 Jan 2025 16:54:02 -0800 Subject: [PATCH 790/807] xfs: lock dquot buffer before detaching dquot from b_li_list We have to lock the buffer before we can delete the dquot log item from the buffer's log item list. Cc: stable@vger.kernel.org # v6.13-rc3 Fixes: acc8f8628c3737 ("xfs: attach dquot buffer to dquot log item buffer") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_dquot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index f11d475898f2..201c26322ede 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -87,8 +87,9 @@ xfs_dquot_detach_buf( } spin_unlock(&qlip->qli_lock); if (bp) { + xfs_buf_lock(bp); list_del_init(&qlip->qli_item.li_bio_list); - xfs_buf_rele(bp); + xfs_buf_relse(bp); } } From cacd9ae4bf801ff4125d8961bb9a3ba955e51680 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:17 +0100 Subject: [PATCH 791/807] poll_wait: add mb() to fix theoretical race between waitqueue_active() and .poll() As the comment above waitqueue_active() explains, it can only be used if both waker and waiter have mb()'s that pair with each other. However __pollwait() is broken in this respect. This is not pipe-specific, but let's look at pipe_poll() for example: poll_wait(...); // -> __pollwait() -> add_wait_queue() LOAD(pipe->head); LOAD(pipe->head); In theory these LOAD()'s can leak into the critical section inside add_wait_queue() and can happen before list_add(entry, wq_head), in this case pipe_poll() can race with wakeup_pipe_readers/writers which do smp_mb(); if (waitqueue_active(wq_head)) wake_up_interruptible(wq_head); There are more __pollwait()-like functions (grep init_poll_funcptr), and it seems that at least ep_ptable_queue_proc() has the same problem, so the patch adds smp_mb() into poll_wait(). Link: https://lore.kernel.org/all/20250102163320.GA17691@redhat.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162717.GA18922@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/poll.h b/include/linux/poll.h index d1ea4f3714a8..fc641b50f129 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -41,8 +41,16 @@ typedef struct poll_table_struct { static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { - if (p && p->_qproc && wait_address) + if (p && p->_qproc && wait_address) { p->_qproc(filp, wait_address, p); + /* + * This memory barrier is paired in the wq_has_sleeper(). + * See the comment above prepare_to_wait(), we need to + * ensure that subsequent tests in this thread can't be + * reordered with __add_wait_queue() in _qproc() paths. + */ + smp_mb(); + } } /* From 10b02a2cfec2f106db4897ad87732db56d71e6fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:24 +0100 Subject: [PATCH 792/807] poll_wait: kill the obsolete wait_address check This check is historical and no longer needed, wait_address is never NULL. These days we rely on the poll_table->_qproc check. NULL if select/poll is not going to sleep, or it already has a data to report, or all waiters have already been registered after the 1st iteration. However, poll_table *p can be NULL, see p9_fd_poll() for example, so we can't remove the "p != NULL" check. Link: https://lore.kernel.org/all/20250106180325.GF7233@redhat.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162724.GA18926@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/poll.h b/include/linux/poll.h index fc641b50f129..57b6d1ccd8bf 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -41,7 +41,7 @@ typedef struct poll_table_struct { static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { - if (p && p->_qproc && wait_address) { + if (p && p->_qproc) { p->_qproc(filp, wait_address, p); /* * This memory barrier is paired in the wq_has_sleeper(). From 4e15fa8305deecdf20233558ed9f7a8a62b708fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:30 +0100 Subject: [PATCH 793/807] io_uring_poll: kill the no longer necessary barrier after poll_wait() Now that poll_wait() provides a full barrier we can remove smp_rmb() from io_uring_poll(). In fact I don't think smp_rmb() was correct, it can't serialize LOADs and STOREs. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162730.GA18940@redhat.com Signed-off-by: Christian Brauner --- io_uring/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 06ff41484e29..a64a82b93b86 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2809,13 +2809,12 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (unlikely(!ctx->poll_activated)) io_activate_pollwq(ctx); - - poll_wait(file, &ctx->poll_wq, wait); /* - * synchronizes with barrier from wq_has_sleeper call in - * io_commit_cqring + * provides mb() which pairs with barrier from wq_has_sleeper + * call in io_commit_cqring */ - smp_rmb(); + poll_wait(file, &ctx->poll_wq, wait); + if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; From b2849867b3a70c2d675ddca01c4e4540f7d3b8e9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:36 +0100 Subject: [PATCH 794/807] sock_poll_wait: kill the no longer necessary barrier after poll_wait() Now that poll_wait() provides a full barrier we can remove smp_mb() from sock_poll_wait(). Also, the poll_does_not_wait() check before poll_wait() just adds the unnecessary confusion, kill it. poll_wait() does the same "p && p->_qproc" check. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162736.GA18944@redhat.com Signed-off-by: Christian Brauner --- include/net/sock.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c..305f3ae5edc2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2291,7 +2291,7 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) } /** - * sock_poll_wait - place memory barrier behind the poll_wait call. + * sock_poll_wait - wrapper for the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table @@ -2301,15 +2301,12 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { - if (!poll_does_not_wait(p)) { - poll_wait(filp, &sock->wq.wait, p); - /* We need to be sure we are in sync with the - * socket flags modification. - * - * This memory barrier is paired in the wq_has_sleeper. - */ - smp_mb(); - } + /* Provides a barrier we need to be sure we are in sync + * with the socket flags modification. + * + * This memory barrier is paired in the wq_has_sleeper. + */ + poll_wait(filp, &sock->wq.wait, p); } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) From f005bf18a57aadf3af1e85a0f0151cb3688ee606 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:43 +0100 Subject: [PATCH 795/807] poll: kill poll_does_not_wait() It no longer has users. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162743.GA18947@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/include/linux/poll.h b/include/linux/poll.h index 57b6d1ccd8bf..12bb18e8b978 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -25,14 +25,14 @@ struct poll_table_struct; -/* +/* * structures and helpers for f_op->poll implementations */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); /* - * Do not touch the structure directly, use the access functions - * poll_does_not_wait() and poll_requested_events() instead. + * Do not touch the structure directly, use the access function + * poll_requested_events() instead. */ typedef struct poll_table_struct { poll_queue_proc _qproc; @@ -53,16 +53,6 @@ static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_addres } } -/* - * Return true if it is guaranteed that poll will not wait. This is the case - * if the poll() of another file descriptor in the set got an event, so there - * is no need for waiting. - */ -static inline bool poll_does_not_wait(const poll_table *p) -{ - return p == NULL || p->_qproc == NULL; -} - /* * Return the set of events that the application wants to poll for. * This is useful for drivers that need to know whether a DMA transfer has From da30ba227c41762ac98e993a1453460450b3e642 Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Fri, 10 Jan 2025 10:27:11 +1100 Subject: [PATCH 796/807] workqueue: warn if delayed_work is queued to an offlined cpu. delayed_work submitted to an offlined cpu, will not get executed, after the specified delay if the cpu remains offline. If the cpu never comes online the work will never get executed. checking for online cpu in __queue_delayed_work, does not sound like a good idea because to do this reliably we need hotplug lock and since work may be submitted from atomic contexts, we would have to use cpus_read_trylock. But if trylock fails we would queue the work on any cpu and this may not be optimal because our intended cpu might still be online. Putting a WARN_ON_ONCE for an already offlined cpu, will indicate users of queue_delayed_work_on, if they are (wrongly) trying to queue delayed_work on offlined cpu. Also indicate the problem of using offlined cpu with queue_delayed_work_on, in its description. Signed-off-by: Imran Khan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f7d8fc204579..9362484a653c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2508,6 +2508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, return; } + WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu)); dwork->wq = wq; dwork->cpu = cpu; timer->expires = jiffies + delay; @@ -2533,6 +2534,12 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, * @dwork: work to queue * @delay: number of jiffies to wait before queueing * + * We queue the delayed_work to a specific CPU, for non-zero delays the + * caller must ensure it is online and can't go away. Callers that fail + * to ensure this, may get @dwork->timer queued to an offlined CPU and + * this will prevent queueing of @dwork->work unless the offlined CPU + * becomes online again. + * * Return: %false if @work was already on a queue, %true otherwise. If * @delay is zero and @dwork is idle, it will be scheduled for immediate * execution. From 4b7cfa8b6c28a9fa22b86894166a1a34f6d630ba Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 10 Jan 2025 14:31:23 +0000 Subject: [PATCH 797/807] io_uring/sqpoll: zero sqd->thread on tctx errors Syzkeller reports: BUG: KASAN: slab-use-after-free in thread_group_cputime+0x409/0x700 kernel/sched/cputime.c:341 Read of size 8 at addr ffff88803578c510 by task syz.2.3223/27552 Call Trace: ... kasan_report+0x143/0x180 mm/kasan/report.c:602 thread_group_cputime+0x409/0x700 kernel/sched/cputime.c:341 thread_group_cputime_adjusted+0xa6/0x340 kernel/sched/cputime.c:639 getrusage+0x1000/0x1340 kernel/sys.c:1863 io_uring_show_fdinfo+0xdfe/0x1770 io_uring/fdinfo.c:197 seq_show+0x608/0x770 fs/proc/fd.c:68 ... That's due to sqd->task not being cleared properly in cases where SQPOLL task tctx setup fails, which can essentially only happen with fault injection to insert allocation errors. Cc: stable@vger.kernel.org Fixes: 1251d2025c3e1 ("io_uring/sqpoll: early exit thread if task_context wasn't allocated") Reported-by: syzbot+3d92cfcfa84070b0a470@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/efc7ec7010784463b2e7466d7b5c02c2cb381635.1736519461.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 9e5bd79fd2b5..8961a3c1e73c 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -268,8 +268,12 @@ static int io_sq_thread(void *data) DEFINE_WAIT(wait); /* offload context creation failed, just exit */ - if (!current->io_uring) + if (!current->io_uring) { + mutex_lock(&sqd->lock); + sqd->thread = NULL; + mutex_unlock(&sqd->lock); goto err_out; + } snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); set_task_comm(current, buf); From bd2703b42decebdcddf76e277ba76b4c4a142d73 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 10 Jan 2025 20:36:45 +0000 Subject: [PATCH 798/807] io_uring: don't touch sqd->thread off tw add With IORING_SETUP_SQPOLL all requests are created by the SQPOLL task, which means that req->task should always match sqd->thread. Since accesses to sqd->thread should be separately protected, use req->task in io_req_normal_work_add() instead. Note, in the eyes of io_req_normal_work_add(), the SQPOLL task struct is always pinned and alive, and sqd->thread can either be the task or NULL. It's only problematic if the compiler decides to reload the value after the null check, which is not so likely. Cc: stable@vger.kernel.org Cc: Bui Quang Minh Reported-by: lizetao Fixes: 78f9b61bd8e54 ("io_uring: wake SQPOLL task when task_work is added to an empty queue") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1cbbe72cf32c45a8fee96026463024cd8564a7d7.1736541357.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d3403c8216db..5eb119002099 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1226,10 +1226,7 @@ static void io_req_normal_work_add(struct io_kiocb *req) /* SQPOLL doesn't need the task_work added, it'll run it itself */ if (ctx->flags & IORING_SETUP_SQPOLL) { - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd->thread) - __set_notify_signal(sqd->thread); + __set_notify_signal(tctx->task); return; } From a2a3374c47c428c0edb0bbc693638d4783f81e31 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 10 Jan 2025 23:16:31 +0100 Subject: [PATCH 799/807] sched_ext: idle: Refresh idle masks during idle-to-idle transitions With the consolidation of put_prev_task/set_next_task(), see commit 436f3eed5c69 ("sched: Combine the last put_prev_task() and the first set_next_task()"), we are now skipping the transition between these two functions when the previous and the next tasks are the same. As a result, the scx idle state of a CPU is updated only when transitioning to or from the idle thread. While this is generally correct, it can lead to uneven and inefficient core utilization in certain scenarios [1]. A typical scenario involves proactive wake-ups: scx_bpf_pick_idle_cpu() selects and marks an idle CPU as busy, followed by a wake-up via scx_bpf_kick_cpu(), without dispatching any tasks. In this case, the CPU continues running the idle thread, returns to idle, but remains marked as busy, preventing it from being selected again as an idle CPU (until a task eventually runs on it and releases the CPU). For example, running a workload that uses 20% of each CPU, combined with an scx scheduler using proactive wake-ups, results in the following core utilization: CPU 0: 25.7% CPU 1: 29.3% CPU 2: 26.5% CPU 3: 25.5% CPU 4: 0.0% CPU 5: 25.5% CPU 6: 0.0% CPU 7: 10.5% To address this, refresh the idle state also in pick_task_idle(), during idle-to-idle transitions, but only trigger ops.update_idle() on actual state changes to prevent unnecessary updates to the scx scheduler and maintain balanced state transitions. With this change in place, the core utilization in the previous example becomes the following: CPU 0: 18.8% CPU 1: 19.4% CPU 2: 18.0% CPU 3: 18.7% CPU 4: 19.3% CPU 5: 18.9% CPU 6: 18.7% CPU 7: 19.3% [1] https://github.com/sched-ext/scx/pull/1139 Fixes: 7c65ae81ea86 ("sched_ext: Don't call put_prev_task_scx() before picking the next task") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 61 ++++++++++++++++++++++++++++++++++++++------- kernel/sched/ext.h | 8 +++--- kernel/sched/idle.c | 5 ++-- 3 files changed, 59 insertions(+), 15 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 68150e110451..19813b387ef9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3590,16 +3590,8 @@ static void reset_idle_masks(void) cpumask_copy(idle_masks.smt, cpu_online_mask); } -void __scx_update_idle(struct rq *rq, bool idle) +static void update_builtin_idle(int cpu, bool idle) { - int cpu = cpu_of(rq); - - if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - if (!static_branch_unlikely(&scx_builtin_idle_enabled)) - return; - } - if (idle) cpumask_set_cpu(cpu, idle_masks.cpu); else @@ -3626,6 +3618,57 @@ void __scx_update_idle(struct rq *rq, bool idle) #endif } +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + */ + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); +} + static void handle_hotplug(struct rq *rq, bool online) { int cpu = cpu_of(rq); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index b1675bb59fc4..4d022d17ac7d 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ #if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) -void __scx_update_idle(struct rq *rq, bool idle); +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify); -static inline void scx_update_idle(struct rq *rq, bool idle) +static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) { if (scx_enabled()) - __scx_update_idle(rq, idle); + __scx_update_idle(rq, idle, do_notify); } #else -static inline void scx_update_idle(struct rq *rq, bool idle) {} +static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} #endif #ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 621696269584..2c85c86b455f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { dl_server_update_idle_time(rq, prev); - scx_update_idle(rq, false); + scx_update_idle(rq, false, true); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { update_idle_core(rq); - scx_update_idle(rq, true); + scx_update_idle(rq, true, true); schedstat_inc(rq->sched_goidle); next->se.exec_start = rq_clock_task(rq); } struct task_struct *pick_task_idle(struct rq *rq) { + scx_update_idle(rq, true, false); return rq->idle; } From 20b1aa912316ffb7fbb5f407f17c330f2a22ddff Mon Sep 17 00:00:00 2001 From: Meetakshi Setiya Date: Wed, 8 Jan 2025 05:10:34 -0500 Subject: [PATCH 800/807] smb: client: sync the root session and superblock context passwords before automounting In some cases, when password2 becomes the working password, the client swaps the two password fields in the root session struct, but not in the smb3_fs_context struct in cifs_sb. DFS automounts inherit fs context from their parent mounts. Therefore, they might end up getting the passwords in the stale order. The automount should succeed, because the mount function will end up retrying with the actual password anyway. But to reduce these unnecessary session setup retries for automounts, we can sync the parent context's passwords with the root session's passwords before duplicating it to the child's fs context. Cc: stable@vger.kernel.org Signed-off-by: Meetakshi Setiya Reviewed-by: Shyam Prasad N Acked-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/namespace.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c index 0f788031b740..e3f9213131c4 100644 --- a/fs/smb/client/namespace.c +++ b/fs/smb/client/namespace.c @@ -196,11 +196,28 @@ static struct vfsmount *cifs_do_automount(struct path *path) struct smb3_fs_context tmp; char *full_path; struct vfsmount *mnt; + struct cifs_sb_info *mntpt_sb; + struct cifs_ses *ses; if (IS_ROOT(mntpt)) return ERR_PTR(-ESTALE); - cur_ctx = CIFS_SB(mntpt->d_sb)->ctx; + mntpt_sb = CIFS_SB(mntpt->d_sb); + ses = cifs_sb_master_tcon(mntpt_sb)->ses; + cur_ctx = mntpt_sb->ctx; + + /* + * At this point, the root session should be in the mntpt sb. We should + * bring the sb context passwords in sync with the root session's + * passwords. This would help prevent unnecessary retries and password + * swaps for automounts. + */ + mutex_lock(&ses->session_mutex); + rc = smb3_sync_session_ctx_passwords(mntpt_sb, ses); + mutex_unlock(&ses->session_mutex); + + if (rc) + return ERR_PTR(rc); fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, mntpt); if (IS_ERR(fc)) From 77a903cd8e5a91d120ee014c8f8eae74d6c5d0f6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 11 Jan 2025 10:57:38 +1100 Subject: [PATCH 801/807] MAINTAINERS: powerpc: Update my status Maddy is taking over the day-to-day maintenance of powerpc. I will still be around to help, and as a backup. Re-order the main POWERPC list to put Maddy first to reflect that. KVM/powerpc patches will be handled by Maddy via the powerpc tree with review from Nick, so replace myself with Maddy there. Remove myself from BPF, leaving Hari & Christophe as maintainers. Signed-off-by: Michael Ellerman Signed-off-by: Linus Torvalds --- MAINTAINERS | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index dba82fb32d04..a87ddad78e26 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4128,7 +4128,6 @@ S: Odd Fixes F: drivers/net/ethernet/netronome/nfp/bpf/ BPF JIT for POWERPC (32-BIT AND 64-BIT) -M: Michael Ellerman M: Hari Bathini M: Christophe Leroy R: Naveen N Rao @@ -12629,7 +12628,7 @@ F: arch/mips/include/uapi/asm/kvm* F: arch/mips/kvm/ KERNEL VIRTUAL MACHINE FOR POWERPC (KVM/powerpc) -M: Michael Ellerman +M: Madhavan Srinivasan R: Nicholas Piggin L: linuxppc-dev@lists.ozlabs.org L: kvm@vger.kernel.org @@ -13208,11 +13207,11 @@ X: drivers/macintosh/adb-iop.c X: drivers/macintosh/via-macii.c LINUX FOR POWERPC (32-BIT AND 64-BIT) +M: Madhavan Srinivasan M: Michael Ellerman R: Nicholas Piggin R: Christophe Leroy R: Naveen N Rao -M: Madhavan Srinivasan L: linuxppc-dev@lists.ozlabs.org S: Supported W: https://github.com/linuxppc/wiki/wiki From 87ecfdbc699cc95fac73291b52650283ddcf929d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 12 Jan 2025 10:34:44 +0100 Subject: [PATCH 802/807] KVM: e500: always restore irqs If find_linux_pte fails, IRQs will not be restored. This is unlikely to happen in practice since it would have been reported as hanging hosts, but it should of course be fixed anyway. Cc: stable@vger.kernel.org Reported-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500_mmu_host.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index e5a145b578a4..6824e8139801 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -479,7 +479,6 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (pte_present(pte)) { wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; - local_irq_restore(flags); } else { local_irq_restore(flags); pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n", @@ -488,8 +487,9 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, goto out; } } - writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + local_irq_restore(flags); + writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); From e97fbb43fb1b2e909dfd726204af3cdcb971517e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Jan 2025 16:19:28 +0100 Subject: [PATCH 803/807] KVM: e500: use shadow TLB entry as witness for writability kvmppc_e500_ref_setup is returning whether the guest TLB entry is writable, which is than passed to kvm_release_faultin_page. This makes little sense for two reasons: first, because the function sets up the private data for the page and the return value feels like it has been bolted on the side; second, because what really matters is whether the _shadow_ TLB entry is writable. If it is not writable, the page can be released as non-dirty. Shift from using tlbe_is_writable(gtlbe) to doing the same check on the shadow TLB entry. Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500_mmu_host.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 6824e8139801..c266c02f120f 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -242,7 +242,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); } -static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref, +static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, struct kvm_book3e_206_tlb_entry *gtlbe, kvm_pfn_t pfn, unsigned int wimg) { @@ -251,8 +251,6 @@ static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref, /* Use guest supplied MAS2_G and MAS2_E */ ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; - - return tlbe_is_writable(gtlbe); } static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) @@ -489,9 +487,10 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } local_irq_restore(flags); - writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); + writable = tlbe_is_writable(stlbe); /* Clear i-cache for new pages */ kvmppc_mmu_flush_icache(pfn); From f2104bf22f0475a08a0feeee40d8e45ce38ed5a1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Jan 2025 16:21:38 +0100 Subject: [PATCH 804/807] KVM: e500: track host-writability of pages Add the possibility of marking a page so that the UW and SW bits are force-cleared. This is stored in the private info so that it persists across multiple calls to kvmppc_e500_setup_stlbe. Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500.h | 2 ++ arch/powerpc/kvm/e500_mmu_host.c | 15 +++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index 6d0d329cbb35..f9acf866c709 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -34,6 +34,8 @@ enum vcpu_ftr { #define E500_TLB_BITMAP (1 << 30) /* TLB1 entry is mapped by host TLB0 */ #define E500_TLB_TLB0 (1 << 29) +/* entry is writable on the host */ +#define E500_TLB_WRITABLE (1 << 28) /* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */ #define E500_TLB_MAS2_ATTR (0x7f) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index c266c02f120f..b1be39639d4a 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -45,11 +45,14 @@ static inline unsigned int tlb1_max_shadow_size(void) return host_tlb_params[1].entries - tlbcam_index - 1; } -static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) +static inline u32 e500_shadow_mas3_attrib(u32 mas3, bool writable, int usermode) { /* Mask off reserved bits. */ mas3 &= MAS3_ATTRIB_MASK; + if (!writable) + mas3 &= ~(MAS3_UW|MAS3_SW); + #ifndef CONFIG_KVM_BOOKE_HV if (!usermode) { /* Guest is in supervisor mode, @@ -244,10 +247,13 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, struct kvm_book3e_206_tlb_entry *gtlbe, - kvm_pfn_t pfn, unsigned int wimg) + kvm_pfn_t pfn, unsigned int wimg, + bool writable) { ref->pfn = pfn; ref->flags = E500_TLB_VALID; + if (writable) + ref->flags |= E500_TLB_WRITABLE; /* Use guest supplied MAS2_G and MAS2_E */ ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; @@ -303,6 +309,7 @@ static void kvmppc_e500_setup_stlbe( { kvm_pfn_t pfn = ref->pfn; u32 pr = vcpu->arch.shared->msr & MSR_PR; + bool writable = !!(ref->flags & E500_TLB_WRITABLE); BUG_ON(!(ref->flags & E500_TLB_VALID)); @@ -310,7 +317,7 @@ static void kvmppc_e500_setup_stlbe( stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR); stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | - e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); + e500_shadow_mas3_attrib(gtlbe->mas7_3, writable, pr); } static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, @@ -487,7 +494,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } local_irq_restore(flags); - kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, true); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); writable = tlbe_is_writable(stlbe); From 03b755b2aa48d242440cbfbd365e153b4b20fe54 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Jan 2025 16:14:55 +0100 Subject: [PATCH 805/807] KVM: e500: map readonly host pages for read The new __kvm_faultin_pfn() function is upset by the fact that e500 KVM ignores host page permissions - __kvm_faultin requires a "writable" outgoing argument, but e500 KVM is nonchalantly passing NULL. If the host page permissions do not include writability, the shadow TLB entry is forcibly mapped read-only. Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500_mmu_host.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index b1be39639d4a..b38679e5821b 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -374,6 +374,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, unsigned long slot_start, slot_end; pfnmap = 1; + writable = vma->vm_flags & VM_WRITE; start = vma->vm_pgoff; end = start + @@ -449,7 +450,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (likely(!pfnmap)) { tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT); - pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, NULL, &page); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page); if (is_error_noslot_pfn(pfn)) { if (printk_ratelimit()) pr_err("%s: real page not found for gfn %lx\n", @@ -494,7 +495,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } local_irq_restore(flags); - kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, true); + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, writable); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); writable = tlbe_is_writable(stlbe); From 55f4db79c4d94d4bb757f7a31a7f14de22fe517d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Jan 2025 16:49:50 +0100 Subject: [PATCH 806/807] KVM: e500: perform hugepage check after looking up the PFN e500 KVM tries to bypass __kvm_faultin_pfn() in order to map VM_PFNMAP VMAs as huge pages. This is a Bad Idea because VM_PFNMAP VMAs could become noncontiguous as a result of callsto remap_pfn_range(). Instead, use the already existing host PTE lookup to retrieve a valid host-side mapping level after __kvm_faultin_pfn() has returned. Then find the largest size that will satisfy the guest's request while staying within a single host PTE. Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500_mmu_host.c | 178 ++++++++++++------------------- 1 file changed, 69 insertions(+), 109 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index b38679e5821b..06caf8bbbe2b 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -326,15 +326,14 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, struct tlbe_ref *ref) { struct kvm_memory_slot *slot; - unsigned long pfn = 0; /* silence GCC warning */ + unsigned int psize; + unsigned long pfn; struct page *page = NULL; unsigned long hva; - int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; int ret = 0; unsigned long mmu_seq; struct kvm *kvm = vcpu_e500->vcpu.kvm; - unsigned long tsize_pages = 0; pte_t *ptep; unsigned int wimg = 0; pgd_t *pgdir; @@ -356,111 +355,12 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn); hva = gfn_to_hva_memslot(slot, gfn); - if (tlbsel == 1) { - struct vm_area_struct *vma; - mmap_read_lock(kvm->mm); - - vma = find_vma(kvm->mm, hva); - if (vma && hva >= vma->vm_start && - (vma->vm_flags & VM_PFNMAP)) { - /* - * This VMA is a physically contiguous region (e.g. - * /dev/mem) that bypasses normal Linux page - * management. Find the overlap between the - * vma and the memslot. - */ - - unsigned long start, end; - unsigned long slot_start, slot_end; - - pfnmap = 1; - writable = vma->vm_flags & VM_WRITE; - - start = vma->vm_pgoff; - end = start + - vma_pages(vma); - - pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); - - slot_start = pfn - (gfn - slot->base_gfn); - slot_end = slot_start + slot->npages; - - if (start < slot_start) - start = slot_start; - if (end > slot_end) - end = slot_end; - - tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> - MAS1_TSIZE_SHIFT; - - /* - * e500 doesn't implement the lowest tsize bit, - * or 1K pages. - */ - tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); - - /* - * Now find the largest tsize (up to what the guest - * requested) that will cover gfn, stay within the - * range, and for which gfn and pfn are mutually - * aligned. - */ - - for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { - unsigned long gfn_start, gfn_end; - tsize_pages = 1UL << (tsize - 2); - - gfn_start = gfn & ~(tsize_pages - 1); - gfn_end = gfn_start + tsize_pages; - - if (gfn_start + pfn - gfn < start) - continue; - if (gfn_end + pfn - gfn > end) - continue; - if ((gfn & (tsize_pages - 1)) != - (pfn & (tsize_pages - 1))) - continue; - - gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); - pfn &= ~(tsize_pages - 1); - break; - } - } else if (vma && hva >= vma->vm_start && - is_vm_hugetlb_page(vma)) { - unsigned long psize = vma_kernel_pagesize(vma); - - tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> - MAS1_TSIZE_SHIFT; - - /* - * Take the largest page size that satisfies both host - * and guest mapping - */ - tsize = min(__ilog2(psize) - 10, tsize); - - /* - * e500 doesn't implement the lowest tsize bit, - * or 1K pages. - */ - tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); - } - - mmap_read_unlock(kvm->mm); - } - - if (likely(!pfnmap)) { - tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT); - pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page); - if (is_error_noslot_pfn(pfn)) { - if (printk_ratelimit()) - pr_err("%s: real page not found for gfn %lx\n", - __func__, (long)gfn); - return -EINVAL; - } - - /* Align guest and physical address to page map boundaries */ - pfn &= ~(tsize_pages - 1); - gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page); + if (is_error_noslot_pfn(pfn)) { + if (printk_ratelimit()) + pr_err("%s: real page not found for gfn %lx\n", + __func__, (long)gfn); + return -EINVAL; } spin_lock(&kvm->mmu_lock); @@ -478,7 +378,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, * can't run hence pfn won't change. */ local_irq_save(flags); - ptep = find_linux_pte(pgdir, hva, NULL, NULL); + ptep = find_linux_pte(pgdir, hva, NULL, &psize); if (ptep) { pte_t pte = READ_ONCE(*ptep); @@ -495,6 +395,66 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } local_irq_restore(flags); + if (psize && tlbsel == 1) { + unsigned long psize_pages, tsize_pages; + unsigned long start, end; + unsigned long slot_start, slot_end; + + psize_pages = 1UL << (psize - PAGE_SHIFT); + start = pfn & ~(psize_pages - 1); + end = start + psize_pages; + + slot_start = pfn - (gfn - slot->base_gfn); + slot_end = slot_start + slot->npages; + + if (start < slot_start) + start = slot_start; + if (end > slot_end) + end = slot_end; + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * Any page size that doesn't satisfy the host mapping + * will fail the start and end tests. + */ + tsize = min(psize - PAGE_SHIFT + BOOK3E_PAGESZ_4K, tsize); + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); + + /* + * Now find the largest tsize (up to what the guest + * requested) that will cover gfn, stay within the + * range, and for which gfn and pfn are mutually + * aligned. + */ + + for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { + unsigned long gfn_start, gfn_end; + tsize_pages = 1UL << (tsize - 2); + + gfn_start = gfn & ~(tsize_pages - 1); + gfn_end = gfn_start + tsize_pages; + + if (gfn_start + pfn - gfn < start) + continue; + if (gfn_end + pfn - gfn > end) + continue; + if ((gfn & (tsize_pages - 1)) != + (pfn & (tsize_pages - 1))) + continue; + + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + pfn &= ~(tsize_pages - 1); + break; + } + } + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, writable); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); From 5bc55a333a2f7316b58edc7573e8e893f7acb532 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 12 Jan 2025 14:37:56 -0800 Subject: [PATCH 807/807] Linux 6.13-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7904d5d88088..e20a62ad397f 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Baby Opossum Posse # *DOCUMENTATION*