From e1b1d282d5ccabbf17e5556191af248a35293e16 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:02 -0800 Subject: [PATCH 01/88] net: fill in MODULE_DESCRIPTION()s for SLIP W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Serial Line (SLIP) protocol modules. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240108181610.2697017-3-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/slip/slhc.c | 1 + drivers/net/slip/slip.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c index ba93bab948e0..18df7ca66198 100644 --- a/drivers/net/slip/slhc.c +++ b/drivers/net/slip/slhc.c @@ -752,4 +752,5 @@ EXPORT_SYMBOL(slhc_compress); EXPORT_SYMBOL(slhc_uncompress); EXPORT_SYMBOL(slhc_toss); +MODULE_DESCRIPTION("Compression helpers for SLIP (serial line)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c index e4280e37fec9..0aba3569ccc0 100644 --- a/drivers/net/slip/slip.c +++ b/drivers/net/slip/slip.c @@ -1437,5 +1437,6 @@ out: } #endif +MODULE_DESCRIPTION("SLIP (serial line) protocol module"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_SLIP); From 417d8c571cb49fcace83a6b6477da2970802bf26 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:03 -0800 Subject: [PATCH 02/88] net: fill in MODULE_DESCRIPTION()s for HSR W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to High-availability Seamless Redundancy (HSR) driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240108181610.2697017-4-leitao@debian.org Signed-off-by: Jakub Kicinski --- net/hsr/hsr_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index b099c3150150..cb83c8feb746 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -167,4 +167,5 @@ static void __exit hsr_exit(void) module_init(hsr_init); module_exit(hsr_exit); +MODULE_DESCRIPTION("High-availability Seamless Redundancy (HSR) driver"); MODULE_LICENSE("GPL"); From 95c236cc5fc9f09fc4cf58767fa7c01f2425ad6b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:04 -0800 Subject: [PATCH 03/88] net: fill in MODULE_DESCRIPTION()s for NFC W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to all NFC Controller Interface (NCI) modules. Signed-off-by: Breno Leitao Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240108181610.2697017-5-leitao@debian.org Signed-off-by: Jakub Kicinski --- net/nfc/digital_core.c | 1 + net/nfc/nci/core.c | 1 + net/nfc/nci/spi.c | 1 + 3 files changed, 3 insertions(+) diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index d63d2e5dc60c..dae378f1d52b 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -858,4 +858,5 @@ void nfc_digital_unregister_device(struct nfc_digital_dev *ddev) } EXPORT_SYMBOL(nfc_digital_unregister_device); +MODULE_DESCRIPTION("NFC Digital protocol stack"); MODULE_LICENSE("GPL"); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 6c9592d05120..97348cedb16b 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1577,4 +1577,5 @@ static void nci_cmd_work(struct work_struct *work) } } +MODULE_DESCRIPTION("NFC Controller Interface"); MODULE_LICENSE("GPL"); diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c index b68150c971d0..6a93533c480e 100644 --- a/net/nfc/nci/spi.c +++ b/net/nfc/nci/spi.c @@ -319,4 +319,5 @@ done: } EXPORT_SYMBOL_GPL(nci_spi_read); +MODULE_DESCRIPTION("NFC Controller Interface (NCI) SPI link layer"); MODULE_LICENSE("GPL"); From d8610e431fe53ee3a1a11a7f25528b03f8a0b1c7 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:05 -0800 Subject: [PATCH 04/88] net: fill in MODULE_DESCRIPTION()s for Sun RPC W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to Sun RPC modules. Signed-off-by: Breno Leitao Reviewed-by: Jeff Layton Link: https://lore.kernel.org/r/20240108181610.2697017-6-leitao@debian.org Signed-off-by: Jakub Kicinski --- net/sunrpc/auth_gss/auth_gss.c | 1 + net/sunrpc/auth_gss/gss_krb5_mech.c | 1 + net/sunrpc/sunrpc_syms.c | 1 + 3 files changed, 3 insertions(+) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 1af71fbb0d80..c7af0220f82f 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -2280,6 +2280,7 @@ static void __exit exit_rpcsec_gss(void) } MODULE_ALIAS("rpc-auth-6"); +MODULE_DESCRIPTION("Sun RPC Kerberos RPCSEC_GSS client authentication"); MODULE_LICENSE("GPL"); module_param_named(expired_cred_retry_delay, gss_expired_cred_retry_delay, diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index e31cfdf7eadc..64cff717c3d9 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -650,6 +650,7 @@ static void __exit cleanup_kerberos_module(void) gss_mech_unregister(&gss_kerberos_mech); } +MODULE_DESCRIPTION("Sun RPC Kerberos 5 module"); MODULE_LICENSE("GPL"); module_init(init_kerberos_module); module_exit(cleanup_kerberos_module); diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 691c0000e9ea..bab6cab29405 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -148,6 +148,7 @@ cleanup_sunrpc(void) #endif rcu_barrier(); /* Wait for completion of call_rcu()'s */ } +MODULE_DESCRIPTION("Sun RPC core"); MODULE_LICENSE("GPL"); fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */ module_exit(cleanup_sunrpc); From ade98756128a63dbb801b9d3948c6954cc81b473 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:08 -0800 Subject: [PATCH 05/88] net: fill in MODULE_DESCRIPTION()s for ds26522 module W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to Slic Maxim ds26522 card driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240108181610.2697017-9-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/wan/slic_ds26522.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wan/slic_ds26522.c b/drivers/net/wan/slic_ds26522.c index 8a51cfcff99e..cbb99fc5ea9f 100644 --- a/drivers/net/wan/slic_ds26522.c +++ b/drivers/net/wan/slic_ds26522.c @@ -28,6 +28,7 @@ static struct spi_device *g_spi; +MODULE_DESCRIPTION("Slic Maxim DS26522 driver"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Zhao Qiang"); From c155eca07647bac84cbce690b4257605f5740dda Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:09 -0800 Subject: [PATCH 06/88] net: fill in MODULE_DESCRIPTION()s for s2io W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Neterion 10GbE (s2io) driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240108181610.2697017-10-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/neterion/s2io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index 61d8bfd12d5f..55408f16fbbc 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -414,6 +414,7 @@ static const u64 fix_mac[] = { END_SIGN }; +MODULE_DESCRIPTION("Neterion 10GbE driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); From da14d1fed9c1e2f56a58dcd980d1395121c4665f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:11 -0800 Subject: [PATCH 07/88] MAINTAINERS: eth: mtk: move John to CREDITS John is still active in other bits of the kernel but not much on the MediaTek ethernet switch side. Our scripts report: Subsystem MEDIATEK ETHERNET DRIVER Changes 81 / 384 (21%) Last activity: 2023-12-21 Felix Fietkau : Author c6d96df9fa2c 2023-05-02 00:00:00 42 Tags c6d96df9fa2c 2023-05-02 00:00:00 48 John Crispin : Sean Wang : Author 880c2d4b2fdf 2019-06-03 00:00:00 5 Tags a5d75538295b 2020-04-07 00:00:00 7 Mark Lee : Author 8d66a8183d0c 2019-11-14 00:00:00 4 Tags 8d66a8183d0c 2019-11-14 00:00:00 4 Lorenzo Bianconi : Author 7cb8cd4daacf 2023-12-21 00:00:00 98 Tags 7cb8cd4daacf 2023-12-21 00:00:00 112 Top reviewers: [18]: horms@kernel.org [15]: leonro@nvidia.com [8]: rmk+kernel@armlinux.org.uk INACTIVE MAINTAINER John Crispin Reviewed-by: Simon Horman Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20240109164517.3063131-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index d61ba9b5117f..d174c65e1031 100644 --- a/CREDITS +++ b/CREDITS @@ -814,6 +814,10 @@ D: Support for Xircom PGSDB9 (firmware and host driver) S: Bucharest S: Romania +N: John Crispin +E: john@phrozen.org +D: MediaTek MT7623 Gigabit ethernet support + N: Laurence Culhane E: loz@holmes.demon.co.uk D: Wrote the initial alpha SLIP code diff --git a/MAINTAINERS b/MAINTAINERS index c36618d4659e..f04fe483ecfb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13509,7 +13509,6 @@ F: drivers/dma/mediatek/ MEDIATEK ETHERNET DRIVER M: Felix Fietkau -M: John Crispin M: Sean Wang M: Mark Lee M: Lorenzo Bianconi From b59d8485fe7fda864e10800085ce1d19c321922a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:12 -0800 Subject: [PATCH 08/88] MAINTAINERS: eth: mt7530: move Landen Chao to CREDITS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mt7530 is a pretty active driver and last we have heard from Landen Chao on the list was March. There were total of 4 message from them in the last 2.5 years. I think it's time to move to CREDITS. Subsystem MEDIATEK SWITCH DRIVER Changes 94 / 169 (55%) Last activity: 2023-10-11 Arınç ÜNAL : Author e94b590abfff 2023-08-19 00:00:00 12 Tags e94b590abfff 2023-08-19 00:00:00 16 Daniel Golle : Author 91daa4f62ce8 2023-04-19 00:00:00 17 Tags ac49b992578d 2023-10-11 00:00:00 20 Landen Chao : DENG Qingfang : Author 342afce10d6f 2021-10-18 00:00:00 24 Tags 342afce10d6f 2021-10-18 00:00:00 25 Sean Wang : Tags c288575f7810 2020-09-14 00:00:00 5 Top reviewers: [46]: f.fainelli@gmail.com [29]: andrew@lunn.ch [19]: olteanv@gmail.com INACTIVE MAINTAINER Landen Chao Acked-by: Arınç ÜNAL Link: https://lore.kernel.org/r/20240109164517.3063131-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index d174c65e1031..76bc0091f9cd 100644 --- a/CREDITS +++ b/CREDITS @@ -677,6 +677,10 @@ D: Media subsystem (V4L/DVB) drivers and core D: EDAC drivers and EDAC 3.0 core rework S: Brazil +N: Landen Chao +E: Landen.Chao@mediatek.com +D: MT7531 Ethernet switch support + N: Raymond Chen E: raymondc@microsoft.com D: Author of Configure script diff --git a/MAINTAINERS b/MAINTAINERS index f04fe483ecfb..b29c132f4754 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13664,7 +13664,6 @@ F: include/soc/mediatek/smi.h MEDIATEK SWITCH DRIVER M: Arınç ÜNAL M: Daniel Golle -M: Landen Chao M: DENG Qingfang M: Sean Wang L: netdev@vger.kernel.org From 009a98bca6347d0bd9cf0c31691331e68f0ee380 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:13 -0800 Subject: [PATCH 09/88] MAINTAINERS: eth: mvneta: move Thomas to CREDITS Thomas is still active in other bits of the kernel and beyond but not as much on the Marvell Ethernet devices. Our scripts report: Subsystem MARVELL MVNETA ETHERNET DRIVER Changes 54 / 176 (30%) (No activity) Top reviewers: [12]: hawk@kernel.org [9]: toke@redhat.com [9]: john.fastabend@gmail.com INACTIVE MAINTAINER Thomas Petazzoni Acked-by: Thomas Petazzoni Link: https://lore.kernel.org/r/20240109164517.3063131-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index 76bc0091f9cd..4d6beeaaca3c 100644 --- a/CREDITS +++ b/CREDITS @@ -3049,6 +3049,10 @@ S: Demonstratsii 8-382 S: Tula 300000 S: Russia +N: Thomas Petazzoni +E: thomas.petazzoni@bootlin.com +D: Driver for the Marvell Armada 370/XP network unit. + N: Gordon Peters E: GordPeters@smarttech.com D: Isochronous receive for IEEE 1394 driver (OHCI module). diff --git a/MAINTAINERS b/MAINTAINERS index b29c132f4754..ceb9b669dc6c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12875,9 +12875,8 @@ S: Maintained F: drivers/thermal/armada_thermal.c MARVELL MVNETA ETHERNET DRIVER -M: Thomas Petazzoni L: netdev@vger.kernel.org -S: Maintained +S: Orphan F: drivers/net/ethernet/marvell/mvneta.* MARVELL MVPP2 ETHERNET DRIVER From 384a35866f3a2b75f0c12e09e1bc486ef96cb5f5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:14 -0800 Subject: [PATCH 10/88] MAINTAINERS: eth: mark Cavium liquidio as an Orphan We haven't seen much review activity from the liquidio maintainers for years. Reflect that reality in MAINTAINERS. Our scripts report: Subsystem CAVIUM LIQUIDIO NETWORK DRIVER Changes 30 / 87 (34%) Last activity: 2019-01-28 Derek Chickles : Tags ac93e2fa8550 2019-01-28 00:00:00 1 Satanand Burla : Felix Manlunas : Tags ac93e2fa8550 2019-01-28 00:00:00 1 Top reviewers: [5]: simon.horman@corigine.com [4]: keescook@chromium.org [4]: jiri@nvidia.com INACTIVE MAINTAINER Satanand Burla Acked-by: Derek Chickles Link: https://lore.kernel.org/r/20240109164517.3063131-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ceb9b669dc6c..1ef18977427e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4697,11 +4697,8 @@ F: drivers/i2c/busses/i2c-octeon* F: drivers/i2c/busses/i2c-thunderx* CAVIUM LIQUIDIO NETWORK DRIVER -M: Derek Chickles -M: Satanand Burla -M: Felix Manlunas L: netdev@vger.kernel.org -S: Supported +S: Orphan W: http://www.marvell.com F: drivers/net/ethernet/cavium/liquidio/ From 0bfcdce867f70a8309fe120eb5a9fff2fc54e8b6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:15 -0800 Subject: [PATCH 11/88] MAINTAINERS: Bluetooth: retire Johan (for now?) Johan moved to maintaining the Zephyr Bluetooth stack, and we haven't heard from him on the ML in 3 years (according to lore), and seen any tags in git in 4 years. Trade the MAINTAINER entry for CREDITS, we can revert whenever Johan comes back to Linux hacking :) Subsystem BLUETOOTH SUBSYSTEM Changes 173 / 986 (17%) Last activity: 2023-12-22 Marcel Holtmann : Author 91cb4c19118a 2022-01-27 00:00:00 52 Committer edcb185fa9c4 2022-05-23 00:00:00 446 Tags 000c2fa2c144 2023-04-23 00:00:00 523 Johan Hedberg : Luiz Augusto von Dentz : Author d03376c18592 2023-12-22 00:00:00 241 Committer da9065caa594 2023-12-22 00:00:00 341 Tags da9065caa594 2023-12-22 00:00:00 493 Top reviewers: [33]: alainm@chromium.org [31]: mcchou@chromium.org [27]: abhishekpandit@chromium.org INACTIVE MAINTAINER Johan Hedberg Link: https://lore.kernel.org/r/20240109164517.3063131-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 4d6beeaaca3c..3c392f7ebc6e 100644 --- a/CREDITS +++ b/CREDITS @@ -1542,6 +1542,10 @@ N: Andrew Haylett E: ajh@primag.co.uk D: Selection mechanism +N: Johan Hedberg +E: johan.hedberg@gmail.com +D: Bluetooth subsystem maintainer + N: Andre Hedrick E: andre@linux-ide.org E: andre@linuxdiskcert.org diff --git a/MAINTAINERS b/MAINTAINERS index 1ef18977427e..80422f639352 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3609,7 +3609,6 @@ F: drivers/mtd/devices/block2mtd.c BLUETOOTH DRIVERS M: Marcel Holtmann -M: Johan Hedberg M: Luiz Augusto von Dentz L: linux-bluetooth@vger.kernel.org S: Supported From bd93edbfd70cdea6e2313c87c3bae5b05bbb947e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:16 -0800 Subject: [PATCH 12/88] MAINTAINERS: mark ax25 as Orphan We haven't heard from Ralf for two years, according to lore. We get a constant stream of "fixes" to ax25 from people using code analysis tools. Nobody is reviewing those, let's reflect this reality in MAINTAINERS. Subsystem AX.25 NETWORK LAYER Changes 9 / 59 (15%) (No activity) Top reviewers: [2]: mkl@pengutronix.de [2]: edumazet@google.com [2]: stefan@datenfreihafen.org INACTIVE MAINTAINER Ralf Baechle Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240109164517.3063131-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 1 + MAINTAINERS | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index 3c392f7ebc6e..f7888de50220 100644 --- a/CREDITS +++ b/CREDITS @@ -179,6 +179,7 @@ E: ralf@gnu.org P: 1024/AF7B30C1 CF 97 C2 CC 6D AE A7 FE C8 BA 9C FC 88 DE 32 C3 D: Linux/MIPS port D: Linux/68k hacker +D: AX25 maintainer S: Hauptstrasse 19 S: 79837 St. Blasien S: Germany diff --git a/MAINTAINERS b/MAINTAINERS index 80422f639352..b736b708b34b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3376,9 +3376,8 @@ F: Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml F: drivers/iio/adc/hx711.c AX.25 NETWORK LAYER -M: Ralf Baechle L: linux-hams@vger.kernel.org -S: Maintained +S: Orphan W: https://linux-ax25.in-berlin.de F: include/net/ax25.h F: include/uapi/linux/ax25.h From f9678f5825dd852b7201132e36691fefb17d49ce Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:17 -0800 Subject: [PATCH 13/88] MAINTAINERS: ibmvnic: drop Dany from reviewers I missed that Dany uses a different email address when tagging patches (drt@linux.ibm.com) and asked him if he's still actively working on ibmvnic. He doesn't really fall under our removal criteria, but he admitted that he already moved on to other projects. Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240109164517.3063131-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b736b708b34b..4194f414dd2c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10155,7 +10155,6 @@ IBM Power SRIOV Virtual NIC Device Driver M: Haren Myneni M: Rick Lindsley R: Nick Child -R: Dany Madden R: Thomas Falcon L: netdev@vger.kernel.org S: Supported From b3739fb3a9e6633b233d829ee799323d75162775 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 Jan 2024 17:27:53 +0100 Subject: [PATCH 14/88] wangxunx: select CONFIG_PHYLINK where needed The ngbe driver needs phylink: arm-linux-gnueabi-ld: drivers/net/ethernet/wangxun/libwx/wx_ethtool.o: in function `wx_nway_reset': wx_ethtool.c:(.text+0x458): undefined reference to `phylink_ethtool_nway_reset' arm-linux-gnueabi-ld: drivers/net/ethernet/wangxun/ngbe/ngbe_main.o: in function `ngbe_remove': ngbe_main.c:(.text+0x7c): undefined reference to `phylink_destroy' arm-linux-gnueabi-ld: drivers/net/ethernet/wangxun/ngbe/ngbe_main.o: in function `ngbe_open': ngbe_main.c:(.text+0xf90): undefined reference to `phylink_connect_phy' arm-linux-gnueabi-ld: drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.o: in function `ngbe_mdio_init': ngbe_mdio.c:(.text+0x314): undefined reference to `phylink_create' Add the missing Kconfig description for this. Fixes: bc2426d74aa3 ("net: ngbe: convert phylib to phylink") Signed-off-by: Arnd Bergmann Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240111162828.68564-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/Kconfig b/drivers/net/ethernet/wangxun/Kconfig index 23cd610bd376..85cdbdd44fec 100644 --- a/drivers/net/ethernet/wangxun/Kconfig +++ b/drivers/net/ethernet/wangxun/Kconfig @@ -26,7 +26,7 @@ config NGBE tristate "Wangxun(R) GbE PCI Express adapters support" depends on PCI select LIBWX - select PHYLIB + select PHYLINK help This driver supports Wangxun(R) GbE PCI Express family of adapters. From e689a876969833cb1317f8752cd064595e7b61e2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 10 Jan 2024 21:34:10 +0000 Subject: [PATCH 15/88] selftests/net/tcp-ao: Use LDLIBS instead of LDFLAGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rules to link selftests are: > $(OUTPUT)/%_ipv4: %.c > $(LINK.c) $^ $(LDLIBS) -o $@ > > $(OUTPUT)/%_ipv6: %.c > $(LINK.c) -DIPV6_TEST $^ $(LDLIBS) -o $@ The intel test robot uses only selftest's Makefile, not the top linux Makefile: > make W=1 O=/tmp/kselftest -C tools/testing/selftests So, $(LINK.c) is determined by environment, rather than by kernel Makefiles. On my machine (as well as other people that ran tcp-ao selftests) GNU/Make implicit definition does use $(LDFLAGS): > [dima@Mindolluin ~]$ make -p -f/dev/null | grep '^LINK.c\>' > make: *** No targets. Stop. > LINK.c = $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) But, according to build robot report, it's not the case for them. While I could just avoid using pre-defined $(LINK.c), it's also used by selftests/lib.mk by default. Anyways, according to GNU/Make documentation [1], I should have used $(LDLIBS) instead of $(LDFLAGS) in the first place, so let's just do it: > LDFLAGS > Extra flags to give to compilers when they are supposed to invoke > the linker, ‘ld’, such as -L. Libraries (-lfoo) should be added > to the LDLIBS variable instead. > LDLIBS > Library flags or names given to compilers when they are supposed > to invoke the linker, ‘ld’. LOADLIBES is a deprecated (but still > supported) alternative to LDLIBS. Non-library linker flags, such > as -L, should go in the LDFLAGS variable. [1]: https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401011151.veyYTJzq-lkp@intel.com/ Signed-off-by: Dmitry Safonov Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240110-tcp_ao-selftests-makefile-v1-1-aa07d043f052@arista.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/Makefile b/tools/testing/selftests/net/tcp_ao/Makefile index 8e60bae67aa9..522d991e310e 100644 --- a/tools/testing/selftests/net/tcp_ao/Makefile +++ b/tools/testing/selftests/net/tcp_ao/Makefile @@ -52,5 +52,5 @@ $(OUTPUT)/%_ipv6: %.c $(OUTPUT)/icmps-accept_ipv4: CFLAGS+= -DTEST_ICMPS_ACCEPT $(OUTPUT)/icmps-accept_ipv6: CFLAGS+= -DTEST_ICMPS_ACCEPT -$(OUTPUT)/bench-lookups_ipv4: LDFLAGS+= -lm -$(OUTPUT)/bench-lookups_ipv6: LDFLAGS+= -lm +$(OUTPUT)/bench-lookups_ipv4: LDLIBS+= -lm +$(OUTPUT)/bench-lookups_ipv6: LDLIBS+= -lm From b33fb5b801c6db408b774a68e7c8722796b59ecc Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 10 Jan 2024 14:14:00 +0800 Subject: [PATCH 16/88] net: qualcomm: rmnet: fix global oob in rmnet_policy The variable rmnet_link_ops assign a *bigger* maxtype which leads to a global out-of-bounds read when parsing the netlink attributes. See bug trace below: ================================================================== BUG: KASAN: global-out-of-bounds in validate_nla lib/nlattr.c:386 [inline] BUG: KASAN: global-out-of-bounds in __nla_validate_parse+0x24af/0x2750 lib/nlattr.c:600 Read of size 1 at addr ffffffff92c438d0 by task syz-executor.6/84207 CPU: 0 PID: 84207 Comm: syz-executor.6 Tainted: G N 6.1.0 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x8b/0xb3 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:284 [inline] print_report+0x172/0x475 mm/kasan/report.c:395 kasan_report+0xbb/0x1c0 mm/kasan/report.c:495 validate_nla lib/nlattr.c:386 [inline] __nla_validate_parse+0x24af/0x2750 lib/nlattr.c:600 __nla_parse+0x3e/0x50 lib/nlattr.c:697 nla_parse_nested_deprecated include/net/netlink.h:1248 [inline] __rtnl_newlink+0x50a/0x1880 net/core/rtnetlink.c:3485 rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3594 rtnetlink_rcv_msg+0x43c/0xd70 net/core/rtnetlink.c:6091 netlink_rcv_skb+0x14f/0x410 net/netlink/af_netlink.c:2540 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x54e/0x800 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x930/0xe50 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0x154/0x190 net/socket.c:734 ____sys_sendmsg+0x6df/0x840 net/socket.c:2482 ___sys_sendmsg+0x110/0x1b0 net/socket.c:2536 __sys_sendmsg+0xf3/0x1c0 net/socket.c:2565 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fdcf2072359 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fdcf13e3168 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007fdcf219ff80 RCX: 00007fdcf2072359 RDX: 0000000000000000 RSI: 0000000020000200 RDI: 0000000000000003 RBP: 00007fdcf20bd493 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fffbb8d7bdf R14: 00007fdcf13e3300 R15: 0000000000022000 The buggy address belongs to the variable: rmnet_policy+0x30/0xe0 The buggy address belongs to the physical page: page:0000000065bdeb3c refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x155243 flags: 0x200000000001000(reserved|node=0|zone=2) raw: 0200000000001000 ffffea00055490c8 ffffea00055490c8 0000000000000000 raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffffffff92c43780: f9 f9 f9 f9 00 00 00 02 f9 f9 f9 f9 00 00 00 07 ffffffff92c43800: f9 f9 f9 f9 00 00 00 05 f9 f9 f9 f9 06 f9 f9 f9 >ffffffff92c43880: f9 f9 f9 f9 00 00 00 00 00 00 f9 f9 f9 f9 f9 f9 ^ ffffffff92c43900: 00 00 00 00 00 00 00 00 07 f9 f9 f9 f9 f9 f9 f9 ffffffff92c43980: 00 00 00 07 f9 f9 f9 f9 00 00 00 05 f9 f9 f9 f9 According to the comment of `nla_parse_nested_deprecated`, the maxtype should be len(destination array) - 1. Hence use `IFLA_RMNET_MAX` here. Fixes: 14452ca3b5ce ("net: qualcomm: rmnet: Export mux_id and flags to netlink") Signed-off-by: Lin Ma Reviewed-by: Subash Abhinov Kasiviswanathan Reviewed-by: Simon Horman Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240110061400.3356108-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 39d24e07f306..5b69b9268c75 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -396,7 +396,7 @@ nla_put_failure: struct rtnl_link_ops rmnet_link_ops __read_mostly = { .kind = "rmnet", - .maxtype = __IFLA_RMNET_MAX, + .maxtype = IFLA_RMNET_MAX, .priv_size = sizeof(struct rmnet_priv), .setup = rmnet_vnd_setup, .validate = rmnet_rtnl_validate, From 844f104790bd69c2e4dbb9ee3eba46fde1fcea7b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 10 Jan 2024 02:33:54 +0200 Subject: [PATCH 17/88] net: dsa: fix netdev_priv() dereference before check on non-DSA netdevice events After the blamed commit, we started doing this dereference for every NETDEV_CHANGEUPPER and NETDEV_PRECHANGEUPPER event in the system. static inline struct dsa_port *dsa_user_to_port(const struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); return p->dp; } Which is obviously bogus, because not all net_devices have a netdev_priv() of type struct dsa_user_priv. But struct dsa_user_priv is fairly small, and p->dp means dereferencing 8 bytes starting with offset 16. Most drivers allocate that much private memory anyway, making our access not fault, and we discard the bogus data quickly afterwards, so this wasn't caught. But the dummy interface is somewhat special in that it calls alloc_netdev() with a priv size of 0. So every netdev_priv() dereference is invalid, and we get this when we emit a NETDEV_PRECHANGEUPPER event with a VLAN as its new upper: $ ip link add dummy1 type dummy $ ip link add link dummy1 name dummy1.100 type vlan id 100 [ 43.309174] ================================================================== [ 43.316456] BUG: KASAN: slab-out-of-bounds in dsa_user_prechangeupper+0x30/0xe8 [ 43.323835] Read of size 8 at addr ffff3f86481d2990 by task ip/374 [ 43.330058] [ 43.342436] Call trace: [ 43.366542] dsa_user_prechangeupper+0x30/0xe8 [ 43.371024] dsa_user_netdevice_event+0xb38/0xee8 [ 43.375768] notifier_call_chain+0xa4/0x210 [ 43.379985] raw_notifier_call_chain+0x24/0x38 [ 43.384464] __netdev_upper_dev_link+0x3ec/0x5d8 [ 43.389120] netdev_upper_dev_link+0x70/0xa8 [ 43.393424] register_vlan_dev+0x1bc/0x310 [ 43.397554] vlan_newlink+0x210/0x248 [ 43.401247] rtnl_newlink+0x9fc/0xe30 [ 43.404942] rtnetlink_rcv_msg+0x378/0x580 Avoid the kernel oops by dereferencing after the type check, as customary. Fixes: 4c3f80d22b2e ("net: dsa: walk through all changeupper notifier functions") Reported-and-tested-by: syzbot+d81bcd883824180500c8@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/0000000000001d4255060e87545c@google.com/ Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240110003354.2796778-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/user.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/dsa/user.c b/net/dsa/user.c index b738a466e2dc..b15e71cc342c 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -2806,13 +2806,14 @@ EXPORT_SYMBOL_GPL(dsa_user_dev_check); static int dsa_user_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { - struct dsa_port *dp = dsa_user_to_port(dev); struct netlink_ext_ack *extack; int err = NOTIFY_DONE; + struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return err; + dp = dsa_user_to_port(dev); extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev)) { @@ -2865,11 +2866,13 @@ static int dsa_user_changeupper(struct net_device *dev, static int dsa_user_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { - struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return NOTIFY_DONE; + dp = dsa_user_to_port(dev); + if (netif_is_bridge_master(info->upper_dev) && !info->linking) dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) From 8722014311e613244f33952354956a82fa4b0472 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 9 Jan 2024 15:10:48 +0000 Subject: [PATCH 18/88] rxrpc: Fix use of Don't Fragment flag rxrpc normally has the Don't Fragment flag set on the UDP packets it transmits, except when it has decided that DATA packets aren't getting through - in which case it turns it off just for the DATA transmissions. This can be a problem, however, for RESPONSE packets that convey authentication and crypto data from the client to the server as ticket may be larger than can fit in the MTU. In such a case, rxrpc gets itself into an infinite loop as the sendmsg returns an error (EMSGSIZE), which causes rxkad_send_response() to return -EAGAIN - and the CHALLENGE packet is put back on the Rx queue to retry, leading to the I/O thread endlessly attempting to perform the transmission. Fix this by disabling DF on RESPONSE packets for now. The use of DF and best data MTU determination needs reconsidering at some point in the future. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Reported-by: Marc Dionne Signed-off-by: David Howells cc: linux-afs@lists.infradead.org Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/1581852.1704813048@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/local_object.c | 13 ++++++++++++- net/rxrpc/output.c | 6 ++---- net/rxrpc/rxkad.c | 2 ++ 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 2f8b39a614c3..dbeb75c29857 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1079,6 +1079,7 @@ void rxrpc_send_version_request(struct rxrpc_local *local, /* * local_object.c */ +void rxrpc_local_dont_fragment(const struct rxrpc_local *local, bool set); struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *); struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *, enum rxrpc_local_trace); struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *, enum rxrpc_local_trace); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index c553a30e9c83..34d307368135 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -36,6 +36,17 @@ static void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err, return ipv6_icmp_error(sk, skb, err, port, info, payload); } +/* + * Set or clear the Don't Fragment flag on a socket. + */ +void rxrpc_local_dont_fragment(const struct rxrpc_local *local, bool set) +{ + if (set) + ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO); + else + ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DONT); +} + /* * Compare a local to an address. Return -ve, 0 or +ve to indicate less than, * same or greater than. @@ -203,7 +214,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) ip_sock_set_recverr(usk); /* we want to set the don't fragment bit */ - ip_sock_set_mtu_discover(usk, IP_PMTUDISC_DO); + rxrpc_local_dont_fragment(local, true); /* We want receive timestamps. */ sock_enable_timestamps(usk); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 5e53429c6922..a0906145e829 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -494,14 +494,12 @@ send_fragmentable: switch (conn->local->srx.transport.family) { case AF_INET6: case AF_INET: - ip_sock_set_mtu_discover(conn->local->socket->sk, - IP_PMTUDISC_DONT); + rxrpc_local_dont_fragment(conn->local, false); rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag); ret = do_udp_sendmsg(conn->local->socket, &msg, len); conn->peer->last_tx_at = ktime_get_seconds(); - ip_sock_set_mtu_discover(conn->local->socket->sk, - IP_PMTUDISC_DO); + rxrpc_local_dont_fragment(conn->local, true); break; default: diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 1bf571a66e02..b52dedcebce0 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -724,7 +724,9 @@ static int rxkad_send_response(struct rxrpc_connection *conn, serial = atomic_inc_return(&conn->serial); whdr.serial = htonl(serial); + rxrpc_local_dont_fragment(conn->local, false); ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); + rxrpc_local_dont_fragment(conn->local, true); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, rxrpc_tx_point_rxkad_response); From ec4ffd100ffb396eca13ebe7d18938ea80f399c3 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 8 Jan 2024 10:41:02 +0100 Subject: [PATCH 19/88] Revert "net: rtnetlink: Enslave device before bringing it up" This reverts commit a4abfa627c3865c37e036bccb681619a50d3d93c. The patch broke: > ip link set dummy0 up > ip link set dummy0 master bond0 down This last command is useful to be able to enslave an interface with only one netlink message. After discussion, there is no good reason to support: > ip link set dummy0 down > ip link set dummy0 master bond0 up because the bond interface already set the slave up when it is up. Cc: stable@vger.kernel.org Fixes: a4abfa627c38 ("net: rtnetlink: Enslave device before bringing it up") Signed-off-by: Nicolas Dichtel Reviewed-by: Jiri Pirko Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20240108094103.2001224-2-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5f6ed6da3cfc..f6f29eb03ec2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2899,13 +2899,6 @@ static int do_setlink(const struct sk_buff *skb, call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); - if (err) - goto errout; - status |= DO_SETLINK_MODIFIED; - } - if (ifm->ifi_flags || ifm->ifi_change) { err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), extack); @@ -2913,6 +2906,13 @@ static int do_setlink(const struct sk_buff *skb, goto errout; } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) From a159cbe81d3b88bf35cd4edb4e6040d015972fdd Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 8 Jan 2024 10:41:03 +0100 Subject: [PATCH 20/88] selftests: rtnetlink: check enslaving iface in a bond The goal is to check the following two sequences: > ip link set dummy0 up > ip link set dummy0 master bond0 down Signed-off-by: Nicolas Dichtel Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240108094103.2001224-3-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/rtnetlink.sh | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index a10a32952f21..a31be0eaaa50 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -28,6 +28,7 @@ ALL_TESTS=" kci_test_neigh_get kci_test_bridge_parent_id kci_test_address_proto + kci_test_enslave_bonding " devdummy="test-dummy0" @@ -1241,6 +1242,33 @@ kci_test_address_proto() return $ret } +kci_test_enslave_bonding() +{ + local testns="testns" + local bond="bond123" + local dummy="dummy123" + local ret=0 + + run_cmd ip netns add "$testns" + if [ $ret -ne 0 ]; then + end_test "SKIP bonding tests: cannot add net namespace $testns" + return $ksft_skip + fi + + run_cmd ip -netns $testns link add dev $bond type bond mode balance-rr + run_cmd ip -netns $testns link add dev $dummy type dummy + run_cmd ip -netns $testns link set dev $dummy up + run_cmd ip -netns $testns link set dev $dummy master $bond down + if [ $ret -ne 0 ]; then + end_test "FAIL: initially up interface added to a bond and set down" + ip netns del "$testns" + return 1 + fi + + end_test "PASS: enslave interface in a bond" + ip netns del "$testns" +} + kci_test_rtnl() { local current_test From a0cb76a770083a22167659e64ba69af6425b1d9b Mon Sep 17 00:00:00 2001 From: Nithin Dabilpuram Date: Mon, 8 Jan 2024 13:00:36 +0530 Subject: [PATCH 21/88] octeontx2-af: CN10KB: Fix FIFO length calculation for RPM2 RPM0 and RPM1 on the CN10KB SoC have 8 LMACs each, whereas RPM2 has only 4 LMACs. Similarly, the RPM0 and RPM1 have 256KB FIFO, whereas RPM2 has 128KB FIFO. This patch fixes an issue with improper TX credit programming for the RPM2 link. Fixes: b9d0fedc6234 ("octeontx2-af: cn10kb: Add RPM_USX MAC support") Signed-off-by: Nithin Dabilpuram Signed-off-by: Naveen Mamindlapalli Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240108073036.8766-1-naveenm@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/rpm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c index 4728ba34b0e3..76218f1cb459 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c @@ -506,6 +506,7 @@ u32 rpm2_get_lmac_fifo_len(void *rpmd, int lmac_id) rpm_t *rpm = rpmd; u8 num_lmacs; u32 fifo_len; + u16 max_lmac; lmac_info = rpm_read(rpm, 0, RPM2_CMRX_RX_LMACS); /* LMACs are divided into two groups and each group @@ -513,7 +514,11 @@ u32 rpm2_get_lmac_fifo_len(void *rpmd, int lmac_id) * Group0 lmac_id range {0..3} * Group1 lmac_id range {4..7} */ - fifo_len = rpm->mac_ops->fifo_len / 2; + max_lmac = (rpm_read(rpm, 0, CGX_CONST) >> 24) & 0xFF; + if (max_lmac > 4) + fifo_len = rpm->mac_ops->fifo_len / 2; + else + fifo_len = rpm->mac_ops->fifo_len; if (lmac_id < 4) { num_lmacs = hweight8(lmac_info & 0xF); From e3fe8d28c67bf6c291e920c6d04fa22afa14e6e4 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 4 Jan 2024 10:09:02 +0800 Subject: [PATCH 22/88] =?UTF-8?q?virtio=5Fnet:=20Fix=20"=E2=80=98%d?= =?UTF-8?q?=E2=80=99=20directive=20writing=20between=201=20and=2011=20byte?= =?UTF-8?q?s=20into=20a=20region=20of=20size=2010"=20warnings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the warnings when building virtio_net driver. " drivers/net/virtio_net.c: In function ‘init_vqs’: drivers/net/virtio_net.c:4551:48: warning: ‘%d’ directive writing between 1 and 11 bytes into a region of size 10 [-Wformat-overflow=] 4551 | sprintf(vi->rq[i].name, "input.%d", i); | ^~ In function ‘virtnet_find_vqs’, inlined from ‘init_vqs’ at drivers/net/virtio_net.c:4645:8: drivers/net/virtio_net.c:4551:41: note: directive argument in the range [-2147483643, 65534] 4551 | sprintf(vi->rq[i].name, "input.%d", i); | ^~~~~~~~~~ drivers/net/virtio_net.c:4551:17: note: ‘sprintf’ output between 8 and 18 bytes into a destination of size 16 4551 | sprintf(vi->rq[i].name, "input.%d", i); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/net/virtio_net.c: In function ‘init_vqs’: drivers/net/virtio_net.c:4552:49: warning: ‘%d’ directive writing between 1 and 11 bytes into a region of size 9 [-Wformat-overflow=] 4552 | sprintf(vi->sq[i].name, "output.%d", i); | ^~ In function ‘virtnet_find_vqs’, inlined from ‘init_vqs’ at drivers/net/virtio_net.c:4645:8: drivers/net/virtio_net.c:4552:41: note: directive argument in the range [-2147483643, 65534] 4552 | sprintf(vi->sq[i].name, "output.%d", i); | ^~~~~~~~~~~ drivers/net/virtio_net.c:4552:17: note: ‘sprintf’ output between 9 and 19 bytes into a destination of size 16 4552 | sprintf(vi->sq[i].name, "output.%d", i); " Reviewed-by: Xuan Zhuo Signed-off-by: Zhu Yanjun Link: https://lore.kernel.org/r/20240104020902.2753599-1-yanjun.zhu@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3cb8aa193884..d7ce4a1011ea 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -4295,10 +4295,11 @@ static int virtnet_find_vqs(struct virtnet_info *vi) { vq_callback_t **callbacks; struct virtqueue **vqs; - int ret = -ENOMEM; - int i, total_vqs; const char **names; + int ret = -ENOMEM; + int total_vqs; bool *ctx; + u16 i; /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by @@ -4335,8 +4336,8 @@ static int virtnet_find_vqs(struct virtnet_info *vi) for (i = 0; i < vi->max_queue_pairs; i++) { callbacks[rxq2vq(i)] = skb_recv_done; callbacks[txq2vq(i)] = skb_xmit_done; - sprintf(vi->rq[i].name, "input.%d", i); - sprintf(vi->sq[i].name, "output.%d", i); + sprintf(vi->rq[i].name, "input.%u", i); + sprintf(vi->sq[i].name, "output.%u", i); names[rxq2vq(i)] = vi->rq[i].name; names[txq2vq(i)] = vi->sq[i].name; if (ctx) From 64e47d8afb5ca533b27efc006405e5bcae2c4a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sanju=C3=A1n=20Garc=C3=ADa=2C=20Jorge?= Date: Fri, 5 Jan 2024 08:55:43 +0000 Subject: [PATCH 23/88] net: ethernet: ti: am65-cpsw: Fix max mtu to fit ethernet frames The value of AM65_CPSW_MAX_PACKET_SIZE represents the maximum length of a received frame. This value is written to the register AM65_CPSW_PORT_REG_RX_MAXLEN. The maximum MTU configured on the network device should then leave some room for the ethernet headers and frame check. Otherwise, if the network interface is configured to its maximum mtu possible, the frames will be larger than AM65_CPSW_MAX_PACKET_SIZE and will get dropped as oversized. The switch supports ethernet frame sizes between 64 and 2024 bytes (including VLAN) as stated in the technical reference manual, so define AM65_CPSW_MAX_PACKET_SIZE with that maximum size. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Signed-off-by: Jorge Sanjuan Garcia Reviewed-by: Horatiu Vultur Reviewed-by: Siddharth Vadapalli Link: https://lore.kernel.org/r/20240105085530.14070-2-jorge.sanjuangarcia@duagon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index faa0561e988e..9d2f4ac783e4 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -56,7 +56,7 @@ #define AM65_CPSW_MAX_PORTS 8 #define AM65_CPSW_MIN_PACKET_SIZE VLAN_ETH_ZLEN -#define AM65_CPSW_MAX_PACKET_SIZE (VLAN_ETH_FRAME_LEN + ETH_FCS_LEN) +#define AM65_CPSW_MAX_PACKET_SIZE 2024 #define AM65_CPSW_REG_CTL 0x004 #define AM65_CPSW_REG_STAT_PORT_EN 0x014 @@ -2244,7 +2244,8 @@ am65_cpsw_nuss_init_port_ndev(struct am65_cpsw_common *common, u32 port_idx) eth_hw_addr_set(port->ndev, port->slave.mac_addr); port->ndev->min_mtu = AM65_CPSW_MIN_PACKET_SIZE; - port->ndev->max_mtu = AM65_CPSW_MAX_PACKET_SIZE; + port->ndev->max_mtu = AM65_CPSW_MAX_PACKET_SIZE - + (VLAN_ETH_HLEN + ETH_FCS_LEN); port->ndev->hw_features = NETIF_F_SG | NETIF_F_RXCSUM | NETIF_F_HW_CSUM | From bec161add35b478a7746bf58bcdea6faa19129ef Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 7 Jan 2024 14:42:41 +0000 Subject: [PATCH 24/88] amt: do not use overwrapped cb area amt driver uses skb->cb for storing tunnel information. This job is worked before TC layer and then amt driver load tunnel info from skb->cb after TC layer. So, its cb area should not be overwrapped with CB area used by TC. In order to not use cb area used by TC, it skips the biggest cb structure used by TC, which was qdisc_skb_cb. But it's not anymore. Currently, biggest structure of TC's CB is tc_skb_cb. So, it should skip size of tc_skb_cb instead of qdisc_skb_cb. Fixes: ec624fe740b4 ("net/sched: Extend qdisc control block with tc control block") Signed-off-by: Taehee Yoo Acked-by: Paolo Abeni Reviewed-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240107144241.4169520-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 53415e83821c..68e79b1272f6 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include @@ -80,11 +80,11 @@ static struct mld2_grec mldv2_zero_grec; static struct amt_skb_cb *amt_skb_cb(struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(struct amt_skb_cb) + sizeof(struct qdisc_skb_cb) > + BUILD_BUG_ON(sizeof(struct amt_skb_cb) + sizeof(struct tc_skb_cb) > sizeof_field(struct sk_buff, cb)); return (struct amt_skb_cb *)((void *)skb->cb + - sizeof(struct qdisc_skb_cb)); + sizeof(struct tc_skb_cb)); } static void __amt_source_gc_work(void) From acd66c2126eb9b5da2d89ae07dbcd73b909c2111 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Wed, 10 Jan 2024 12:37:30 +0100 Subject: [PATCH 25/88] net: micrel: Fix PTP frame parsing for lan8841 The HW has the capability to check each frame if it is a PTP frame, which domain it is, which ptp frame type it is, different ip address in the frame. And if one of these checks fail then the frame is not timestamp. Most of these checks were disabled except checking the field minorVersionPTP inside the PTP header. Meaning that once a partner sends a frame compliant to 8021AS which has minorVersionPTP set to 1, then the frame was not timestamp because the HW expected by default a value of 0 in minorVersionPTP. Fix this issue by removing this check so the userspace can decide on this. Fixes: cafc3662ee3f ("net: micrel: Add PHC support for lan8841") Signed-off-by: Horatiu Vultur Reviewed-by: Divya Koppera Reviewed-by: Rahul Rameshbabu Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index d2aa3d0695e3..bf4053431dcb 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -3338,8 +3338,10 @@ static int lan8814_probe(struct phy_device *phydev) #define LAN8841_ADC_CHANNEL_MASK 198 #define LAN8841_PTP_RX_PARSE_L2_ADDR_EN 370 #define LAN8841_PTP_RX_PARSE_IP_ADDR_EN 371 +#define LAN8841_PTP_RX_VERSION 374 #define LAN8841_PTP_TX_PARSE_L2_ADDR_EN 434 #define LAN8841_PTP_TX_PARSE_IP_ADDR_EN 435 +#define LAN8841_PTP_TX_VERSION 438 #define LAN8841_PTP_CMD_CTL 256 #define LAN8841_PTP_CMD_CTL_PTP_ENABLE BIT(2) #define LAN8841_PTP_CMD_CTL_PTP_DISABLE BIT(1) @@ -3383,6 +3385,12 @@ static int lan8841_config_init(struct phy_device *phydev) phy_write_mmd(phydev, KSZ9131RN_MMD_COMMON_CTRL_REG, LAN8841_PTP_RX_PARSE_IP_ADDR_EN, 0); + /* Disable checking for minorVersionPTP field */ + phy_write_mmd(phydev, KSZ9131RN_MMD_COMMON_CTRL_REG, + LAN8841_PTP_RX_VERSION, 0xff00); + phy_write_mmd(phydev, KSZ9131RN_MMD_COMMON_CTRL_REG, + LAN8841_PTP_TX_VERSION, 0xff00); + /* 100BT Clause 40 improvenent errata */ phy_write_mmd(phydev, LAN8841_MMD_ANALOG_REG, LAN8841_ANALOG_CONTROL_1, From e398822c4751017fe401f57409488f5948d12fb5 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Fri, 5 Jan 2024 10:52:42 +0200 Subject: [PATCH 26/88] net: phy: micrel: populate .soft_reset for KSZ9131 The RZ/G3S SMARC Module has 2 KSZ9131 PHYs. In this setup, the KSZ9131 PHY is used with the ravb Ethernet driver. It has been discovered that when bringing the Ethernet interface down/up continuously, e.g., with the following sh script: $ while :; do ifconfig eth0 down; ifconfig eth0 up; done the link speed and duplex are wrong after interrupting the bring down/up operation even though the Ethernet interface is up. To recover from this state the following configuration sequence is necessary (executed manually): $ ifconfig eth0 down $ ifconfig eth0 up The behavior has been identified also on the Microchip SAMA7G5-EK board which runs the macb driver and uses the same PHY. The order of PHY-related operations in ravb_open() is as follows: ravb_open() -> ravb_phy_start() -> ravb_phy_init() -> of_phy_connect() -> phy_connect_direct() -> phy_attach_direct() -> phy_init_hw() -> phydev->drv->soft_reset() phydev->drv->config_init() phydev->drv->config_intr() phy_resume() kszphy_resume() The order of PHY-related operations in ravb_close is as follows: ravb_close() -> phy_stop() -> phy_suspend() -> kszphy_suspend() -> genphy_suspend() // set BMCR_PDOWN bit in MII_BMCR In genphy_suspend() setting the BMCR_PDWN bit in MII_BMCR switches the PHY to Software Power-Down (SPD) mode (according to the KSZ9131 datasheet). Thus, when opening the interface after it has been previously closed (via ravb_close()), the phydev->drv->config_init() and phydev->drv->config_intr() reach the KSZ9131 PHY driver via the ksz9131_config_init() and kszphy_config_intr() functions. KSZ9131 specifies that the MII management interface remains operational during SPD (Software Power-Down), but (according to manual): - Only access to the standard registers (0 through 31) is supported. - Access to MMD address spaces other than MMD address space 1 is possible if the spd_clock_gate_override bit is set. - Access to MMD address space 1 is not possible. The spd_clock_gate_override bit is not used in the KSZ9131 driver. ksz9131_config_init() configures RGMII delay, pad skews and LEDs by accessesing MMD registers other than those in address space 1. The datasheet for the KSZ9131 does not specify what happens if registers from an unsupported address space are accessed while the PHY is in SPD. To fix the issue the .soft_reset method has been instantiated for KSZ9131, too. This resets the PHY to the default state before doing any configurations to it, thus switching it out of SPD. Fixes: bff5b4b37372 ("net: phy: micrel: add Microchip KSZ9131 initial driver") Signed-off-by: Claudiu Beznea Reviewed-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index bf4053431dcb..81c20eb4b54b 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -4847,6 +4847,7 @@ static struct phy_driver ksphy_driver[] = { .flags = PHY_POLL_CABLE_TEST, .driver_data = &ksz9131_type, .probe = kszphy_probe, + .soft_reset = genphy_soft_reset, .config_init = ksz9131_config_init, .config_intr = kszphy_config_intr, .config_aneg = ksz9131_config_aneg, From 907ee6681788556b9ade3ad0a1f6f4aea192399c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2024 11:33:11 -0800 Subject: [PATCH 27/88] net: fill in MODULE_DESCRIPTION()s for wx_lib W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add a description to Wangxun's common code lib. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 23355cc408fd..8706223a6e5a 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -2769,4 +2769,5 @@ void wx_set_ring(struct wx *wx, u32 new_tx_count, } EXPORT_SYMBOL(wx_set_ring); +MODULE_DESCRIPTION("Common library for Wangxun(R) Ethernet drivers."); MODULE_LICENSE("GPL"); From cbdd50ec8b1daef6d71fb3325e436c8dec2d2e15 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 11 Jan 2024 19:24:29 +0300 Subject: [PATCH 28/88] net: liquidio: fix clang-specific W=1 build warnings When compiling with clang-18 and W=1, I've noticed the following warnings: drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c:1493:16: warning: cast from 'void (*)(struct octeon_device *, struct octeon_mbox_cmd *, void *)' to 'octeon_mbox_callback_t' (aka 'void (*)(void *, void *, void *)') converts to incompatible function type [-Wcast-function-type-strict] 1493 | mbox_cmd.fn = (octeon_mbox_callback_t)cn23xx_get_vf_stats_callback; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ and: drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c:432:16: warning: cast from 'void (*)(struct octeon_device *, struct octeon_mbox_cmd *, void *)' to 'octeon_mbox_callback_t' (aka 'void (*)(void *, void *, void *)') converts to incompatible function type [-Wcast-function-type-strict] 432 | mbox_cmd.fn = (octeon_mbox_callback_t)octeon_pfvf_hs_callback; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix both of the above by adjusting 'octeon_mbox_callback_t' to match actual callback definitions (at the cost of adding an extra forward declaration). Signed-off-by: Dmitry Antipov Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240111162432.124014-1-dmantipov@yandex.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c | 2 +- drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c | 2 +- drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c index 068ed52b66c9..b3c81a2e9d46 100644 --- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c +++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c @@ -1490,7 +1490,7 @@ int cn23xx_get_vf_stats(struct octeon_device *oct, int vfidx, mbox_cmd.q_no = vfidx * oct->sriov_info.rings_per_vf; mbox_cmd.recv_len = 0; mbox_cmd.recv_status = 0; - mbox_cmd.fn = (octeon_mbox_callback_t)cn23xx_get_vf_stats_callback; + mbox_cmd.fn = cn23xx_get_vf_stats_callback; ctx.stats = stats; atomic_set(&ctx.status, 0); mbox_cmd.fn_arg = (void *)&ctx; diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c b/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c index dd5d80fee24f..d2fcb3da484e 100644 --- a/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c +++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c @@ -429,7 +429,7 @@ int cn23xx_octeon_pfvf_handshake(struct octeon_device *oct) mbox_cmd.q_no = 0; mbox_cmd.recv_len = 0; mbox_cmd.recv_status = 0; - mbox_cmd.fn = (octeon_mbox_callback_t)octeon_pfvf_hs_callback; + mbox_cmd.fn = octeon_pfvf_hs_callback; mbox_cmd.fn_arg = &status; octeon_mbox_write(oct, &mbox_cmd); diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h b/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h index d92bd7e16477..9ac85d22c615 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h @@ -57,7 +57,10 @@ union octeon_mbox_message { } s; }; -typedef void (*octeon_mbox_callback_t)(void *, void *, void *); +struct octeon_mbox_cmd; + +typedef void (*octeon_mbox_callback_t)(struct octeon_device *, + struct octeon_mbox_cmd *, void *); struct octeon_mbox_cmd { union octeon_mbox_message msg; From 89e23277f9c16df6f9f9c1a1a07f8f132339c15c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2024 19:49:13 +0000 Subject: [PATCH 29/88] mptcp: mptcp_parse_option() fix for MPTCPOPT_MP_JOIN mptcp_parse_option() currently sets OPTIONS_MPTCP_MPJ, for the three possible cases handled for MPTCPOPT_MP_JOIN option. OPTIONS_MPTCP_MPJ is the combination of three flags: - OPTION_MPTCP_MPJ_SYN - OPTION_MPTCP_MPJ_SYNACK - OPTION_MPTCP_MPJ_ACK This is a problem, because backup, join_id, token, nonce and/or hmac fields could be left uninitialized in some cases. Distinguish the three cases, as following patches will need this step. Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Peter Krystad Cc: Matthieu Baerts Cc: Mat Martineau Cc: Geliang Tang Reviewed-by: Simon Horman Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20240111194917.4044654-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index c53914012d01..d2527d189a79 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -123,8 +123,8 @@ static void mptcp_parse_option(const struct sk_buff *skb, break; case MPTCPOPT_MP_JOIN: - mp_opt->suboptions |= OPTIONS_MPTCP_MPJ; if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { + mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYN; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->token = get_unaligned_be32(ptr); @@ -135,6 +135,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->backup, mp_opt->join_id, mp_opt->token, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { + mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYNACK; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->thmac = get_unaligned_be64(ptr); @@ -145,11 +146,10 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->backup, mp_opt->join_id, mp_opt->thmac, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { + mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK; ptr += 2; memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); pr_debug("MP_JOIN hmac"); - } else { - mp_opt->suboptions &= ~OPTIONS_MPTCP_MPJ; } break; From c1665273bdc7c201766c65e561c06711f2e050dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2024 19:49:14 +0000 Subject: [PATCH 30/88] mptcp: strict validation before using mp_opt->hmac mp_opt->hmac contains uninitialized data unless OPTION_MPTCP_MPJ_ACK was set in mptcp_parse_option(). We must refine the condition before we call subflow_hmac_valid(). Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Peter Krystad Cc: Matthieu Baerts Cc: Mat Martineau Cc: Geliang Tang Reviewed-by: Simon Horman Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20240111194917.4044654-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3eacd04e7099..bb05477006a6 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -788,7 +788,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) || + if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); From be1d9d9d38da922bd4beeec5b6dd821ff5a1dfeb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2024 19:49:15 +0000 Subject: [PATCH 31/88] mptcp: use OPTION_MPTCP_MPJ_SYNACK in subflow_finish_connect() subflow_finish_connect() uses four fields (backup, join_id, thmac, none) that may contain garbage unless OPTION_MPTCP_MPJ_SYNACK has been set in mptcp_parse_option() Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Peter Krystad Cc: Matthieu Baerts Cc: Mat Martineau Cc: Geliang Tang Reviewed-by: Simon Horman Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20240111194917.4044654-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bb05477006a6..103ff5471993 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -506,7 +506,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; - if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ)) { + if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) { subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } From 66ff70df1a919a066942844bb095d6fcb748d78d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2024 19:49:16 +0000 Subject: [PATCH 32/88] mptcp: use OPTION_MPTCP_MPJ_SYN in subflow_check_req() syzbot reported that subflow_check_req() was using uninitialized data in subflow_check_req() [1] This is because mp_opt.token is only set when OPTION_MPTCP_MPJ_SYN is also set. While we are are it, fix mptcp_subflow_init_cookie_req() to test for OPTION_MPTCP_MPJ_ACK. [1] BUG: KMSAN: uninit-value in subflow_token_join_request net/mptcp/subflow.c:91 [inline] BUG: KMSAN: uninit-value in subflow_check_req+0x1028/0x15d0 net/mptcp/subflow.c:209 subflow_token_join_request net/mptcp/subflow.c:91 [inline] subflow_check_req+0x1028/0x15d0 net/mptcp/subflow.c:209 subflow_v6_route_req+0x269/0x410 net/mptcp/subflow.c:367 tcp_conn_request+0x153a/0x4240 net/ipv4/tcp_input.c:7164 subflow_v6_conn_request+0x3ee/0x510 tcp_rcv_state_process+0x2e1/0x4ac0 net/ipv4/tcp_input.c:6659 tcp_v6_do_rcv+0x11bf/0x1fe0 net/ipv6/tcp_ipv6.c:1669 tcp_v6_rcv+0x480b/0x4fb0 net/ipv6/tcp_ipv6.c:1900 ip6_protocol_deliver_rcu+0xda6/0x2a60 net/ipv6/ip6_input.c:438 ip6_input_finish net/ipv6/ip6_input.c:483 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] ip6_input+0x15d/0x430 net/ipv6/ip6_input.c:492 dst_input include/net/dst.h:461 [inline] ip6_rcv_finish+0x5db/0x870 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:314 [inline] ipv6_rcv+0xda/0x390 net/ipv6/ip6_input.c:310 __netif_receive_skb_one_core net/core/dev.c:5532 [inline] __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5646 netif_receive_skb_internal net/core/dev.c:5732 [inline] netif_receive_skb+0x58/0x660 net/core/dev.c:5791 tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555 tun_get_user+0x53af/0x66d0 drivers/net/tun.c:2002 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2020 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x8ef/0x1490 fs/read_write.c:584 ksys_write+0x20f/0x4c0 fs/read_write.c:637 __do_sys_write fs/read_write.c:649 [inline] __se_sys_write fs/read_write.c:646 [inline] __x64_sys_write+0x93/0xd0 fs/read_write.c:646 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Local variable mp_opt created at: subflow_check_req+0x6d/0x15d0 net/mptcp/subflow.c:145 subflow_v6_route_req+0x269/0x410 net/mptcp/subflow.c:367 CPU: 1 PID: 5924 Comm: syz-executor.3 Not tainted 6.7.0-rc8-syzkaller-00055-g5eff55d725a4 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Peter Krystad Cc: Matthieu Baerts Cc: Mat Martineau Cc: Geliang Tang Reviewed-by: Simon Horman Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20240111194917.4044654-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 103ff5471993..13039ac8d1ab 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -158,7 +158,7 @@ static int subflow_check_req(struct request_sock *req, mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); - opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); + opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN); if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); @@ -255,7 +255,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); - opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); + opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK); if (opt_mp_capable && opt_mp_join) return -EINVAL; From 724b00c12957973656d312dce2a110c75ae2c680 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2024 19:49:17 +0000 Subject: [PATCH 33/88] mptcp: refine opt_mp_capable determination OPTIONS_MPTCP_MPC is a combination of three flags. It would be better to be strict about testing what flag is expected, at least for code readability. mptcp_parse_option() already makes the distinction. - subflow_check_req() should use OPTION_MPTCP_MPC_SYN. - mptcp_subflow_init_cookie_req() should use OPTION_MPTCP_MPC_ACK. - subflow_finish_connect() should use OPTION_MPTCP_MPC_SYNACK - subflow_syn_recv_sock should use OPTION_MPTCP_MPC_ACK Suggested-by: Paolo Abeni Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Fixes: 74c7dfbee3e1 ("mptcp: consolidate in_opt sub-options fields in a bitmask") Link: https://lore.kernel.org/r/20240111194917.4044654-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 13039ac8d1ab..1117d1e84274 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -157,7 +157,7 @@ static int subflow_check_req(struct request_sock *req, mptcp_get_options(skb, &mp_opt); - opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); + opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN); if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); @@ -254,7 +254,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, subflow_init_req(req, sk_listener); mptcp_get_options(skb, &mp_opt); - opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); + opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK); if (opt_mp_capable && opt_mp_join) return -EINVAL; @@ -486,7 +486,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { - if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { + if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); mptcp_do_fallback(sk); @@ -783,7 +783,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * options. */ mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) + if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK)) fallback = true; } else if (subflow_req->mp_join) { From 08300adac3b8dab9e2fd3be0155c7d3093c755f4 Mon Sep 17 00:00:00 2001 From: Sneh Shah Date: Tue, 9 Jan 2024 20:17:29 +0530 Subject: [PATCH 34/88] net: stmmac: Fix ethool link settings ops for integrated PCS Currently get/set_link_ksettings ethtool ops are dependent on PCS. When PCS is integrated, it will not have separate link config. Bypass configuring and checking PCS for integrated PCS. Fixes: aa571b6275fb ("net: stmmac: add new switch to struct plat_stmmacenet_data") Tested-by: Andrew Halaney # sa8775p-ride Signed-off-by: Sneh Shah Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index dd05437b51f9..0f8bfba4443d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -321,8 +321,9 @@ static int stmmac_ethtool_get_link_ksettings(struct net_device *dev, { struct stmmac_priv *priv = netdev_priv(dev); - if (priv->hw->pcs & STMMAC_PCS_RGMII || - priv->hw->pcs & STMMAC_PCS_SGMII) { + if (!(priv->plat->flags & STMMAC_FLAG_HAS_INTEGRATED_PCS) && + (priv->hw->pcs & STMMAC_PCS_RGMII || + priv->hw->pcs & STMMAC_PCS_SGMII)) { struct rgmii_adv adv; u32 supported, advertising, lp_advertising; @@ -407,8 +408,9 @@ stmmac_ethtool_set_link_ksettings(struct net_device *dev, { struct stmmac_priv *priv = netdev_priv(dev); - if (priv->hw->pcs & STMMAC_PCS_RGMII || - priv->hw->pcs & STMMAC_PCS_SGMII) { + if (!(priv->plat->flags & STMMAC_FLAG_HAS_INTEGRATED_PCS) && + (priv->hw->pcs & STMMAC_PCS_RGMII || + priv->hw->pcs & STMMAC_PCS_SGMII)) { /* Only support ANE */ if (cmd->base.autoneg != AUTONEG_ENABLE) return -EINVAL; From 482521d8e0c6520429478aa6866cd44128b33d5d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2024 10:44:27 +0000 Subject: [PATCH 35/88] udp: annotate data-races around up->pending up->pending can be read without holding the socket lock, as pointed out by syzbot [1] Add READ_ONCE() in lockless contexts, and WRITE_ONCE() on write side. [1] BUG: KCSAN: data-race in udpv6_sendmsg / udpv6_sendmsg write to 0xffff88814e5eadf0 of 4 bytes by task 15547 on cpu 1: udpv6_sendmsg+0x1405/0x1530 net/ipv6/udp.c:1596 inet6_sendmsg+0x63/0x80 net/ipv6/af_inet6.c:657 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] __sys_sendto+0x257/0x310 net/socket.c:2192 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __x64_sys_sendto+0x78/0x90 net/socket.c:2200 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b read to 0xffff88814e5eadf0 of 4 bytes by task 15551 on cpu 0: udpv6_sendmsg+0x22c/0x1530 net/ipv6/udp.c:1373 inet6_sendmsg+0x63/0x80 net/ipv6/af_inet6.c:657 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x37c/0x4d0 net/socket.c:2586 ___sys_sendmsg net/socket.c:2640 [inline] __sys_sendmmsg+0x269/0x500 net/socket.c:2726 __do_sys_sendmmsg net/socket.c:2755 [inline] __se_sys_sendmmsg net/socket.c:2752 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2752 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b value changed: 0x00000000 -> 0x0000000a Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 15551 Comm: syz-executor.1 Tainted: G W 6.7.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+8d482d0e407f665d9d10@syzkaller.appspotmail.com Link: https://lore.kernel.org/netdev/0000000000009e46c3060ebcdffd@google.com/ Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/udp.c | 12 ++++++------ net/ipv6/udp.c | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 89e5a806b82e..5f742d0b9e07 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -805,7 +805,7 @@ void udp_flush_pending_frames(struct sock *sk) if (up->pending) { up->len = 0; - up->pending = 0; + WRITE_ONCE(up->pending, 0); ip_flush_pending_frames(sk); } } @@ -993,7 +993,7 @@ int udp_push_pending_frames(struct sock *sk) out: up->len = 0; - up->pending = 0; + WRITE_ONCE(up->pending, 0); return err; } EXPORT_SYMBOL(udp_push_pending_frames); @@ -1070,7 +1070,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; - if (up->pending) { + if (READ_ONCE(up->pending)) { /* * There are pending frames. * The socket lock must be held while it's corked. @@ -1269,7 +1269,7 @@ back_from_confirm: fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = inet->inet_sport; - up->pending = AF_INET; + WRITE_ONCE(up->pending, AF_INET); do_append_data: up->len += ulen; @@ -1281,7 +1281,7 @@ do_append_data: else if (!corkreq) err = udp_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) - up->pending = 0; + WRITE_ONCE(up->pending, 0); release_sock(sk); out: @@ -1319,7 +1319,7 @@ void udp_splice_eof(struct socket *sock) struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); - if (!up->pending || udp_test_bit(CORK, sk)) + if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) return; lock_sock(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 594e3f23c129..3f2249b4cd5f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1135,7 +1135,7 @@ static void udp_v6_flush_pending_frames(struct sock *sk) udp_flush_pending_frames(sk); else if (up->pending) { up->len = 0; - up->pending = 0; + WRITE_ONCE(up->pending, 0); ip6_flush_pending_frames(sk); } } @@ -1313,7 +1313,7 @@ static int udp_v6_push_pending_frames(struct sock *sk) &inet_sk(sk)->cork.base); out: up->len = 0; - up->pending = 0; + WRITE_ONCE(up->pending, 0); return err; } @@ -1370,7 +1370,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) default: return -EINVAL; } - } else if (!up->pending) { + } else if (!READ_ONCE(up->pending)) { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; @@ -1401,8 +1401,8 @@ do_udp_sendmsg: return -EMSGSIZE; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; - if (up->pending) { - if (up->pending == AF_INET) + if (READ_ONCE(up->pending)) { + if (READ_ONCE(up->pending) == AF_INET) return udp_sendmsg(sk, msg, len); /* * There are pending frames. @@ -1593,7 +1593,7 @@ back_from_confirm: goto out; } - up->pending = AF_INET6; + WRITE_ONCE(up->pending, AF_INET6); do_append_data: if (ipc6.dontfrag < 0) @@ -1607,7 +1607,7 @@ do_append_data: else if (!corkreq) err = udp_v6_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) - up->pending = 0; + WRITE_ONCE(up->pending, 0); if (err > 0) err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0; @@ -1648,7 +1648,7 @@ static void udpv6_splice_eof(struct socket *sock) struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); - if (!up->pending || udp_test_bit(CORK, sk)) + if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) return; lock_sock(sk); From e18405d0be8001fa4c5f9e61471f6ffd59c7a1b3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 12 Jan 2024 12:39:30 +0100 Subject: [PATCH 36/88] net: sched: track device in tcf_block_get/put_ext() only for clsact binder types Clsact/ingress qdisc is not the only one using shared block, red is also using it. The device tracking was originally introduced by commit 913b47d3424e ("net/sched: Introduce tc block netdev tracking infra") for clsact/ingress only. Commit 94e2557d086a ("net: sched: move block device tracking into tcf_block_get/put_ext()") mistakenly enabled that for red as well. Fix that by adding a check for the binder type being clsact when adding device to the block->ports xarray. Reported-by: Ido Schimmel Closes: https://lore.kernel.org/all/ZZ6JE0odnu1lLPtu@shredder/ Fixes: 94e2557d086a ("net: sched: move block device tracking into tcf_block_get/put_ext()") Signed-off-by: Jiri Pirko Tested-by: Ido Schimmel Acked-by: Jamal Hadi Salim Tested-by: Victor Nogueira Signed-off-by: David S. Miller --- net/sched/cls_api.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e3236a3169c3..92a12e3d0fe6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1424,6 +1424,14 @@ static void tcf_block_owner_del(struct tcf_block *block, WARN_ON(1); } +static bool tcf_block_tracks_dev(struct tcf_block *block, + struct tcf_block_ext_info *ei) +{ + return tcf_block_shared(block) && + (ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS || + ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS); +} + int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) @@ -1462,7 +1470,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, if (err) goto err_block_offload_bind; - if (tcf_block_shared(block)) { + if (tcf_block_tracks_dev(block, ei)) { err = xa_insert(&block->ports, dev->ifindex, dev, GFP_KERNEL); if (err) { NL_SET_ERR_MSG(extack, "block dev insert failed"); @@ -1516,7 +1524,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, if (!block) return; - if (tcf_block_shared(block)) + if (tcf_block_tracks_dev(block, ei)) xa_erase(&block->ports, dev->ifindex); tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); From 9181d6f8a2bb32d158de66a84164fac05e3ddd18 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2024 12:28:16 +0000 Subject: [PATCH 37/88] net: add more sanity check in virtio_net_hdr_to_skb() syzbot/KMSAN reports access to uninitialized data from gso_features_check() [1] The repro use af_packet, injecting a gso packet and hdrlen == 0. We could fix the issue making gso_features_check() more careful while dealing with NETIF_F_TSO_MANGLEID in fast path. Or we can make sure virtio_net_hdr_to_skb() pulls minimal network and transport headers as intended. Note that for GSO packets coming from untrusted sources, SKB_GSO_DODGY bit forces a proper header validation (and pull) before the packet can hit any device ndo_start_xmit(), thus we do not need a precise disection at virtio_net_hdr_to_skb() stage. [1] BUG: KMSAN: uninit-value in skb_gso_segment include/net/gso.h:83 [inline] BUG: KMSAN: uninit-value in validate_xmit_skb+0x10f2/0x1930 net/core/dev.c:3629 skb_gso_segment include/net/gso.h:83 [inline] validate_xmit_skb+0x10f2/0x1930 net/core/dev.c:3629 __dev_queue_xmit+0x1eac/0x5130 net/core/dev.c:4341 dev_queue_xmit include/linux/netdevice.h:3134 [inline] packet_xmit+0x9c/0x6b0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3087 [inline] packet_sendmsg+0x8b1d/0x9f30 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x318/0x740 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2780 packet_alloc_skb net/packet/af_packet.c:2936 [inline] packet_snd net/packet/af_packet.c:3030 [inline] packet_sendmsg+0x70e8/0x9f30 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 0 PID: 5025 Comm: syz-executor279 Not tainted 6.7.0-rc7-syzkaller-00003-gfbafc3e621c3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Reported-by: syzbot+7f4d0ea3df4d4fa9a65f@syzkaller.appspotmail.com Link: https://lore.kernel.org/netdev/0000000000005abd7b060eb160cd@google.com/ Fixes: 9274124f023b ("net: stricter validation of untrusted gso packets") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 27cc1d464321..4dfa9b69ca8d 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -3,6 +3,8 @@ #define _LINUX_VIRTIO_NET_H #include +#include +#include #include #include #include @@ -49,6 +51,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) { + unsigned int nh_min_len = sizeof(struct iphdr); unsigned int gso_type = 0; unsigned int thlen = 0; unsigned int p_off = 0; @@ -65,6 +68,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, gso_type = SKB_GSO_TCPV6; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); + nh_min_len = sizeof(struct ipv6hdr); break; case VIRTIO_NET_HDR_GSO_UDP: gso_type = SKB_GSO_UDP; @@ -100,7 +104,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; - p_off = skb_transport_offset(skb) + thlen; + nh_min_len = max_t(u32, nh_min_len, skb_transport_offset(skb)); + p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } else { @@ -140,7 +145,7 @@ retry: skb_set_transport_header(skb, keys.control.thoff); } else if (gso_type) { - p_off = thlen; + p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } From 894d7508316e7ad722df597d68b4b1797a9eee11 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 12 Jan 2024 17:13:14 +0100 Subject: [PATCH 38/88] net: netdev_queue: netdev_txq_completed_mb(): fix wake condition netif_txq_try_stop() uses "get_desc >= start_thrs" as the check for the call to netif_tx_start_queue(). Use ">=" i netdev_txq_completed_mb(), too. Fixes: c91c46de6bbc ("net: provide macros for commonly copied lockless queue stop/wake code") Signed-off-by: Marc Kleine-Budde Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netdev_queues.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index d68b0a483431..8b8ed4e13d74 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -128,7 +128,7 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue, netdev_txq_completed_mb(txq, pkts, bytes); \ \ _res = -1; \ - if (pkts && likely(get_desc > start_thrs)) { \ + if (pkts && likely(get_desc >= start_thrs)) { \ _res = 1; \ if (unlikely(netif_tx_queue_stopped(txq)) && \ !(down_cond)) { \ From 19ca0823f6eaad01d18f664a00550abe912c034c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 12 Jan 2024 11:05:28 -0800 Subject: [PATCH 39/88] bpf: iter_udp: Retry with a larger batch size without going back to the previous bucket The current logic is to use a default size 16 to batch the whole bucket. If it is too small, it will retry with a larger batch size. The current code accidentally does a state->bucket-- before retrying. This goes back to retry with the previous bucket which has already been done. This patch fixed it. It is hard to create a selftest. I added a WARN_ON(state->bucket < 0), forced a particular port to be hashed to the first bucket, created >16 sockets, and observed the for-loop went back to the "-1" bucket. Cc: Aditi Ghag Fixes: c96dac8d369f ("bpf: udp: Implement batching for sockets iterator") Acked-by: Yonghong Song Signed-off-by: Martin KaFai Lau Reviewed-by: Aditi Ghag Link: https://lore.kernel.org/r/20240112190530.3751661-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- net/ipv4/udp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5f742d0b9e07..79050d83e736 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3213,7 +3213,6 @@ again: /* After allocating a larger batch, retry one more time to grab * the whole bucket. */ - state->bucket--; goto again; } done: From 2242fd537fab52d5f4d2fbb1845f047c01fad0cf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 12 Jan 2024 11:05:29 -0800 Subject: [PATCH 40/88] bpf: Avoid iter->offset making backward progress in bpf_iter_udp There is a bug in the bpf_iter_udp_batch() function that stops the userspace from making forward progress. The case that triggers the bug is the userspace passed in a very small read buffer. When the bpf prog does bpf_seq_printf, the userspace read buffer is not enough to capture the whole bucket. When the read buffer is not large enough, the kernel will remember the offset of the bucket in iter->offset such that the next userspace read() can continue from where it left off. The kernel will skip the number (== "iter->offset") of sockets in the next read(). However, the code directly decrements the "--iter->offset". This is incorrect because the next read() may not consume the whole bucket either and then the next-next read() will start from offset 0. The net effect is the userspace will keep reading from the beginning of a bucket and the process will never finish. "iter->offset" must always go forward until the whole bucket is consumed. This patch fixes it by using a local variable "resume_offset" and "resume_bucket". "iter->offset" is always reset to 0 before it may be used. "iter->offset" will be advanced to the "resume_offset" when it continues from the "resume_bucket" (i.e. "state->bucket == resume_bucket"). This brings it closer to the bpf_iter_tcp's offset handling which does not suffer the same bug. Cc: Aditi Ghag Fixes: c96dac8d369f ("bpf: udp: Implement batching for sockets iterator") Acked-by: Yonghong Song Signed-off-by: Martin KaFai Lau Reviewed-by: Aditi Ghag Link: https://lore.kernel.org/r/20240112190530.3751661-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- net/ipv4/udp.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 79050d83e736..148ffb007969 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3137,16 +3137,18 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq) struct bpf_udp_iter_state *iter = seq->private; struct udp_iter_state *state = &iter->state; struct net *net = seq_file_net(seq); + int resume_bucket, resume_offset; struct udp_table *udptable; unsigned int batch_sks = 0; bool resized = false; struct sock *sk; + resume_bucket = state->bucket; + resume_offset = iter->offset; + /* The current batch is done, so advance the bucket. */ - if (iter->st_bucket_done) { + if (iter->st_bucket_done) state->bucket++; - iter->offset = 0; - } udptable = udp_get_table_seq(seq, net); @@ -3166,19 +3168,19 @@ again: for (; state->bucket <= udptable->mask; state->bucket++) { struct udp_hslot *hslot2 = &udptable->hash2[state->bucket]; - if (hlist_empty(&hslot2->head)) { - iter->offset = 0; + if (hlist_empty(&hslot2->head)) continue; - } + iter->offset = 0; spin_lock_bh(&hslot2->lock); udp_portaddr_for_each_entry(sk, &hslot2->head) { if (seq_sk_match(seq, sk)) { /* Resume from the last iterated socket at the * offset in the bucket before iterator was stopped. */ - if (iter->offset) { - --iter->offset; + if (state->bucket == resume_bucket && + iter->offset < resume_offset) { + ++iter->offset; continue; } if (iter->end_sk < iter->max_sk) { @@ -3192,9 +3194,6 @@ again: if (iter->end_sk) break; - - /* Reset the current bucket's offset before moving to the next bucket. */ - iter->offset = 0; } /* All done: no batch made. */ From dbd7db7787ba1743a505a495e479550932836fa4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 12 Jan 2024 11:05:30 -0800 Subject: [PATCH 41/88] selftests/bpf: Test udp and tcp iter batching The patch adds a test to exercise the bpf_iter_udp batching logic. It specifically tests the case that there are multiple so_reuseport udp_sk in a bucket of the udp_table. The test creates two sets of so_reuseport sockets and each set on a different port. Meaning there will be two buckets in the udp_table. The test does the following: 1. read() 3 out of 4 sockets in the first bucket. 2. close() all sockets in the first bucket. This will ensure the current bucket's offset in the kernel does not affect the read() of the following bucket. 3. read() all 4 sockets in the second bucket. The test also reads one udp_sk at a time from the bpf_iter_udp prog. The true case in "do_test(..., bool onebyone)". This is the buggy case that the previous patch fixed. It also tests the "false" case in "do_test(..., bool onebyone)", meaning the userspace reads the whole bucket. There is no bug in this case but adding this test also while at it. Considering the way to have multiple tcp_sk in the same bucket is similar (by using so_reuseport), this patch also tests the bpf_iter_tcp even though the bpf_iter_tcp batching logic works correctly. Both IP v4 and v6 are exercising the same bpf_iter batching code path, so only v6 is tested. Acked-by: Yonghong Song Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240112190530.3751661-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/sock_iter_batch.c | 135 ++++++++++++++++++ .../selftests/bpf/progs/bpf_tracing_net.h | 3 + .../selftests/bpf/progs/sock_iter_batch.c | 91 ++++++++++++ .../testing/selftests/bpf/progs/test_jhash.h | 31 ++++ 4 files changed, 260 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c create mode 100644 tools/testing/selftests/bpf/progs/sock_iter_batch.c diff --git a/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c b/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c new file mode 100644 index 000000000000..0c365f36c73b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Meta + +#include +#include "network_helpers.h" +#include "sock_iter_batch.skel.h" + +#define TEST_NS "sock_iter_batch_netns" + +static const int nr_soreuse = 4; + +static void do_test(int sock_type, bool onebyone) +{ + int err, i, nread, to_read, total_read, iter_fd = -1; + int first_idx, second_idx, indices[nr_soreuse]; + struct bpf_link *link = NULL; + struct sock_iter_batch *skel; + int *fds[2] = {}; + + skel = sock_iter_batch__open(); + if (!ASSERT_OK_PTR(skel, "sock_iter_batch__open")) + return; + + /* Prepare 2 buckets of sockets in the kernel hashtable */ + for (i = 0; i < ARRAY_SIZE(fds); i++) { + int local_port; + + fds[i] = start_reuseport_server(AF_INET6, sock_type, "::1", 0, 0, + nr_soreuse); + if (!ASSERT_OK_PTR(fds[i], "start_reuseport_server")) + goto done; + local_port = get_socket_local_port(*fds[i]); + if (!ASSERT_GE(local_port, 0, "get_socket_local_port")) + goto done; + skel->rodata->ports[i] = ntohs(local_port); + } + + err = sock_iter_batch__load(skel); + if (!ASSERT_OK(err, "sock_iter_batch__load")) + goto done; + + link = bpf_program__attach_iter(sock_type == SOCK_STREAM ? + skel->progs.iter_tcp_soreuse : + skel->progs.iter_udp_soreuse, + NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto done; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_GE(iter_fd, 0, "bpf_iter_create")) + goto done; + + /* Test reading a bucket (either from fds[0] or fds[1]). + * Only read "nr_soreuse - 1" number of sockets + * from a bucket and leave one socket out from + * that bucket on purpose. + */ + to_read = (nr_soreuse - 1) * sizeof(*indices); + total_read = 0; + first_idx = -1; + do { + nread = read(iter_fd, indices, onebyone ? sizeof(*indices) : to_read); + if (nread <= 0 || nread % sizeof(*indices)) + break; + total_read += nread; + + if (first_idx == -1) + first_idx = indices[0]; + for (i = 0; i < nread / sizeof(*indices); i++) + ASSERT_EQ(indices[i], first_idx, "first_idx"); + } while (total_read < to_read); + ASSERT_EQ(nread, onebyone ? sizeof(*indices) : to_read, "nread"); + ASSERT_EQ(total_read, to_read, "total_read"); + + free_fds(fds[first_idx], nr_soreuse); + fds[first_idx] = NULL; + + /* Read the "whole" second bucket */ + to_read = nr_soreuse * sizeof(*indices); + total_read = 0; + second_idx = !first_idx; + do { + nread = read(iter_fd, indices, onebyone ? sizeof(*indices) : to_read); + if (nread <= 0 || nread % sizeof(*indices)) + break; + total_read += nread; + + for (i = 0; i < nread / sizeof(*indices); i++) + ASSERT_EQ(indices[i], second_idx, "second_idx"); + } while (total_read <= to_read); + ASSERT_EQ(nread, 0, "nread"); + /* Both so_reuseport ports should be in different buckets, so + * total_read must equal to the expected to_read. + * + * For a very unlikely case, both ports collide at the same bucket, + * the bucket offset (i.e. 3) will be skipped and it cannot + * expect the to_read number of bytes. + */ + if (skel->bss->bucket[0] != skel->bss->bucket[1]) + ASSERT_EQ(total_read, to_read, "total_read"); + +done: + for (i = 0; i < ARRAY_SIZE(fds); i++) + free_fds(fds[i], nr_soreuse); + if (iter_fd < 0) + close(iter_fd); + bpf_link__destroy(link); + sock_iter_batch__destroy(skel); +} + +void test_sock_iter_batch(void) +{ + struct nstoken *nstoken = NULL; + + SYS_NOFAIL("ip netns del " TEST_NS " &> /dev/null"); + SYS(done, "ip netns add %s", TEST_NS); + SYS(done, "ip -net %s link set dev lo up", TEST_NS); + + nstoken = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto done; + + if (test__start_subtest("tcp")) { + do_test(SOCK_STREAM, true); + do_test(SOCK_STREAM, false); + } + if (test__start_subtest("udp")) { + do_test(SOCK_DGRAM, true); + do_test(SOCK_DGRAM, false); + } + close_netns(nstoken); + +done: + SYS_NOFAIL("ip netns del " TEST_NS " &> /dev/null"); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 1bdc680b0e0e..e8bd4b7b5ef7 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -72,6 +72,8 @@ #define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr #define inet_dport sk.__sk_common.skc_dport +#define udp_portaddr_hash inet.sk.__sk_common.skc_u16hashes[1] + #define ir_loc_addr req.__req_common.skc_rcv_saddr #define ir_num req.__req_common.skc_num #define ir_rmt_addr req.__req_common.skc_daddr @@ -85,6 +87,7 @@ #define sk_rmem_alloc sk_backlog.rmem_alloc #define sk_refcnt __sk_common.skc_refcnt #define sk_state __sk_common.skc_state +#define sk_net __sk_common.skc_net #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_flags __sk_common.skc_flags diff --git a/tools/testing/selftests/bpf/progs/sock_iter_batch.c b/tools/testing/selftests/bpf/progs/sock_iter_batch.c new file mode 100644 index 000000000000..ffbbfe1fa1c1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sock_iter_batch.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Meta + +#include "vmlinux.h" +#include +#include +#include +#include "bpf_tracing_net.h" +#include "bpf_kfuncs.h" + +#define ATTR __always_inline +#include "test_jhash.h" + +static bool ipv6_addr_loopback(const struct in6_addr *a) +{ + return (a->s6_addr32[0] | a->s6_addr32[1] | + a->s6_addr32[2] | (a->s6_addr32[3] ^ bpf_htonl(1))) == 0; +} + +volatile const __u16 ports[2]; +unsigned int bucket[2]; + +SEC("iter/tcp") +int iter_tcp_soreuse(struct bpf_iter__tcp *ctx) +{ + struct sock *sk = (struct sock *)ctx->sk_common; + struct inet_hashinfo *hinfo; + unsigned int hash; + struct net *net; + int idx; + + if (!sk) + return 0; + + sk = bpf_rdonly_cast(sk, bpf_core_type_id_kernel(struct sock)); + if (sk->sk_family != AF_INET6 || + sk->sk_state != TCP_LISTEN || + !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + return 0; + + if (sk->sk_num == ports[0]) + idx = 0; + else if (sk->sk_num == ports[1]) + idx = 1; + else + return 0; + + /* bucket selection as in inet_lhash2_bucket_sk() */ + net = sk->sk_net.net; + hash = jhash2(sk->sk_v6_rcv_saddr.s6_addr32, 4, net->hash_mix); + hash ^= sk->sk_num; + hinfo = net->ipv4.tcp_death_row.hashinfo; + bucket[idx] = hash & hinfo->lhash2_mask; + bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx)); + + return 0; +} + +#define udp_sk(ptr) container_of(ptr, struct udp_sock, inet.sk) + +SEC("iter/udp") +int iter_udp_soreuse(struct bpf_iter__udp *ctx) +{ + struct sock *sk = (struct sock *)ctx->udp_sk; + struct udp_table *udptable; + int idx; + + if (!sk) + return 0; + + sk = bpf_rdonly_cast(sk, bpf_core_type_id_kernel(struct sock)); + if (sk->sk_family != AF_INET6 || + !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + return 0; + + if (sk->sk_num == ports[0]) + idx = 0; + else if (sk->sk_num == ports[1]) + idx = 1; + else + return 0; + + /* bucket selection as in udp_hashslot2() */ + udptable = sk->sk_net.net->ipv4.udp_table; + bucket[idx] = udp_sk(sk)->udp_portaddr_hash & udptable->mask; + bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx)); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_jhash.h b/tools/testing/selftests/bpf/progs/test_jhash.h index c300734d26f6..ef53559bbbdf 100644 --- a/tools/testing/selftests/bpf/progs/test_jhash.h +++ b/tools/testing/selftests/bpf/progs/test_jhash.h @@ -69,3 +69,34 @@ u32 jhash(const void *key, u32 length, u32 initval) return c; } + +static __always_inline u32 jhash2(const u32 *k, u32 length, u32 initval) +{ + u32 a, b, c; + + /* Set up the internal state */ + a = b = c = JHASH_INITVAL + (length<<2) + initval; + + /* Handle most of the key */ + while (length > 3) { + a += k[0]; + b += k[1]; + c += k[2]; + __jhash_mix(a, b, c); + length -= 3; + k += 3; + } + + /* Handle the last 3 u32's */ + switch (length) { + case 3: c += k[2]; + case 2: b += k[1]; + case 1: a += k[0]; + __jhash_final(a, b, c); + break; + case 0: /* Nothing left to add */ + break; + } + + return c; +} From dc9dfc8dc629e42f2234e3327b75324ffc752bc9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Jan 2024 16:32:57 -0800 Subject: [PATCH 42/88] net: tls, fix WARNIING in __sk_msg_free A splice with MSG_SPLICE_PAGES will cause tls code to use the tls_sw_sendmsg_splice path in the TLS sendmsg code to move the user provided pages from the msg into the msg_pl. This will loop over the msg until msg_pl is full, checked by sk_msg_full(msg_pl). The user can also set the MORE flag to hint stack to delay sending until receiving more pages and ideally a full buffer. If the user adds more pages to the msg than can fit in the msg_pl scatterlist (MAX_MSG_FRAGS) we should ignore the MORE flag and send the buffer anyways. What actually happens though is we abort the msg to msg_pl scatterlist setup and then because we forget to set 'full record' indicating we can no longer consume data without a send we fallthrough to the 'continue' path which will check if msg_data_left(msg) has more bytes to send and then attempts to fit them in the already full msg_pl. Then next iteration of sender doing send will encounter a full msg_pl and throw the warning in the syzbot report. To fix simply check if we have a full_record in splice code path and if not send the msg regardless of MORE flag. Reported-and-tested-by: syzbot+f2977222e0e95cec15c8@syzkaller.appspotmail.com Reported-by: Edward Adam Davis Fixes: fe1e81d4f73b ("tls/sw: Support MSG_SPLICE_PAGES") Reviewed-by: Jakub Kicinski Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e37b4d2e2acd..31e8a94dfc11 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1052,7 +1052,11 @@ alloc_encrypted: if (ret < 0) goto send_end; tls_ctx->pending_open_record_frags = true; - if (full_record || eor || sk_msg_full(msg_pl)) + + if (sk_msg_full(msg_pl)) + full_record = true; + + if (full_record || eor) goto copied; continue; } From 034ea1305e659ddae44c19ba8449166fec318e2d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Jan 2024 16:32:58 -0800 Subject: [PATCH 43/88] net: tls, add test to capture error on large splice syzbot found an error with how splice() is handled with a msg greater than 32. This was fixed in previous patch, but lets add a test for it to ensure it continues to work. Signed-off-by: John Fastabend Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 464853a7f982..7799e042a971 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -707,6 +707,20 @@ TEST_F(tls, splice_from_pipe) EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); } +TEST_F(tls, splice_more) +{ + unsigned int f = SPLICE_F_NONBLOCK | SPLICE_F_MORE | SPLICE_F_GIFT; + int send_len = TLS_PAYLOAD_MAX_LEN; + char mem_send[TLS_PAYLOAD_MAX_LEN]; + int i, send_pipe = 1; + int p[2]; + + ASSERT_GE(pipe(p), 0); + EXPECT_GE(write(p[1], mem_send, send_len), 0); + for (i = 0; i < 32; i++) + EXPECT_EQ(splice(p[0], NULL, self->fd, NULL, send_pipe, f), 1); +} + TEST_F(tls, splice_from_pipe2) { int send_len = 16000; From c061be1bd5e74e9685630cc95b818e13dbff9713 Mon Sep 17 00:00:00 2001 From: Marcin Wojtas Date: Fri, 12 Jan 2024 14:06:28 +0000 Subject: [PATCH 44/88] MAINTAINERS: eth: mvneta: update entry Add myself as driver maintainer and restore the maintained status. While at it, update the file field to cover mvneta_bm part of the driver. Signed-off-by: Marcin Wojtas Signed-off-by: David S. Miller --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4194f414dd2c..39eb0a6a2927 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12869,9 +12869,10 @@ S: Maintained F: drivers/thermal/armada_thermal.c MARVELL MVNETA ETHERNET DRIVER +M: Marcin Wojtas L: netdev@vger.kernel.org -S: Orphan -F: drivers/net/ethernet/marvell/mvneta.* +S: Maintained +F: drivers/net/ethernet/marvell/mvneta* MARVELL MVPP2 ETHERNET DRIVER M: Marcin Wojtas From e327b2372bc0f18c30433ac40be07741b59231c5 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Sat, 13 Jan 2024 10:22:21 +0600 Subject: [PATCH 45/88] net: ravb: Fix dma_addr_t truncation in error case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In ravb_start_xmit(), ravb driver uses u32 variable to store result of dma_map_single() call. Since ravb hardware has 32-bit address fields in descriptors, this works properly when mapping is successful - it is platform's job to provide mapping addresses that fit into hardware limitations. However, in failure case dma_map_single() returns DMA_MAPPING_ERROR constant that is 64-bit when dma_addr_t is 64-bit. Storing this constant in u32 leads to truncation, and further call to dma_mapping_error() fails to notice the error. Fix that by storing result of dma_map_single() in a dma_addr_t variable. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Nikita Yushchenko Reviewed-by: Niklas Söderlund Reviewed-by: Sergey Shtylyov Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/ravb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 8649b3e90edb..0e3731f50fc2 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1949,7 +1949,7 @@ static netdev_tx_t ravb_start_xmit(struct sk_buff *skb, struct net_device *ndev) struct ravb_tstamp_skb *ts_skb; struct ravb_tx_desc *desc; unsigned long flags; - u32 dma_addr; + dma_addr_t dma_addr; void *buffer; u32 entry; u32 len; From c2518da8e6b0e248cfff1d4b6682e14020bd4d3f Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 10 Jan 2024 09:14:35 -0500 Subject: [PATCH 46/88] selftests: bonding: Change script interpreter The tests changed by this patch, as well as the scripts they source, use features which are not part of POSIX sh (ex. 'source' and 'local'). As a result, these tests fail when /bin/sh is dash such as on Debian. Change the interpreter to bash so that these tests can run successfully. Fixes: d43eff0b85ae ("selftests: bonding: up/down delay w/ slave link flapping") Tested-by: Hangbin Liu Reviewed-by: Hangbin Liu Reviewed-by: Petr Machata Signed-off-by: Benjamin Poirier Reviewed-by: Przemek Kitszel Signed-off-by: Paolo Abeni --- .../selftests/drivers/net/bonding/mode-1-recovery-updelay.sh | 2 +- .../selftests/drivers/net/bonding/mode-2-recovery-updelay.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/bonding/mode-1-recovery-updelay.sh b/tools/testing/selftests/drivers/net/bonding/mode-1-recovery-updelay.sh index ad4c845a4ac7..b76bf5030952 100755 --- a/tools/testing/selftests/drivers/net/bonding/mode-1-recovery-updelay.sh +++ b/tools/testing/selftests/drivers/net/bonding/mode-1-recovery-updelay.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # Regression Test: diff --git a/tools/testing/selftests/drivers/net/bonding/mode-2-recovery-updelay.sh b/tools/testing/selftests/drivers/net/bonding/mode-2-recovery-updelay.sh index 2330d37453f9..8c2619002147 100755 --- a/tools/testing/selftests/drivers/net/bonding/mode-2-recovery-updelay.sh +++ b/tools/testing/selftests/drivers/net/bonding/mode-2-recovery-updelay.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # Regression Test: From 49078c1b80b6f7e214ff60cf6f44750b33019cad Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 10 Jan 2024 09:14:36 -0500 Subject: [PATCH 47/88] selftests: forwarding: Remove executable bits from lib.sh The lib.sh script is meant to be sourced from other scripts, not executed directly. Therefore, remove the executable bits from lib.sh's permissions. Fixes: fe32dffdcd33 ("selftests: forwarding: add TCPDUMP_EXTRA_FLAGS to lib.sh") Tested-by: Hangbin Liu Reviewed-by: Hangbin Liu Reviewed-by: Vladimir Oltean Signed-off-by: Benjamin Poirier Reviewed-by: Przemek Kitszel Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 tools/testing/selftests/net/forwarding/lib.sh diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh old mode 100755 new mode 100644 From a23aa04042187cbde16f470b49d4ad60d32e9206 Mon Sep 17 00:00:00 2001 From: Qiang Ma Date: Fri, 12 Jan 2024 10:12:49 +0800 Subject: [PATCH 48/88] net: stmmac: ethtool: Fixed calltrace caused by unbalanced disable_irq_wake calls We found the following dmesg calltrace when testing the GMAC NIC notebook: [9.448656] ------------[ cut here ]------------ [9.448658] Unbalanced IRQ 43 wake disable [9.448673] WARNING: CPU: 3 PID: 1083 at kernel/irq/manage.c:688 irq_set_irq_wake+0xe0/0x128 [9.448717] CPU: 3 PID: 1083 Comm: ethtool Tainted: G O 4.19 #1 [9.448773] ... [9.448774] Call Trace: [9.448781] [<9000000000209b5c>] show_stack+0x34/0x140 [9.448788] [<9000000000d52700>] dump_stack+0x98/0xd0 [9.448794] [<9000000000228610>] __warn+0xa8/0x120 [9.448797] [<9000000000d2fb60>] report_bug+0x98/0x130 [9.448800] [<900000000020a418>] do_bp+0x248/0x2f0 [9.448805] [<90000000002035f4>] handle_bp_int+0x4c/0x78 [9.448808] [<900000000029ea40>] irq_set_irq_wake+0xe0/0x128 [9.448813] [<9000000000a96a7c>] stmmac_set_wol+0x134/0x150 [9.448819] [<9000000000be6ed0>] dev_ethtool+0x1368/0x2440 [9.448824] [<9000000000c08350>] dev_ioctl+0x1f8/0x3e0 [9.448827] [<9000000000bb2a34>] sock_ioctl+0x2a4/0x450 [9.448832] [<900000000046f044>] do_vfs_ioctl+0xa4/0x738 [9.448834] [<900000000046f778>] ksys_ioctl+0xa0/0xe8 [9.448837] [<900000000046f7d8>] sys_ioctl+0x18/0x28 [9.448840] [<9000000000211ab4>] syscall_common+0x20/0x34 [9.448842] ---[ end trace 40c18d9aec863c3e ]--- Multiple disable_irq_wake() calls will keep decreasing the IRQ wake_depth, When wake_depth is 0, calling disable_irq_wake() again, will report the above calltrace. Due to the need to appear in pairs, we cannot call disable_irq_wake() without calling enable_irq_wake(). Fix this by making sure there are no unbalanced disable_irq_wake() calls. Fixes: 3172d3afa998 ("stmmac: support wake up irq from external sources (v3)") Signed-off-by: Qiang Ma Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240112021249.24598-1-maqianga@uniontech.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 10 ++++++++-- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 9f89acf31050..f155e4841c62 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -267,6 +267,7 @@ struct stmmac_priv { u32 msg_enable; int wolopts; int wol_irq; + bool wol_irq_disabled; int clk_csr; struct timer_list eee_ctrl_timer; int lpi_irq; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 0f8bfba4443d..42d27b97dd1d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -832,10 +832,16 @@ static int stmmac_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) if (wol->wolopts) { pr_info("stmmac: wakeup enable\n"); device_set_wakeup_enable(priv->device, 1); - enable_irq_wake(priv->wol_irq); + /* Avoid unbalanced enable_irq_wake calls */ + if (priv->wol_irq_disabled) + enable_irq_wake(priv->wol_irq); + priv->wol_irq_disabled = false; } else { device_set_wakeup_enable(priv->device, 0); - disable_irq_wake(priv->wol_irq); + /* Avoid unbalanced disable_irq_wake calls */ + if (!priv->wol_irq_disabled) + disable_irq_wake(priv->wol_irq); + priv->wol_irq_disabled = true; } mutex_lock(&priv->lock); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 47de466e432c..c78a96b8eb64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3628,6 +3628,7 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev) /* Request the Wake IRQ in case of another line * is used for WoL */ + priv->wol_irq_disabled = true; if (priv->wol_irq > 0 && priv->wol_irq != dev->irq) { int_name = priv->int_name_wol; sprintf(int_name, "%s:%s", dev->name, "wol"); From 2c4ca7977298c6a3d237d7d703df97e043b75c12 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 14 Jan 2024 14:47:26 -0800 Subject: [PATCH 49/88] selftests: netdevsim: sprinkle more udevadm settle Number of tests are failing when netdev renaming is active on the system. Add udevadm settle in logic determining the names. Fixes: 242aaf03dc9b ("selftests: add a test for ethtool pause stats") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240114224726.1210532-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh | 1 + tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh index 922744059aaa..80160579e0cc 100644 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh @@ -51,6 +51,7 @@ function make_netdev { fi echo $NSIM_ID $@ > /sys/bus/netdevsim/new_device + udevadm settle # get new device name ls /sys/bus/netdevsim/devices/netdevsim${NSIM_ID}/net/ } diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh index 1b08e042cf94..4855ef597a15 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh @@ -233,6 +233,7 @@ function print_tables { function get_netdev_name { local -n old=$1 + udevadm settle new=$(ls /sys/class/net) for netdev in $new; do From 4697381bd076adfea4d67394189c8bf27b9cc95b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 14 Jan 2024 14:47:48 -0800 Subject: [PATCH 50/88] selftests: netdevsim: correct expected FEC strings ethtool CLI has changed its output. Make the test compatible. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240114224748.1210578-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- .../drivers/net/netdevsim/ethtool-fec.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh index 0c56746e9ce0..7d7829f57550 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh @@ -8,16 +8,20 @@ NSIM_NETDEV=$(make_netdev) set -o pipefail +# Since commit 2b3ddcb35357 ("ethtool: fec: Change the prompt ...") +# in ethtool CLI the Configured lines start with Supported/Configured. +configured=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2 | head -1 | cut -d' ' -f1) + # netdevsim starts out with None/None s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) -check $? "$s" "Configured FEC encodings: None +check $? "$s" "$configured FEC encodings: None Active FEC encoding: None" # Test Auto $ETHTOOL --set-fec $NSIM_NETDEV encoding auto check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) -check $? "$s" "Configured FEC encodings: Auto +check $? "$s" "$configured FEC encodings: Auto Active FEC encoding: Off" # Test case in-sensitivity @@ -25,7 +29,7 @@ for o in off Off OFF; do $ETHTOOL --set-fec $NSIM_NETDEV encoding $o check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) - check $? "$s" "Configured FEC encodings: Off + check $? "$s" "$configured FEC encodings: Off Active FEC encoding: Off" done @@ -33,7 +37,7 @@ for o in BaseR baser BAser; do $ETHTOOL --set-fec $NSIM_NETDEV encoding $o check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) - check $? "$s" "Configured FEC encodings: BaseR + check $? "$s" "$configured FEC encodings: BaseR Active FEC encoding: BaseR" done @@ -41,7 +45,7 @@ for o in llrs rs; do $ETHTOOL --set-fec $NSIM_NETDEV encoding $o check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) - check $? "$s" "Configured FEC encodings: ${o^^} + check $? "$s" "$configured FEC encodings: ${o^^} Active FEC encoding: ${o^^}" done @@ -49,13 +53,13 @@ done $ETHTOOL --set-fec $NSIM_NETDEV encoding rs llrs check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) -check $? "$s" "Configured FEC encodings: RS LLRS +check $? "$s" "$configured FEC encodings: RS LLRS Active FEC encoding: LLRS" $ETHTOOL --set-fec $NSIM_NETDEV encoding rs off auto check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) -check $? "$s" "Configured FEC encodings: Auto Off RS +check $? "$s" "$configured FEC encodings: Auto Off RS Active FEC encoding: RS" # Make sure other link modes are rejected From 03fb8565c880d57952d9b4ba0b36468bae52b554 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Jan 2024 18:02:01 -0800 Subject: [PATCH 51/88] selftests: bonding: add missing build configs bonding tests also try to create bridge, veth and dummy interfaces. These are not currently listed in config. Fixes: bbb774d921e2 ("net: Add tests for bonding and team address list management") Fixes: c078290a2b76 ("selftests: include bonding tests into the kselftest infra") Acked-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240116020201.1883023-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/bonding/config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 70638fa50b2c..f85b16fc5128 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -1,2 +1,5 @@ CONFIG_BONDING=y +CONFIG_BRIDGE=y +CONFIG_DUMMY=y CONFIG_MACVLAN=y +CONFIG_VETH=y From 22c7fa171a02d310e3a3f6ed46a698ca8a0060ed Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Mon, 15 Jan 2024 09:20:27 +0100 Subject: [PATCH 52/88] bpf: Reject variable offset alu on PTR_TO_FLOW_KEYS For PTR_TO_FLOW_KEYS, check_flow_keys_access() only uses fixed off for validation. However, variable offset ptr alu is not prohibited for this ptr kind. So the variable offset is not checked. The following prog is accepted: func#0 @0 0: R1=ctx() R10=fp0 0: (bf) r6 = r1 ; R1=ctx() R6_w=ctx() 1: (79) r7 = *(u64 *)(r6 +144) ; R6_w=ctx() R7_w=flow_keys() 2: (b7) r8 = 1024 ; R8_w=1024 3: (37) r8 /= 1 ; R8_w=scalar() 4: (57) r8 &= 1024 ; R8_w=scalar(smin=smin32=0, smax=umax=smax32=umax32=1024,var_off=(0x0; 0x400)) 5: (0f) r7 += r8 mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1 mark_precise: frame0: regs=r8 stack= before 4: (57) r8 &= 1024 mark_precise: frame0: regs=r8 stack= before 3: (37) r8 /= 1 mark_precise: frame0: regs=r8 stack= before 2: (b7) r8 = 1024 6: R7_w=flow_keys(smin=smin32=0,smax=umax=smax32=umax32=1024,var_off =(0x0; 0x400)) R8_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=1024, var_off=(0x0; 0x400)) 6: (79) r0 = *(u64 *)(r7 +0) ; R0_w=scalar() 7: (95) exit This prog loads flow_keys to r7, and adds the variable offset r8 to r7, and finally causes out-of-bounds access: BUG: unable to handle page fault for address: ffffc90014c80038 [...] Call Trace: bpf_dispatcher_nop_func include/linux/bpf.h:1231 [inline] __bpf_prog_run include/linux/filter.h:651 [inline] bpf_prog_run include/linux/filter.h:658 [inline] bpf_prog_run_pin_on_cpu include/linux/filter.h:675 [inline] bpf_flow_dissect+0x15f/0x350 net/core/flow_dissector.c:991 bpf_prog_test_run_flow_dissector+0x39d/0x620 net/bpf/test_run.c:1359 bpf_prog_test_run kernel/bpf/syscall.c:4107 [inline] __sys_bpf+0xf8f/0x4560 kernel/bpf/syscall.c:5475 __do_sys_bpf kernel/bpf/syscall.c:5561 [inline] __se_sys_bpf kernel/bpf/syscall.c:5559 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:5559 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Fix this by rejecting ptr alu with variable offset on flow_keys. Applying the patch rejects the program with "R7 pointer arithmetic on flow_keys prohibited". Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Signed-off-by: Hao Sun Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240115082028.9992-1-sunhao.th@gmail.com --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index adbf330d364b..65f598694d55 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12826,6 +12826,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } switch (base_type(ptr_reg->type)) { + case PTR_TO_FLOW_KEYS: + if (known) + break; + fallthrough; case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) From 33772ff3b887eb2f426ed66bcb1808837a40669c Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Mon, 15 Jan 2024 09:20:28 +0100 Subject: [PATCH 53/88] selftests/bpf: Add test for alu on PTR_TO_FLOW_KEYS Add a test case for PTR_TO_FLOW_KEYS alu. Testing if alu with variable offset on flow_keys is rejected. For the fixed offset success case, we already have C code coverage to verify (e.g. via bpf_flow.c). Signed-off-by: Hao Sun Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240115082028.9992-2-sunhao.th@gmail.com --- .../bpf/progs/verifier_value_illegal_alu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c index 71814a753216..a9ab37d3b9e2 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c @@ -146,4 +146,23 @@ l0_%=: exit; \ : __clobber_all); } +SEC("flow_dissector") +__description("flow_keys illegal alu op with variable offset") +__failure __msg("R7 pointer arithmetic on flow_keys prohibited") +__naked void flow_keys_illegal_variable_offset_alu(void) +{ + asm volatile(" \ + r6 = r1; \ + r7 = *(u64*)(r6 + %[flow_keys_off]); \ + r8 = 8; \ + r8 /= 1; \ + r8 &= 8; \ + r7 += r8; \ + r0 = *(u64*)(r7 + 0); \ + exit; \ +" : + : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; From 776dac5a662774f07a876b650ba578d0a62d20db Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Thu, 11 Jan 2024 15:20:18 +0800 Subject: [PATCH 54/88] net: dsa: vsc73xx: Add null pointer check to vsc73xx_gpio_probe devm_kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Fixes: 05bd97fc559d ("net: dsa: Add Vitesse VSC73xx DSA router driver") Signed-off-by: Kunwu Chan Suggested-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240111072018.75971-1-chentao@kylinos.cn Signed-off-by: Jakub Kicinski --- drivers/net/dsa/vitesse-vsc73xx-core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index dd50502e2122..ae70eac3be28 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -1135,6 +1135,8 @@ static int vsc73xx_gpio_probe(struct vsc73xx *vsc) vsc->gc.label = devm_kasprintf(vsc->dev, GFP_KERNEL, "VSC%04x", vsc->chipid); + if (!vsc->gc.label) + return -ENOMEM; vsc->gc.ngpio = 4; vsc->gc.owner = THIS_MODULE; vsc->gc.parent = vsc->dev; From 97eb5d51b4a584a60e5d096bdb6b33edc9f50d8d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 15 Jan 2024 12:43:38 +0000 Subject: [PATCH 55/88] net: sfp-bus: fix SFP mode detect from bitrate The referenced commit moved the setting of the Autoneg and pause bits early in sfp_parse_support(). However, we check whether the modes are empty before using the bitrate to set some modes. Setting these bits so early causes that test to always be false, preventing this working, and thus some modules that used to work no longer do. Move them just before the call to the quirk. Fixes: 8110633db49d ("net: sfp-bus: allow SFP quirks to override Autoneg and pause bits") Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Link: https://lore.kernel.org/r/E1rPMJW-001Ahf-L0@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/sfp-bus.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 6fa679b36290..db39dec7f247 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -151,10 +151,6 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned int br_min, br_nom, br_max; __ETHTOOL_DECLARE_LINK_MODE_MASK(modes) = { 0, }; - phylink_set(modes, Autoneg); - phylink_set(modes, Pause); - phylink_set(modes, Asym_Pause); - /* Decode the bitrate information to MBd */ br_min = br_nom = br_max = 0; if (id->base.br_nominal) { @@ -339,6 +335,10 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, } } + phylink_set(modes, Autoneg); + phylink_set(modes, Pause); + phylink_set(modes, Asym_Pause); + if (bus->sfp_quirk && bus->sfp_quirk->modes) bus->sfp_quirk->modes(id, modes, interfaces); From e9ce7ededf14af3396312f02d1622f4889d676ca Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 15 Jan 2024 14:59:22 +0100 Subject: [PATCH 56/88] selftests: rtnetlink: use setup_ns in bonding test This is a follow-up of commit a159cbe81d3b ("selftests: rtnetlink: check enslaving iface in a bond") after the merge of net-next into net. The goal is to follow the new convention, see commit d3b6b1116127 ("selftests/net: convert rtnetlink.sh to run it in unique namespace") for more details. Let's use also the generic dummy name instead of defining a new one. Signed-off-by: Nicolas Dichtel Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20240115135922.3662648-1-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/rtnetlink.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index a31be0eaaa50..4667d74579d1 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -1244,21 +1244,19 @@ kci_test_address_proto() kci_test_enslave_bonding() { - local testns="testns" local bond="bond123" - local dummy="dummy123" local ret=0 - run_cmd ip netns add "$testns" - if [ $ret -ne 0 ]; then + setup_ns testns + if [ $? -ne 0 ]; then end_test "SKIP bonding tests: cannot add net namespace $testns" return $ksft_skip fi run_cmd ip -netns $testns link add dev $bond type bond mode balance-rr - run_cmd ip -netns $testns link add dev $dummy type dummy - run_cmd ip -netns $testns link set dev $dummy up - run_cmd ip -netns $testns link set dev $dummy master $bond down + run_cmd ip -netns $testns link add dev $devdummy type dummy + run_cmd ip -netns $testns link set dev $devdummy up + run_cmd ip -netns $testns link set dev $devdummy master $bond down if [ $ret -ne 0 ]; then end_test "FAIL: initially up interface added to a bond and set down" ip netns del "$testns" From c2945c435c999c63e47f337bc7c13c98c21d0bcc Mon Sep 17 00:00:00 2001 From: Romain Gantois Date: Tue, 16 Jan 2024 13:19:17 +0100 Subject: [PATCH 57/88] net: stmmac: Prevent DSA tags from breaking COE Some DSA tagging protocols change the EtherType field in the MAC header e.g. DSA_TAG_PROTO_(DSA/EDSA/BRCM/MTK/RTL4C_A/SJA1105). On TX these tagged frames are ignored by the checksum offload engine and IP header checker of some stmmac cores. On RX, the stmmac driver wrongly assumes that checksums have been computed for these tagged packets, and sets CHECKSUM_UNNECESSARY. Add an additional check in the stmmac TX and RX hotpaths so that COE is deactivated for packets with ethertypes that will not trigger the COE and IP header checks. Fixes: 6b2c6e4a938f ("net: stmmac: propagate feature flags to vlan") Cc: Reported-by: Richard Tresidder Link: https://lore.kernel.org/netdev/e5c6c75f-2dfa-4e50-a1fb-6bf4cdb617c2@electromag.com.au/ Reported-by: Romain Gantois Link: https://lore.kernel.org/netdev/c57283ed-6b9b-b0e6-ee12-5655c1c54495@bootlin.com/ Reviewed-by: Vladimir Oltean Reviewed-by: Linus Walleij Signed-off-by: Romain Gantois Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 32 +++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index c78a96b8eb64..a0e46369ae15 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4435,6 +4435,28 @@ dma_map_err: return NETDEV_TX_OK; } +/** + * stmmac_has_ip_ethertype() - Check if packet has IP ethertype + * @skb: socket buffer to check + * + * Check if a packet has an ethertype that will trigger the IP header checks + * and IP/TCP checksum engine of the stmmac core. + * + * Return: true if the ethertype can trigger the checksum engine, false + * otherwise + */ +static bool stmmac_has_ip_ethertype(struct sk_buff *skb) +{ + int depth = 0; + __be16 proto; + + proto = __vlan_get_protocol(skb, eth_header_parse_protocol(skb), + &depth); + + return (depth <= ETH_HLEN) && + (proto == htons(ETH_P_IP) || proto == htons(ETH_P_IPV6)); +} + /** * stmmac_xmit - Tx entry point of the driver * @skb : the socket buffer @@ -4499,9 +4521,13 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) /* DWMAC IPs can be synthesized to support tx coe only for a few tx * queues. In that case, checksum offloading for those queues that don't * support tx coe needs to fallback to software checksum calculation. + * + * Packets that won't trigger the COE e.g. most DSA-tagged packets will + * also have to be checksummed in software. */ if (csum_insertion && - priv->plat->tx_queues_cfg[queue].coe_unsupported) { + (priv->plat->tx_queues_cfg[queue].coe_unsupported || + !stmmac_has_ip_ethertype(skb))) { if (unlikely(skb_checksum_help(skb))) goto dma_map_err; csum_insertion = !csum_insertion; @@ -5066,7 +5092,7 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue, stmmac_rx_vlan(priv->dev, skb); skb->protocol = eth_type_trans(skb, priv->dev); - if (unlikely(!coe)) + if (unlikely(!coe) || !stmmac_has_ip_ethertype(skb)) skb_checksum_none_assert(skb); else skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -5589,7 +5615,7 @@ drain_data: skb->protocol = eth_type_trans(skb, priv->dev); - if (unlikely(!coe)) + if (unlikely(!coe) || !stmmac_has_ip_ethertype(skb)) skb_checksum_none_assert(skb); else skb->ip_summed = CHECKSUM_UNNECESSARY; From c0f5aec28edf98906d28f08daace6522adf9ee7a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 16 Jan 2024 18:18:47 +0100 Subject: [PATCH 58/88] mptcp: relax check on MPC passive fallback While testing the blamed commit below, I was able to miss (!) packetdrill failures in the fastopen test-cases. On passive fastopen the child socket is created by incoming TCP MPC syn, allow for both MPC_SYN and MPC_ACK header. Fixes: 724b00c12957 ("mptcp: refine opt_mp_capable determination") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1117d1e84274..0dcb721c89d1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -783,7 +783,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * options. */ mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK)) + if (!(mp_opt.suboptions & + (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK))) fallback = true; } else if (subflow_req->mp_join) { From ea937f77208323d35ffe2f8d8fc81b00118bfcda Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2024 11:14:00 -0800 Subject: [PATCH 59/88] net: netdevsim: don't try to destroy PHC on VFs PHC gets initialized in nsim_init_netdevsim(), which is only called if (nsim_dev_port_is_pf()). Create a counterpart of nsim_init_netdevsim() and move the mock_phc_destroy() there. This fixes a crash trying to destroy netdevsim with VFs instantiated, as caught by running the devlink.sh test: BUG: kernel NULL pointer dereference, address: 00000000000000b8 RIP: 0010:mock_phc_destroy+0xd/0x30 Call Trace: nsim_destroy+0x4a/0x70 [netdevsim] __nsim_dev_port_del+0x47/0x70 [netdevsim] nsim_dev_reload_destroy+0x105/0x120 [netdevsim] nsim_drv_remove+0x2f/0xb0 [netdevsim] device_release_driver_internal+0x1a1/0x210 bus_remove_device+0xd5/0x120 device_del+0x159/0x490 device_unregister+0x12/0x30 del_device_store+0x11a/0x1a0 [netdevsim] kernfs_fop_write_iter+0x130/0x1d0 vfs_write+0x30b/0x4b0 ksys_write+0x69/0xf0 do_syscall_64+0xcc/0x1e0 entry_SYSCALL_64_after_hwframe+0x6f/0x77 Fixes: b63e78fca889 ("net: netdevsim: use mock PHC driver") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/netdevsim/netdev.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index aecaf5f44374..77e8250282a5 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -369,6 +369,12 @@ static int nsim_init_netdevsim_vf(struct netdevsim *ns) return err; } +static void nsim_exit_netdevsim(struct netdevsim *ns) +{ + nsim_udp_tunnels_info_destroy(ns->netdev); + mock_phc_destroy(ns->phc); +} + struct netdevsim * nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port) { @@ -417,8 +423,7 @@ void nsim_destroy(struct netdevsim *ns) } rtnl_unlock(); if (nsim_dev_port_is_pf(ns->nsim_dev_port)) - nsim_udp_tunnels_info_destroy(dev); - mock_phc_destroy(ns->phc); + nsim_exit_netdevsim(ns); free_netdev(dev); } From 0617c3de9b4026b87be12b0cb5c35f42c7c66fcb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 3 Jan 2024 23:34:58 +0100 Subject: [PATCH 60/88] netfilter: nf_tables: reject invalid set policy Report -EINVAL in case userspace provides a unsupported set backend policy. Fixes: c50b960ccc59 ("netfilter: nf_tables: implement proper set selection") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8438a8922e4a..a90a364f5be5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5048,8 +5048,16 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, } desc.policy = NFT_SET_POL_PERFORMANCE; - if (nla[NFTA_SET_POLICY] != NULL) + if (nla[NFTA_SET_POLICY] != NULL) { desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + switch (desc.policy) { + case NFT_SET_POL_PERFORMANCE: + case NFT_SET_POL_MEMORY: + break; + default: + return -EOPNOTSUPP; + } + } if (nla[NFTA_SET_DESC] != NULL) { err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); From 65b3bd600e15e00fb9e433bbc8aa55e42b202055 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 6 Jan 2024 23:52:02 +0100 Subject: [PATCH 61/88] netfilter: nf_tables: validate .maxattr at expression registration struct nft_expr_info allows to store up to NFT_EXPR_MAXATTR (16) attributes when parsing netlink attributes. Rise a warning in case there is ever a nft expression whose .maxattr goes beyond this number of expressions, in such case, struct nft_expr_info needs to be updated. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a90a364f5be5..2548d7d56408 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2977,6 +2977,9 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, */ int nft_register_expr(struct nft_expr_type *type) { + if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR)) + return -ENOMEM; + nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) list_add_tail_rcu(&type->list, &nf_tables_expressions); From 3c13725f43dcf43ad8a9bcd6a9f12add19a8f93e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 7 Jan 2024 23:00:15 +0100 Subject: [PATCH 62/88] netfilter: nf_tables: bail out if stateful expression provides no .clone All existing NFT_EXPR_STATEFUL provide a .clone interface, remove fallback to copy content of stateful expression since this is never exercised and bail out if .clone interface is not defined. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2548d7d56408..b3db97440db1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3274,14 +3274,13 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) { int err; - if (src->ops->clone) { - dst->ops = src->ops; - err = src->ops->clone(dst, src); - if (err < 0) - return err; - } else { - memcpy(dst, src, src->ops->size); - } + if (WARN_ON_ONCE(!src->ops->clone)) + return -EINVAL; + + dst->ops = src->ops; + err = src->ops->clone(dst, src); + if (err < 0) + return err; __module_get(src->ops->type->owner); From 91a139cee1202a4599a380810d93c69b5bac6197 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 10 Jan 2024 00:42:37 +0100 Subject: [PATCH 63/88] netfilter: nft_limit: do not ignore unsupported flags Bail out if userspace provides unsupported flags, otherwise future extensions to the limit expression will be silently ignored by the kernel. Fixes: c7862a5f0de5 ("netfilter: nft_limit: allow to invert matching criteria") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 145dc62c6247..79039afde34e 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -58,6 +58,7 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) static int nft_limit_init(struct nft_limit_priv *priv, const struct nlattr * const tb[], bool pkts) { + bool invert = false; u64 unit, tokens; if (tb[NFTA_LIMIT_RATE] == NULL || @@ -90,19 +91,23 @@ static int nft_limit_init(struct nft_limit_priv *priv, priv->rate); } + if (tb[NFTA_LIMIT_FLAGS]) { + u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); + + if (flags & ~NFT_LIMIT_F_INV) + return -EOPNOTSUPP; + + if (flags & NFT_LIMIT_F_INV) + invert = true; + } + priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT); if (!priv->limit) return -ENOMEM; priv->limit->tokens = tokens; priv->tokens_max = priv->limit->tokens; - - if (tb[NFTA_LIMIT_FLAGS]) { - u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); - - if (flags & NFT_LIMIT_F_INV) - priv->invert = true; - } + priv->invert = invert; priv->limit->last = ktime_get_ns(); spin_lock_init(&priv->limit->lock); From c3f9fd54cd87233f53bdf0e191a86b3a5e960e02 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 Jan 2024 23:06:37 +0800 Subject: [PATCH 64/88] netfilter: nfnetlink_log: use proper helper for fetching physinif We don't use physindev in __build_packet_message except for getting physinif from it. So let's switch to nf_bridge_get_physinif to get what we want directly. Signed-off-by: Pavel Tikhomirov Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index f03f4d4d7d88..134e05d31061 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -508,7 +508,7 @@ __build_packet_message(struct nfnl_log_net *log, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { - struct net_device *physindev; + int physinif; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ @@ -516,10 +516,10 @@ __build_packet_message(struct nfnl_log_net *log, htonl(indev->ifindex))) goto nla_put_failure; - physindev = nf_bridge_get_physindev(skb); - if (physindev && + physinif = nf_bridge_get_physinif(skb); + if (physinif && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, - htonl(physindev->ifindex))) + htonl(physinif))) goto nla_put_failure; } #endif From aeaa44075f8e49e2e0ad4507d925e690b7950145 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 Jan 2024 23:06:38 +0800 Subject: [PATCH 65/88] netfilter: nf_queue: remove excess nf_bridge variable We don't really need nf_bridge variable here. And nf_bridge_info_exists is better replacement for nf_bridge_info_get in case we are only checking for existence. Signed-off-by: Pavel Tikhomirov Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_queue.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 63d1516816b1..3dfcb3ac5cb4 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -82,10 +82,8 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const struct sk_buff *skb = entry->skb; - struct nf_bridge_info *nf_bridge; - nf_bridge = nf_bridge_info_get(skb); - if (nf_bridge) { + if (nf_bridge_info_exists(skb)) { entry->physin = nf_bridge_get_physindev(skb); entry->physout = nf_bridge_get_physoutdev(skb); } else { From a54e72197037d2c9bfcd70dddaac8c8ccb5b41ba Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 Jan 2024 23:06:39 +0800 Subject: [PATCH 66/88] netfilter: propagate net to nf_bridge_get_physindev This is a preparation patch for replacing physindev with physinif on nf_bridge_info structure. We will use dev_get_by_index_rcu to resolve device, when needed, and it requires net to be available. Signed-off-by: Pavel Tikhomirov Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 2 +- net/ipv4/netfilter/nf_reject_ipv4.c | 2 +- net/ipv6/netfilter/nf_reject_ipv6.c | 2 +- net/netfilter/ipset/ip_set_hash_netiface.c | 8 ++++---- net/netfilter/nf_log_syslog.c | 13 +++++++------ net/netfilter/nf_queue.c | 2 +- net/netfilter/xt_physdev.c | 2 +- 7 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index f980edfdd278..e927b9a15a55 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -56,7 +56,7 @@ static inline int nf_bridge_get_physoutif(const struct sk_buff *skb) } static inline struct net_device * -nf_bridge_get_physindev(const struct sk_buff *skb) +nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index f01b038fc1cd..86e7d390671a 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -289,7 +289,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - br_indev = nf_bridge_get_physindev(oldskb); + br_indev = nf_bridge_get_physindev(oldskb, net); if (br_indev) { struct ethhdr *oeth = eth_hdr(oldskb); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index d45bc54b7ea5..27b2164f4c43 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -354,7 +354,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - br_indev = nf_bridge_get_physindev(oldskb); + br_indev = nf_bridge_get_physindev(oldskb, net); if (br_indev) { struct ethhdr *oeth = eth_hdr(oldskb); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 95aeb31c60e0..30a655e5c4fd 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -138,9 +138,9 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next, #include "ip_set_hash_gen.h" #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -static const char *get_physindev_name(const struct sk_buff *skb) +static const char *get_physindev_name(const struct sk_buff *skb, struct net *net) { - struct net_device *dev = nf_bridge_get_physindev(skb); + struct net_device *dev = nf_bridge_get_physindev(skb, net); return dev ? dev->name : NULL; } @@ -177,7 +177,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - const char *eiface = SRCDIR ? get_physindev_name(skb) : + const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) : get_physoutdev_name(skb); if (!eiface) @@ -395,7 +395,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - const char *eiface = SRCDIR ? get_physindev_name(skb) : + const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) : get_physoutdev_name(skb); if (!eiface) diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index c66689ad2b49..58402226045e 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -111,7 +111,8 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const struct nf_loginfo *loginfo, const char *prefix) + const struct nf_loginfo *loginfo, const char *prefix, + struct net *net) { const struct net_device *physoutdev __maybe_unused; const struct net_device *physindev __maybe_unused; @@ -121,7 +122,7 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, in ? in->name : "", out ? out->name : ""); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - physindev = nf_bridge_get_physindev(skb); + physindev = nf_bridge_get_physindev(skb, net); if (physindev && in != physindev) nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); physoutdev = nf_bridge_get_physoutdev(skb); @@ -148,7 +149,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, - prefix); + prefix, net); dump_arp_packet(m, loginfo, skb, skb_network_offset(skb)); nf_log_buf_close(m); @@ -845,7 +846,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, - out, loginfo, prefix); + out, loginfo, prefix, net); if (in) dump_mac_header(m, loginfo, skb); @@ -880,7 +881,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, - loginfo, prefix); + loginfo, prefix, net); if (in) dump_mac_header(m, loginfo, skb); @@ -916,7 +917,7 @@ static void nf_log_unknown_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, - prefix); + prefix, net); dump_mac_header(m, loginfo, skb); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 3dfcb3ac5cb4..e2f334f70281 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -84,7 +84,7 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) const struct sk_buff *skb = entry->skb; if (nf_bridge_info_exists(skb)) { - entry->physin = nf_bridge_get_physindev(skb); + entry->physin = nf_bridge_get_physindev(skb, entry->state.net); entry->physout = nf_bridge_get_physoutdev(skb); } else { entry->physin = NULL; diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index ec6ed6fda96c..343e65f377d4 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -59,7 +59,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) (!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED))) return false; - physdev = nf_bridge_get_physindev(skb); + physdev = nf_bridge_get_physindev(skb, xt_net(par)); indev = physdev ? physdev->name : NULL; if ((info->bitmask & XT_PHYSDEV_OP_ISIN && From 9874808878d9eed407e3977fd11fee49de1e1d86 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 Jan 2024 23:06:40 +0800 Subject: [PATCH 67/88] netfilter: bridge: replace physindev with physinif in nf_bridge_info An skb can be added to a neigh->arp_queue while waiting for an arp reply. Where original skb's skb->dev can be different to neigh's neigh->dev. For instance in case of bridging dnated skb from one veth to another, the skb would be added to a neigh->arp_queue of the bridge. As skb->dev can be reset back to nf_bridge->physindev and used, and as there is no explicit mechanism that prevents this physindev from been freed under us (for instance neigh_flush_dev doesn't cleanup skbs from different device's neigh queue) we can crash on e.g. this stack: arp_process neigh_update skb = __skb_dequeue(&neigh->arp_queue) neigh_resolve_output(..., skb) ... br_nf_dev_xmit br_nf_pre_routing_finish_bridge_slow skb->dev = nf_bridge->physindev br_handle_frame_finish Let's use plain ifindex instead of net_device link. To peek into the original net_device we will use dev_get_by_index_rcu(). Thus either we get device and are safe to use it or we don't get it and drop skb. Fixes: c4e70a87d975 ("netfilter: bridge: rename br_netfilter.c to br_netfilter_hooks.c") Suggested-by: Florian Westphal Signed-off-by: Pavel Tikhomirov Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 4 +-- include/linux/skbuff.h | 2 +- net/bridge/br_netfilter_hooks.c | 42 +++++++++++++++++++++++------ net/bridge/br_netfilter_ipv6.c | 14 +++++++--- net/ipv4/netfilter/nf_reject_ipv4.c | 9 ++++--- net/ipv6/netfilter/nf_reject_ipv6.c | 11 +++++--- 6 files changed, 61 insertions(+), 21 deletions(-) diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index e927b9a15a55..743475ca7e9d 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -42,7 +42,7 @@ static inline int nf_bridge_get_physinif(const struct sk_buff *skb) if (!nf_bridge) return 0; - return nf_bridge->physindev ? nf_bridge->physindev->ifindex : 0; + return nf_bridge->physinif; } static inline int nf_bridge_get_physoutif(const struct sk_buff *skb) @@ -60,7 +60,7 @@ nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - return nf_bridge ? nf_bridge->physindev : NULL; + return nf_bridge ? dev_get_by_index_rcu(net, nf_bridge->physinif) : NULL; } static inline struct net_device * diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a5ae952454c8..2dde34c29203 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -295,7 +295,7 @@ struct nf_bridge_info { u8 bridged_dnat:1; u8 sabotage_in_done:1; __u16 frag_max_size; - struct net_device *physindev; + int physinif; /* always valid & non-NULL from FORWARD on, for physdev match */ struct net_device *physoutdev; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6adcb45bca75..ed1720890757 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -279,8 +279,17 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_ if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) && READ_ONCE(neigh->hh.hh_len)) { + struct net_device *br_indev; + + br_indev = nf_bridge_get_physindev(skb, net); + if (!br_indev) { + neigh_release(neigh); + goto free_skb; + } + neigh_hh_bridge(&neigh->hh, skb); - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; + ret = br_handle_frame_finish(net, sk, skb); } else { /* the neighbour function below overwrites the complete @@ -352,12 +361,18 @@ br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, */ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb->dev; + struct net_device *dev = skb->dev, *br_indev; struct iphdr *iph = ip_hdr(skb); struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; int err; + br_indev = nf_bridge_get_physindev(skb, net); + if (!br_indev) { + kfree_skb(skb); + return 0; + } + nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { @@ -397,7 +412,7 @@ free_skb: } else { if (skb_dst(skb)->dev == dev) { bridged_dnat: - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, @@ -410,7 +425,7 @@ bridged_dnat: skb->pkt_type = PACKET_HOST; } } else { - rt = bridge_parent_rtable(nf_bridge->physindev); + rt = bridge_parent_rtable(br_indev); if (!rt) { kfree_skb(skb); return 0; @@ -419,7 +434,7 @@ bridged_dnat: skb_dst_set_noref(skb, &rt->dst); } - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, @@ -456,7 +471,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net) } nf_bridge->in_prerouting = 1; - nf_bridge->physindev = skb->dev; + nf_bridge->physinif = skb->dev->ifindex; skb->dev = brnf_get_logical_dev(skb, skb->dev, net); if (skb->protocol == htons(ETH_P_8021Q)) @@ -553,7 +568,11 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff if (skb->protocol == htons(ETH_P_IPV6)) nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; - in = nf_bridge->physindev; + in = nf_bridge_get_physindev(skb, net); + if (!in) { + kfree_skb(skb); + return 0; + } if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; @@ -899,6 +918,13 @@ static unsigned int ip_sabotage_in(void *priv, static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct net_device *br_indev; + + br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev)); + if (!br_indev) { + kfree_skb(skb); + return; + } skb_pull(skb, ETH_HLEN); nf_bridge->bridged_dnat = 0; @@ -908,7 +934,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), nf_bridge->neigh_header, ETH_HLEN - ETH_ALEN); - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; nf_bridge->physoutdev = NULL; br_handle_frame_finish(dev_net(skb->dev), NULL, skb); diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 2e24a743f917..e0421eaa3abc 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -102,9 +102,15 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; - struct net_device *dev = skb->dev; + struct net_device *dev = skb->dev, *br_indev; const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + br_indev = nf_bridge_get_physindev(skb, net); + if (!br_indev) { + kfree_skb(skb); + return 0; + } + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { @@ -122,7 +128,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc } if (skb_dst(skb)->dev == dev) { - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, @@ -133,7 +139,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); skb->pkt_type = PACKET_HOST; } else { - rt = bridge_parent_rtable(nf_bridge->physindev); + rt = bridge_parent_rtable(br_indev); if (!rt) { kfree_skb(skb); return 0; @@ -142,7 +148,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc skb_dst_set_noref(skb, &rt->dst); } - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 86e7d390671a..04504b2b51df 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -239,7 +239,6 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { - struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; struct iphdr *niph; const struct tcphdr *oth; @@ -289,9 +288,13 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - br_indev = nf_bridge_get_physindev(oldskb, net); - if (br_indev) { + if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); + struct net_device *br_indev; + + br_indev = nf_bridge_get_physindev(oldskb, net); + if (!br_indev) + goto free_nskb; nskb->dev = br_indev; niph->tot_len = htons(nskb->len); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 27b2164f4c43..196dd4ecb5e2 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -278,7 +278,6 @@ static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { - struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; struct tcphdr _otcph; const struct tcphdr *otcph; @@ -354,9 +353,15 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - br_indev = nf_bridge_get_physindev(oldskb, net); - if (br_indev) { + if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); + struct net_device *br_indev; + + br_indev = nf_bridge_get_physindev(oldskb, net); + if (!br_indev) { + kfree_skb(nskb); + return; + } nskb->dev = br_indev; nskb->protocol = htons(ETH_P_IPV6); From b1db244ffd041a49ecc9618e8feb6b5c1afcdaa7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 12 Jan 2024 23:28:45 +0100 Subject: [PATCH 68/88] netfilter: nf_tables: check if catch-all set element is active in next generation When deactivating the catch-all set element, check the state in the next generation that represents this transaction. This bug uncovered after the recent removal of the element busy mark a2dd0233cbc4 ("netfilter: nf_tables: remove busy mark and gc batch API"). Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Cc: stable@vger.kernel.org Reported-by: lonial con Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b3db97440db1..50b595ef6389 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6578,7 +6578,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net, list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_is_active(net, ext)) + if (!nft_is_active_next(net, ext)) continue; kfree(elem->priv); From 3ce67e3793f48c1b9635beb9bb71116ca1e51b58 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 14 Jan 2024 23:53:39 +0100 Subject: [PATCH 69/88] netfilter: nf_tables: do not allow mismatch field size and set key length The set description provides the size of each field in the set whose sum should not mismatch the set key length, bail out otherwise. I did not manage to crash nft_set_pipapo with mismatch fields and set key length so far, but this is UB which must be disallowed. Fixes: f3a2181e16f1 ("netfilter: nf_tables: Support for sets with multiple ranged fields") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 50b595ef6389..e9fa4a32c093 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4813,8 +4813,8 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { + u32 num_regs = 0, key_num_regs = 0; struct nlattr *attr; - u32 num_regs = 0; int rem, err, i; nla_for_each_nested(attr, nla, rem) { @@ -4829,6 +4829,10 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, for (i = 0; i < desc->field_count; i++) num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32)); + key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); + if (key_num_regs != num_regs) + return -EINVAL; + if (num_regs > NFT_REG32_COUNT) return -E2BIG; From 6b1ca88e4bb63673dc9f9c7f23c899f22c3cb17a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 Jan 2024 00:14:38 +0100 Subject: [PATCH 70/88] netfilter: nf_tables: skip dead set elements in netlink dump Delete from packet path relies on the garbage collector to purge elements with NFT_SET_ELEM_DEAD_BIT on. Skip these dead elements from nf_tables_dump_setelem() path, I very rarely see tests/shell/testcases/maps/typeof_maps_add_delete reports [DUMP FAILED] showing a mismatch in the expected output with an element that should not be there. If the netlink dump happens before GC worker run, it might show dead elements in the ruleset listing. nft_rhash_get() already skips dead elements in nft_rhash_cmp(), therefore, it already does not show the element when getting a single element via netlink control plane. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e9fa4a32c093..88a6a858383b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5718,7 +5718,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; - if (nft_set_elem_expired(ext)) + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; args = container_of(iter, struct nft_set_dump_args, iter); From 113661e07460a6604aacc8ae1b23695a89e7d4b3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 Jan 2024 12:50:29 +0100 Subject: [PATCH 71/88] netfilter: nf_tables: reject NFT_SET_CONCAT with not field length description It is still possible to set on the NFT_SET_CONCAT flag by specifying a set size and no field description, report EINVAL in such case. Fixes: 1b6345d4160e ("netfilter: nf_tables: check NFT_SET_CONCAT flag if field_count is specified") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 88a6a858383b..4b55533ce5ca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5070,8 +5070,12 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) return err; - if (desc.field_count > 1 && !(flags & NFT_SET_CONCAT)) + if (desc.field_count > 1) { + if (!(flags & NFT_SET_CONCAT)) + return -EINVAL; + } else if (flags & NFT_SET_CONCAT) { return -EINVAL; + } } else if (flags & NFT_SET_CONCAT) { return -EINVAL; } From d6938c1c76c64f42363d0d1f051e1b4641c2ad40 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 15 Jan 2024 17:39:22 +0300 Subject: [PATCH 72/88] ipvs: avoid stat macros calls from preemptible context Inside decrement_ttl() upon discovering that the packet ttl has exceeded, __IP_INC_STATS and __IP6_INC_STATS macros can be called from preemptible context having the following backtrace: check_preemption_disabled: 48 callbacks suppressed BUG: using __this_cpu_add() in preemptible [00000000] code: curl/1177 caller is decrement_ttl+0x217/0x830 CPU: 5 PID: 1177 Comm: curl Not tainted 6.7.0+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 04/01/2014 Call Trace: dump_stack_lvl+0xbd/0xe0 check_preemption_disabled+0xd1/0xe0 decrement_ttl+0x217/0x830 __ip_vs_get_out_rt+0x4e0/0x1ef0 ip_vs_nat_xmit+0x205/0xcd0 ip_vs_in_hook+0x9b1/0x26a0 nf_hook_slow+0xc2/0x210 nf_hook+0x1fb/0x770 __ip_local_out+0x33b/0x640 ip_local_out+0x2a/0x490 __ip_queue_xmit+0x990/0x1d10 __tcp_transmit_skb+0x288b/0x3d10 tcp_connect+0x3466/0x5180 tcp_v4_connect+0x1535/0x1bb0 __inet_stream_connect+0x40d/0x1040 inet_stream_connect+0x57/0xa0 __sys_connect_file+0x162/0x1a0 __sys_connect+0x137/0x160 __x64_sys_connect+0x72/0xb0 do_syscall_64+0x6f/0x140 entry_SYSCALL_64_after_hwframe+0x6e/0x76 RIP: 0033:0x7fe6dbbc34e0 Use the corresponding preemption-aware variants: IP_INC_STATS and IP6_INC_STATS. Found by Linux Verification Center (linuxtesting.org). Fixes: 8d8e20e2d7bb ("ipvs: Decrement ttl") Signed-off-by: Fedor Pchelkin Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_xmit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 9193e109e6b3..65e0259178da 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -271,7 +271,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return false; } @@ -286,7 +286,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, { if (ip_hdr(skb)->ttl <= 1) { /* Tell the sender its packet died... */ - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); return false; } From 01b55f4f0cd6ad1a16eca6c43a3190005892ef91 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:39 -0800 Subject: [PATCH 73/88] libbpf: feature-detect arg:ctx tag support in kernel Add feature detector of kernel-side arg:ctx (__arg_ctx) tag support. If this is detected, libbpf will avoid doing any __arg_ctx-related BTF rewriting and checks in favor of letting kernel handle this completely. test_global_funcs/ctx_arg_rewrite subtest is adjusted to do the same feature detection (albeit in much simpler, though round-about and inefficient, way), and skip the tests. This is done to still be able to execute this test on older kernels (like in libbpf CI). Note, BPF token series ([0]) does a major refactor and code moving of libbpf-internal feature detection "framework", so to avoid unnecessary conflicts we keep newly added feature detection stand-alone with ad-hoc result caching. Once things settle, there will be a small follow up to re-integrate everything back and move code into its final place in newly-added (by BPF token series) features.c file. [0] https://patchwork.kernel.org/project/netdevbpf/list/?series=814209&state=* Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 67 +++++++++++++++++++ .../bpf/prog_tests/test_global_funcs.c | 13 ++++ 2 files changed, 80 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c5a42ac309fd..61db92189517 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6757,6 +6757,69 @@ static int clone_func_btf_info(struct btf *btf, int orig_fn_id, struct bpf_progr return fn_id; } +static int probe_kern_arg_ctx_tag(void) +{ + /* To minimize merge conflicts with BPF token series that refactors + * feature detection code a lot, we don't integrate + * probe_kern_arg_ctx_tag() into kernel_supports() feature-detection + * framework yet, doing our own caching internally. + * This will be cleaned up a bit later when bpf/bpf-next trees settle. + */ + static int cached_result = -1; + static const char strs[] = "\0a\0b\0arg:ctx\0"; + const __u32 types[] = { + /* [1] INT */ + BTF_TYPE_INT_ENC(1 /* "a" */, BTF_INT_SIGNED, 0, 32, 4), + /* [2] PTR -> VOID */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0), + /* [3] FUNC_PROTO `int(void *a)` */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1), + BTF_PARAM_ENC(1 /* "a" */, 2), + /* [4] FUNC 'a' -> FUNC_PROTO (main prog) */ + BTF_TYPE_ENC(1 /* "a" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 3), + /* [5] FUNC_PROTO `int(void *b __arg_ctx)` */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1), + BTF_PARAM_ENC(3 /* "b" */, 2), + /* [6] FUNC 'b' -> FUNC_PROTO (subprog) */ + BTF_TYPE_ENC(3 /* "b" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 5), + /* [7] DECL_TAG 'arg:ctx' -> func 'b' arg 'b' */ + BTF_TYPE_DECL_TAG_ENC(5 /* "arg:ctx" */, 6, 0), + }; + const struct bpf_insn insns[] = { + /* main prog */ + BPF_CALL_REL(+1), + BPF_EXIT_INSN(), + /* global subprog */ + BPF_EMIT_CALL(BPF_FUNC_get_func_ip), /* needs PTR_TO_CTX */ + BPF_EXIT_INSN(), + }; + const struct bpf_func_info_min func_infos[] = { + { 0, 4 }, /* main prog -> FUNC 'a' */ + { 2, 6 }, /* subprog -> FUNC 'b' */ + }; + LIBBPF_OPTS(bpf_prog_load_opts, opts); + int prog_fd, btf_fd, insn_cnt = ARRAY_SIZE(insns); + + if (cached_result >= 0) + return cached_result; + + btf_fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs)); + if (btf_fd < 0) + return 0; + + opts.prog_btf_fd = btf_fd; + opts.func_info = &func_infos; + opts.func_info_cnt = ARRAY_SIZE(func_infos); + opts.func_info_rec_size = sizeof(func_infos[0]); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, "det_arg_ctx", + "GPL", insns, insn_cnt, &opts); + close(btf_fd); + + cached_result = probe_fd(prog_fd); + return cached_result; +} + /* Check if main program or global subprog's function prototype has `arg:ctx` * argument tags, and, if necessary, substitute correct type to match what BPF * verifier would expect, taking into account specific program type. This @@ -6780,6 +6843,10 @@ static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_progra if (!obj->btf_ext || !prog->func_info) return 0; + /* don't do any fix ups if kernel natively supports __arg_ctx */ + if (probe_kern_arg_ctx_tag() > 0) + return 0; + /* some BPF program types just don't have named context structs, so * this fallback mechanism doesn't work for them */ diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c index 67d4ef9e62b3..e905cbaf6b3d 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c @@ -47,6 +47,19 @@ static void subtest_ctx_arg_rewrite(void) struct btf *btf = NULL; __u32 info_len = sizeof(info); int err, fd, i; + struct btf *kern_btf = NULL; + + kern_btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK_PTR(kern_btf, "kern_btf_load")) + return; + + /* simple detection of kernel native arg:ctx tag support */ + if (btf__find_by_name_kind(kern_btf, "bpf_subprog_arg_info", BTF_KIND_STRUCT) > 0) { + test__skip(); + btf__free(kern_btf); + return; + } + btf__free(kern_btf); skel = test_global_func_ctx_args__open(); if (!ASSERT_OK_PTR(skel, "skel_open")) From 66967a32d3b16ed447e76fed4d946bab52e43d86 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:40 -0800 Subject: [PATCH 74/88] bpf: extract bpf_ctx_convert_map logic and make it more reusable Refactor btf_get_prog_ctx_type() a bit to allow reuse of bpf_ctx_convert_map logic in more than one places. Simplify interface by returning btf_type instead of btf_member (field reference in BTF). To do the above we need to touch and start untangling btf_translate_to_vmlinux() implementation. We do the bare minimum to not regress anything for btf_translate_to_vmlinux(), but its implementation is very questionable for what it claims to be doing. Mapping kfunc argument types to kernel corresponding types conceptually is quite different from recognizing program context types. Fixing this is out of scope for this change though. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 2 +- kernel/bpf/btf.c | 71 ++++++++++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 59d404e22814..cf5c6ff48981 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -512,7 +512,7 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); -const struct btf_member * +const struct btf_type * btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 51e8b4bee0c8..10ac9efc662d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5615,21 +5615,46 @@ static u8 bpf_ctx_convert_map[] = { #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE -const struct btf_member * +static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type) +{ + const struct btf_type *conv_struct; + const struct btf_member *ctx_type; + + conv_struct = bpf_ctx_convert.t; + if (!conv_struct) + return NULL; + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; + /* ctx_type is a pointer to prog_ctx_type in vmlinux. + * Like 'struct __sk_buff' + */ + return btf_type_by_id(btf_vmlinux, ctx_type->type); +} + +static int find_kern_ctx_type_id(enum bpf_prog_type prog_type) +{ + const struct btf_type *conv_struct; + const struct btf_member *ctx_type; + + conv_struct = bpf_ctx_convert.t; + if (!conv_struct) + return -EFAULT; + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1; + /* ctx_type is a pointer to prog_ctx_type in vmlinux. + * Like 'struct sk_buff' + */ + return ctx_type->type; +} + +const struct btf_type * btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) { - const struct btf_type *conv_struct; - const struct btf_type *ctx_struct; - const struct btf_member *ctx_type; + const struct btf_type *ctx_type; const char *tname, *ctx_tname; - conv_struct = bpf_ctx_convert.t; - if (!conv_struct) { - bpf_log(log, "btf_vmlinux is malformed\n"); - return NULL; - } t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); @@ -5646,17 +5671,15 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, bpf_log(log, "arg#%d struct doesn't have a name\n", arg); return NULL; } - /* prog_type is valid bpf program type. No need for bounds check. */ - ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; - /* ctx_struct is a pointer to prog_ctx_type in vmlinux. - * Like 'struct __sk_buff' - */ - ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type); - if (!ctx_struct) + + ctx_type = find_canonical_prog_ctx_type(prog_type); + if (!ctx_type) { + bpf_log(log, "btf_vmlinux is malformed\n"); /* should not happen */ return NULL; + } again: - ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off); + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); if (!ctx_tname) { /* should not happen */ bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); @@ -5677,10 +5700,10 @@ again: /* bpf_user_pt_regs_t is a typedef, so resolve it to * underlying struct and check name again */ - if (!btf_type_is_modifier(ctx_struct)) + if (!btf_type_is_modifier(ctx_type)) return NULL; - while (btf_type_is_modifier(ctx_struct)) - ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type); + while (btf_type_is_modifier(ctx_type)) + ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); goto again; } return ctx_type; @@ -5692,13 +5715,9 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, enum bpf_prog_type prog_type, int arg) { - const struct btf_member *prog_ctx_type, *kern_ctx_type; - - prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg); - if (!prog_ctx_type) + if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg)) return -ENOENT; - kern_ctx_type = prog_ctx_type + 1; - return kern_ctx_type->type; + return find_kern_ctx_type_id(prog_type); } int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) From 0ba971511d16603599f947459e59b435cc465b0d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:41 -0800 Subject: [PATCH 75/88] bpf: enforce types for __arg_ctx-tagged arguments in global subprogs Add enforcement of expected types for context arguments tagged with arg:ctx (__arg_ctx) tag. First, any program type will accept generic `void *` context type when combined with __arg_ctx tag. Besides accepting "canonical" struct names and `void *`, for a bunch of program types for which program context is actually a named struct, we allows a bunch of pragmatic exceptions to match real-world and expected usage: - for both kprobes and perf_event we allow `bpf_user_pt_regs_t *` as canonical context argument type, where `bpf_user_pt_regs_t` is a *typedef*, not a struct; - for kprobes, we also always accept `struct pt_regs *`, as that's what actually is passed as a context to any kprobe program; - for perf_event, we resolve typedefs (unless it's `bpf_user_pt_regs_t`) down to actual struct type and accept `struct pt_regs *`, or `struct user_pt_regs *`, or `struct user_regs_struct *`, depending on the actual struct type kernel architecture points `bpf_user_pt_regs_t` typedef to; otherwise, canonical `struct bpf_perf_event_data *` is expected; - for raw_tp/raw_tp.w programs, `u64/long *` are accepted, as that's what's expected with BPF_PROG() usage; otherwise, canonical `struct bpf_raw_tracepoint_args *` is expected; - tp_btf supports both `struct bpf_raw_tracepoint_args *` and `u64 *` formats, both are coded as expections as tp_btf is actually a TRACING program type, which has no canonical context type; - iterator programs accept `struct bpf_iter__xxx *` structs, currently with no further iterator-type specific enforcement; - fentry/fexit/fmod_ret/lsm/struct_ops all accept `u64 *`; - classic tracepoint programs, as well as syscall and freplace programs allow any user-provided type. In all other cases kernel will enforce exact match of struct name to expected canonical type. And if user-provided type doesn't match that expectation, verifier will emit helpful message with expected type name. Note a bit unnatural way the check is done after processing all the arguments. This is done to avoid conflict between bpf and bpf-next trees. Once trees converge, a small follow up patch will place a simple btf_validate_prog_ctx_type() check into a proper ARG_PTR_TO_CTX branch (which bpf-next tree patch refactored already), removing duplicated arg:ctx detection logic. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 10ac9efc662d..596471189176 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5709,6 +5709,149 @@ again: return ctx_type; } +/* forward declarations for arch-specific underlying types of + * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef + * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still + * works correctly with __builtin_types_compatible_p() on respective + * architectures + */ +struct user_regs_struct; +struct user_pt_regs; + +static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int arg, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type) +{ + const struct btf_type *ctx_type; + const char *tname, *ctx_tname; + + if (!btf_is_ptr(t)) { + bpf_log(log, "arg#%d type isn't a pointer\n", arg); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + + /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */ + if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) { + while (btf_type_is_modifier(t) && !btf_type_is_typedef(t)) + t = btf_type_by_id(btf, t->type); + + if (btf_type_is_typedef(t)) { + tname = btf_name_by_offset(btf, t->name_off); + if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0) + return 0; + } + } + + /* all other program types don't use typedefs for context type */ + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + + /* `void *ctx __arg_ctx` is always valid */ + if (btf_type_is_void(t)) + return 0; + + tname = btf_name_by_offset(btf, t->name_off); + if (str_is_empty(tname)) { + bpf_log(log, "arg#%d type doesn't have a name\n", arg); + return -EINVAL; + } + + /* special cases */ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return 0; + break; + case BPF_PROG_TYPE_PERF_EVENT: + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) && + __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return 0; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) && + __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0) + return 0; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) && + __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0) + return 0; + break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_PROG_TYPE_TRACING: + switch (attach_type) { + case BPF_TRACE_RAW_TP: + /* tp_btf program is TRACING, so need special case here */ + if (__btf_type_is_struct(t) && + strcmp(tname, "bpf_raw_tracepoint_args") == 0) + return 0; + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_TRACE_ITER: + /* allow struct bpf_iter__xxx types only */ + if (__btf_type_is_struct(t) && + strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0) + return 0; + break; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + default: + break; + } + break; + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_SYSCALL: + case BPF_PROG_TYPE_EXT: + return 0; /* anything goes */ + default: + break; + } + + ctx_type = find_canonical_prog_ctx_type(prog_type); + if (!ctx_type) { + /* should not happen */ + bpf_log(log, "btf_vmlinux is malformed\n"); + return -EINVAL; + } + + /* resolve typedefs and check that underlying structs are matching as well */ + while (btf_type_is_modifier(ctx_type)) + ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); + + /* if program type doesn't have distinctly named struct type for + * context, then __arg_ctx argument can only be `void *`, which we + * already checked above + */ + if (!__btf_type_is_struct(ctx_type)) { + bpf_log(log, "arg#%d should be void pointer\n", arg); + return -EINVAL; + } + + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); + if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) { + bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname); + return -EINVAL; + } + + return 0; +} + static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -6953,6 +7096,23 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return -EINVAL; } + for (i = 0; i < nargs; i++) { + const char *tag; + + if (sub->args[i].arg_type != ARG_PTR_TO_CTX) + continue; + + /* check if arg has "arg:ctx" tag */ + t = btf_type_by_id(btf, args[i].type); + tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:"); + if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0) + continue; + + if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type, + prog->expected_attach_type)) + return -EINVAL; + } + sub->arg_cnt = nargs; sub->args_cached = true; From 989410cde81959c4033dc287d79b42b6eb04f04f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:42 -0800 Subject: [PATCH 76/88] selftests/bpf: add tests confirming type logic in kernel for __arg_ctx Add a bunch of global subprogs across variety of program types to validate expected kernel type enforcement logic for __arg_ctx arguments. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_global_subprogs.c | 164 +++++++++++++++++- 1 file changed, 161 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index 9eeb2d89cda8..67dddd941891 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -3,6 +3,7 @@ #include #include +#include #include "bpf_misc.h" #include "xdp_metadata.h" #include "bpf_kfuncs.h" @@ -138,25 +139,182 @@ __weak int subprog_ctx_tag(void *ctx __arg_ctx) return bpf_get_stack(ctx, stack, sizeof(stack), 0); } +__weak int raw_tp_canonical(struct bpf_raw_tracepoint_args *ctx __arg_ctx) +{ + return 0; +} + +__weak int raw_tp_u64_array(u64 *ctx __arg_ctx) +{ + return 0; +} + SEC("?raw_tp") __success __log_level(2) int arg_tag_ctx_raw_tp(void *ctx) { - return subprog_ctx_tag(ctx); + return subprog_ctx_tag(ctx) + raw_tp_canonical(ctx) + raw_tp_u64_array(ctx); +} + +SEC("?raw_tp.w") +__success __log_level(2) +int arg_tag_ctx_raw_tp_writable(void *ctx) +{ + return subprog_ctx_tag(ctx) + raw_tp_canonical(ctx) + raw_tp_u64_array(ctx); +} + +SEC("?tp_btf/sys_enter") +__success __log_level(2) +int arg_tag_ctx_raw_tp_btf(void *ctx) +{ + return subprog_ctx_tag(ctx) + raw_tp_canonical(ctx) + raw_tp_u64_array(ctx); +} + +struct whatever { }; + +__weak int tp_whatever(struct whatever *ctx __arg_ctx) +{ + return 0; } SEC("?tp") __success __log_level(2) int arg_tag_ctx_tp(void *ctx) { - return subprog_ctx_tag(ctx); + return subprog_ctx_tag(ctx) + tp_whatever(ctx); +} + +__weak int kprobe_subprog_pt_regs(struct pt_regs *ctx __arg_ctx) +{ + return 0; +} + +__weak int kprobe_subprog_typedef(bpf_user_pt_regs_t *ctx __arg_ctx) +{ + return 0; } SEC("?kprobe") __success __log_level(2) int arg_tag_ctx_kprobe(void *ctx) { - return subprog_ctx_tag(ctx); + return subprog_ctx_tag(ctx) + + kprobe_subprog_pt_regs(ctx) + + kprobe_subprog_typedef(ctx); +} + +__weak int perf_subprog_regs( +#if defined(bpf_target_riscv) + struct user_regs_struct *ctx __arg_ctx +#elif defined(bpf_target_s390) + /* user_pt_regs typedef is anonymous struct, so only `void *` works */ + void *ctx __arg_ctx +#elif defined(bpf_target_loongarch) || defined(bpf_target_arm64) || defined(bpf_target_powerpc) + struct user_pt_regs *ctx __arg_ctx +#else + struct pt_regs *ctx __arg_ctx +#endif +) +{ + return 0; +} + +__weak int perf_subprog_typedef(bpf_user_pt_regs_t *ctx __arg_ctx) +{ + return 0; +} + +__weak int perf_subprog_canonical(struct bpf_perf_event_data *ctx __arg_ctx) +{ + return 0; +} + +SEC("?perf_event") +__success __log_level(2) +int arg_tag_ctx_perf(void *ctx) +{ + return subprog_ctx_tag(ctx) + + perf_subprog_regs(ctx) + + perf_subprog_typedef(ctx) + + perf_subprog_canonical(ctx); +} + +__weak int iter_subprog_void(void *ctx __arg_ctx) +{ + return 0; +} + +__weak int iter_subprog_typed(struct bpf_iter__task *ctx __arg_ctx) +{ + return 0; +} + +SEC("?iter/task") +__success __log_level(2) +int arg_tag_ctx_iter_task(struct bpf_iter__task *ctx) +{ + return (iter_subprog_void(ctx) + iter_subprog_typed(ctx)) & 1; +} + +__weak int tracing_subprog_void(void *ctx __arg_ctx) +{ + return 0; +} + +__weak int tracing_subprog_u64(u64 *ctx __arg_ctx) +{ + return 0; +} + +int acc; + +SEC("?fentry/" SYS_PREFIX "sys_nanosleep") +__success __log_level(2) +int BPF_PROG(arg_tag_ctx_fentry) +{ + acc += tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); + return 0; +} + +SEC("?fexit/" SYS_PREFIX "sys_nanosleep") +__success __log_level(2) +int BPF_PROG(arg_tag_ctx_fexit) +{ + acc += tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); + return 0; +} + +SEC("?fmod_ret/" SYS_PREFIX "sys_nanosleep") +__success __log_level(2) +int BPF_PROG(arg_tag_ctx_fmod_ret) +{ + return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); +} + +SEC("?lsm/bpf") +__success __log_level(2) +int BPF_PROG(arg_tag_ctx_lsm) +{ + return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); +} + +SEC("?struct_ops/test_1") +__success __log_level(2) +int BPF_PROG(arg_tag_ctx_struct_ops) +{ + return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); +} + +SEC(".struct_ops") +struct bpf_dummy_ops dummy_1 = { + .test_1 = (void *)arg_tag_ctx_struct_ops, +}; + +SEC("?syscall") +__success __log_level(2) +int arg_tag_ctx_syscall(void *ctx) +{ + return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx) + tp_whatever(ctx); } __weak int subprog_dynptr(struct bpf_dynptr *dptr) From 76ec90a996e3c707eb6772510afa36faeba2ecff Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:43 -0800 Subject: [PATCH 77/88] libbpf: warn on unexpected __arg_ctx type when rewriting BTF On kernel that don't support arg:ctx tag, before adjusting global subprog BTF information to match kernel's expected canonical type names, make sure that types used by user are meaningful, and if not, warn and don't do BTF adjustments. This is similar to checks that kernel performs, but narrower in scope, as only a small subset of BPF program types can be accommodated by libbpf using canonical type names. Libbpf unconditionally allows `struct pt_regs *` for perf_event program types, unlike kernel, which supports that conditionally on architecture. This is done to keep things simple and not cause unnecessary false positives. This seems like a minor and harmless deviation, which in real-world programs will be caught by kernels with arg:ctx tag support anyways. So KISS principle. This logic is hard to test (especially on latest kernels), so manual testing was performed instead. Libbpf emitted the following warning for perf_event program with wrong context argument type: libbpf: prog 'arg_tag_ctx_perf': subprog 'subprog_ctx_tag' arg#0 is expected to be of `struct bpf_perf_event_data *` type Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 75 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 61db92189517..afd09571c482 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6695,6 +6695,67 @@ static struct { /* all other program types don't have "named" context structs */ }; +static bool need_func_arg_type_fixup(const struct btf *btf, const struct bpf_program *prog, + const char *subprog_name, int arg_idx, + int arg_type_id, const char *ctx_name) +{ + const struct btf_type *t; + const char *tname; + + /* check if existing parameter already matches verifier expectations */ + t = skip_mods_and_typedefs(btf, arg_type_id, NULL); + if (!btf_is_ptr(t)) + goto out_warn; + + /* typedef bpf_user_pt_regs_t is a special PITA case, valid for kprobe + * and perf_event programs, so check this case early on and forget + * about it for subsequent checks + */ + while (btf_is_mod(t)) + t = btf__type_by_id(btf, t->type); + if (btf_is_typedef(t) && + (prog->type == BPF_PROG_TYPE_KPROBE || prog->type == BPF_PROG_TYPE_PERF_EVENT)) { + tname = btf__str_by_offset(btf, t->name_off) ?: ""; + if (strcmp(tname, "bpf_user_pt_regs_t") == 0) + return false; /* canonical type for kprobe/perf_event */ + } + + /* now we can ignore typedefs moving forward */ + t = skip_mods_and_typedefs(btf, t->type, NULL); + + /* if it's `void *`, definitely fix up BTF info */ + if (btf_is_void(t)) + return true; + + /* if it's already proper canonical type, no need to fix up */ + tname = btf__str_by_offset(btf, t->name_off) ?: ""; + if (btf_is_struct(t) && strcmp(tname, ctx_name) == 0) + return false; + + /* special cases */ + switch (prog->type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_PERF_EVENT: + /* `struct pt_regs *` is expected, but we need to fix up */ + if (btf_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return true; + break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return true; + break; + default: + break; + } + +out_warn: + pr_warn("prog '%s': subprog '%s' arg#%d is expected to be of `struct %s *` type\n", + prog->name, subprog_name, arg_idx, ctx_name); + return false; +} + static int clone_func_btf_info(struct btf *btf, int orig_fn_id, struct bpf_program *prog) { int fn_id, fn_proto_id, ret_type_id, orig_proto_id; @@ -6829,7 +6890,7 @@ static int probe_kern_arg_ctx_tag(void) */ static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_program *prog) { - const char *ctx_name = NULL, *ctx_tag = "arg:ctx"; + const char *ctx_name = NULL, *ctx_tag = "arg:ctx", *fn_name; struct bpf_func_info_min *func_rec; struct btf_type *fn_t, *fn_proto_t; struct btf *btf = obj->btf; @@ -6909,15 +6970,11 @@ static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_progra if (arg_idx < 0 || arg_idx >= arg_cnt) continue; - /* check if existing parameter already matches verifier expectations */ + /* check if we should fix up argument type */ p = &btf_params(fn_proto_t)[arg_idx]; - t = skip_mods_and_typedefs(btf, p->type, NULL); - if (btf_is_ptr(t) && - (t = skip_mods_and_typedefs(btf, t->type, NULL)) && - btf_is_struct(t) && - strcmp(btf__str_by_offset(btf, t->name_off), ctx_name) == 0) { - continue; /* no need for fix up */ - } + fn_name = btf__str_by_offset(btf, fn_t->name_off) ?: ""; + if (!need_func_arg_type_fixup(btf, prog, fn_name, arg_idx, p->type, ctx_name)) + continue; /* clone fn/fn_proto, unless we already did it for another arg */ if (func_rec->type_id == orig_fn_id) { From 39369c9a6e090619a7b5eedb2212c99ac8a03890 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2024 07:43:11 -0800 Subject: [PATCH 78/88] selftests: netdevsim: add a config file netdevsim tests aren't very well integrated with kselftest, which has its advantages and disadvantages. But regardless of the intended integration - a config file to know what kernel to build is very useful, add one. Fixes: fc4c93f145d7 ("selftests: add basic netdevsim devlink flash testing") Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240116154311.1945801-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/netdevsim/config | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/netdevsim/config diff --git a/tools/testing/selftests/drivers/net/netdevsim/config b/tools/testing/selftests/drivers/net/netdevsim/config new file mode 100644 index 000000000000..adf45a3a78b4 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/config @@ -0,0 +1,10 @@ +CONFIG_DUMMY=y +CONFIG_GENEVE=m +CONFIG_IPV6=y +CONFIG_NETDEVSIM=m +CONFIG_NET_SCH_MQPRIO=y +CONFIG_NET_SCH_MULTIQ=y +CONFIG_NET_SCH_PRIO=y +CONFIG_PSAMPLE=y +CONFIG_PTP_1588_CLOCK_MOCK=y +CONFIG_VXLAN=m From dd2d40acdbb2b9e6bcddd5424b0e00c1760ecf26 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Tue, 16 Jan 2024 10:49:26 -0500 Subject: [PATCH 79/88] selftests: bonding: Add more missing config options As a followup to commit 03fb8565c880 ("selftests: bonding: add missing build configs"), add more networking-specific config options which are needed for bonding tests. For testing, I used the minimal config generated by virtme-ng and I added the options in the config file. All bonding tests passed. Fixes: bbb774d921e2 ("net: Add tests for bonding and team address list management") # for ipv6 Fixes: 6cbe791c0f4e ("kselftest: bonding: add num_grat_arp test") # for tc options Fixes: 222c94ec0ad4 ("selftests: bonding: add tests for ether type changes") # for nlmon Suggested-by: Jakub Kicinski Signed-off-by: Benjamin Poirier Link: https://lore.kernel.org/r/20240116154926.202164-1-bpoirier@nvidia.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/bonding/config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index f85b16fc5128..899d7fb6ea8e 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -1,5 +1,10 @@ CONFIG_BONDING=y CONFIG_BRIDGE=y CONFIG_DUMMY=y +CONFIG_IPV6=y CONFIG_MACVLAN=y +CONFIG_NET_ACT_GACT=y +CONFIG_NET_CLS_FLOWER=y +CONFIG_NET_SCH_INGRESS=y +CONFIG_NLMON=y CONFIG_VETH=y From f1172f3ee3a98754d95b968968920a7d03fdebcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludvig=20P=C3=A4rsson?= Date: Wed, 17 Jan 2024 13:03:14 +0100 Subject: [PATCH 80/88] ethtool: netlink: Add missing ethnl_ops_begin/complete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accessing an ethernet device that is powered off or clock gated might cause the CPU to hang. Add ethnl_ops_begin/complete in ethnl_set_features() to protect against this. Fixes: 0980bfcd6954 ("ethtool: set netdev features with FEATURES_SET request") Signed-off-by: Ludvig Pärsson Link: https://lore.kernel.org/r/20240117-etht2-v2-1-1a96b6e8c650@axis.com Signed-off-by: Paolo Abeni --- net/ethtool/features.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/ethtool/features.c b/net/ethtool/features.c index a79af8c25a07..b6cb101d7f19 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -234,17 +234,20 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) dev = req_info.dev; rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; ethnl_features_to_bitmap(old_active, dev->features); ethnl_features_to_bitmap(old_wanted, dev->wanted_features); ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT, tb[ETHTOOL_A_FEATURES_WANTED], netdev_features_strings, info->extack); if (ret < 0) - goto out_rtnl; + goto out_ops; if (ethnl_bitmap_to_features(req_mask) & ~NETIF_F_ETHTOOL_BITS) { GENL_SET_ERR_MSG(info, "attempt to change non-ethtool features"); ret = -EINVAL; - goto out_rtnl; + goto out_ops; } /* set req_wanted bits not in req_mask from old_wanted */ @@ -281,6 +284,8 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) if (mod) netdev_features_change(dev); +out_ops: + ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); From 6d6eeabcfaba2fcadf5443b575789ea606f9de83 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Jan 2024 16:04:16 +0100 Subject: [PATCH 81/88] mlxsw: spectrum_acl_erp: Fix error flow of pool allocation failure Lately, a bug was found when many TC filters are added - at some point, several bugs are printed to dmesg [1] and the switch is crashed with segmentation fault. The issue starts when gen_pool_free() fails because of unexpected behavior - a try to free memory which is already freed, this leads to BUG() call which crashes the switch and makes many other bugs. Trying to track down the unexpected behavior led to a bug in eRP code. The function mlxsw_sp_acl_erp_table_alloc() gets a pointer to the allocated index, sets the value and returns an error code. When gen_pool_alloc() fails it returns address 0, we track it and return -ENOBUFS outside, BUT the call for gen_pool_alloc() already override the index in erp_table structure. This is a problem when such allocation is done as part of table expansion. This is not a new table, which will not be used in case of allocation failure. We try to expand eRP table and override the current index (non-zero) with zero. Then, it leads to an unexpected behavior when address 0 is freed twice. Note that address 0 is valid in erp_table->base_index and indeed other tables use it. gen_pool_alloc() fails in case that there is no space left in the pre-allocated pool, in our case, the pool is limited to ACL_MAX_ERPT_BANK_SIZE, which is read from hardware. When more than max erp entries are required, we exceed the limit and return an error, this error leads to "Failed to migrate vregion" print. Fix this by changing erp_table->base_index only in case of a successful allocation. Add a test case for such a scenario. Without this fix it causes segmentation fault: $ TESTS="max_erp_entries_test" ./tc_flower.sh ./tc_flower.sh: line 988: 1560 Segmentation fault tc filter del dev $h2 ingress chain $i protocol ip pref $i handle $j flower &>/dev/null [1]: kernel BUG at lib/genalloc.c:508! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 6 PID: 3531 Comm: tc Not tainted 6.7.0-rc5-custom-ga6893f479f5e #1 Hardware name: Mellanox Technologies Ltd. MSN4700/VMOD0010, BIOS 5.11 07/12/2021 RIP: 0010:gen_pool_free_owner+0xc9/0xe0 ... Call Trace: __mlxsw_sp_acl_erp_table_other_dec+0x70/0xa0 [mlxsw_spectrum] mlxsw_sp_acl_erp_mask_destroy+0xf5/0x110 [mlxsw_spectrum] objagg_obj_root_destroy+0x18/0x80 [objagg] objagg_obj_destroy+0x12c/0x130 [objagg] mlxsw_sp_acl_erp_mask_put+0x37/0x50 [mlxsw_spectrum] mlxsw_sp_acl_ctcam_region_entry_remove+0x74/0xa0 [mlxsw_spectrum] mlxsw_sp_acl_ctcam_entry_del+0x1e/0x40 [mlxsw_spectrum] mlxsw_sp_acl_tcam_ventry_del+0x78/0xd0 [mlxsw_spectrum] mlxsw_sp_flower_destroy+0x4d/0x70 [mlxsw_spectrum] mlxsw_sp_flow_block_cb+0x73/0xb0 [mlxsw_spectrum] tc_setup_cb_destroy+0xc1/0x180 fl_hw_destroy_filter+0x94/0xc0 [cls_flower] __fl_delete+0x1ac/0x1c0 [cls_flower] fl_destroy+0xc2/0x150 [cls_flower] tcf_proto_destroy+0x1a/0xa0 ... mlxsw_spectrum3 0000:07:00.0: Failed to migrate vregion mlxsw_spectrum3 0000:07:00.0: Failed to migrate vregion Fixes: f465261aa105 ("mlxsw: spectrum_acl: Implement common eRP core") Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/4cfca254dfc0e5d283974801a24371c7b6db5989.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlxsw/spectrum_acl_erp.c | 8 +-- .../drivers/net/mlxsw/spectrum-2/tc_flower.sh | 52 ++++++++++++++++++- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c index 4c98950380d5..d231f4d2888b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c @@ -301,6 +301,7 @@ mlxsw_sp_acl_erp_table_alloc(struct mlxsw_sp_acl_erp_core *erp_core, unsigned long *p_index) { unsigned int num_rows, entry_size; + unsigned long index; /* We only allow allocations of entire rows */ if (num_erps % erp_core->num_erp_banks != 0) @@ -309,10 +310,11 @@ mlxsw_sp_acl_erp_table_alloc(struct mlxsw_sp_acl_erp_core *erp_core, entry_size = erp_core->erpt_entries_size[region_type]; num_rows = num_erps / erp_core->num_erp_banks; - *p_index = gen_pool_alloc(erp_core->erp_tables, num_rows * entry_size); - if (*p_index == 0) + index = gen_pool_alloc(erp_core->erp_tables, num_rows * entry_size); + if (!index) return -ENOBUFS; - *p_index -= MLXSW_SP_ACL_ERP_GENALLOC_OFFSET; + + *p_index = index - MLXSW_SP_ACL_ERP_GENALLOC_OFFSET; return 0; } diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh index fb850e0ec837..7bf56ea161e3 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh @@ -10,7 +10,8 @@ lib_dir=$(dirname $0)/../../../../net/forwarding ALL_TESTS="single_mask_test identical_filters_test two_masks_test \ multiple_masks_test ctcam_edge_cases_test delta_simple_test \ delta_two_masks_one_key_test delta_simple_rehash_test \ - bloom_simple_test bloom_complex_test bloom_delta_test" + bloom_simple_test bloom_complex_test bloom_delta_test \ + max_erp_entries_test" NUM_NETIFS=2 source $lib_dir/lib.sh source $lib_dir/tc_common.sh @@ -983,6 +984,55 @@ bloom_delta_test() log_test "bloom delta test ($tcflags)" } +max_erp_entries_test() +{ + # The number of eRP entries is limited. Once the maximum number of eRPs + # has been reached, filters cannot be added. This test verifies that + # when this limit is reached, inserstion fails without crashing. + + RET=0 + + local num_masks=32 + local num_regions=15 + local chain_failed + local mask_failed + local ret + + if [[ "$tcflags" != "skip_sw" ]]; then + return 0; + fi + + for ((i=1; i < $num_regions; i++)); do + for ((j=$num_masks; j >= 0; j--)); do + tc filter add dev $h2 ingress chain $i protocol ip \ + pref $i handle $j flower $tcflags \ + dst_ip 192.1.0.0/$j &> /dev/null + ret=$? + + if [ $ret -ne 0 ]; then + chain_failed=$i + mask_failed=$j + break 2 + fi + done + done + + # We expect to exceed the maximum number of eRP entries, so that + # insertion eventually fails. Otherwise, the test should be adjusted to + # add more filters. + check_fail $ret "expected to exceed number of eRP entries" + + for ((; i >= 1; i--)); do + for ((j=0; j <= $num_masks; j++)); do + tc filter del dev $h2 ingress chain $i protocol ip \ + pref $i handle $j flower &> /dev/null + done + done + + log_test "max eRP entries test ($tcflags). " \ + "max chain $chain_failed, mask $mask_failed" +} + setup_prepare() { h1=${NETIFS[p1]} From efeb7dfea8ee10cdec11b6b6ba4e405edbe75809 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 17 Jan 2024 16:04:17 +0100 Subject: [PATCH 82/88] mlxsw: spectrum_acl_tcam: Fix NULL pointer dereference in error path When calling mlxsw_sp_acl_tcam_region_destroy() from an error path after failing to attach the region to an ACL group, we hit a NULL pointer dereference upon 'region->group->tcam' [1]. Fix by retrieving the 'tcam' pointer using mlxsw_sp_acl_to_tcam(). [1] BUG: kernel NULL pointer dereference, address: 0000000000000000 [...] RIP: 0010:mlxsw_sp_acl_tcam_region_destroy+0xa0/0xd0 [...] Call Trace: mlxsw_sp_acl_tcam_vchunk_get+0x88b/0xa20 mlxsw_sp_acl_tcam_ventry_add+0x25/0xe0 mlxsw_sp_acl_rule_add+0x47/0x240 mlxsw_sp_flower_replace+0x1a9/0x1d0 tc_setup_cb_add+0xdc/0x1c0 fl_hw_replace_filter+0x146/0x1f0 fl_change+0xc17/0x1360 tc_new_tfilter+0x472/0xb90 rtnetlink_rcv_msg+0x313/0x3b0 netlink_rcv_skb+0x58/0x100 netlink_unicast+0x244/0x390 netlink_sendmsg+0x1e4/0x440 ____sys_sendmsg+0x164/0x260 ___sys_sendmsg+0x9a/0xe0 __sys_sendmsg+0x7a/0xc0 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x63/0x6b Fixes: 22a677661f56 ("mlxsw: spectrum: Introduce ACL core with simple TCAM implementation") Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/fb6a4542bbc9fcab5a523802d97059bffbca7126.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c index d50786b0a6ce..7d1e91196e94 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c @@ -681,13 +681,13 @@ static void mlxsw_sp_acl_tcam_region_destroy(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_tcam_region *region) { + struct mlxsw_sp_acl_tcam *tcam = mlxsw_sp_acl_to_tcam(mlxsw_sp->acl); const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops; ops->region_fini(mlxsw_sp, region->priv); mlxsw_sp_acl_tcam_region_disable(mlxsw_sp, region); mlxsw_sp_acl_tcam_region_free(mlxsw_sp, region); - mlxsw_sp_acl_tcam_region_id_put(region->group->tcam, - region->id); + mlxsw_sp_acl_tcam_region_id_put(tcam, region->id); kfree(region); } From 483ae90d8f976f8339cf81066312e1329f2d3706 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 17 Jan 2024 16:04:18 +0100 Subject: [PATCH 83/88] mlxsw: spectrum_acl_tcam: Fix stack corruption When tc filters are first added to a net device, the corresponding local port gets bound to an ACL group in the device. The group contains a list of ACLs. In turn, each ACL points to a different TCAM region where the filters are stored. During forwarding, the ACLs are sequentially evaluated until a match is found. One reason to place filters in different regions is when they are added with decreasing priorities and in an alternating order so that two consecutive filters can never fit in the same region because of their key usage. In Spectrum-2 and newer ASICs the firmware started to report that the maximum number of ACLs in a group is more than 16, but the layout of the register that configures ACL groups (PAGT) was not updated to account for that. It is therefore possible to hit stack corruption [1] in the rare case where more than 16 ACLs in a group are required. Fix by limiting the maximum ACL group size to the minimum between what the firmware reports and the maximum ACLs that fit in the PAGT register. Add a test case to make sure the machine does not crash when this condition is hit. [1] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: mlxsw_sp_acl_tcam_group_update+0x116/0x120 [...] dump_stack_lvl+0x36/0x50 panic+0x305/0x330 __stack_chk_fail+0x15/0x20 mlxsw_sp_acl_tcam_group_update+0x116/0x120 mlxsw_sp_acl_tcam_group_region_attach+0x69/0x110 mlxsw_sp_acl_tcam_vchunk_get+0x492/0xa20 mlxsw_sp_acl_tcam_ventry_add+0x25/0xe0 mlxsw_sp_acl_rule_add+0x47/0x240 mlxsw_sp_flower_replace+0x1a9/0x1d0 tc_setup_cb_add+0xdc/0x1c0 fl_hw_replace_filter+0x146/0x1f0 fl_change+0xc17/0x1360 tc_new_tfilter+0x472/0xb90 rtnetlink_rcv_msg+0x313/0x3b0 netlink_rcv_skb+0x58/0x100 netlink_unicast+0x244/0x390 netlink_sendmsg+0x1e4/0x440 ____sys_sendmsg+0x164/0x260 ___sys_sendmsg+0x9a/0xe0 __sys_sendmsg+0x7a/0xc0 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x63/0x6b Fixes: c3ab435466d5 ("mlxsw: spectrum: Extend to support Spectrum-2 ASIC") Reported-by: Orel Hagag Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Signed-off-by: Petr Machata Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/2d91c89afba59c22587b444994ae419dbea8d876.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlxsw/spectrum_acl_tcam.c | 2 + .../drivers/net/mlxsw/spectrum-2/tc_flower.sh | 56 ++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c index 7d1e91196e94..50ea1eff02b2 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c @@ -1564,6 +1564,8 @@ int mlxsw_sp_acl_tcam_init(struct mlxsw_sp *mlxsw_sp, tcam->max_groups = max_groups; tcam->max_group_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_GROUP_SIZE); + tcam->max_group_size = min_t(unsigned int, tcam->max_group_size, + MLXSW_REG_PAGT_ACL_MAX_NUM); err = ops->init(mlxsw_sp, tcam->priv, tcam); if (err) diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh index 7bf56ea161e3..616d3581419c 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh @@ -11,7 +11,7 @@ ALL_TESTS="single_mask_test identical_filters_test two_masks_test \ multiple_masks_test ctcam_edge_cases_test delta_simple_test \ delta_two_masks_one_key_test delta_simple_rehash_test \ bloom_simple_test bloom_complex_test bloom_delta_test \ - max_erp_entries_test" + max_erp_entries_test max_group_size_test" NUM_NETIFS=2 source $lib_dir/lib.sh source $lib_dir/tc_common.sh @@ -1033,6 +1033,60 @@ max_erp_entries_test() "max chain $chain_failed, mask $mask_failed" } +max_group_size_test() +{ + # The number of ACLs in an ACL group is limited. Once the maximum + # number of ACLs has been reached, filters cannot be added. This test + # verifies that when this limit is reached, insertion fails without + # crashing. + + RET=0 + + local num_acls=32 + local max_size + local ret + + if [[ "$tcflags" != "skip_sw" ]]; then + return 0; + fi + + for ((i=1; i < $num_acls; i++)); do + if [[ $(( i % 2 )) == 1 ]]; then + tc filter add dev $h2 ingress pref $i proto ipv4 \ + flower $tcflags dst_ip 198.51.100.1/32 \ + ip_proto tcp tcp_flags 0x01/0x01 \ + action drop &> /dev/null + else + tc filter add dev $h2 ingress pref $i proto ipv6 \ + flower $tcflags dst_ip 2001:db8:1::1/128 \ + action drop &> /dev/null + fi + + ret=$? + [[ $ret -ne 0 ]] && max_size=$((i - 1)) && break + done + + # We expect to exceed the maximum number of ACLs in a group, so that + # insertion eventually fails. Otherwise, the test should be adjusted to + # add more filters. + check_fail $ret "expected to exceed number of ACLs in a group" + + for ((; i >= 1; i--)); do + if [[ $(( i % 2 )) == 1 ]]; then + tc filter del dev $h2 ingress pref $i proto ipv4 \ + flower $tcflags dst_ip 198.51.100.1/32 \ + ip_proto tcp tcp_flags 0x01/0x01 \ + action drop &> /dev/null + else + tc filter del dev $h2 ingress pref $i proto ipv6 \ + flower $tcflags dst_ip 2001:db8:1::1/128 \ + action drop &> /dev/null + fi + done + + log_test "max ACL group size test ($tcflags). max size $max_size" +} + setup_prepare() { h1=${NETIFS[p1]} From 62bef63646c194e0f82b40304a0f2d060b28687b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 17 Jan 2024 16:04:19 +0100 Subject: [PATCH 84/88] mlxsw: spectrum_router: Register netdevice notifier before nexthop If there are IPIP nexthops at the time when the driver is loaded (or the devlink instance reloaded), the driver looks up the corresponding IPIP entry. But IPIP entries are only created as a result of netdevice notifications. Since the netdevice notifier is registered after the nexthop notifier, mlxsw_sp_nexthop_type_init() never finds the IPIP entry, registers the nexthop MLXSW_SP_NEXTHOP_TYPE_ETH, and fails to assign a CRIF to the nexthop. Later on when the CRIF is necessary, the WARN_ON in mlxsw_sp_nexthop_rif() triggers, causing the splat [1]. In order to fix the issue, reorder the netdevice notifier to be registered before the nexthop one. [1] (edited for clarity): WARNING: CPU: 1 PID: 1364 at drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3245 mlxsw_sp_nexthop_rif (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3246 (discriminator 1)) mlxsw_spectrum Hardware name: Mellanox Technologies Ltd. MSN4410/VMOD0010, BIOS 5.11 01/06/2019 Call Trace: ? mlxsw_sp_nexthop_rif (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3246 (discriminator 1)) mlxsw_spectrum __mlxsw_sp_nexthop_eth_update (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3637) mlxsw_spectrum mlxsw_sp_nexthop_update (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3679 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3727) mlxsw_spectrum mlxsw_sp_nexthop_group_update (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3757) mlxsw_spectrum mlxsw_sp_nexthop_group_refresh (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:4112) mlxsw_spectrum mlxsw_sp_nexthop_obj_event (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:5118 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:5191 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:5315 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:5500) mlxsw_spectrum nexthops_dump (net/ipv4/nexthop.c:217 net/ipv4/nexthop.c:440 net/ipv4/nexthop.c:3609) register_nexthop_notifier (net/ipv4/nexthop.c:3624) mlxsw_sp_router_init (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:11486) mlxsw_spectrum mlxsw_sp_init (drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3267) mlxsw_spectrum __mlxsw_core_bus_device_register (drivers/net/ethernet/mellanox/mlxsw/core.c:2202) mlxsw_core mlxsw_devlink_core_bus_device_reload_up (drivers/net/ethernet/mellanox/mlxsw/core.c:2265 drivers/net/ethernet/mellanox/mlxsw/core.c:1603) mlxsw_core devlink_reload (net/devlink/dev.c:314 net/devlink/dev.c:475) [...] Fixes: 9464a3d68ea9 ("mlxsw: spectrum_router: Track next hops at CRIFs") Reported-by: Maksym Yaremchuk Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/74edb8d45d004e8d8f5318eede6ccc3d786d8ba9.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 2c255ed9b8a9..7164f9e6370f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -11472,6 +11472,13 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp, if (err) goto err_register_netevent_notifier; + mlxsw_sp->router->netdevice_nb.notifier_call = + mlxsw_sp_router_netdevice_event; + err = register_netdevice_notifier_net(mlxsw_sp_net(mlxsw_sp), + &mlxsw_sp->router->netdevice_nb); + if (err) + goto err_register_netdev_notifier; + mlxsw_sp->router->nexthop_nb.notifier_call = mlxsw_sp_nexthop_obj_event; err = register_nexthop_notifier(mlxsw_sp_net(mlxsw_sp), @@ -11487,22 +11494,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp, if (err) goto err_register_fib_notifier; - mlxsw_sp->router->netdevice_nb.notifier_call = - mlxsw_sp_router_netdevice_event; - err = register_netdevice_notifier_net(mlxsw_sp_net(mlxsw_sp), - &mlxsw_sp->router->netdevice_nb); - if (err) - goto err_register_netdev_notifier; - return 0; -err_register_netdev_notifier: - unregister_fib_notifier(mlxsw_sp_net(mlxsw_sp), - &mlxsw_sp->router->fib_nb); err_register_fib_notifier: unregister_nexthop_notifier(mlxsw_sp_net(mlxsw_sp), &mlxsw_sp->router->nexthop_nb); err_register_nexthop_notifier: + unregister_netdevice_notifier_net(mlxsw_sp_net(mlxsw_sp), + &router->netdevice_nb); +err_register_netdev_notifier: unregister_netevent_notifier(&mlxsw_sp->router->netevent_nb); err_register_netevent_notifier: unregister_inet6addr_validator_notifier(&router->inet6addr_valid_nb); @@ -11550,11 +11550,11 @@ void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) { struct mlxsw_sp_router *router = mlxsw_sp->router; - unregister_netdevice_notifier_net(mlxsw_sp_net(mlxsw_sp), - &router->netdevice_nb); unregister_fib_notifier(mlxsw_sp_net(mlxsw_sp), &router->fib_nb); unregister_nexthop_notifier(mlxsw_sp_net(mlxsw_sp), &router->nexthop_nb); + unregister_netdevice_notifier_net(mlxsw_sp_net(mlxsw_sp), + &router->netdevice_nb); unregister_netevent_notifier(&router->netevent_nb); unregister_inet6addr_validator_notifier(&router->inet6addr_valid_nb); unregister_inetaddr_validator_notifier(&router->inetaddr_valid_nb); From 40cc674bafd50fc728c4a73d7dc77728a0b86759 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Jan 2024 16:04:20 +0100 Subject: [PATCH 85/88] selftests: mlxsw: qos_pfc: Remove wrong description In the diagram of the topology, $swp3 and $swp4 are described as 1Gbps ports. This is wrong information, the test does not configure such speed. Cc: Shuah Khan Fixes: bfa804784e32 ("selftests: mlxsw: Add a PFC test") Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/0087e2d416aff7e444d15f7c2958fc1d438dc27e.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh index 42ce602d8d49..49bef76083b8 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh @@ -40,7 +40,6 @@ # | + $swp1 $swp3 + + $swp4 | # | | iPOOL1 iPOOL0 | | iPOOL2 | # | | ePOOL4 ePOOL5 | | ePOOL4 | -# | | 1Gbps | | 1Gbps | # | | PFC:enabled=1 | | PFC:enabled=1 | # | +-|----------------------|-+ +-|------------------------+ | # | | + $swp1.111 $swp3.111 + | | + $swp4.111 | | From b34f4de6d30cbaa8fed905a5080b6eace8c84dc7 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Jan 2024 16:04:21 +0100 Subject: [PATCH 86/88] selftests: mlxsw: qos_pfc: Adjust the test to support 8 lanes 'qos_pfc' test checks PFC behavior. The idea is to limit the traffic using a shaper somewhere in the flow of the packets. In this area, the buffer is smaller than the buffer at the beginning of the flow, so it fills up until there is no more space left. The test configures there PFC which is supposed to notice that the headroom is filling up and send PFC Xoff to indicate the transmitter to stop sending traffic for the priorities sharing this PG. The Xon/Xoff threshold is auto-configured and always equal to 2*(MTU rounded up to cell size). Even after sending the PFC Xoff packet, traffic will keep arriving until the transmitter receives and processes the PFC packet. This amount of traffic is known as the PFC delay allowance. Currently the buffer for the delay traffic is configured as 100KB. The MTU in the test is 10KB, therefore the threshold for Xoff is about 20KB. This allows 80KB extra to be stored in this buffer. 8-lane ports use two buffers among which the configured buffer is split, the Xoff threshold then applies to each buffer in parallel. The test does not take into account the behavior of 8-lane ports, when the ports are configured to 400Gbps with 8 lanes or 800Gbps with 8 lanes, packets are dropped and the test fails. Check if the relevant ports use 8 lanes, in such case double the size of the buffer, as the headroom is split half-half. Cc: Shuah Khan Fixes: bfa804784e32 ("selftests: mlxsw: Add a PFC test") Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/23ff11b7dff031eb04a41c0f5254a2b636cd8ebb.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/mlxsw/qos_pfc.sh | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh index 49bef76083b8..0f0f4f05807c 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh @@ -119,6 +119,9 @@ h2_destroy() switch_create() { + local lanes_swp4 + local pg1_size + # pools # ----- @@ -228,7 +231,20 @@ switch_create() dcb pfc set dev $swp4 prio-pfc all:off 1:on # PG0 will get autoconfigured to Xoff, give PG1 arbitrarily 100K, which # is (-2*MTU) about 80K of delay provision. - dcb buffer set dev $swp4 buffer-size all:0 1:$_100KB + pg1_size=$_100KB + + setup_wait_dev_with_timeout $swp4 + + lanes_swp4=$(ethtool $swp4 | grep 'Lanes:') + lanes_swp4=${lanes_swp4#*"Lanes: "} + + # 8-lane ports use two buffers among which the configured buffer + # is split, so double the size to get twice (20K + 80K). + if [[ $lanes_swp4 -eq 8 ]]; then + pg1_size=$((pg1_size * 2)) + fi + + dcb buffer set dev $swp4 buffer-size all:0 1:$pg1_size # bridges # ------- From 2e7ef287f07c74985f1bf2858bedc62bd9ebf155 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Wed, 17 Jan 2024 09:21:02 -0800 Subject: [PATCH 87/88] ipv6: mcast: fix data-race in ipv6_mc_down / mld_ifc_work idev->mc_ifc_count can be written over without proper locking. Originally found by syzbot [1], fix this issue by encapsulating calls to mld_ifc_stop_work() (and mld_gq_stop_work() for good measure) with mutex_lock() and mutex_unlock() accordingly as these functions should only be called with mc_lock per their declarations. [1] BUG: KCSAN: data-race in ipv6_mc_down / mld_ifc_work write to 0xffff88813a80c832 of 1 bytes by task 3771 on cpu 0: mld_ifc_stop_work net/ipv6/mcast.c:1080 [inline] ipv6_mc_down+0x10a/0x280 net/ipv6/mcast.c:2725 addrconf_ifdown+0xe32/0xf10 net/ipv6/addrconf.c:3949 addrconf_notify+0x310/0x980 notifier_call_chain kernel/notifier.c:93 [inline] raw_notifier_call_chain+0x6b/0x1c0 kernel/notifier.c:461 __dev_notify_flags+0x205/0x3d0 dev_change_flags+0xab/0xd0 net/core/dev.c:8685 do_setlink+0x9f6/0x2430 net/core/rtnetlink.c:2916 rtnl_group_changelink net/core/rtnetlink.c:3458 [inline] __rtnl_newlink net/core/rtnetlink.c:3717 [inline] rtnl_newlink+0xbb3/0x1670 net/core/rtnetlink.c:3754 rtnetlink_rcv_msg+0x807/0x8c0 net/core/rtnetlink.c:6558 netlink_rcv_skb+0x126/0x220 net/netlink/af_netlink.c:2545 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:6576 netlink_unicast_kernel net/netlink/af_netlink.c:1342 [inline] netlink_unicast+0x589/0x650 net/netlink/af_netlink.c:1368 netlink_sendmsg+0x66e/0x770 net/netlink/af_netlink.c:1910 ... write to 0xffff88813a80c832 of 1 bytes by task 22 on cpu 1: mld_ifc_work+0x54c/0x7b0 net/ipv6/mcast.c:2653 process_one_work kernel/workqueue.c:2627 [inline] process_scheduled_works+0x5b8/0xa30 kernel/workqueue.c:2700 worker_thread+0x525/0x730 kernel/workqueue.c:2781 ... Fixes: 2d9a93b4902b ("mld: convert from timer to delayed work") Reported-by: syzbot+a9400cabb1d784e49abf@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000994e09060ebcdffb@google.com/ Signed-off-by: Nikita Zhandarovich Acked-by: Taehee Yoo Reviewed-by: Eric Dumazet Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20240117172102.12001-1-n.zhandarovich@fintech.ru Signed-off-by: Jakub Kicinski --- net/ipv6/mcast.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index b75d3c9d41bb..bc6e0a0bad3c 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2722,8 +2722,12 @@ void ipv6_mc_down(struct inet6_dev *idev) synchronize_net(); mld_query_stop_work(idev); mld_report_stop_work(idev); + + mutex_lock(&idev->mc_lock); mld_ifc_stop_work(idev); mld_gq_stop_work(idev); + mutex_unlock(&idev->mc_lock); + mld_dad_stop_work(idev); } From 9cfd3b502153810b66ac0ce47f1fba682228f2d2 Mon Sep 17 00:00:00 2001 From: Tony Nguyen Date: Wed, 17 Jan 2024 09:25:32 -0800 Subject: [PATCH 88/88] i40e: Include types.h to some headers Commit 56df345917c0 ("i40e: Remove circular header dependencies and fix headers") redistributed a number of includes from one large header file to the locations they were needed. In some environments, types.h is not included and causing compile issues. The driver should not rely on implicit inclusion from other locations; explicitly include it to these files. Snippet of issue. Entire log can be seen through the Closes: link. In file included from drivers/net/ethernet/intel/i40e/i40e_diag.h:7, from drivers/net/ethernet/intel/i40e/i40e_diag.c:4: drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h:33:9: error: unknown type name '__le16' 33 | __le16 flags; | ^~~~~~ drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h:34:9: error: unknown type name '__le16' 34 | __le16 opcode; | ^~~~~~ ... drivers/net/ethernet/intel/i40e/i40e_diag.h:22:9: error: unknown type name 'u32' 22 | u32 elements; /* number of elements if array */ | ^~~ drivers/net/ethernet/intel/i40e/i40e_diag.h:23:9: error: unknown type name 'u32' 23 | u32 stride; /* bytes between each element */ Reported-by: Martin Zaharinov Closes: https://lore.kernel.org/netdev/21BBD62A-F874-4E42-B347-93087EEA8126@gmail.com/ Fixes: 56df345917c0 ("i40e: Remove circular header dependencies and fix headers") Reviewed-by: Jesse Brandeburg Reviewed-by: Simon Horman Tested-by: Arpana Arland (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20240117172534.3555162-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h | 1 + drivers/net/ethernet/intel/i40e/i40e_diag.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index 18a1c3b6d72c..c8f35d4de271 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -5,6 +5,7 @@ #define _I40E_ADMINQ_CMD_H_ #include +#include /* This header file defines the i40e Admin Queue commands and is shared between * i40e Firmware and Software. diff --git a/drivers/net/ethernet/intel/i40e/i40e_diag.h b/drivers/net/ethernet/intel/i40e/i40e_diag.h index ece3a6b9a5c6..ab20202a3da3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_diag.h +++ b/drivers/net/ethernet/intel/i40e/i40e_diag.h @@ -4,6 +4,7 @@ #ifndef _I40E_DIAG_H_ #define _I40E_DIAG_H_ +#include #include "i40e_adminq_cmd.h" /* forward-declare the HW struct for the compiler */