From 0e8c52091633b354b12d0c29a27a22077584c111 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 12 Dec 2024 13:29:42 +0200 Subject: [PATCH 01/50] wifi: iwlwifi: fix CRF name for Bz We had BE201 hard coded. Look at the RF_ID and decide based on its value. Fixes: 6795a37161fb ("wifi: iwlwifi: Print a specific device name.") Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241212132940.b9eebda1ca60.I36791a134ed5e538e059418eb6520761da97b44c@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/cfg/bz.c | 1 + .../net/wireless/intel/iwlwifi/iwl-config.h | 1 + drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 41 +++++++++++++++++-- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/bz.c b/drivers/net/wireless/intel/iwlwifi/cfg/bz.c index cd1fe8490ae5..1c43f283ac4a 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/bz.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/bz.c @@ -161,6 +161,7 @@ const struct iwl_cfg_trans_params iwl_gl_trans_cfg = { const char iwl_bz_name[] = "Intel(R) TBD Bz device"; const char iwl_fm_name[] = "Intel(R) Wi-Fi 7 BE201 320MHz"; +const char iwl_wh_name[] = "Intel(R) Wi-Fi 7 BE211 320MHz"; const char iwl_gl_name[] = "Intel(R) Wi-Fi 7 BE200 320MHz"; const char iwl_mtp_name[] = "Intel(R) Wi-Fi 7 BE202 160MHz"; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index 34c91deca57b..17721bb47e25 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -545,6 +545,7 @@ extern const char iwl_ax231_name[]; extern const char iwl_ax411_name[]; extern const char iwl_bz_name[]; extern const char iwl_fm_name[]; +extern const char iwl_wh_name[]; extern const char iwl_gl_name[]; extern const char iwl_mtp_name[]; extern const char iwl_sc_name[]; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 805fb249a0c6..8fb2aa282242 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1106,18 +1106,53 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { iwlax210_2ax_cfg_so_jf_b0, iwl9462_name), /* Bz */ -/* FIXME: need to change the naming according to the actual CRF */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax201_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax211_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_fm_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_wh_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax201_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax211_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_fm_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, - iwl_cfg_bz, iwl_fm_name), + iwl_cfg_bz, iwl_wh_name), /* Ga (Gl) */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, From b83accfec0811421df065f820e73ca8df7f6439a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 12 Dec 2024 08:27:05 -0800 Subject: [PATCH 02/50] MAINTAINERS: wifi: ath: add Jeff Johnson as maintainer The "ATHEROS ATH GENERIC UTILITIES" entry shares the same git tree as the ATH10K, ATH11K, and ATH12K entries which I already maintain, so add me to that entry as well. Signed-off-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241212-ath-maintainer-v1-1-7ea5e86780a8@kernel.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e6e71b05710b..158740a571b3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3608,6 +3608,7 @@ F: drivers/phy/qualcomm/phy-ath79-usb.c ATHEROS ATH GENERIC UTILITIES M: Kalle Valo +M: Jeff Johnson L: linux-wireless@vger.kernel.org S: Supported F: drivers/net/wireless/ath/* From cc0c53f4fac562efb3aca2bc493515e77642ae33 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Jun 2024 14:12:45 -0700 Subject: [PATCH 03/50] wifi: iwlwifi: mvm: Fix __counted_by usage in cfg80211_wowlan_nd_* Both struct cfg80211_wowlan_nd_match and struct cfg80211_wowlan_nd_info pre-allocate space for channels and matches, but then may end up using fewer that the full allocation. Shrink the associated counter (n_channels and n_matches) after counting the results. This avoids compile-time (and run-time) warnings from __counted_by. (The counter member needs to be updated _before_ accessing the array index.) Seen with coming GCC 15: drivers/net/wireless/intel/iwlwifi/mvm/d3.c: In function 'iwl_mvm_query_set_freqs': drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2877:66: warning: operation on 'match->n_channels' may be undefined [-Wsequence-point] 2877 | match->channels[match->n_channels++] = | ~~~~~~~~~~~~~~~~~^~ drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2885:66: warning: operation on 'match->n_channels' may be undefined [-Wsequence-point] 2885 | match->channels[match->n_channels++] = | ~~~~~~~~~~~~~~~~~^~ drivers/net/wireless/intel/iwlwifi/mvm/d3.c: In function 'iwl_mvm_query_netdetect_reasons': drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2982:58: warning: operation on 'net_detect->n_matches' may be undefined [-Wsequence-point] 2982 | net_detect->matches[net_detect->n_matches++] = match; | ~~~~~~~~~~~~~~~~~~~~~^~ Cc: stable@vger.kernel.org Fixes: aa4ec06c455d ("wifi: cfg80211: use __counted_by where appropriate") Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20240619211233.work.355-kees@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index f85c01e04ebf..7d973546c9fb 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -2954,6 +2954,7 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, int idx) { int i; + int n_channels = 0; if (fw_has_api(&mvm->fw->ucode_capa, IWL_UCODE_TLV_API_SCAN_OFFLOAD_CHANS)) { @@ -2962,7 +2963,7 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, for (i = 0; i < SCAN_OFFLOAD_MATCHING_CHANNELS_LEN * 8; i++) if (matches[idx].matching_channels[i / 8] & (BIT(i % 8))) - match->channels[match->n_channels++] = + match->channels[n_channels++] = mvm->nd_channels[i]->center_freq; } else { struct iwl_scan_offload_profile_match_v1 *matches = @@ -2970,9 +2971,11 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, for (i = 0; i < SCAN_OFFLOAD_MATCHING_CHANNELS_LEN_V1 * 8; i++) if (matches[idx].matching_channels[i / 8] & (BIT(i % 8))) - match->channels[match->n_channels++] = + match->channels[n_channels++] = mvm->nd_channels[i]->center_freq; } + /* We may have ended up with fewer channels than we allocated. */ + match->n_channels = n_channels; } /** @@ -3053,6 +3056,8 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, GFP_KERNEL); if (!net_detect || !n_matches) goto out_report_nd; + net_detect->n_matches = n_matches; + n_matches = 0; for_each_set_bit(i, &matched_profiles, mvm->n_nd_match_sets) { struct cfg80211_wowlan_nd_match *match; @@ -3066,8 +3071,9 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, GFP_KERNEL); if (!match) goto out_report_nd; + match->n_channels = n_channels; - net_detect->matches[net_detect->n_matches++] = match; + net_detect->matches[n_matches++] = match; /* We inverted the order of the SSIDs in the scan * request, so invert the index here. @@ -3082,6 +3088,8 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, iwl_mvm_query_set_freqs(mvm, d3_data->nd_results, match, i); } + /* We may have fewer matches than we allocated. */ + net_detect->n_matches = n_matches; out_report_nd: wakeup.net_detect = net_detect; From 146b6057e1fd28fb1a38d300bf76a38dfba7f9fb Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 17 Dec 2024 13:55:48 +0100 Subject: [PATCH 04/50] wifi: cw1200: Fix potential NULL dereference A recent refactoring was identified by smatch to cause another potential NULL dereference: drivers/net/wireless/st/cw1200/cw1200_spi.c:440 cw1200_spi_disconnect() error: we previously assumed 'self' could be null (see line 433) Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202411271742.Xa7CNVh1-lkp@intel.com/ Fixes: 2719a9e7156c ("wifi: cw1200: Convert to GPIO descriptors") Signed-off-by: Linus Walleij Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241217-cw1200-fix-v1-1-911e6b5823ec@linaro.org --- drivers/net/wireless/st/cw1200/cw1200_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/st/cw1200/cw1200_spi.c b/drivers/net/wireless/st/cw1200/cw1200_spi.c index 862964a8cc87..52386dfb5f4a 100644 --- a/drivers/net/wireless/st/cw1200/cw1200_spi.c +++ b/drivers/net/wireless/st/cw1200/cw1200_spi.c @@ -442,8 +442,8 @@ static void cw1200_spi_disconnect(struct spi_device *func) cw1200_core_release(self->core); self->core = NULL; } + cw1200_spi_off(self, dev_get_platdata(&func->dev)); } - cw1200_spi_off(self, dev_get_platdata(&func->dev)); } static int __maybe_unused cw1200_spi_suspend(struct device *dev) From 25c6a5ab151fb9c886552bf5aa7cbf2a5c6e96af Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 17 Dec 2024 14:35:00 +0800 Subject: [PATCH 05/50] net: phy: micrel: Dynamically control external clock of KSZ PHY On the i.MX6ULL-14x14-EVK board, enet1_ref and enet2_ref are used as the clock sources for two external KSZ PHYs. However, after closing the two FEC ports, the clk_enable_count of the enet1_ref and enet2_ref clocks is not 0. The root cause is that since the commit 985329462723 ("net: phy: micrel: use devm_clk_get_optional_enabled for the rmii-ref clock"), the external clock of KSZ PHY has been enabled when the PHY driver probes, and it can only be disabled when the PHY driver is removed. This causes the clock to continue working when the system is suspended or the network port is down. Although Heiko explained in the commit message that the patch was because some clock suppliers need to enable the clock to get the valid clock rate , it seems that the simple fix is to disable the clock after getting the clock rate to solve the current problem. This is indeed true, but we need to admit that Heiko's patch has been applied for more than a year, and we cannot guarantee whether there are platforms that only enable rmii-ref in the KSZ PHY driver during this period. If this is the case, disabling rmii-ref will cause RMII on these platforms to not work. Secondly, commit 99ac4cbcc2a5 ("net: phy: micrel: allow usage of generic ethernet-phy clock") just simply enables the generic clock permanently, which seems like the generic clock may only be enabled in the PHY driver. If we simply disable the generic clock, RMII may not work. If we keep it as it is, the platform using the generic clock will have the same problem as the i.MX6ULL platform. To solve this problem, the clock is enabled when phy_driver::resume() is called, and the clock is disabled when phy_driver::suspend() is called. Since phy_driver::resume() and phy_driver::suspend() are not called in pairs, an additional clk_enable flag is added. When phy_driver::suspend() is called, the clock is disabled only if clk_enable is true. Conversely, when phy_driver::resume() is called, the clock is enabled if clk_enable is false. The changes that introduced the problem were only a few lines, while the current fix is about a hundred lines, which seems out of proportion, but it is necessary because kszphy_probe() is used by multiple KSZ PHYs and we need to fix all of them. Fixes: 985329462723 ("net: phy: micrel: use devm_clk_get_optional_enabled for the rmii-ref clock") Fixes: 99ac4cbcc2a5 ("net: phy: micrel: allow usage of generic ethernet-phy clock") Signed-off-by: Wei Fang Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20241217063500.1424011-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 114 ++++++++++++++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 3ef508840674..eeb33eb181ac 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -432,10 +432,12 @@ struct kszphy_ptp_priv { struct kszphy_priv { struct kszphy_ptp_priv ptp_priv; const struct kszphy_type *type; + struct clk *clk; int led_mode; u16 vct_ctrl1000; bool rmii_ref_clk_sel; bool rmii_ref_clk_sel_val; + bool clk_enable; u64 stats[ARRAY_SIZE(kszphy_hw_stats)]; }; @@ -2050,6 +2052,46 @@ static void kszphy_get_stats(struct phy_device *phydev, data[i] = kszphy_get_stat(phydev, i); } +static void kszphy_enable_clk(struct phy_device *phydev) +{ + struct kszphy_priv *priv = phydev->priv; + + if (!priv->clk_enable && priv->clk) { + clk_prepare_enable(priv->clk); + priv->clk_enable = true; + } +} + +static void kszphy_disable_clk(struct phy_device *phydev) +{ + struct kszphy_priv *priv = phydev->priv; + + if (priv->clk_enable && priv->clk) { + clk_disable_unprepare(priv->clk); + priv->clk_enable = false; + } +} + +static int kszphy_generic_resume(struct phy_device *phydev) +{ + kszphy_enable_clk(phydev); + + return genphy_resume(phydev); +} + +static int kszphy_generic_suspend(struct phy_device *phydev) +{ + int ret; + + ret = genphy_suspend(phydev); + if (ret) + return ret; + + kszphy_disable_clk(phydev); + + return 0; +} + static int kszphy_suspend(struct phy_device *phydev) { /* Disable PHY Interrupts */ @@ -2059,7 +2101,7 @@ static int kszphy_suspend(struct phy_device *phydev) phydev->drv->config_intr(phydev); } - return genphy_suspend(phydev); + return kszphy_generic_suspend(phydev); } static void kszphy_parse_led_mode(struct phy_device *phydev) @@ -2090,7 +2132,9 @@ static int kszphy_resume(struct phy_device *phydev) { int ret; - genphy_resume(phydev); + ret = kszphy_generic_resume(phydev); + if (ret) + return ret; /* After switching from power-down to normal mode, an internal global * reset is automatically generated. Wait a minimum of 1 ms before @@ -2112,6 +2156,24 @@ static int kszphy_resume(struct phy_device *phydev) return 0; } +/* Because of errata DS80000700A, receiver error following software + * power down. Suspend and resume callbacks only disable and enable + * external rmii reference clock. + */ +static int ksz8041_resume(struct phy_device *phydev) +{ + kszphy_enable_clk(phydev); + + return 0; +} + +static int ksz8041_suspend(struct phy_device *phydev) +{ + kszphy_disable_clk(phydev); + + return 0; +} + static int ksz9477_resume(struct phy_device *phydev) { int ret; @@ -2159,7 +2221,10 @@ static int ksz8061_resume(struct phy_device *phydev) if (!(ret & BMCR_PDOWN)) return 0; - genphy_resume(phydev); + ret = kszphy_generic_resume(phydev); + if (ret) + return ret; + usleep_range(1000, 2000); /* Re-program the value after chip is reset. */ @@ -2177,6 +2242,11 @@ static int ksz8061_resume(struct phy_device *phydev) return 0; } +static int ksz8061_suspend(struct phy_device *phydev) +{ + return kszphy_suspend(phydev); +} + static int kszphy_probe(struct phy_device *phydev) { const struct kszphy_type *type = phydev->drv->driver_data; @@ -2217,10 +2287,14 @@ static int kszphy_probe(struct phy_device *phydev) } else if (!clk) { /* unnamed clock from the generic ethernet-phy binding */ clk = devm_clk_get_optional_enabled(&phydev->mdio.dev, NULL); - if (IS_ERR(clk)) - return PTR_ERR(clk); } + if (IS_ERR(clk)) + return PTR_ERR(clk); + + clk_disable_unprepare(clk); + priv->clk = clk; + if (ksz8041_fiber_mode(phydev)) phydev->port = PORT_FIBRE; @@ -5290,6 +5364,21 @@ static int lan8841_probe(struct phy_device *phydev) return 0; } +static int lan8804_resume(struct phy_device *phydev) +{ + return kszphy_resume(phydev); +} + +static int lan8804_suspend(struct phy_device *phydev) +{ + return kszphy_generic_suspend(phydev); +} + +static int lan8841_resume(struct phy_device *phydev) +{ + return kszphy_generic_resume(phydev); +} + static int lan8841_suspend(struct phy_device *phydev) { struct kszphy_priv *priv = phydev->priv; @@ -5298,7 +5387,7 @@ static int lan8841_suspend(struct phy_device *phydev) if (ptp_priv->ptp_clock) ptp_cancel_worker_sync(ptp_priv->ptp_clock); - return genphy_suspend(phydev); + return kszphy_generic_suspend(phydev); } static struct phy_driver ksphy_driver[] = { @@ -5358,9 +5447,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - /* No suspend/resume callbacks because of errata DS80000700A, - * receiver error following software power down. - */ + .suspend = ksz8041_suspend, + .resume = ksz8041_resume, }, { .phy_id = PHY_ID_KSZ8041RNLI, .phy_id_mask = MICREL_PHY_ID_MASK, @@ -5436,7 +5524,7 @@ static struct phy_driver ksphy_driver[] = { .soft_reset = genphy_soft_reset, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, - .suspend = kszphy_suspend, + .suspend = ksz8061_suspend, .resume = ksz8061_resume, }, { .phy_id = PHY_ID_KSZ9021, @@ -5507,8 +5595,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, - .resume = kszphy_resume, + .suspend = lan8804_suspend, + .resume = lan8804_resume, .config_intr = lan8804_config_intr, .handle_interrupt = lan8804_handle_interrupt, }, { @@ -5526,7 +5614,7 @@ static struct phy_driver ksphy_driver[] = { .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, .suspend = lan8841_suspend, - .resume = genphy_resume, + .resume = lan8841_resume, .cable_test_start = lan8814_cable_test_start, .cable_test_get_status = ksz886x_cable_test_get_status, }, { From 262bfba8ab820641c8cfbbf03b86d6c00242c078 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 17 Dec 2024 18:02:23 -0800 Subject: [PATCH 06/50] net: dsa: microchip: Fix KSZ9477 set_ageing_time function The aging count is not a simple 11-bit value but comprises a 3-bit multiplier and an 8-bit second count. The code tries to use the original multiplier which is 4 as the second count is still 300 seconds by default. Fixes: 2c119d9982b1 ("net: dsa: microchip: add the support for set_ageing_time") Signed-off-by: Tristram Ha Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218020224.70590-2-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz9477.c | 47 +++++++++++++++++++------ drivers/net/dsa/microchip/ksz9477_reg.h | 4 +-- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index d16817e0476f..29fe79ea74cd 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -2,7 +2,7 @@ /* * Microchip KSZ9477 switch driver main logic * - * Copyright (C) 2017-2019 Microchip Technology Inc. + * Copyright (C) 2017-2024 Microchip Technology Inc. */ #include @@ -983,26 +983,51 @@ void ksz9477_get_caps(struct ksz_device *dev, int port, int ksz9477_set_ageing_time(struct ksz_device *dev, unsigned int msecs) { u32 secs = msecs / 1000; - u8 value; - u8 data; + u8 data, mult, value; + u32 max_val; int ret; - value = FIELD_GET(SW_AGE_PERIOD_7_0_M, secs); +#define MAX_TIMER_VAL ((1 << 8) - 1) - ret = ksz_write8(dev, REG_SW_LUE_CTRL_3, value); - if (ret < 0) - return ret; + /* The aging timer comprises a 3-bit multiplier and an 8-bit second + * value. Either of them cannot be zero. The maximum timer is then + * 7 * 255 = 1785 seconds. + */ + if (!secs) + secs = 1; - data = FIELD_GET(SW_AGE_PERIOD_10_8_M, secs); + /* Return error if too large. */ + else if (secs > 7 * MAX_TIMER_VAL) + return -EINVAL; ret = ksz_read8(dev, REG_SW_LUE_CTRL_0, &value); if (ret < 0) return ret; - value &= ~SW_AGE_CNT_M; - value |= FIELD_PREP(SW_AGE_CNT_M, data); + /* Check whether there is need to update the multiplier. */ + mult = FIELD_GET(SW_AGE_CNT_M, value); + max_val = MAX_TIMER_VAL; + if (mult > 0) { + /* Try to use the same multiplier already in the register as + * the hardware default uses multiplier 4 and 75 seconds for + * 300 seconds. + */ + max_val = DIV_ROUND_UP(secs, mult); + if (max_val > MAX_TIMER_VAL || max_val * mult != secs) + max_val = MAX_TIMER_VAL; + } - return ksz_write8(dev, REG_SW_LUE_CTRL_0, value); + data = DIV_ROUND_UP(secs, max_val); + if (mult != data) { + value &= ~SW_AGE_CNT_M; + value |= FIELD_PREP(SW_AGE_CNT_M, data); + ret = ksz_write8(dev, REG_SW_LUE_CTRL_0, value); + if (ret < 0) + return ret; + } + + value = DIV_ROUND_UP(secs, data); + return ksz_write8(dev, REG_SW_LUE_CTRL_3, value); } void ksz9477_port_queue_split(struct ksz_device *dev, int port) diff --git a/drivers/net/dsa/microchip/ksz9477_reg.h b/drivers/net/dsa/microchip/ksz9477_reg.h index 04235c22bf40..ff579920078e 100644 --- a/drivers/net/dsa/microchip/ksz9477_reg.h +++ b/drivers/net/dsa/microchip/ksz9477_reg.h @@ -2,7 +2,7 @@ /* * Microchip KSZ9477 register definitions * - * Copyright (C) 2017-2018 Microchip Technology Inc. + * Copyright (C) 2017-2024 Microchip Technology Inc. */ #ifndef __KSZ9477_REGS_H @@ -165,8 +165,6 @@ #define SW_VLAN_ENABLE BIT(7) #define SW_DROP_INVALID_VID BIT(6) #define SW_AGE_CNT_M GENMASK(5, 3) -#define SW_AGE_CNT_S 3 -#define SW_AGE_PERIOD_10_8_M GENMASK(10, 8) #define SW_RESV_MCAST_ENABLE BIT(2) #define SW_HASH_OPTION_M 0x03 #define SW_HASH_OPTION_CRC 1 From bb9869043438af5b94230f94fb4c39206525d758 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 17 Dec 2024 18:02:24 -0800 Subject: [PATCH 07/50] net: dsa: microchip: Fix LAN937X set_ageing_time function The aging count is not a simple 20-bit value but comprises a 3-bit multiplier and a 20-bit second time. The code tries to use the original multiplier which is 4 as the second count is still 300 seconds by default. As the 20-bit number is now too large for practical use there is an option to interpret it as microseconds instead of seconds. Fixes: 2c119d9982b1 ("net: dsa: microchip: add the support for set_ageing_time") Signed-off-by: Tristram Ha Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218020224.70590-3-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/lan937x_main.c | 62 ++++++++++++++++++++++-- drivers/net/dsa/microchip/lan937x_reg.h | 9 ++-- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/microchip/lan937x_main.c b/drivers/net/dsa/microchip/lan937x_main.c index b7652efd632e..b1ae3b9de3d1 100644 --- a/drivers/net/dsa/microchip/lan937x_main.c +++ b/drivers/net/dsa/microchip/lan937x_main.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Microchip LAN937X switch driver main logic - * Copyright (C) 2019-2022 Microchip Technology Inc. + * Copyright (C) 2019-2024 Microchip Technology Inc. */ #include #include @@ -461,10 +461,66 @@ int lan937x_change_mtu(struct ksz_device *dev, int port, int new_mtu) int lan937x_set_ageing_time(struct ksz_device *dev, unsigned int msecs) { - u32 secs = msecs / 1000; - u32 value; + u8 data, mult, value8; + bool in_msec = false; + u32 max_val, value; + u32 secs = msecs; int ret; +#define MAX_TIMER_VAL ((1 << 20) - 1) + + /* The aging timer comprises a 3-bit multiplier and a 20-bit second + * value. Either of them cannot be zero. The maximum timer is then + * 7 * 1048575 = 7340025 seconds. As this value is too large for + * practical use it can be interpreted as microseconds, making the + * maximum timer 7340 seconds with finer control. This allows for + * maximum 122 minutes compared to 29 minutes in KSZ9477 switch. + */ + if (msecs % 1000) + in_msec = true; + else + secs /= 1000; + if (!secs) + secs = 1; + + /* Return error if too large. */ + else if (secs > 7 * MAX_TIMER_VAL) + return -EINVAL; + + /* Configure how to interpret the number value. */ + ret = ksz_rmw8(dev, REG_SW_LUE_CTRL_2, SW_AGE_CNT_IN_MICROSEC, + in_msec ? SW_AGE_CNT_IN_MICROSEC : 0); + if (ret < 0) + return ret; + + ret = ksz_read8(dev, REG_SW_LUE_CTRL_0, &value8); + if (ret < 0) + return ret; + + /* Check whether there is need to update the multiplier. */ + mult = FIELD_GET(SW_AGE_CNT_M, value8); + max_val = MAX_TIMER_VAL; + if (mult > 0) { + /* Try to use the same multiplier already in the register as + * the hardware default uses multiplier 4 and 75 seconds for + * 300 seconds. + */ + max_val = DIV_ROUND_UP(secs, mult); + if (max_val > MAX_TIMER_VAL || max_val * mult != secs) + max_val = MAX_TIMER_VAL; + } + + data = DIV_ROUND_UP(secs, max_val); + if (mult != data) { + value8 &= ~SW_AGE_CNT_M; + value8 |= FIELD_PREP(SW_AGE_CNT_M, data); + ret = ksz_write8(dev, REG_SW_LUE_CTRL_0, value8); + if (ret < 0) + return ret; + } + + secs = DIV_ROUND_UP(secs, data); + value = FIELD_GET(SW_AGE_PERIOD_7_0_M, secs); ret = ksz_write8(dev, REG_SW_AGE_PERIOD__1, value); diff --git a/drivers/net/dsa/microchip/lan937x_reg.h b/drivers/net/dsa/microchip/lan937x_reg.h index 4ec93e421da4..72042fd64e5b 100644 --- a/drivers/net/dsa/microchip/lan937x_reg.h +++ b/drivers/net/dsa/microchip/lan937x_reg.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Microchip LAN937X switch register definitions - * Copyright (C) 2019-2021 Microchip Technology Inc. + * Copyright (C) 2019-2024 Microchip Technology Inc. */ #ifndef __LAN937X_REG_H #define __LAN937X_REG_H @@ -56,8 +56,7 @@ #define SW_VLAN_ENABLE BIT(7) #define SW_DROP_INVALID_VID BIT(6) -#define SW_AGE_CNT_M 0x7 -#define SW_AGE_CNT_S 3 +#define SW_AGE_CNT_M GENMASK(5, 3) #define SW_RESV_MCAST_ENABLE BIT(2) #define REG_SW_LUE_CTRL_1 0x0311 @@ -70,6 +69,10 @@ #define SW_FAST_AGING BIT(1) #define SW_LINK_AUTO_AGING BIT(0) +#define REG_SW_LUE_CTRL_2 0x0312 + +#define SW_AGE_CNT_IN_MICROSEC BIT(7) + #define REG_SW_AGE_PERIOD__1 0x0313 #define SW_AGE_PERIOD_7_0_M GENMASK(7, 0) From 6321f5fb70d502d95de8a212a7b484c297ec9644 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:11 -0800 Subject: [PATCH 08/50] gve: clean XDP queues in gve_tx_stop_ring_gqi When stopping XDP TX rings, the XDP clean function needs to be called to clean out the entire queue, similar to what happens in the normal TX queue case. Otherwise, the FIFO won't be cleared correctly, and xsk_tx_completed won't be reported. Fixes: 75eaae158b1b ("gve: Add XDP DROP and TX support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index e7fb7d6d283d..83ad278ec91f 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -206,7 +206,10 @@ void gve_tx_stop_ring_gqi(struct gve_priv *priv, int idx) return; gve_remove_napi(priv, ntfy_idx); - gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); + if (tx->q_num < priv->tx_cfg.num_queues) + gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); + else + gve_clean_xdp_done(priv, tx, priv->tx_desc_cnt); netdev_tx_reset_queue(tx->netdev_txq); gve_tx_remove_from_block(priv, idx); } From ff7c2dea9dd1a436fc79d6273adffdcc4a7ffea3 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:12 -0800 Subject: [PATCH 09/50] gve: guard XDP xmit NDO on existence of xdp queues In GVE, dedicated XDP queues only exist when an XDP program is installed and the interface is up. As such, the NDO XDP XMIT callback should return early if either of these conditions are false. In the case of no loaded XDP program, priv->num_xdp_queues=0 which can cause a divide-by-zero error, and in the case of interface down, num_xdp_queues remains untouched to persist XDP queue count for the next interface up, but the TX pointer itself would be NULL. The XDP xmit callback also needs to synchronize with a device transitioning from open to close. This synchronization will happen via the GVE_PRIV_FLAGS_NAPI_ENABLED bit along with a synchronize_net() call, which waits for any RCU critical sections at call-time to complete. Fixes: 39a7f4aa3e4a ("gve: Add XDP REDIRECT support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 3 +++ drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e171ca248f9a..5d7b0cc59959 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1899,6 +1899,9 @@ static void gve_turndown(struct gve_priv *priv) gve_clear_napi_enabled(priv); gve_clear_report_stats(priv); + + /* Make sure that all traffic is finished processing. */ + synchronize_net(); } static void gve_turnup(struct gve_priv *priv) diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 83ad278ec91f..852f8c7e39d2 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -837,9 +837,12 @@ int gve_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, struct gve_tx_ring *tx; int i, err = 0, qid; - if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK) || !priv->xdp_prog) return -EINVAL; + if (!gve_get_napi_enabled(priv)) + return -ENETDOWN; + qid = gve_xdp_tx_queue_id(priv, smp_processor_id() % priv->num_xdp_queues); From 40338d7987d810fcaa95c500b1068a52b08eec9b Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:13 -0800 Subject: [PATCH 10/50] gve: guard XSK operations on the existence of queues This patch predicates the enabling and disabling of XSK pools on the existence of queues. As it stands, if the interface is down, disabling or enabling XSK pools would result in a crash, as the RX queue pointer would be NULL. XSK pool registration will occur as part of the next interface up. Similarly, xsk_wakeup needs be guarded against queues disappearing while the function is executing, so a check against the GVE_PRIV_FLAGS_NAPI_ENABLED flag is added to synchronize with the disabling of the bit and the synchronize_net() in gve_turndown. Fixes: fd8e40321a12 ("gve: Add AF_XDP zero-copy support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Reviewed-by: Larysa Zaremba Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 5d7b0cc59959..e4e8ff4f9f80 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1623,8 +1623,8 @@ static int gve_xsk_pool_enable(struct net_device *dev, if (err) return err; - /* If XDP prog is not installed, return */ - if (!priv->xdp_prog) + /* If XDP prog is not installed or interface is down, return. */ + if (!priv->xdp_prog || !netif_running(dev)) return 0; rx = &priv->rx[qid]; @@ -1669,21 +1669,16 @@ static int gve_xsk_pool_disable(struct net_device *dev, if (qid >= priv->rx_cfg.num_queues) return -EINVAL; - /* If XDP prog is not installed, unmap DMA and return */ - if (!priv->xdp_prog) + /* If XDP prog is not installed or interface is down, unmap DMA and + * return. + */ + if (!priv->xdp_prog || !netif_running(dev)) goto done; - tx_qid = gve_xdp_tx_queue_id(priv, qid); - if (!netif_running(dev)) { - priv->rx[qid].xsk_pool = NULL; - xdp_rxq_info_unreg(&priv->rx[qid].xsk_rxq); - priv->tx[tx_qid].xsk_pool = NULL; - goto done; - } - napi_rx = &priv->ntfy_blocks[priv->rx[qid].ntfy_id].napi; napi_disable(napi_rx); /* make sure current rx poll is done */ + tx_qid = gve_xdp_tx_queue_id(priv, qid); napi_tx = &priv->ntfy_blocks[priv->tx[tx_qid].ntfy_id].napi; napi_disable(napi_tx); /* make sure current tx poll is done */ @@ -1711,6 +1706,9 @@ static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) struct gve_priv *priv = netdev_priv(dev); int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id); + if (!gve_get_napi_enabled(priv)) + return -ENETDOWN; + if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog) return -EINVAL; From ba0925c34e0fa6fe02d3d642bc02ab099ab312c7 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:14 -0800 Subject: [PATCH 11/50] gve: process XSK TX descriptors as part of RX NAPI When busy polling is enabled, xsk_sendmsg for AF_XDP zero copy marks the NAPI ID corresponding to the memory pool allocated for the socket. In GVE, this NAPI ID will never correspond to a NAPI ID of one of the dedicated XDP TX queues registered with the umem because XDP TX is not set up to share a NAPI with a corresponding RX queue. This patch moves XSK TX descriptor processing from the TX NAPI to the RX NAPI, and the gve_xsk_wakeup callback is updated to use the RX NAPI instead of the TX NAPI, accordingly. The branch on if the wakeup is for TX is removed, as the NAPI poll should be invoked whether the wakeup is for TX or for RX. Fixes: fd8e40321a12 ("gve: Add AF_XDP zero-copy support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Praveen Kaligineedi Signed-off-by: Joshua Washington Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve.h | 1 + drivers/net/ethernet/google/gve/gve_main.c | 8 +++++ drivers/net/ethernet/google/gve/gve_tx.c | 36 +++++++++++++--------- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index dd92949bb214..8167cc5fb0df 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1140,6 +1140,7 @@ int gve_xdp_xmit_one(struct gve_priv *priv, struct gve_tx_ring *tx, void gve_xdp_tx_flush(struct gve_priv *priv, u32 xdp_qid); bool gve_tx_poll(struct gve_notify_block *block, int budget); bool gve_xdp_poll(struct gve_notify_block *block, int budget); +int gve_xsk_tx_poll(struct gve_notify_block *block, int budget); int gve_tx_alloc_rings_gqi(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *cfg); void gve_tx_free_rings_gqi(struct gve_priv *priv, diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e4e8ff4f9f80..5cab7b88610f 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -333,6 +333,14 @@ int gve_napi_poll(struct napi_struct *napi, int budget) if (block->rx) { work_done = gve_rx_poll(block, budget); + + /* Poll XSK TX as part of RX NAPI. Setup re-poll based on max of + * TX and RX work done. + */ + if (priv->xdp_prog) + work_done = max_t(int, work_done, + gve_xsk_tx_poll(block, budget)); + reschedule |= work_done == budget; } diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 852f8c7e39d2..4350ebd9c2bd 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -981,33 +981,41 @@ out: return sent; } +int gve_xsk_tx_poll(struct gve_notify_block *rx_block, int budget) +{ + struct gve_rx_ring *rx = rx_block->rx; + struct gve_priv *priv = rx->gve; + struct gve_tx_ring *tx; + int sent = 0; + + tx = &priv->tx[gve_xdp_tx_queue_id(priv, rx->q_num)]; + if (tx->xsk_pool) { + sent = gve_xsk_tx(priv, tx, budget); + + u64_stats_update_begin(&tx->statss); + tx->xdp_xsk_sent += sent; + u64_stats_update_end(&tx->statss); + if (xsk_uses_need_wakeup(tx->xsk_pool)) + xsk_set_tx_need_wakeup(tx->xsk_pool); + } + + return sent; +} + bool gve_xdp_poll(struct gve_notify_block *block, int budget) { struct gve_priv *priv = block->priv; struct gve_tx_ring *tx = block->tx; u32 nic_done; - bool repoll; u32 to_do; /* Find out how much work there is to be done */ nic_done = gve_tx_load_event_counter(priv, tx); to_do = min_t(u32, (nic_done - tx->done), budget); gve_clean_xdp_done(priv, tx, to_do); - repoll = nic_done != tx->done; - - if (tx->xsk_pool) { - int sent = gve_xsk_tx(priv, tx, budget); - - u64_stats_update_begin(&tx->statss); - tx->xdp_xsk_sent += sent; - u64_stats_update_end(&tx->statss); - repoll |= (sent == budget); - if (xsk_uses_need_wakeup(tx->xsk_pool)) - xsk_set_tx_need_wakeup(tx->xsk_pool); - } /* If we still have work we want to repoll */ - return repoll; + return nic_done != tx->done; } bool gve_tx_poll(struct gve_notify_block *block, int budget) From de63ac44a527b2c5067551dbd70d939fe151325a Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:15 -0800 Subject: [PATCH 12/50] gve: fix XDP allocation path in edge cases This patch fixes a number of consistency issues in the queue allocation path related to XDP. As it stands, the number of allocated XDP queues changes in three different scenarios. 1) Adding an XDP program while the interface is up via gve_add_xdp_queues 2) Removing an XDP program while the interface is up via gve_remove_xdp_queues 3) After queues have been allocated and the old queue memory has been removed in gve_queues_start. However, the requirement for the interface to be up for gve_(add|remove)_xdp_queues to be called, in conjunction with the fact that the number of queues stored in priv isn't updated until _after_ XDP queues have been allocated in the normal queue allocation path means that if an XDP program is added while the interface is down, XDP queues won't be added until the _second_ if_up, not the first. Given the expectation that the number of XDP queues is equal to the number of RX queues, scenario (3) has another problematic implication. When changing the number of queues while an XDP program is loaded, the number of XDP queues must be updated as well, as there is logic in the driver (gve_xdp_tx_queue_id()) which relies on every RX queue having a corresponding XDP TX queue. However, the number of XDP queues stored in priv would not be updated until _after_ a close/open leading to a mismatch in the number of XDP queues reported vs the number of XDP queues which actually exist after the queue count update completes. This patch remedies these issues by doing the following: 1) The allocation config getter function is set up to retrieve the _expected_ number of XDP queues to allocate instead of relying on the value stored in `priv` which is only updated once the queues have been allocated. 2) When adjusting queues, XDP queues are adjusted to match the number of RX queues when XDP is enabled. This only works in the case when queues are live, so part (1) of the fix must still be available in the case that queues are adjusted when there is an XDP program and the interface is down. Fixes: 5f08cd3d6423 ("gve: Alloc before freeing when adjusting queues") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 5cab7b88610f..09fb7f16f73e 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -930,11 +930,13 @@ static void gve_init_sync_stats(struct gve_priv *priv) static void gve_tx_get_curr_alloc_cfg(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *cfg) { + int num_xdp_queues = priv->xdp_prog ? priv->rx_cfg.num_queues : 0; + cfg->qcfg = &priv->tx_cfg; cfg->raw_addressing = !gve_is_qpl(priv); cfg->ring_size = priv->tx_desc_cnt; cfg->start_idx = 0; - cfg->num_rings = gve_num_tx_queues(priv); + cfg->num_rings = priv->tx_cfg.num_queues + num_xdp_queues; cfg->tx = priv->tx; } @@ -1843,6 +1845,7 @@ int gve_adjust_queues(struct gve_priv *priv, { struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0}; struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0}; + int num_xdp_queues; int err; gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg); @@ -1853,6 +1856,10 @@ int gve_adjust_queues(struct gve_priv *priv, rx_alloc_cfg.qcfg = &new_rx_config; tx_alloc_cfg.num_rings = new_tx_config.num_queues; + /* Add dedicated XDP TX queues if enabled. */ + num_xdp_queues = priv->xdp_prog ? new_rx_config.num_queues : 0; + tx_alloc_cfg.num_rings += num_xdp_queues; + if (netif_running(priv->dev)) { err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg); return err; From 246068b86b1c36e4590388ab8f278e21f1997dc1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 19 Dec 2024 17:54:10 +0200 Subject: [PATCH 13/50] selftests: net: local_termination: require mausezahn Since the blamed commit, we require mausezahn because send_raw() uses it. Remove the "REQUIRE_MZ=no" line, which overwrites the default of requiring it. Fixes: 237979504264 ("selftests: net: local_termination: add PTP frames to the mix") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241219155410.1856868-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/local_termination.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index c35548767756..ecd34f364125 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -7,7 +7,6 @@ ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge test_vlan \ NUM_NETIFS=2 PING_COUNT=1 REQUIRE_MTOOLS=yes -REQUIRE_MZ=no source lib.sh From 4a25201aa46ce88e8e31f9ccdec0e4e3dd6bb736 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Dec 2024 19:28:32 -0800 Subject: [PATCH 14/50] netdev-genl: avoid empty messages in napi get Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down we "hide" the NAPI instances. Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241219032833.1165433-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 2d3ae0cd3ad2..b0772d135efb 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -246,8 +246,12 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rcu_read_unlock(); rtnl_unlock(); - if (err) + if (err) { goto err_free_msg; + } else if (!rsp->len) { + err = -ENOENT; + goto err_free_msg; + } return genlmsg_reply(rsp, info); From 30b981796b94b083da8fdded7cb74cb493608760 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Dec 2024 19:28:33 -0800 Subject: [PATCH 15/50] selftests: drv-net: test empty queue and NAPI responses in netlink Make sure kernel doesn't respond to GETs for queues and NAPIs when link is down. Not with valid data, or with empty message, we want a ENOENT. Link: https://patch.msgid.link/20241219032833.1165433-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/queues.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 9c5473abbd78..38303da957ee 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_run, ksft_exit, ksft_eq, KsftSkipEx -from lib.py import EthtoolFamily, NetdevFamily +from lib.py import ksft_disruptive, ksft_exit, ksft_run +from lib.py import ksft_eq, ksft_raises, KsftSkipEx +from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import NetDrvEnv -from lib.py import cmd +from lib.py import cmd, defer, ip +import errno import glob @@ -59,9 +61,27 @@ def addremove_queues(cfg, nl) -> None: ksft_eq(queues, expected) +@ksft_disruptive +def check_down(cfg, nl) -> None: + # Check the NAPI IDs before interface goes down and hides them + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + + ip(f"link set dev {cfg.dev['ifname']} down") + defer(ip, f"link set dev {cfg.dev['ifname']} up") + + with ksft_raises(NlError) as cm: + nl.queue_get({'ifindex': cfg.ifindex, 'id': 0, 'type': 'rx'}) + ksft_eq(cm.exception.nl_msg.error, -errno.ENOENT) + + if napis: + with ksft_raises(NlError) as cm: + nl.napi_get({'id': napis[0]['id']}) + ksft_eq(cm.exception.nl_msg.error, -errno.ENOENT) + + def main() -> None: with NetDrvEnv(__file__, queue_count=100) as cfg: - ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily())) + ksft_run([get_queues, addremove_queues, check_down], args=(cfg, NetdevFamily())) ksft_exit() From a072ffd896efa6a6c8a0334c712fbc98a63c789c Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Wed, 18 Dec 2024 15:25:58 -0800 Subject: [PATCH 16/50] eth: fbnic: fix csr boundary for RPM RAM section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CSR dump support leverages the FBNIC_BOUNDS macro, which pads the end condition for each section by adding an offset of 1. However, the RPC RAM section, which is dumped differently from other sections, does not rely on this macro and instead directly uses end boundary address. Hence, subtracting 1 from the end address results in skipping a register. Fixes 3d12862b216d (“eth: fbnic: Add support to dump registers”) Signed-off-by: Mohsin Bashir Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20241218232614.439329-1-mohsin.bashr@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic_csr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.c b/drivers/net/ethernet/meta/fbnic/fbnic_csr.c index 2118901b25e9..aeb9f333f4c7 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.c @@ -64,7 +64,7 @@ static void fbnic_csr_get_regs_rpc_ram(struct fbnic_dev *fbd, u32 **data_p) u32 i, j; *(data++) = start; - *(data++) = end - 1; + *(data++) = end; /* FBNIC_RPC_TCAM_ACT */ for (i = 0; i < FBNIC_RPC_TCAM_ACT_NUM_ENTRIES; i++) { From 2b6ffcd7873b7e8a62c3e15a6f305bfc747c466b Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Thu, 19 Dec 2024 11:41:19 +0900 Subject: [PATCH 17/50] net: stmmac: restructure the error path of stmmac_probe_config_dt() Current implementation of stmmac_probe_config_dt() does not release the OF node reference obtained by of_parse_phandle() in some error paths. The problem is that some error paths call stmmac_remove_config_dt() to clean up but others use and unwind ladder. These two types of error handling have not kept in sync and have been a recurring source of bugs. Re-write the error handling in stmmac_probe_config_dt() to use an unwind ladder. Consequently, stmmac_remove_config_dt() is not needed anymore, thus remove it. This bug was found by an experimental verification tool that I am developing. Fixes: 4838a5405028 ("net: stmmac: Fix wrapper drivers not detecting PHY") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241219024119.2017012-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/stmmac_platform.c | 43 ++++++++----------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 3ac32444e492..dc9884130b91 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -405,22 +405,6 @@ static int stmmac_of_get_mac_mode(struct device_node *np) return -ENODEV; } -/** - * stmmac_remove_config_dt - undo the effects of stmmac_probe_config_dt() - * @pdev: platform_device structure - * @plat: driver data platform structure - * - * Release resources claimed by stmmac_probe_config_dt(). - */ -static void stmmac_remove_config_dt(struct platform_device *pdev, - struct plat_stmmacenet_data *plat) -{ - clk_disable_unprepare(plat->stmmac_clk); - clk_disable_unprepare(plat->pclk); - of_node_put(plat->phy_node); - of_node_put(plat->mdio_node); -} - /** * stmmac_probe_config_dt - parse device-tree driver parameters * @pdev: platform_device structure @@ -490,8 +474,10 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dev_warn(&pdev->dev, "snps,phy-addr property is deprecated\n"); rc = stmmac_mdio_setup(plat, np, &pdev->dev); - if (rc) - return ERR_PTR(rc); + if (rc) { + ret = ERR_PTR(rc); + goto error_put_phy; + } of_property_read_u32(np, "tx-fifo-depth", &plat->tx_fifo_size); @@ -581,8 +567,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), GFP_KERNEL); if (!dma_cfg) { - stmmac_remove_config_dt(pdev, plat); - return ERR_PTR(-ENOMEM); + ret = ERR_PTR(-ENOMEM); + goto error_put_mdio; } plat->dma_cfg = dma_cfg; @@ -610,8 +596,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) rc = stmmac_mtl_setup(pdev, plat); if (rc) { - stmmac_remove_config_dt(pdev, plat); - return ERR_PTR(rc); + ret = ERR_PTR(rc); + goto error_put_mdio; } /* clock setup */ @@ -663,6 +649,10 @@ error_hw_init: clk_disable_unprepare(plat->pclk); error_pclk_get: clk_disable_unprepare(plat->stmmac_clk); +error_put_mdio: + of_node_put(plat->mdio_node); +error_put_phy: + of_node_put(plat->phy_node); return ret; } @@ -671,16 +661,17 @@ static void devm_stmmac_remove_config_dt(void *data) { struct plat_stmmacenet_data *plat = data; - /* Platform data argument is unused */ - stmmac_remove_config_dt(NULL, plat); + clk_disable_unprepare(plat->stmmac_clk); + clk_disable_unprepare(plat->pclk); + of_node_put(plat->mdio_node); + of_node_put(plat->phy_node); } /** * devm_stmmac_probe_config_dt * @pdev: platform_device structure * @mac: MAC address to use - * Description: Devres variant of stmmac_probe_config_dt(). Does not require - * the user to call stmmac_remove_config_dt() at driver detach. + * Description: Devres variant of stmmac_probe_config_dt(). */ struct plat_stmmacenet_data * devm_stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) From 4f4aa4aa28142d53f8b06585c478476cfe325cfc Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Thu, 19 Dec 2024 15:28:59 +0800 Subject: [PATCH 18/50] net: fix memory leak in tcp_conn_request() If inet_csk_reqsk_queue_hash_add() return false, tcp_conn_request() will return without free the dst memory, which allocated in af_ops->route_req. Here is the kmemleak stack: unreferenced object 0xffff8881198631c0 (size 240): comm "softirq", pid 0, jiffies 4299266571 (age 1802.392s) hex dump (first 32 bytes): 00 10 9b 03 81 88 ff ff 80 98 da bc ff ff ff ff ................ 81 55 18 bb ff ff ff ff 00 00 00 00 00 00 00 00 .U.............. backtrace: [] kmem_cache_alloc+0x60c/0xa80 [] dst_alloc+0x55/0x250 [] rt_dst_alloc+0x46/0x1d0 [] __mkroute_output+0x29a/0xa50 [] ip_route_output_key_hash+0x10b/0x240 [] ip_route_output_flow+0x1d/0x90 [] inet_csk_route_req+0x2c5/0x500 [] tcp_conn_request+0x691/0x12c0 [] tcp_rcv_state_process+0x3c8/0x11b0 [] tcp_v4_do_rcv+0x156/0x3b0 [] tcp_v4_rcv+0x1cf8/0x1d80 [] ip_protocol_deliver_rcu+0xf6/0x360 [] ip_local_deliver_finish+0xe6/0x1e0 [] ip_local_deliver+0xee/0x360 [] ip_rcv+0xad/0x2f0 [] __netif_receive_skb_one_core+0x123/0x140 Call dst_release() to free the dst memory when inet_csk_reqsk_queue_hash_add() return false in tcp_conn_request(). Fixes: ff46e3b44219 ("Fix race for duplicate reqsk on identical SYN") Signed-off-by: Wang Liang Link: https://patch.msgid.link/20241219072859.3783576-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bdf13ac26ef..4811727b8a02 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7328,6 +7328,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, req->timeout))) { reqsk_free(req); + dst_release(dst); return 0; } From b5a7b661a073727219fedc35f5619f62418ffe72 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Thu, 19 Dec 2024 21:03:36 +0800 Subject: [PATCH 19/50] net: Fix netns for ip_tunnel_init_flow() The device denoted by tunnel->parms.link resides in the underlay net namespace. Therefore pass tunnel->net to ip_tunnel_init_flow(). Fixes: db53cd3d88dc ("net: Handle l3mdev in ip_tunnel_init_flow") Signed-off-by: Xiao Liang Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241219130336.103839-1-shaw.leon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 3 +-- net/ipv4/ip_tunnel.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 4b5fd71c897d..32d2e61f2b82 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -423,8 +423,7 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev, parms = mlxsw_sp_ipip_netdev_parms4(to_dev); ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp, - 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0, - 0); + 0, 0, tun->net, parms.link, tun->fwmark, 0, 0); rt = ip_route_output_key(tun->net, &fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 25505f9b724c..09b73acf037a 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -294,7 +294,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - iph->tos & INET_DSCP_MASK, dev_net(dev), + iph->tos & INET_DSCP_MASK, tunnel->net, tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); @@ -611,7 +611,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), - tos & INET_DSCP_MASK, dev_net(dev), 0, skb->mark, + tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark, skb_get_hash(skb), key->flow_flags); if (!tunnel_hlen) @@ -774,7 +774,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, tos & INET_DSCP_MASK, - dev_net(dev), READ_ONCE(tunnel->parms.link), + tunnel->net, READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) From a4fd163aed2edd967a244499754dec991d8b4c7d Mon Sep 17 00:00:00 2001 From: Ilya Shchipletsov Date: Thu, 19 Dec 2024 08:23:07 +0000 Subject: [PATCH 20/50] netrom: check buffer length before accessing it Syzkaller reports an uninit value read from ax25cmp when sending raw message through ieee802154 implementation. ===================================================== BUG: KMSAN: uninit-value in ax25cmp+0x3a5/0x460 net/ax25/ax25_addr.c:119 ax25cmp+0x3a5/0x460 net/ax25/ax25_addr.c:119 nr_dev_get+0x20e/0x450 net/netrom/nr_route.c:601 nr_route_frame+0x1a2/0xfc0 net/netrom/nr_route.c:774 nr_xmit+0x5a/0x1c0 net/netrom/nr_dev.c:144 __netdev_start_xmit include/linux/netdevice.h:4940 [inline] netdev_start_xmit include/linux/netdevice.h:4954 [inline] xmit_one net/core/dev.c:3548 [inline] dev_hard_start_xmit+0x247/0xa10 net/core/dev.c:3564 __dev_queue_xmit+0x33b8/0x5130 net/core/dev.c:4349 dev_queue_xmit include/linux/netdevice.h:3134 [inline] raw_sendmsg+0x654/0xc10 net/ieee802154/socket.c:299 ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x318/0x740 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2780 sock_alloc_send_skb include/net/sock.h:1884 [inline] raw_sendmsg+0x36d/0xc10 net/ieee802154/socket.c:282 ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 0 PID: 5037 Comm: syz-executor166 Not tainted 6.7.0-rc7-syzkaller-00003-gfbafc3e621c3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 ===================================================== This issue occurs because the skb buffer is too small, and it's actual allocation is aligned. This hides an actual issue, which is that nr_route_frame does not validate the buffer size before using it. Fix this issue by checking skb->len before accessing any fields in skb->data. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Co-developed-by: Nikita Marushkin Signed-off-by: Nikita Marushkin Signed-off-by: Ilya Shchipletsov Link: https://patch.msgid.link/20241219082308.3942-1-rabbelkin@mail.ru Signed-off-by: Jakub Kicinski --- net/netrom/nr_route.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 2b5e246b8d9a..b94cb2ffbaf8 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -754,6 +754,12 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) int ret; struct sk_buff *skbn; + /* + * Reject malformed packets early. Check that it contains at least 2 + * addresses and 1 byte more for Time-To-Live + */ + if (skb->len < 2 * sizeof(ax25_address) + 1) + return 0; nr_src = (ax25_address *)(skb->data + 0); nr_dest = (ax25_address *)(skb->data + 7); From 4e86729d1ff329815a6e8a920cb554a1d4cb5b8d Mon Sep 17 00:00:00 2001 From: Nikolay Kuratov Date: Thu, 19 Dec 2024 19:21:14 +0300 Subject: [PATCH 21/50] net/sctp: Prevent autoclose integer overflow in sctp_association_init() While by default max_autoclose equals to INT_MAX / HZ, one may set net.sctp.max_autoclose to UINT_MAX. There is code in sctp_association_init() that can consequently trigger overflow. Cc: stable@vger.kernel.org Fixes: 9f70f46bd4c7 ("sctp: properly latch and use autoclose value from sock to association") Signed-off-by: Nikolay Kuratov Acked-by: Xin Long Link: https://patch.msgid.link/20241219162114.2863827-1-kniv@yandex-team.ru Signed-off-by: Jakub Kicinski --- net/sctp/associola.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index c45c192b7878..0b0794f164cf 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -137,7 +137,8 @@ static struct sctp_association *sctp_association_init( = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; - asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + (unsigned long)sp->autoclose * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) From 4a4d38ace1fb0586bffd2aab03caaa05d6011748 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Fri, 20 Dec 2024 13:26:14 +0530 Subject: [PATCH 22/50] net: ethernet: ti: am65-cpsw: default to round-robin for host port receive The Host Port (i.e. CPU facing port) of CPSW receives traffic from Linux via TX DMA Channels which are Hardware Queues consisting of traffic categorized according to their priority. The Host Port is configured to dequeue traffic from these Hardware Queues on the basis of priority i.e. as long as traffic exists on a Hardware Queue of a higher priority, the traffic on Hardware Queues of lower priority isn't dequeued. An alternate operation is also supported wherein traffic can be dequeued by the Host Port in a Round-Robin manner. Until commit under Fixes, the am65-cpsw driver enabled a single TX DMA Channel, due to which, unless modified by user via "ethtool", all traffic from Linux is transmitted on DMA Channel 0. Therefore, configuring the Host Port for priority based dequeuing or Round-Robin operation is identical since there is a single DMA Channel. Since commit under Fixes, all 8 TX DMA Channels are enabled by default. Additionally, the default "tc mapping" doesn't take into account the possibility of different traffic profiles which various users might have. This results in traffic starvation at the Host Port due to the priority based dequeuing which has been enabled by default since the inception of the driver. The traffic starvation triggers NETDEV WATCHDOG timeout for all TX DMA Channels that haven't been serviced due to the presence of traffic on the higher priority TX DMA Channels. Fix this by defaulting to Round-Robin dequeuing at the Host Port, which shall ensure that traffic is dequeued from all TX DMA Channels irrespective of the traffic profile. This will address the NETDEV WATCHDOG timeouts. At the same time, users can still switch from Round-Robin to Priority based dequeuing at the Host Port with the help of the "p0-rx-ptype-rrobin" private flag of "ethtool". Users are expected to setup an appropriate "tc mapping" that suits their traffic profile when switching to priority based dequeuing at the Host Port. Fixes: be397ea3473d ("net: ethernet: am65-cpsw: Set default TX channels to maximum") Cc: Signed-off-by: Siddharth Vadapalli Link: https://patch.msgid.link/20241220075618.228202-1-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 14e1df721f2e..5465bf872734 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -3551,7 +3551,7 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) init_completion(&common->tdown_complete); common->tx_ch_num = AM65_CPSW_DEFAULT_TX_CHNS; common->rx_ch_num_flows = AM65_CPSW_DEFAULT_RX_CHN_FLOWS; - common->pf_p0_rx_ptype_rrobin = false; + common->pf_p0_rx_ptype_rrobin = true; common->default_vlan = 1; common->ports = devm_kcalloc(dev, common->port_num, From 75221e96101fa93390d3db5c23e026f5e3565d9b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 20 Dec 2024 18:04:00 +0100 Subject: [PATCH 23/50] net: pse-pd: tps23881: Fix power on/off issue An issue was present in the initial driver implementation. The driver read the power status of all channels before toggling the bit of the desired one. Using the power status register as a base value introduced a problem, because only the bit corresponding to the concerned channel ID should be set in the write-only power enable register. This led to cases where disabling power for one channel also powered off other channels. This patch removes the power status read and ensures the value is limited to the bit matching the channel index of the PI. Fixes: 20e6d190ffe1 ("net: pse-pd: Add TI TPS23881 PSE controller driver") Signed-off-by: Kory Maincent Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20241220170400.291705-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/tps23881.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/net/pse-pd/tps23881.c b/drivers/net/pse-pd/tps23881.c index 5c4e88be46ee..8797ca1a8a21 100644 --- a/drivers/net/pse-pd/tps23881.c +++ b/drivers/net/pse-pd/tps23881.c @@ -64,15 +64,11 @@ static int tps23881_pi_enable(struct pse_controller_dev *pcdev, int id) if (id >= TPS23881_MAX_CHANS) return -ERANGE; - ret = i2c_smbus_read_word_data(client, TPS23881_REG_PW_STATUS); - if (ret < 0) - return ret; - chan = priv->port[id].chan[0]; if (chan < 4) - val = (u16)(ret | BIT(chan)); + val = BIT(chan); else - val = (u16)(ret | BIT(chan + 4)); + val = BIT(chan + 4); if (priv->port[id].is_4p) { chan = priv->port[id].chan[1]; @@ -100,15 +96,11 @@ static int tps23881_pi_disable(struct pse_controller_dev *pcdev, int id) if (id >= TPS23881_MAX_CHANS) return -ERANGE; - ret = i2c_smbus_read_word_data(client, TPS23881_REG_PW_STATUS); - if (ret < 0) - return ret; - chan = priv->port[id].chan[0]; if (chan < 4) - val = (u16)(ret | BIT(chan + 4)); + val = BIT(chan + 4); else - val = (u16)(ret | BIT(chan + 8)); + val = BIT(chan + 8); if (priv->port[id].is_4p) { chan = priv->port[id].chan[1]; From 050a4c011b0dfeb91664a5d7bd3647ff38db08ce Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Fri, 20 Dec 2024 10:15:02 +0200 Subject: [PATCH 24/50] net/mlx5: DR, select MSIX vector 0 for completion queue creation When creating a software steering completion queue (CQ), an arbitrary MSIX vector n is selected. This results in the CQ sharing the same Ethernet traffic channel n associated with the chosen vector. However, the value of n is often unpredictable, which can introduce complications for interrupt monitoring and verification tools. Moreover, SW steering uses polling rather than event-driven interrupts. Therefore, there is no need to select any MSIX vector other than the existing vector 0 for CQ creation. In light of these factors, and to enhance predictability, we modify the code to consistently select MSIX vector 0 for CQ creation. Fixes: 297cccebdc5a ("net/mlx5: DR, Expose an internal API to issue RDMA operations") Signed-off-by: Shahar Shitrit Reviewed-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index 6fa06ba2d346..f57c84e5128b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1067,7 +1067,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, int inlen, err, eqn; void *cqc, *in; __be64 *pas; - int vector; u32 i; cq = kzalloc(sizeof(*cq), GFP_KERNEL); @@ -1096,8 +1095,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, if (!in) goto err_cqwq; - vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev); - err = mlx5_comp_eqn_get(mdev, vector, &eqn); + err = mlx5_comp_eqn_get(mdev, 0, &eqn); if (err) { kvfree(in); goto err_cqwq; From 8c6254479b3d5bd788d2b5fefaa48fb194331ed0 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 20 Dec 2024 10:15:03 +0200 Subject: [PATCH 25/50] net/mlx5e: macsec: Maintain TX SA from encoding_sa In MACsec, it is possible to create multiple active TX SAs on a SC, but only one such SA can be used at a time for transmission. This SA is selected through the encoding_sa link parameter. When there are 2 or more active TX SAs configured (encoding_sa=0): ip macsec add macsec0 tx sa 0 pn 1 on key 00 ip macsec add macsec0 tx sa 1 pn 1 on key 00 ... the traffic should be still sent via TX SA 0 as the encoding_sa was not changed. However, the driver ignores the encoding_sa and overrides it to SA 1 by installing the flow steering id of the newly created TX SA into the SCI -> flow steering id hash map. The future packet tx descriptors will point to the incorrect flow steering rule (SA 1). This patch fixes the issue by avoiding the creation of the flow steering rule for an active TX SA that is not the encoding_sa. The driver side tx_sa object and the FW side macsec object are still created. When the encoding_sa link parameter is changed to another active TX SA, only the new flow steering rule will be created in the mlx5e_macsec_upd_txsa() handler. Fixes: 8ff0ac5be144 ("net/mlx5: Add MACsec offload Tx command support") Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Reviewed-by: Lior Nahmanson Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index cc9bcc420032..6ab02f3fc291 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -339,9 +339,13 @@ static int mlx5e_macsec_init_sa_fs(struct macsec_context *ctx, { struct mlx5e_priv *priv = macsec_netdev_priv(ctx->netdev); struct mlx5_macsec_fs *macsec_fs = priv->mdev->macsec_fs; + const struct macsec_tx_sc *tx_sc = &ctx->secy->tx_sc; struct mlx5_macsec_rule_attrs rule_attrs; union mlx5_macsec_rule *macsec_rule; + if (is_tx && tx_sc->encoding_sa != sa->assoc_num) + return 0; + rule_attrs.macsec_obj_id = sa->macsec_obj_id; rule_attrs.sci = sa->sci; rule_attrs.assoc_num = sa->assoc_num; From 5a03b368562a7ff5f5f1f63b5adf8309cbdbd5be Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:04 +0200 Subject: [PATCH 26/50] net/mlx5e: Skip restore TC rules for vport rep without loaded flag During driver unload, unregister_netdev is called after unloading vport rep. So, the mlx5e_rep_priv is already freed while trying to get rpriv->netdev, or walk rpriv->tc_ht, which results in use-after-free. So add the checking to make sure access the data of vport rep which is still loaded. Fixes: d1569537a837 ("net/mlx5e: Modify and restore TC rules for IPSec TX rules") Signed-off-by: Jianbo Liu Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 --- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c index 5a0047bdcb51..ed977ae75fab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c @@ -150,11 +150,11 @@ void mlx5_esw_ipsec_restore_dest_uplink(struct mlx5_core_dev *mdev) unsigned long i; int err; - xa_for_each(&esw->offloads.vport_reps, i, rep) { - rpriv = rep->rep_data[REP_ETH].priv; - if (!rpriv || !rpriv->netdev) + mlx5_esw_for_each_rep(esw, i, rep) { + if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED) continue; + rpriv = rep->rep_data[REP_ETH].priv; rhashtable_walk_enter(&rpriv->tc_ht, &iter); rhashtable_walk_start(&iter); while ((flow = rhashtable_walk_next(&iter)) != NULL) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index a83d41121db6..8573d36785f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -714,6 +714,9 @@ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw); MLX5_CAP_GEN_2((esw->dev), ec_vf_vport_base) +\ (last) - 1) +#define mlx5_esw_for_each_rep(esw, i, rep) \ + xa_for_each(&((esw)->offloads.vport_reps), i, rep) + struct mlx5_eswitch *__must_check mlx5_devlink_eswitch_get(struct devlink *devlink); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index d5b42b3a19fd..40359f320724 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -53,9 +53,6 @@ #include "lag/lag.h" #include "en/tc/post_meter.h" -#define mlx5_esw_for_each_rep(esw, i, rep) \ - xa_for_each(&((esw)->offloads.vport_reps), i, rep) - /* There are two match-all miss flows, one for unicast dst mac and * one for multicast. */ From 2a4f56fbcc473d8faeb29b73082df39efbe5893c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:05 +0200 Subject: [PATCH 27/50] net/mlx5e: Keep netdev when leave switchdev for devlink set legacy only In the cited commit, when changing from switchdev to legacy mode, uplink representor's netdev is kept, and its profile is replaced with nic profile, so netdev is detached from old profile, then attach to new profile. During profile change, the hardware resources allocated by the old profile will be cleaned up. However, the cleanup is relying on the related kernel modules. And they may need to flush themselves first, which is triggered by netdev events, for example, NETDEV_UNREGISTER. However, netdev is kept, or netdev_register is called after the cleanup, which may cause troubles because the resources are still referred by kernel modules. The same process applies to all the caes when uplink is leaving switchdev mode, including devlink eswitch mode set legacy, driver unload and devlink reload. For the first one, it can be blocked and returns failure to users, whenever possible. But it's hard for the others. Besides, the attachment to nic profile is unnecessary as the netdev will be unregistered anyway for such cases. So in this patch, the original behavior is kept only for devlink eswitch set mode legacy. For the others, moves netdev unregistration before the profile change. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 19 +++++++++++++++++-- .../net/ethernet/mellanox/mlx5/core/en_rep.c | 15 +++++++++++++++ .../mellanox/mlx5/core/eswitch_offloads.c | 2 ++ include/linux/mlx5/driver.h | 1 + 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dd16d73000c3..0ec17c276bdd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6542,8 +6542,23 @@ static void _mlx5e_remove(struct auxiliary_device *adev) mlx5_core_uplink_netdev_set(mdev, NULL); mlx5e_dcbnl_delete_app(priv); - unregister_netdev(priv->netdev); - _mlx5e_suspend(adev, false); + /* When unload driver, the netdev is in registered state + * if it's from legacy mode. If from switchdev mode, it + * is already unregistered before changing to NIC profile. + */ + if (priv->netdev->reg_state == NETREG_REGISTERED) { + unregister_netdev(priv->netdev); + _mlx5e_suspend(adev, false); + } else { + struct mlx5_core_dev *pos; + int i; + + if (test_bit(MLX5E_STATE_DESTROYING, &priv->state)) + mlx5_sd_for_each_dev(i, mdev, pos) + mlx5e_destroy_mdev_resources(pos); + else + _mlx5e_suspend(adev, true); + } /* Avoid cleanup if profile rollback failed. */ if (priv->profile) priv->profile->cleanup(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 554f9cb5b53f..fdff9fd8a89e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1509,6 +1509,21 @@ mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv) priv = netdev_priv(netdev); + /* This bit is set when using devlink to change eswitch mode from + * switchdev to legacy. As need to keep uplink netdev ifindex, we + * detach uplink representor profile and attach NIC profile only. + * The netdev will be unregistered later when unload NIC auxiliary + * driver for this case. + * We explicitly block devlink eswitch mode change if any IPSec rules + * offloaded, but can't block other cases, such as driver unload + * and devlink reload. We have to unregister netdev before profile + * change for those cases. This is to avoid resource leak because + * the offloaded rules don't have the chance to be unoffloaded before + * cleanup which is triggered by detach uplink representor profile. + */ + if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_SWITCH_LEGACY)) + unregister_netdev(netdev); + mlx5e_netdev_attach_nic_profile(priv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 40359f320724..06076dd9ec64 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3777,6 +3777,8 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, esw->eswitch_operation_in_progress = true; up_write(&esw->mode_lock); + if (mode == DEVLINK_ESWITCH_MODE_LEGACY) + esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY; mlx5_eswitch_disable_locked(esw); if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { if (mlx5_devlink_trap_get_num_active(esw->dev)) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..8f5991168ccd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -524,6 +524,7 @@ enum { * creation/deletion on drivers rescan. Unset during device attach. */ MLX5_PRIV_FLAGS_DETACH = 1 << 2, + MLX5_PRIV_FLAGS_SWITCH_LEGACY = 1 << 3, }; struct mlx5_adev { From 542ed8145e6f9392e3d0a86a0e9027d2ffd183e4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 21 Dec 2024 00:29:20 +0100 Subject: [PATCH 28/50] netfilter: nft_set_hash: unaligned atomic read on struct nft_set_ext Access to genmask field in struct nft_set_ext results in unaligned atomic read: [ 72.130109] Unable to handle kernel paging request at virtual address ffff0000c2bb708c [ 72.131036] Mem abort info: [ 72.131213] ESR = 0x0000000096000021 [ 72.131446] EC = 0x25: DABT (current EL), IL = 32 bits [ 72.132209] SET = 0, FnV = 0 [ 72.133216] EA = 0, S1PTW = 0 [ 72.134080] FSC = 0x21: alignment fault [ 72.135593] Data abort info: [ 72.137194] ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000 [ 72.142351] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 72.145989] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 72.150115] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000237d27000 [ 72.154893] [ffff0000c2bb708c] pgd=0000000000000000, p4d=180000023ffff403, pud=180000023f84b403, pmd=180000023f835403, +pte=0068000102bb7707 [ 72.163021] Internal error: Oops: 0000000096000021 [#1] SMP [...] [ 72.170041] CPU: 7 UID: 0 PID: 54 Comm: kworker/7:0 Tainted: G E 6.13.0-rc3+ #2 [ 72.170509] Tainted: [E]=UNSIGNED_MODULE [ 72.170720] Hardware name: QEMU QEMU Virtual Machine, BIOS edk2-stable202302-for-qemu 03/01/2023 [ 72.171192] Workqueue: events_power_efficient nft_rhash_gc [nf_tables] [ 72.171552] pstate: 21400005 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 72.171915] pc : nft_rhash_gc+0x200/0x2d8 [nf_tables] [ 72.172166] lr : nft_rhash_gc+0x128/0x2d8 [nf_tables] [ 72.172546] sp : ffff800081f2bce0 [ 72.172724] x29: ffff800081f2bd40 x28: ffff0000c2bb708c x27: 0000000000000038 [ 72.173078] x26: ffff0000c6780ef0 x25: ffff0000c643df00 x24: ffff0000c6778f78 [ 72.173431] x23: 000000000000001a x22: ffff0000c4b1f000 x21: ffff0000c6780f78 [ 72.173782] x20: ffff0000c2bb70dc x19: ffff0000c2bb7080 x18: 0000000000000000 [ 72.174135] x17: ffff0000c0a4e1c0 x16: 0000000000003000 x15: 0000ac26d173b978 [ 72.174485] x14: ffffffffffffffff x13: 0000000000000030 x12: ffff0000c6780ef0 [ 72.174841] x11: 0000000000000000 x10: ffff800081f2bcf8 x9 : ffff0000c3000000 [ 72.175193] x8 : 00000000000004be x7 : 0000000000000000 x6 : 0000000000000000 [ 72.175544] x5 : 0000000000000040 x4 : ffff0000c3000010 x3 : 0000000000000000 [ 72.175871] x2 : 0000000000003a98 x1 : ffff0000c2bb708c x0 : 0000000000000004 [ 72.176207] Call trace: [ 72.176316] nft_rhash_gc+0x200/0x2d8 [nf_tables] (P) [ 72.176653] process_one_work+0x178/0x3d0 [ 72.176831] worker_thread+0x200/0x3f0 [ 72.176995] kthread+0xe8/0xf8 [ 72.177130] ret_from_fork+0x10/0x20 [ 72.177289] Code: 54fff984 d503201f d2800080 91003261 (f820303f) [ 72.177557] ---[ end trace 0000000000000000 ]--- Align struct nft_set_ext to word size to address this and documentation it. pahole reports that this increases the size of elements for rhash and pipapo in 8 bytes on x86_64. Fixes: 7ffc7481153b ("netfilter: nft_set_hash: skip duplicated elements pending gc run") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4afa64c81304..0027beca5cd5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -733,15 +733,18 @@ struct nft_set_ext_tmpl { /** * struct nft_set_ext - set extensions * - * @genmask: generation mask + * @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT) * @offset: offsets of individual extension types * @data: beginning of extension data + * + * This structure must be aligned to word size, otherwise atomic bitops + * on genmask field can cause alignment failure on some archs. */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; -}; +} __aligned(BITS_PER_LONG / 8); static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { From 6b830c6a023ff6e8fe05dbe47a9e5cd276df09ee Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:14 +0100 Subject: [PATCH 29/50] netlink: specs: mptcp: add missing 'server-side' attr This attribute is added with the 'created' and 'established' events, but the documentation didn't mention it. The documentation in the UAPI header has been auto-generated by: ./tools/net/ynl/ynl-regen.sh Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-1-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 6 ++++-- include/uapi/linux/mptcp_pm.h | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index dc190bf838fe..fc0603f51665 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -23,7 +23,8 @@ definitions: - name: created doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport + token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, + server-side A new MPTCP connection has been created. It is the good time to allocate memory and send ADD_ADDR if needed. Depending on the traffic-patterns it can take a long time until the @@ -31,7 +32,8 @@ definitions: - name: established doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport + token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, + server-side A MPTCP connection is established (can start new subflows). - name: closed diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 50589e5dd6a3..b34fd95b6f84 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -13,12 +13,13 @@ * enum mptcp_event_type * @MPTCP_EVENT_UNSPEC: unused event * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport A new MPTCP connection has been created. It is the good time - * to allocate memory and send ADD_ADDR if needed. Depending on the - * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED - * is sent. + * sport, dport, server-side A new MPTCP connection has been created. It is + * the good time to allocate memory and send ADD_ADDR if needed. Depending on + * the traffic-patterns it can take a long time until the + * MPTCP_EVENT_ESTABLISHED is sent. * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport A MPTCP connection is established (can start new subflows). + * sport, dport, server-side A MPTCP connection is established (can start new + * subflows). * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A * new address has been announced by the peer. From bea87657b5ee8e6f18af2833ee4b88212ef52d28 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:15 +0100 Subject: [PATCH 30/50] netlink: specs: mptcp: clearly mention attributes The rendered version of the MPTCP events [1] looked strange, because the whole content of the 'doc' was displayed in the same block. It was then not clear that the first words, not even ended by a period, were the attributes that are defined when such events are emitted. These attributes have now been moved to the end, prefixed by 'Attributes:' and ended with a period. Note that '>-' has been added after 'doc:' to allow ':' in the text below. The documentation in the UAPI header has been auto-generated by: ./tools/net/ynl/ynl-regen.sh Link: https://docs.kernel.org/networking/netlink_spec/mptcp_pm.html#event-type [1] Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-2-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 50 ++++++++++----------- include/uapi/linux/mptcp_pm.h | 53 ++++++++++++----------- 2 files changed, 52 insertions(+), 51 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index fc0603f51665..59087a230565 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -22,67 +22,67 @@ definitions: doc: unused event - name: created - doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, - server-side + doc: >- A new MPTCP connection has been created. It is the good time to allocate memory and send ADD_ADDR if needed. Depending on the traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. + Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, + dport, server-side. - name: established - doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, - server-side + doc: >- A MPTCP connection is established (can start new subflows). + Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, + dport, server-side. - name: closed - doc: - token + doc: >- A MPTCP connection has stopped. + Attribute: token. - name: announced value: 6 - doc: - token, rem_id, family, daddr4 | daddr6 [, dport] + doc: >- A new address has been announced by the peer. + Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. - name: removed - doc: - token, rem_id + doc: >- An address has been lost by the peer. + Attributes: token, rem_id. - name: sub-established value: 10 - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- A new subflow has been established. 'error' should not be set. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: sub-closed - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- A subflow has been closed. An error (copy of sk_err) could be set if an error has been detected for this subflow. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: sub-priority value: 13 - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- The priority of a subflow has changed. 'error' should not be set. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: listener-created value: 15 - doc: - family, sport, saddr4 | saddr6 + doc: >- A new PM listener is created. + Attributes: family, sport, saddr4 | saddr6. - name: listener-closed - doc: - family, sport, saddr4 | saddr6 + doc: >- A PM listener is closed. + Attributes: family, sport, saddr4 | saddr6. attribute-sets: - diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index b34fd95b6f84..84fa8a21dfd0 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -12,32 +12,33 @@ /** * enum mptcp_event_type * @MPTCP_EVENT_UNSPEC: unused event - * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side A new MPTCP connection has been created. It is - * the good time to allocate memory and send ADD_ADDR if needed. Depending on - * the traffic-patterns it can take a long time until the - * MPTCP_EVENT_ESTABLISHED is sent. - * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side A MPTCP connection is established (can start new - * subflows). - * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. - * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A - * new address has been announced by the peer. - * @MPTCP_EVENT_REMOVED: token, rem_id An address has been lost by the peer. - * @MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error] A new - * subflow has been established. 'error' should not be set. - * @MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx [, error] A subflow has been - * closed. An error (copy of sk_err) could be set if an error has been - * detected for this subflow. - * @MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx [, error] The priority of a - * subflow has changed. 'error' should not be set. - * @MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 A new PM - * listener is created. - * @MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 A PM listener - * is closed. + * @MPTCP_EVENT_CREATED: A new MPTCP connection has been created. It is the + * good time to allocate memory and send ADD_ADDR if needed. Depending on the + * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED + * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, server-side. + * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new + * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, server-side. + * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. + * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. + * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. + * @MPTCP_EVENT_REMOVED: An address has been lost by the peer. Attributes: + * token, rem_id. + * @MPTCP_EVENT_SUB_ESTABLISHED: A new subflow has been established. 'error' + * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_SUB_CLOSED: A subflow has been closed. An error (copy of + * sk_err) could be set if an error has been detected for this subflow. + * Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + * daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_SUB_PRIORITY: The priority of a subflow has changed. 'error' + * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_LISTENER_CREATED: A new PM listener is created. Attributes: + * family, sport, saddr4 | saddr6. + * @MPTCP_EVENT_LISTENER_CLOSED: A PM listener is closed. Attributes: family, + * sport, saddr4 | saddr6. */ enum mptcp_event_type { MPTCP_EVENT_UNSPEC, From 4f363fe9f6b28ed9b714cd7fe5ce880171927dab Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:16 +0100 Subject: [PATCH 31/50] netlink: specs: mptcp: fix missing doc Two operations didn't have a small description. It looks like something that has been missed in the original commit introducing this file. Replace the two "todo" by a small and simple description: Create/Destroy subflow. While at it, also uniform the capital letters, avoid double spaces, and fix the "announce" event description: a new "address" has been announced, not a new "subflow". Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-3-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index 59087a230565..dfd017780d2f 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -308,8 +308,8 @@ operations: attributes: - addr - - name: flush-addrs - doc: flush addresses + name: flush-addrs + doc: Flush addresses attribute-set: endpoint dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -353,7 +353,7 @@ operations: - addr-remote - name: announce - doc: announce new sf + doc: Announce new address attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -364,7 +364,7 @@ operations: - token - name: remove - doc: announce removal + doc: Announce removal attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -375,7 +375,7 @@ operations: - loc-id - name: subflow-create - doc: todo + doc: Create subflow attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -387,7 +387,7 @@ operations: - addr-remote - name: subflow-destroy - doc: todo + doc: Destroy subflow attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] From a024e377efed31ecfb39210bed562932321345b3 Mon Sep 17 00:00:00 2001 From: Antonio Pastor Date: Tue, 24 Dec 2024 20:07:20 -0500 Subject: [PATCH 32/50] net: llc: reset skb->transport_header 802.2+LLC+SNAP frames received by napi_complete_done with GRO and DSA have skb->transport_header set two bytes short, or pointing 2 bytes before network_header & skb->data. As snap_rcv expects transport_header to point to SNAP header (OID:PID) after LLC processing advances offset over LLC header (llc_rcv & llc_fixup_skb), code doesn't find a match and packet is dropped. Between napi_complete_done and snap_rcv, transport_header is not used until __netif_receive_skb_core, where originally it was being reset. Commit fda55eca5a33 ("net: introduce skb_transport_header_was_set()") only does so if not set, on the assumption the value was set correctly by GRO (and also on assumption that "network stacks usually reset the transport header anyway"). Afterwards it is moved forward by llc_fixup_skb. Locally generated traffic shows up at __netif_receive_skb_core with no transport_header set and is processed without issue. On a setup with GRO but no DSA, transport_header and network_header are both set to point to skb->data which is also correct. As issue is LLC specific, to avoid impacting non-LLC traffic, and to follow up on original assumption made on previous code change, llc_fixup_skb to reset the offset after skb pull. llc_fixup_skb assumes the LLC header is at skb->data, and by definition SNAP header immediately follows. Fixes: fda55eca5a33 ("net: introduce skb_transport_header_was_set()") Signed-off-by: Antonio Pastor Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241225010723.2830290-1-antonio.pastor@gmail.com Signed-off-by: Jakub Kicinski --- net/llc/llc_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 51bccfb00a9c..61b0159b2fbe 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -124,8 +124,8 @@ static inline int llc_fixup_skb(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; - skb->transport_header += llc_len; skb_pull(skb, llc_len); + skb_reset_transport_header(skb); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; From b255ef45fcc2141c1bf98456796abb956d843a27 Mon Sep 17 00:00:00 2001 From: Vitalii Mordan Date: Fri, 27 Dec 2024 15:30:07 +0300 Subject: [PATCH 33/50] eth: bcmsysport: fix call balance of priv->clk handling routines Check the return value of clk_prepare_enable to ensure that priv->clk has been successfully enabled. If priv->clk was not enabled during bcm_sysport_probe, bcm_sysport_resume, or bcm_sysport_open, it must not be disabled in any subsequent execution paths. Fixes: 31bc72d97656 ("net: systemport: fetch and use clock resources") Signed-off-by: Vitalii Mordan Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20241227123007.2333397-1-mordan@ispras.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcmsysport.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 42672c63f108..bc4e1f3b3752 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1933,7 +1933,11 @@ static int bcm_sysport_open(struct net_device *dev) unsigned int i; int ret; - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + netdev_err(dev, "could not enable priv clock\n"); + return ret; + } /* Reset UniMAC */ umac_reset(priv); @@ -2591,7 +2595,11 @@ static int bcm_sysport_probe(struct platform_device *pdev) goto err_deregister_notifier; } - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + dev_err(&pdev->dev, "could not enable priv clock\n"); + goto err_deregister_netdev; + } priv->rev = topctrl_readl(priv, REV_CNTL) & REV_MASK; dev_info(&pdev->dev, @@ -2605,6 +2613,8 @@ static int bcm_sysport_probe(struct platform_device *pdev) return 0; +err_deregister_netdev: + unregister_netdev(dev); err_deregister_notifier: unregister_netdevice_notifier(&priv->netdev_notifier); err_deregister_fixed_link: @@ -2774,7 +2784,12 @@ static int __maybe_unused bcm_sysport_resume(struct device *d) if (!netif_running(dev)) return 0; - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + netdev_err(dev, "could not enable priv clock\n"); + return ret; + } + if (priv->wolopts) clk_disable_unprepare(priv->wol_clk); From fb3a9a1165cea104b5ab3753e88218e4497b01c1 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Fri, 20 Dec 2024 19:28:06 -0800 Subject: [PATCH 34/50] gve: trigger RX NAPI instead of TX NAPI in gve_xsk_wakeup Commit ba0925c34e0f ("gve: process XSK TX descriptors as part of RX NAPI") moved XSK TX processing to be part of the RX NAPI. However, that commit did not include triggering the RX NAPI in gve_xsk_wakeup. This is necessary because the TX NAPI only processes TX completions, meaning that a TX wakeup would not actually trigger XSK descriptor processing. Also, the branch on XDP_WAKEUP_TX was supposed to have been removed, as the NAPI should be scheduled whether the wakeup is for RX or TX. Fixes: ba0925c34e0f ("gve: process XSK TX descriptors as part of RX NAPI") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Link: https://patch.msgid.link/20241221032807.302244-1-pkaligineedi@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 09fb7f16f73e..8a8f6ab12a98 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1714,7 +1714,7 @@ done: static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) { struct gve_priv *priv = netdev_priv(dev); - int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id); + struct napi_struct *napi; if (!gve_get_napi_enabled(priv)) return -ENETDOWN; @@ -1722,19 +1722,12 @@ static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog) return -EINVAL; - if (flags & XDP_WAKEUP_TX) { - struct gve_tx_ring *tx = &priv->tx[tx_queue_id]; - struct napi_struct *napi = - &priv->ntfy_blocks[tx->ntfy_id].napi; - - if (!napi_if_scheduled_mark_missed(napi)) { - /* Call local_bh_enable to trigger SoftIRQ processing */ - local_bh_disable(); - napi_schedule(napi); - local_bh_enable(); - } - - tx->xdp_xsk_wakeup++; + napi = &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_id)].napi; + if (!napi_if_scheduled_mark_missed(napi)) { + /* Call local_bh_enable to trigger SoftIRQ processing */ + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); } return 0; From ad5c318086e2e23b577eca33559c5ebf89bc7eb9 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 21 Dec 2024 17:14:48 +0900 Subject: [PATCH 35/50] net: mv643xx_eth: fix an OF node reference leak Current implementation of mv643xx_eth_shared_of_add_port() calls of_parse_phandle(), but does not release the refcount on error. Call of_node_put() in the error path and in mv643xx_eth_shared_of_remove(). This bug was found by an experimental verification tool that I am developing. Fixes: 76723bca2802 ("net: mv643xx_eth: add DT parsing support") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241221081448.3313163-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mv643xx_eth.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index a06048719e84..67a6ff07c83d 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2704,9 +2704,15 @@ static struct platform_device *port_platdev[3]; static void mv643xx_eth_shared_of_remove(void) { + struct mv643xx_eth_platform_data *pd; int n; for (n = 0; n < 3; n++) { + if (!port_platdev[n]) + continue; + pd = dev_get_platdata(&port_platdev[n]->dev); + if (pd) + of_node_put(pd->phy_node); platform_device_del(port_platdev[n]); port_platdev[n] = NULL; } @@ -2769,8 +2775,10 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, } ppdev = platform_device_alloc(MV643XX_ETH_NAME, dev_num); - if (!ppdev) - return -ENOMEM; + if (!ppdev) { + ret = -ENOMEM; + goto put_err; + } ppdev->dev.coherent_dma_mask = DMA_BIT_MASK(32); ppdev->dev.of_node = pnp; @@ -2792,6 +2800,8 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, port_err: platform_device_put(ppdev); +put_err: + of_node_put(ppd.phy_node); return ret; } From cbb26f7d8451fe56ccac802c6db48d16240feebd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 21 Dec 2024 09:51:46 +0100 Subject: [PATCH 36/50] mptcp: fix TCP options overflow. Syzbot reported the following splat: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 1 UID: 0 PID: 5836 Comm: sshd Not tainted 6.13.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 RIP: 0010:_compound_head include/linux/page-flags.h:242 [inline] RIP: 0010:put_page+0x23/0x260 include/linux/mm.h:1552 Code: 90 90 90 90 90 90 90 55 41 57 41 56 53 49 89 fe 48 bd 00 00 00 00 00 fc ff df e8 f8 5e 12 f8 49 8d 5e 08 48 89 d8 48 c1 e8 03 <80> 3c 28 00 74 08 48 89 df e8 8f c7 78 f8 48 8b 1b 48 89 de 48 83 RSP: 0000:ffffc90003916c90 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000008 RCX: ffff888030458000 RDX: 0000000000000100 RSI: 0000000000000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff898ca81d R09: 1ffff110054414ac R10: dffffc0000000000 R11: ffffed10054414ad R12: 0000000000000007 R13: ffff88802a20a542 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f34f496e800(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f9d6ec9ec28 CR3: 000000004d260000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_page_unref include/linux/skbuff_ref.h:43 [inline] __skb_frag_unref include/linux/skbuff_ref.h:56 [inline] skb_release_data+0x483/0x8a0 net/core/skbuff.c:1119 skb_release_all net/core/skbuff.c:1190 [inline] __kfree_skb+0x55/0x70 net/core/skbuff.c:1204 tcp_clean_rtx_queue net/ipv4/tcp_input.c:3436 [inline] tcp_ack+0x2442/0x6bc0 net/ipv4/tcp_input.c:4032 tcp_rcv_state_process+0x8eb/0x44e0 net/ipv4/tcp_input.c:6805 tcp_v4_do_rcv+0x77d/0xc70 net/ipv4/tcp_ipv4.c:1939 tcp_v4_rcv+0x2dc0/0x37f0 net/ipv4/tcp_ipv4.c:2351 ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 __netif_receive_skb_one_core net/core/dev.c:5672 [inline] __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5785 process_backlog+0x662/0x15b0 net/core/dev.c:6117 __napi_poll+0xcb/0x490 net/core/dev.c:6883 napi_poll net/core/dev.c:6952 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:7074 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0xf7/0x220 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0x57/0xc0 arch/x86/kernel/apic/apic.c:1049 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702 RIP: 0033:0x7f34f4519ad5 Code: 85 d2 74 0d 0f 10 02 48 8d 54 24 20 0f 11 44 24 20 64 8b 04 25 18 00 00 00 85 c0 75 27 41 b8 08 00 00 00 b8 0f 01 00 00 0f 05 <48> 3d 00 f0 ff ff 76 75 48 8b 15 24 73 0d 00 f7 d8 64 89 02 48 83 RSP: 002b:00007ffec5b32ce0 EFLAGS: 00000246 RAX: 0000000000000001 RBX: 00000000000668a0 RCX: 00007f34f4519ad5 RDX: 00007ffec5b32d00 RSI: 0000000000000004 RDI: 0000564f4bc6cae0 RBP: 0000564f4bc6b5a0 R08: 0000000000000008 R09: 0000000000000000 R10: 00007ffec5b32de8 R11: 0000000000000246 R12: 0000564f48ea8aa4 R13: 0000000000000001 R14: 0000564f48ea93e8 R15: 00007ffec5b32d68 Eric noted a probable shinfo->nr_frags corruption, which indeed occurs. The root cause is a buggy MPTCP option len computation in some circumstances: the ADD_ADDR option should be mutually exclusive with DSS since the blamed commit. Still, mptcp_established_options_add_addr() tries to set the relevant info in mptcp_out_options, if the remaining space is large enough even when DSS is present. Since the ADD_ADDR infos and the DSS share the same union fields, adding first corrupts the latter. In the worst-case scenario, such corruption increases the DSS binary layout, exceeding the computed length and possibly overwriting the skb shared info. Address the issue by enforcing mutual exclusion in mptcp_established_options_add_addr(), too. Cc: stable@vger.kernel.org Reported-by: syzbot+38a095a81f30d82884c1@syzkaller.appspotmail.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/538 Fixes: 1bff1e43a30e ("mptcp: optimize out option generation") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/025d9df8cde3c9a557befc47e9bc08fbbe3476e5.1734771049.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1603b3702e22..a62bc874bf1e 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -667,8 +667,15 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * &echo, &drop_other_suboptions)) return false; + /* + * Later on, mptcp_write_options() will enforce mutually exclusion with + * DSS, bail out if such option is set and we can't drop it. + */ if (drop_other_suboptions) remaining += opt_size; + else if (opts->suboptions & OPTION_MPTCP_DSS) + return false; + len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); if (remaining < len) return false; From 03c8d0af2e409e15c16130b185e12b5efba0a6b9 Mon Sep 17 00:00:00 2001 From: Pascal Hambourg Date: Mon, 23 Dec 2024 17:44:01 +0100 Subject: [PATCH 37/50] sky2: Add device ID 11ab:4373 for Marvell 88E8075 A Marvell 88E8075 ethernet controller has this device ID instead of 11ab:4370 and works fine with the sky2 driver. Signed-off-by: Pascal Hambourg Cc: stable@vger.kernel.org Link: https://patch.msgid.link/10165a62-99fb-4be6-8c64-84afd6234085@plouf.fr.eu.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/sky2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 3914cd9210d4..988fa28cfb5f 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -130,6 +130,7 @@ static const struct pci_device_id sky2_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x436C) }, /* 88E8072 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x436D) }, /* 88E8055 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4370) }, /* 88E8075 */ + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4373) }, /* 88E8075 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4380) }, /* 88E8057 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4381) }, /* 88E8059 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4382) }, /* 88E8079 */ From 4f619d518db9cd1a933c3a095a5f95d0c1584ae8 Mon Sep 17 00:00:00 2001 From: Jinjian Song Date: Tue, 24 Dec 2024 12:15:52 +0800 Subject: [PATCH 38/50] net: wwan: t7xx: Fix FSM command timeout issue When driver processes the internal state change command, it use an asynchronous thread to process the command operation. If the main thread detects that the task has timed out, the asynchronous thread will panic when executing the completion notification because the main thread completion object has been released. BUG: unable to handle page fault for address: fffffffffffffff8 PGD 1f283a067 P4D 1f283a067 PUD 1f283c067 PMD 0 Oops: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:complete_all+0x3e/0xa0 [...] Call Trace: ? __die_body+0x68/0xb0 ? page_fault_oops+0x379/0x3e0 ? exc_page_fault+0x69/0xa0 ? asm_exc_page_fault+0x22/0x30 ? complete_all+0x3e/0xa0 fsm_main_thread+0xa3/0x9c0 [mtk_t7xx (HASH:1400 5)] ? __pfx_autoremove_wake_function+0x10/0x10 kthread+0xd8/0x110 ? __pfx_fsm_main_thread+0x10/0x10 [mtk_t7xx (HASH:1400 5)] ? __pfx_kthread+0x10/0x10 ret_from_fork+0x38/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 [...] CR2: fffffffffffffff8 ---[ end trace 0000000000000000 ]--- Use the reference counter to ensure safe release as Sergey suggests: https://lore.kernel.org/all/da90f64c-260a-4329-87bf-1f9ff20a5951@gmail.com/ Fixes: 13e920d93e37 ("net: wwan: t7xx: Add core components") Signed-off-by: Jinjian Song Acked-by: Sergey Ryazanov Link: https://patch.msgid.link/20241224041552.8711-1-jinjian.song@fibocom.com Signed-off-by: Jakub Kicinski --- drivers/net/wwan/t7xx/t7xx_state_monitor.c | 26 ++++++++++++++-------- drivers/net/wwan/t7xx/t7xx_state_monitor.h | 5 +++-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/net/wwan/t7xx/t7xx_state_monitor.c b/drivers/net/wwan/t7xx/t7xx_state_monitor.c index 3931c7a13f5a..cbdbb91e8381 100644 --- a/drivers/net/wwan/t7xx/t7xx_state_monitor.c +++ b/drivers/net/wwan/t7xx/t7xx_state_monitor.c @@ -104,14 +104,21 @@ void t7xx_fsm_broadcast_state(struct t7xx_fsm_ctl *ctl, enum md_state state) fsm_state_notify(ctl->md, state); } +static void fsm_release_command(struct kref *ref) +{ + struct t7xx_fsm_command *cmd = container_of(ref, typeof(*cmd), refcnt); + + kfree(cmd); +} + static void fsm_finish_command(struct t7xx_fsm_ctl *ctl, struct t7xx_fsm_command *cmd, int result) { if (cmd->flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { - *cmd->ret = result; - complete_all(cmd->done); + cmd->result = result; + complete_all(&cmd->done); } - kfree(cmd); + kref_put(&cmd->refcnt, fsm_release_command); } static void fsm_del_kf_event(struct t7xx_fsm_event *event) @@ -475,7 +482,6 @@ static int fsm_main_thread(void *data) int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id, unsigned int flag) { - DECLARE_COMPLETION_ONSTACK(done); struct t7xx_fsm_command *cmd; unsigned long flags; int ret; @@ -487,11 +493,13 @@ int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id INIT_LIST_HEAD(&cmd->entry); cmd->cmd_id = cmd_id; cmd->flag = flag; + kref_init(&cmd->refcnt); if (flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { - cmd->done = &done; - cmd->ret = &ret; + init_completion(&cmd->done); + kref_get(&cmd->refcnt); } + kref_get(&cmd->refcnt); spin_lock_irqsave(&ctl->command_lock, flags); list_add_tail(&cmd->entry, &ctl->command_queue); spin_unlock_irqrestore(&ctl->command_lock, flags); @@ -501,11 +509,11 @@ int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id if (flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { unsigned long wait_ret; - wait_ret = wait_for_completion_timeout(&done, + wait_ret = wait_for_completion_timeout(&cmd->done, msecs_to_jiffies(FSM_CMD_TIMEOUT_MS)); - if (!wait_ret) - return -ETIMEDOUT; + ret = wait_ret ? cmd->result : -ETIMEDOUT; + kref_put(&cmd->refcnt, fsm_release_command); return ret; } diff --git a/drivers/net/wwan/t7xx/t7xx_state_monitor.h b/drivers/net/wwan/t7xx/t7xx_state_monitor.h index 7b0a9baf488c..6e0601bb752e 100644 --- a/drivers/net/wwan/t7xx/t7xx_state_monitor.h +++ b/drivers/net/wwan/t7xx/t7xx_state_monitor.h @@ -110,8 +110,9 @@ struct t7xx_fsm_command { struct list_head entry; enum t7xx_fsm_cmd_state cmd_id; unsigned int flag; - struct completion *done; - int *ret; + struct completion done; + int result; + struct kref refcnt; }; struct t7xx_fsm_notifier { From a8620de72e5676993ec3a3b975f7c10908f5f60f Mon Sep 17 00:00:00 2001 From: Liang Jie Date: Mon, 30 Dec 2024 17:37:09 +0800 Subject: [PATCH 39/50] net: sfc: Correct key_len for efx_tc_ct_zone_ht_params In efx_tc_ct_zone_ht_params, the key_len was previously set to offsetof(struct efx_tc_ct_zone, linkage). This calculation is incorrect because it includes any padding between the zone field and the linkage field due to structure alignment, which can vary between systems. This patch updates key_len to use sizeof_field(struct efx_tc_ct_zone, zone) , ensuring that the hash table correctly uses the zone as the key. This fix prevents potential hash lookup errors and improves connection tracking reliability. Fixes: c3bb5c6acd4e ("sfc: functions to register for conntrack zone offload") Signed-off-by: Liang Jie Acked-by: Edward Cree Link: https://patch.msgid.link/20241230093709.3226854-1-buaajxlj@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/tc_conntrack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/tc_conntrack.c b/drivers/net/ethernet/sfc/tc_conntrack.c index d90206f27161..c0603f54cec3 100644 --- a/drivers/net/ethernet/sfc/tc_conntrack.c +++ b/drivers/net/ethernet/sfc/tc_conntrack.c @@ -16,7 +16,7 @@ static int efx_tc_flow_block(enum tc_setup_type type, void *type_data, void *cb_priv); static const struct rhashtable_params efx_tc_ct_zone_ht_params = { - .key_len = offsetof(struct efx_tc_ct_zone, linkage), + .key_len = sizeof_field(struct efx_tc_ct_zone, zone), .key_offset = 0, .head_offset = offsetof(struct efx_tc_ct_zone, linkage), }; From 68e068cabd2c6c533ef934c2e5151609cf6ecc6d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 1 Jan 2025 11:47:40 -0500 Subject: [PATCH 40/50] net: reenable NETIF_F_IPV6_CSUM offload for BIG TCP packets The blamed commit disabled hardware offoad of IPv6 packets with extension headers on devices that advertise NETIF_F_IPV6_CSUM, based on the definition of that feature in skbuff.h: * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). The change causes skb_warn_bad_offload to fire for BIG TCP packets. [ 496.310233] WARNING: CPU: 13 PID: 23472 at net/core/dev.c:3129 skb_warn_bad_offload+0xc4/0xe0 [ 496.310297] ? skb_warn_bad_offload+0xc4/0xe0 [ 496.310300] skb_checksum_help+0x129/0x1f0 [ 496.310303] skb_csum_hwoffload_help+0x150/0x1b0 [ 496.310306] validate_xmit_skb+0x159/0x270 [ 496.310309] validate_xmit_skb_list+0x41/0x70 [ 496.310312] sch_direct_xmit+0x5c/0x250 [ 496.310317] __qdisc_run+0x388/0x620 BIG TCP introduced an IPV6_TLV_JUMBO IPv6 extension header to communicate packet length, as this is an IPv6 jumbogram. But, the feature is only enabled on devices that support BIG TCP TSO. The header is only present for PF_PACKET taps like tcpdump, and not transmitted by physical devices. For this specific case of extension headers that are not transmitted, return to the situation before the blamed commit and support hardware offload. ipv6_has_hopopt_jumbo() tests not only whether this header is present, but also that it is the only extension header before a terminal (L4) header. Fixes: 04c20a9356f2 ("net: skip offload for NETIF_F_IPV6_CSUM if ipv6 header contains extension") Reported-by: syzbot Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89iK1hdC3Nt8KPhOtTF8vCPc1AHDCtse_BTNki1pWxAByTQ@mail.gmail.com/ Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250101164909.1331680-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 45a8c3dd4a64..faa23042df38 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3642,8 +3642,10 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) goto sw_checksum; + switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): From 5b0af621c3f6ef9261cf6067812f2fd9943acb4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Dec 2024 16:05:27 +0000 Subject: [PATCH 41/50] net: restrict SO_REUSEPORT to inet sockets After blamed commit, crypto sockets could accidentally be destroyed from RCU call back, as spotted by zyzbot [1]. Trying to acquire a mutex in RCU callback is not allowed. Restrict SO_REUSEPORT socket option to inet sockets. v1 of this patch supported TCP, UDP and SCTP sockets, but fcnal-test.sh test needed RAW and ICMP support. [1] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:562 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 24, name: ksoftirqd/1 preempt_count: 100, expected: 0 RCU nest depth: 0, expected: 0 1 lock held by ksoftirqd/1/24: #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_do_batch kernel/rcu/tree.c:2561 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_core+0xa37/0x17a0 kernel/rcu/tree.c:2823 Preemption disabled at: [] softirq_handle_begin kernel/softirq.c:402 [inline] [] handle_softirqs+0x128/0x9b0 kernel/softirq.c:537 CPU: 1 UID: 0 PID: 24 Comm: ksoftirqd/1 Not tainted 6.13.0-rc3-syzkaller-00174-ga024e377efed #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 __might_resched+0x5d4/0x780 kernel/sched/core.c:8758 __mutex_lock_common kernel/locking/mutex.c:562 [inline] __mutex_lock+0x131/0xee0 kernel/locking/mutex.c:735 crypto_put_default_null_skcipher+0x18/0x70 crypto/crypto_null.c:179 aead_release+0x3d/0x50 crypto/algif_aead.c:489 alg_do_release crypto/af_alg.c:118 [inline] alg_sock_destruct+0x86/0xc0 crypto/af_alg.c:502 __sk_destruct+0x58/0x5f0 net/core/sock.c:2260 rcu_do_batch kernel/rcu/tree.c:2567 [inline] rcu_core+0xaaa/0x17a0 kernel/rcu/tree.c:2823 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 run_ksoftirqd+0xca/0x130 kernel/softirq.c:950 smpboot_thread_fn+0x544/0xa30 kernel/smpboot.c:164 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: 8c7138b33e5c ("net: Unpublish sk from sk_reuseport_cb before call_rcu") Reported-by: syzbot+b3e02953598f447d4d2a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772f2f4.050a0220.2f3838.04cb.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241231160527.3994168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index 74729d20cd00..be84885f9290 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1295,7 +1295,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - sk->sk_reuseport = valbool; + if (valbool && !sk_is_inet(sk)) + ret = -EOPNOTSUPP; + else + sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); From a7af435df0e04cfb4a4004136d597c42639a2ae7 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Sun, 29 Dec 2024 17:46:58 +0100 Subject: [PATCH 42/50] net: wwan: iosm: Properly check for valid exec stage in ipc_mmio_init() ipc_mmio_init() used the post-decrement operator in its loop continuing condition of "retries" counter being "> 0", which meant that when this condition caused loop exit "retries" counter reached -1. But the later valid exec stage failure check only tests for "retries" counter being exactly zero, so it didn't trigger in this case (but would wrongly trigger if the code reaches a valid exec stage in the very last loop iteration). Fix this by using the pre-decrement operator instead, so the loop counter is exactly zero on valid exec stage failure. Fixes: dc0514f5d828 ("net: iosm: mmio scratchpad") Signed-off-by: Maciej S. Szmigiero Link: https://patch.msgid.link/8b19125a825f9dcdd81c667c1e5c48ba28d505a6.1735490770.git.mail@maciej.szmigiero.name Signed-off-by: Jakub Kicinski --- drivers/net/wwan/iosm/iosm_ipc_mmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_mmio.c b/drivers/net/wwan/iosm/iosm_ipc_mmio.c index 63eb08c43c05..6764c13530b9 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mmio.c +++ b/drivers/net/wwan/iosm/iosm_ipc_mmio.c @@ -104,7 +104,7 @@ struct iosm_mmio *ipc_mmio_init(void __iomem *mmio, struct device *dev) break; msleep(20); - } while (retries-- > 0); + } while (--retries > 0); if (!retries) { dev_err(ipc_mmio->dev, "invalid exec stage %X", stage); From 77ee7a6d16b6ec07b5c3ae2b6b60a24c1afbed09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:03 +0000 Subject: [PATCH 43/50] af_packet: fix vlan_get_tci() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_tci() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8da482 len:32 put:14 head:ffff88807a1d5800 data:ffff88807a1d5810 tail:0x14 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 5880 Comm: syz-executor172 Not tainted 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 9e 6c 26 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 3a 5a 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc90003baf5b8 EFLAGS: 00010286 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 8565c1eec37aa000 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802616fb50 R08: ffffffff817f0a4c R09: 1ffff92000775e50 R10: dffffc0000000000 R11: fffff52000775e51 R12: 0000000000000140 R13: ffff88807a1d5800 R14: ffff88807a1d5810 R15: 0000000000000014 FS: 00007fa03261f6c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffd65753000 CR3: 0000000031720000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_tci+0x272/0x550 net/packet/af_packet.c:565 packet_recvmsg+0x13c9/0x1ef0 net/packet/af_packet.c:3616 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1066 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2814 ___sys_recvmsg net/socket.c:2856 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2951 __sys_recvmmsg net/socket.c:3025 [inline] __do_sys_recvmmsg net/socket.c:3048 [inline] __se_sys_recvmmsg net/socket.c:3041 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3041 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+8400677f3fd43f37d3bc@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c6.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 886c0dd47b66..e2e34a49e98d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -538,10 +538,8 @@ static void *packet_current_frame(struct packet_sock *po, return packet_lookup_frame(po, rb, rb->head, status); } -static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) +static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; struct vlan_hdr vhdr, *vh; unsigned int header_len; @@ -562,12 +560,8 @@ static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) else return 0; - skb_push(skb, skb->data - skb_mac_header(skb)); - vh = skb_header_pointer(skb, header_len, sizeof(vhdr), &vhdr); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } + vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, + sizeof(vhdr), &vhdr); if (unlikely(!vh)) return 0; From f91a5b8089389eb408501af2762f168c3aaa7b79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:04 +0000 Subject: [PATCH 44/50] af_packet: fix vlan_get_protocol_dgram() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_protocol_dgram() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8ccd05 len:29 put:14 head:ffff88807fc8e400 data:ffff88807fc8e3f4 tail:0x11 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5892 Comm: syz-executor883 Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 86 d5 25 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 5a 69 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc900038d7638 EFLAGS: 00010282 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 609ffd18ea660600 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802483c8d0 R08: ffffffff817f0a8c R09: 1ffff9200071ae60 R10: dffffc0000000000 R11: fffff5200071ae61 R12: 0000000000000140 R13: ffff88807fc8e400 R14: ffff88807fc8e3f4 R15: 0000000000000011 FS: 00007fbac5e006c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbac5e00d58 CR3: 000000001238e000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_protocol_dgram+0x165/0x290 net/packet/af_packet.c:585 packet_recvmsg+0x948/0x1ef0 net/packet/af_packet.c:3552 sock_recvmsg_nosec net/socket.c:1033 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1055 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2803 ___sys_recvmsg net/socket.c:2845 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2940 __sys_recvmmsg net/socket.c:3014 [inline] __do_sys_recvmmsg net/socket.c:3037 [inline] __se_sys_recvmmsg net/socket.c:3030 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3030 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+74f70bb1cb968bf09e4f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c5.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 16 +++++++++++++--- net/packet/af_packet.c | 16 ++++------------ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed9..d65b5d71b93b 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -585,13 +585,16 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol + * @mac_offset: MAC offset * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, - int *depth) +static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset, + int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; @@ -610,7 +613,8 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, do { struct vlan_hdr vhdr, *vh; - vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; @@ -625,6 +629,12 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, return type; } +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, + int *depth) +{ + return __vlan_get_protocol_offset(skb, type, 0, depth); +} + /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e2e34a49e98d..2d73769d67f4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -568,21 +568,13 @@ static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) return ntohs(vh->h_vlan_TCI); } -static __be16 vlan_get_protocol_dgram(struct sk_buff *skb) +static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; - if (unlikely(eth_type_vlan(proto))) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; - - skb_push(skb, skb->data - skb_mac_header(skb)); - proto = __vlan_get_protocol(skb, proto, NULL); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } - } + if (unlikely(eth_type_vlan(proto))) + proto = __vlan_get_protocol_offset(skb, proto, + skb_mac_offset(skb), NULL); return proto; } From 260466b576bca0081a7d4acecc8e93687aa22d0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:28:49 +0000 Subject: [PATCH 45/50] ila: serialize calls to nf_register_net_hooks() syzbot found a race in ila_add_mapping() [1] commit 031ae72825ce ("ila: call nf_unregister_net_hooks() sooner") attempted to fix a similar issue. Looking at the syzbot repro, we have concurrent ILA_CMD_ADD commands. Add a mutex to make sure at most one thread is calling nf_register_net_hooks(). [1] BUG: KASAN: slab-use-after-free in rht_key_hashfn include/linux/rhashtable.h:159 [inline] BUG: KASAN: slab-use-after-free in __rhashtable_lookup.constprop.0+0x426/0x550 include/linux/rhashtable.h:604 Read of size 4 at addr ffff888028f40008 by task dhcpcd/5501 CPU: 1 UID: 0 PID: 5501 Comm: dhcpcd Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xc3/0x620 mm/kasan/report.c:489 kasan_report+0xd9/0x110 mm/kasan/report.c:602 rht_key_hashfn include/linux/rhashtable.h:159 [inline] __rhashtable_lookup.constprop.0+0x426/0x550 include/linux/rhashtable.h:604 rhashtable_lookup include/linux/rhashtable.h:646 [inline] rhashtable_lookup_fast include/linux/rhashtable.h:672 [inline] ila_lookup_wildcards net/ipv6/ila/ila_xlat.c:127 [inline] ila_xlat_addr net/ipv6/ila/ila_xlat.c:652 [inline] ila_nf_input+0x1ee/0x620 net/ipv6/ila/ila_xlat.c:185 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xbb/0x200 net/netfilter/core.c:626 nf_hook.constprop.0+0x42e/0x750 include/linux/netfilter.h:269 NF_HOOK include/linux/netfilter.h:312 [inline] ipv6_rcv+0xa4/0x680 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core+0x12e/0x1e0 net/core/dev.c:5672 __netif_receive_skb+0x1d/0x160 net/core/dev.c:5785 process_backlog+0x443/0x15f0 net/core/dev.c:6117 __napi_poll.constprop.0+0xb7/0x550 net/core/dev.c:6883 napi_poll net/core/dev.c:6952 [inline] net_rx_action+0xa94/0x1010 net/core/dev.c:7074 handle_softirqs+0x213/0x8f0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0x109/0x170 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0xa4/0xc0 arch/x86/kernel/apic/apic.c:1049 Fixes: 7f00feaf1076 ("ila: Add generic ILA translation facility") Reported-by: syzbot+47e761d22ecf745f72b9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c9ae.050a0220.2f3838.04c7.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Tom Herbert Link: https://patch.msgid.link/20241230162849.2795486-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ila/ila_xlat.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 7646e401c630..1d41b2ab4884 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -195,6 +195,8 @@ static const struct nf_hook_ops ila_nf_hook_ops[] = { }, }; +static DEFINE_MUTEX(ila_mutex); + static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -202,16 +204,20 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; - if (!ilan->xlat.hooks_registered) { + if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ - err = nf_register_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); + mutex_lock(&ila_mutex); + if (!ilan->xlat.hooks_registered) { + err = nf_register_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); + if (!err) + WRITE_ONCE(ilan->xlat.hooks_registered, true); + } + mutex_unlock(&ila_mutex); if (err) return err; - - ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); From 449e6912a2522af672e99992e1201a454910864e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:30 +0100 Subject: [PATCH 46/50] mptcp: fix recvbuffer adjust on sleeping rcvmsg If the recvmsg() blocks after receiving some data - i.e. due to SO_RCVLOWAT - the MPTCP code will attempt multiple times to adjust the receive buffer size, wrongly accounting every time the cumulative of received data - instead of accounting only for the delta. Address the issue moving mptcp_rcv_space_adjust just after the data reception and passing it only the just received bytes. This also removes an unneeded difference between the TCP and MPTCP RX code path implementation. Fixes: 581302298524 ("mptcp: error out earlier on disconnect") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-1-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 08a72242428c..27afdb7e2071 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1939,6 +1939,8 @@ do_error: goto out; } +static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); + static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, @@ -1992,6 +1994,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; } + mptcp_rcv_space_adjust(msk, copied); return copied; } @@ -2268,7 +2271,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); - mptcp_rcv_space_adjust(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2276,8 +2278,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } - mptcp_rcv_space_adjust(msk, copied); - out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) From 551844f26da2a9f76c0a698baaffa631d1178645 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:31 +0100 Subject: [PATCH 47/50] mptcp: don't always assume copied data in mptcp_cleanup_rbuf() Under some corner cases the MPTCP protocol can end-up invoking mptcp_cleanup_rbuf() when no data has been copied, but such helper assumes the opposite condition. Explicitly drop such assumption and performs the costly call only when strictly needed - before releasing the msk socket lock. Fixes: fd8976790a6c ("mptcp: be careful on MPTCP-level ack.") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-2-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 27afdb7e2071..5307fff9d995 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -528,13 +528,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } -static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied) { bool slow; slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) - tcp_cleanup_rbuf(ssk, 1); + tcp_cleanup_rbuf(ssk, copied); unlock_sock_fast(ssk, slow); } @@ -551,7 +551,7 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } -static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; @@ -559,14 +559,14 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) int space = __mptcp_space(sk); bool cleanup, rx_empty; - cleanup = (space > 0) && (space >= (old_space << 1)); - rx_empty = !__mptcp_rmem(sk); + cleanup = (space > 0) && (space >= (old_space << 1)) && copied; + rx_empty = !__mptcp_rmem(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) - mptcp_subflow_cleanup_rbuf(ssk); + mptcp_subflow_cleanup_rbuf(ssk, copied); } } @@ -2220,9 +2220,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - /* be sure to advertise window change */ - mptcp_cleanup_rbuf(msk); - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; @@ -2271,6 +2268,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); + mptcp_cleanup_rbuf(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2278,6 +2276,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } + mptcp_cleanup_rbuf(msk, copied); + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) From 56b824eb49d6258aa0bad09a406ceac3f643cdae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:32 +0100 Subject: [PATCH 48/50] mptcp: prevent excessive coalescing on receive Currently the skb size after coalescing is only limited by the skb layout (the skb must not carry frag_list). A single coalesced skb covering several MSS can potentially fill completely the receive buffer. In such a case, the snd win will zero until the receive buffer will be empty again, affecting tput badly. Fixes: 8268ed4c9d19 ("mptcp: introduce and use mptcp_try_coalesce()") Cc: stable@vger.kernel.org # please delay 2 weeks after 6.13-final release Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-3-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5307fff9d995..1b2e7cbb577f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -136,6 +136,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, int delta; if (MPTCP_SKB_CB(from)->offset || + ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; From 9facce84f4062f782ebde18daa7006a23d40b607 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Mon, 23 Dec 2024 20:45:49 +0530 Subject: [PATCH 49/50] net: ti: icssg-prueth: Fix firmware load sequence. Timesync related operations are ran in PRU0 cores for both ICSSG SLICE0 and SLICE1. Currently whenever any ICSSG interface comes up we load the respective firmwares to PRU cores and whenever interface goes down, we stop the resective cores. Due to this, when SLICE0 goes down while SLICE1 is still active, PRU0 firmwares are unloaded and PRU0 core is stopped. This results in clock jump for SLICE1 interface as the timesync related operations are no longer running. As there are interdependencies between SLICE0 and SLICE1 firmwares, fix this by running both PRU0 and PRU1 firmwares as long as at least 1 ICSSG interface is up. Add new flag in prueth struct to check if all firmwares are running and remove the old flag (fw_running). Use emacs_initialized as reference count to load the firmwares for the first and last interface up/down. Moving init_emac_mode and fw_offload_mode API outside of icssg_config to icssg_common_start API as they need to be called only once per firmware boot. Change prueth_emac_restart() to return error code and add error prints inside the caller of this functions in case of any failures. Move prueth_emac_stop() from common to sr1 driver. sr1 and sr2 drivers have different logic handling for stopping the firmwares. While sr1 driver is dependent on emac structure to stop the corresponding pru cores for that slice, for sr2 all the pru cores of both the slices are stopped and is not dependent on emac. So the prueth_emac_stop() function is no longer common and can be moved to sr1 driver. Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") Signed-off-by: MD Danish Anwar Signed-off-by: Meghana Malladi Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icssg_common.c | 25 -- drivers/net/ethernet/ti/icssg/icssg_config.c | 41 ++- drivers/net/ethernet/ti/icssg/icssg_config.h | 1 + drivers/net/ethernet/ti/icssg/icssg_prueth.c | 281 ++++++++++++------ drivers/net/ethernet/ti/icssg/icssg_prueth.h | 5 +- .../net/ethernet/ti/icssg/icssg_prueth_sr1.c | 24 +- 6 files changed, 246 insertions(+), 131 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_common.c b/drivers/net/ethernet/ti/icssg/icssg_common.c index fdebeb2f84e0..74f0f200a89d 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_common.c +++ b/drivers/net/ethernet/ti/icssg/icssg_common.c @@ -855,31 +855,6 @@ irqreturn_t prueth_rx_irq(int irq, void *dev_id) } EXPORT_SYMBOL_GPL(prueth_rx_irq); -void prueth_emac_stop(struct prueth_emac *emac) -{ - struct prueth *prueth = emac->prueth; - int slice; - - switch (emac->port_id) { - case PRUETH_PORT_MII0: - slice = ICSS_SLICE0; - break; - case PRUETH_PORT_MII1: - slice = ICSS_SLICE1; - break; - default: - netdev_err(emac->ndev, "invalid port\n"); - return; - } - - emac->fw_running = 0; - if (!emac->is_sr1) - rproc_shutdown(prueth->txpru[slice]); - rproc_shutdown(prueth->rtu[slice]); - rproc_shutdown(prueth->pru[slice]); -} -EXPORT_SYMBOL_GPL(prueth_emac_stop); - void prueth_cleanup_tx_ts(struct prueth_emac *emac) { int i; diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.c b/drivers/net/ethernet/ti/icssg/icssg_config.c index 5d2491c2943a..ddfd1c02a885 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.c +++ b/drivers/net/ethernet/ti/icssg/icssg_config.c @@ -397,7 +397,7 @@ static int prueth_emac_buffer_setup(struct prueth_emac *emac) return 0; } -static void icssg_init_emac_mode(struct prueth *prueth) +void icssg_init_emac_mode(struct prueth *prueth) { /* When the device is configured as a bridge and it is being brought * back to the emac mode, the host mac address has to be set as 0. @@ -406,9 +406,6 @@ static void icssg_init_emac_mode(struct prueth *prueth) int i; u8 mac[ETH_ALEN] = { 0 }; - if (prueth->emacs_initialized) - return; - /* Set VLAN TABLE address base */ regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, SMEM_VLAN_OFFSET_MASK, addr << SMEM_VLAN_OFFSET); @@ -423,15 +420,13 @@ static void icssg_init_emac_mode(struct prueth *prueth) /* Clear host MAC address */ icssg_class_set_host_mac_addr(prueth->miig_rt, mac); } +EXPORT_SYMBOL_GPL(icssg_init_emac_mode); -static void icssg_init_fw_offload_mode(struct prueth *prueth) +void icssg_init_fw_offload_mode(struct prueth *prueth) { u32 addr = prueth->shram.pa + EMAC_ICSSG_SWITCH_DEFAULT_VLAN_TABLE_OFFSET; int i; - if (prueth->emacs_initialized) - return; - /* Set VLAN TABLE address base */ regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, SMEM_VLAN_OFFSET_MASK, addr << SMEM_VLAN_OFFSET); @@ -448,6 +443,7 @@ static void icssg_init_fw_offload_mode(struct prueth *prueth) icssg_class_set_host_mac_addr(prueth->miig_rt, prueth->hw_bridge_dev->dev_addr); icssg_set_pvid(prueth, prueth->default_vlan, PRUETH_PORT_HOST); } +EXPORT_SYMBOL_GPL(icssg_init_fw_offload_mode); int icssg_config(struct prueth *prueth, struct prueth_emac *emac, int slice) { @@ -455,11 +451,6 @@ int icssg_config(struct prueth *prueth, struct prueth_emac *emac, int slice) struct icssg_flow_cfg __iomem *flow_cfg; int ret; - if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) - icssg_init_fw_offload_mode(prueth); - else - icssg_init_emac_mode(prueth); - memset_io(config, 0, TAS_GATE_MASK_LIST0); icssg_miig_queues_init(prueth, slice); @@ -786,3 +777,27 @@ void icssg_set_pvid(struct prueth *prueth, u8 vid, u8 port) writel(pvid, prueth->shram.va + EMAC_ICSSG_SWITCH_PORT0_DEFAULT_VLAN_OFFSET); } EXPORT_SYMBOL_GPL(icssg_set_pvid); + +int emac_fdb_flow_id_updated(struct prueth_emac *emac) +{ + struct mgmt_cmd_rsp fdb_cmd_rsp = { 0 }; + int slice = prueth_emac_slice(emac); + struct mgmt_cmd fdb_cmd = { 0 }; + int ret; + + fdb_cmd.header = ICSSG_FW_MGMT_CMD_HEADER; + fdb_cmd.type = ICSSG_FW_MGMT_FDB_CMD_TYPE_RX_FLOW; + fdb_cmd.seqnum = ++(emac->prueth->icssg_hwcmdseq); + fdb_cmd.param = 0; + + fdb_cmd.param |= (slice << 4); + fdb_cmd.cmd_args[0] = 0; + + ret = icssg_send_fdb_msg(emac, &fdb_cmd, &fdb_cmd_rsp); + if (ret) + return ret; + + WARN_ON(fdb_cmd.seqnum != fdb_cmd_rsp.seqnum); + return fdb_cmd_rsp.status == 1 ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(emac_fdb_flow_id_updated); diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.h b/drivers/net/ethernet/ti/icssg/icssg_config.h index 92c2deaa3068..c884e9fa099e 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.h +++ b/drivers/net/ethernet/ti/icssg/icssg_config.h @@ -55,6 +55,7 @@ struct icssg_rxq_ctx { #define ICSSG_FW_MGMT_FDB_CMD_TYPE 0x03 #define ICSSG_FW_MGMT_CMD_TYPE 0x04 #define ICSSG_FW_MGMT_PKT 0x80000000 +#define ICSSG_FW_MGMT_FDB_CMD_TYPE_RX_FLOW 0x05 struct icssg_r30_cmd { u32 cmd[4]; diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index c568c84a032b..d76fe6d05e10 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -164,11 +164,26 @@ static struct icssg_firmwares icssg_emac_firmwares[] = { } }; -static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) +static int prueth_start(struct rproc *rproc, const char *fw_name) +{ + int ret; + + ret = rproc_set_firmware(rproc, fw_name); + if (ret) + return ret; + return rproc_boot(rproc); +} + +static void prueth_shutdown(struct rproc *rproc) +{ + rproc_shutdown(rproc); +} + +static int prueth_emac_start(struct prueth *prueth) { struct icssg_firmwares *firmwares; struct device *dev = prueth->dev; - int slice, ret; + int ret, slice; if (prueth->is_switch_mode) firmwares = icssg_switch_firmwares; @@ -177,49 +192,126 @@ static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) else firmwares = icssg_emac_firmwares; - slice = prueth_emac_slice(emac); - if (slice < 0) { - netdev_err(emac->ndev, "invalid port\n"); - return -EINVAL; + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + ret = prueth_start(prueth->pru[slice], firmwares[slice].pru); + if (ret) { + dev_err(dev, "failed to boot PRU%d: %d\n", slice, ret); + goto unwind_slices; + } + + ret = prueth_start(prueth->rtu[slice], firmwares[slice].rtu); + if (ret) { + dev_err(dev, "failed to boot RTU%d: %d\n", slice, ret); + rproc_shutdown(prueth->pru[slice]); + goto unwind_slices; + } + + ret = prueth_start(prueth->txpru[slice], firmwares[slice].txpru); + if (ret) { + dev_err(dev, "failed to boot TX_PRU%d: %d\n", slice, ret); + rproc_shutdown(prueth->rtu[slice]); + rproc_shutdown(prueth->pru[slice]); + goto unwind_slices; + } } - ret = icssg_config(prueth, emac, slice); - if (ret) - return ret; - - ret = rproc_set_firmware(prueth->pru[slice], firmwares[slice].pru); - ret = rproc_boot(prueth->pru[slice]); - if (ret) { - dev_err(dev, "failed to boot PRU%d: %d\n", slice, ret); - return -EINVAL; - } - - ret = rproc_set_firmware(prueth->rtu[slice], firmwares[slice].rtu); - ret = rproc_boot(prueth->rtu[slice]); - if (ret) { - dev_err(dev, "failed to boot RTU%d: %d\n", slice, ret); - goto halt_pru; - } - - ret = rproc_set_firmware(prueth->txpru[slice], firmwares[slice].txpru); - ret = rproc_boot(prueth->txpru[slice]); - if (ret) { - dev_err(dev, "failed to boot TX_PRU%d: %d\n", slice, ret); - goto halt_rtu; - } - - emac->fw_running = 1; return 0; -halt_rtu: - rproc_shutdown(prueth->rtu[slice]); - -halt_pru: - rproc_shutdown(prueth->pru[slice]); +unwind_slices: + while (--slice >= 0) { + prueth_shutdown(prueth->txpru[slice]); + prueth_shutdown(prueth->rtu[slice]); + prueth_shutdown(prueth->pru[slice]); + } return ret; } +static void prueth_emac_stop(struct prueth *prueth) +{ + int slice; + + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + prueth_shutdown(prueth->txpru[slice]); + prueth_shutdown(prueth->rtu[slice]); + prueth_shutdown(prueth->pru[slice]); + } +} + +static int prueth_emac_common_start(struct prueth *prueth) +{ + struct prueth_emac *emac; + int ret = 0; + int slice; + + if (!prueth->emac[ICSS_SLICE0] && !prueth->emac[ICSS_SLICE1]) + return -EINVAL; + + /* clear SMEM and MSMC settings for all slices */ + memset_io(prueth->msmcram.va, 0, prueth->msmcram.size); + memset_io(prueth->shram.va, 0, ICSSG_CONFIG_OFFSET_SLICE1 * PRUETH_NUM_MACS); + + icssg_class_default(prueth->miig_rt, ICSS_SLICE0, 0, false); + icssg_class_default(prueth->miig_rt, ICSS_SLICE1, 0, false); + + if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) + icssg_init_fw_offload_mode(prueth); + else + icssg_init_emac_mode(prueth); + + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + emac = prueth->emac[slice]; + if (!emac) + continue; + ret = icssg_config(prueth, emac, slice); + if (ret) + goto disable_class; + } + + ret = prueth_emac_start(prueth); + if (ret) + goto disable_class; + + emac = prueth->emac[ICSS_SLICE0] ? prueth->emac[ICSS_SLICE0] : + prueth->emac[ICSS_SLICE1]; + ret = icss_iep_init(emac->iep, &prueth_iep_clockops, + emac, IEP_DEFAULT_CYCLE_TIME_NS); + if (ret) { + dev_err(prueth->dev, "Failed to initialize IEP module\n"); + goto stop_pruss; + } + + return 0; + +stop_pruss: + prueth_emac_stop(prueth); + +disable_class: + icssg_class_disable(prueth->miig_rt, ICSS_SLICE0); + icssg_class_disable(prueth->miig_rt, ICSS_SLICE1); + + return ret; +} + +static int prueth_emac_common_stop(struct prueth *prueth) +{ + struct prueth_emac *emac; + + if (!prueth->emac[ICSS_SLICE0] && !prueth->emac[ICSS_SLICE1]) + return -EINVAL; + + icssg_class_disable(prueth->miig_rt, ICSS_SLICE0); + icssg_class_disable(prueth->miig_rt, ICSS_SLICE1); + + prueth_emac_stop(prueth); + + emac = prueth->emac[ICSS_SLICE0] ? prueth->emac[ICSS_SLICE0] : + prueth->emac[ICSS_SLICE1]; + icss_iep_exit(emac->iep); + + return 0; +} + /* called back by PHY layer if there is change in link state of hw port*/ static void emac_adjust_link(struct net_device *ndev) { @@ -374,9 +466,6 @@ static void prueth_iep_settime(void *clockops_data, u64 ns) u32 cycletime; int timeout; - if (!emac->fw_running) - return; - sc_descp = emac->prueth->shram.va + TIMESYNC_FW_WC_SETCLOCK_DESC_OFFSET; cycletime = IEP_DEFAULT_CYCLE_TIME_NS; @@ -543,23 +632,17 @@ static int emac_ndo_open(struct net_device *ndev) { struct prueth_emac *emac = netdev_priv(ndev); int ret, i, num_data_chn = emac->tx_ch_num; + struct icssg_flow_cfg __iomem *flow_cfg; struct prueth *prueth = emac->prueth; int slice = prueth_emac_slice(emac); struct device *dev = prueth->dev; int max_rx_flows; int rx_flow; - /* clear SMEM and MSMC settings for all slices */ - if (!prueth->emacs_initialized) { - memset_io(prueth->msmcram.va, 0, prueth->msmcram.size); - memset_io(prueth->shram.va, 0, ICSSG_CONFIG_OFFSET_SLICE1 * PRUETH_NUM_MACS); - } - /* set h/w MAC as user might have re-configured */ ether_addr_copy(emac->mac_addr, ndev->dev_addr); icssg_class_set_mac_addr(prueth->miig_rt, slice, emac->mac_addr); - icssg_class_default(prueth->miig_rt, slice, 0, false); icssg_ft1_set_mac_addr(prueth->miig_rt, slice, emac->mac_addr); /* Notify the stack of the actual queue counts. */ @@ -597,18 +680,23 @@ static int emac_ndo_open(struct net_device *ndev) goto cleanup_napi; } - /* reset and start PRU firmware */ - ret = prueth_emac_start(prueth, emac); - if (ret) - goto free_rx_irq; + if (!prueth->emacs_initialized) { + ret = prueth_emac_common_start(prueth); + if (ret) + goto free_rx_irq; + } + + flow_cfg = emac->dram.va + ICSSG_CONFIG_OFFSET + PSI_L_REGULAR_FLOW_ID_BASE_OFFSET; + writew(emac->rx_flow_id_base, &flow_cfg->rx_base_flow); + ret = emac_fdb_flow_id_updated(emac); + + if (ret) { + netdev_err(ndev, "Failed to update Rx Flow ID %d", ret); + goto stop; + } icssg_mii_update_mtu(prueth->mii_rt, slice, ndev->max_mtu); - if (!prueth->emacs_initialized) { - ret = icss_iep_init(emac->iep, &prueth_iep_clockops, - emac, IEP_DEFAULT_CYCLE_TIME_NS); - } - ret = request_threaded_irq(emac->tx_ts_irq, NULL, prueth_tx_ts_irq, IRQF_ONESHOT, dev_name(dev), emac); if (ret) @@ -653,7 +741,8 @@ reset_rx_chn: free_tx_ts_irq: free_irq(emac->tx_ts_irq, emac); stop: - prueth_emac_stop(emac); + if (!prueth->emacs_initialized) + prueth_emac_common_stop(prueth); free_rx_irq: free_irq(emac->rx_chns.irq[rx_flow], emac); cleanup_napi: @@ -689,8 +778,6 @@ static int emac_ndo_stop(struct net_device *ndev) if (ndev->phydev) phy_stop(ndev->phydev); - icssg_class_disable(prueth->miig_rt, prueth_emac_slice(emac)); - if (emac->prueth->is_hsr_offload_mode) __dev_mc_unsync(ndev, icssg_prueth_hsr_del_mcast); else @@ -728,11 +815,9 @@ static int emac_ndo_stop(struct net_device *ndev) /* Destroying the queued work in ndo_stop() */ cancel_delayed_work_sync(&emac->stats_work); - if (prueth->emacs_initialized == 1) - icss_iep_exit(emac->iep); - /* stop PRUs */ - prueth_emac_stop(emac); + if (prueth->emacs_initialized == 1) + prueth_emac_common_stop(prueth); free_irq(emac->tx_ts_irq, emac); @@ -1053,10 +1138,11 @@ static void prueth_offload_fwd_mark_update(struct prueth *prueth) } } -static void prueth_emac_restart(struct prueth *prueth) +static int prueth_emac_restart(struct prueth *prueth) { struct prueth_emac *emac0 = prueth->emac[PRUETH_MAC0]; struct prueth_emac *emac1 = prueth->emac[PRUETH_MAC1]; + int ret; /* Detach the net_device for both PRUeth ports*/ if (netif_running(emac0->ndev)) @@ -1065,36 +1151,46 @@ static void prueth_emac_restart(struct prueth *prueth) netif_device_detach(emac1->ndev); /* Disable both PRUeth ports */ - icssg_set_port_state(emac0, ICSSG_EMAC_PORT_DISABLE); - icssg_set_port_state(emac1, ICSSG_EMAC_PORT_DISABLE); + ret = icssg_set_port_state(emac0, ICSSG_EMAC_PORT_DISABLE); + ret |= icssg_set_port_state(emac1, ICSSG_EMAC_PORT_DISABLE); + if (ret) + return ret; /* Stop both pru cores for both PRUeth ports*/ - prueth_emac_stop(emac0); - prueth->emacs_initialized--; - prueth_emac_stop(emac1); - prueth->emacs_initialized--; + ret = prueth_emac_common_stop(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to stop the firmwares"); + return ret; + } /* Start both pru cores for both PRUeth ports */ - prueth_emac_start(prueth, emac0); - prueth->emacs_initialized++; - prueth_emac_start(prueth, emac1); - prueth->emacs_initialized++; + ret = prueth_emac_common_start(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to start the firmwares"); + return ret; + } /* Enable forwarding for both PRUeth ports */ - icssg_set_port_state(emac0, ICSSG_EMAC_PORT_FORWARD); - icssg_set_port_state(emac1, ICSSG_EMAC_PORT_FORWARD); + ret = icssg_set_port_state(emac0, ICSSG_EMAC_PORT_FORWARD); + ret |= icssg_set_port_state(emac1, ICSSG_EMAC_PORT_FORWARD); /* Attache net_device for both PRUeth ports */ netif_device_attach(emac0->ndev); netif_device_attach(emac1->ndev); + + return ret; } static void icssg_change_mode(struct prueth *prueth) { struct prueth_emac *emac; - int mac; + int mac, ret; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) { emac = prueth->emac[mac]; @@ -1173,13 +1269,18 @@ static void prueth_netdevice_port_unlink(struct net_device *ndev) { struct prueth_emac *emac = netdev_priv(ndev); struct prueth *prueth = emac->prueth; + int ret; prueth->br_members &= ~BIT(emac->port_id); if (prueth->is_switch_mode) { prueth->is_switch_mode = false; emac->port_vlan = 0; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } } prueth_offload_fwd_mark_update(prueth); @@ -1228,6 +1329,7 @@ static void prueth_hsr_port_unlink(struct net_device *ndev) struct prueth *prueth = emac->prueth; struct prueth_emac *emac0; struct prueth_emac *emac1; + int ret; emac0 = prueth->emac[PRUETH_MAC0]; emac1 = prueth->emac[PRUETH_MAC1]; @@ -1238,7 +1340,11 @@ static void prueth_hsr_port_unlink(struct net_device *ndev) emac0->port_vlan = 0; emac1->port_vlan = 0; prueth->hsr_dev = NULL; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } netdev_dbg(ndev, "Disabling HSR Offload mode\n"); } } @@ -1413,13 +1519,10 @@ static int prueth_probe(struct platform_device *pdev) prueth->pa_stats = NULL; } - if (eth0_node) { + if (eth0_node || eth1_node) { ret = prueth_get_cores(prueth, ICSS_SLICE0, false); if (ret) goto put_cores; - } - - if (eth1_node) { ret = prueth_get_cores(prueth, ICSS_SLICE1, false); if (ret) goto put_cores; @@ -1618,14 +1721,12 @@ put_pruss: pruss_put(prueth->pruss); put_cores: - if (eth1_node) { - prueth_put_cores(prueth, ICSS_SLICE1); - of_node_put(eth1_node); - } - - if (eth0_node) { + if (eth0_node || eth1_node) { prueth_put_cores(prueth, ICSS_SLICE0); of_node_put(eth0_node); + + prueth_put_cores(prueth, ICSS_SLICE1); + of_node_put(eth1_node); } return ret; diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index f5c1d473e9f9..5473315ea204 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -140,7 +140,6 @@ struct prueth_rx_chn { /* data for each emac port */ struct prueth_emac { bool is_sr1; - bool fw_running; struct prueth *prueth; struct net_device *ndev; u8 mac_addr[6]; @@ -361,6 +360,8 @@ int icssg_set_port_state(struct prueth_emac *emac, enum icssg_port_state_cmd state); void icssg_config_set_speed(struct prueth_emac *emac); void icssg_config_half_duplex(struct prueth_emac *emac); +void icssg_init_emac_mode(struct prueth *prueth); +void icssg_init_fw_offload_mode(struct prueth *prueth); /* Buffer queue helpers */ int icssg_queue_pop(struct prueth *prueth, u8 queue); @@ -377,6 +378,7 @@ void icssg_vtbl_modify(struct prueth_emac *emac, u8 vid, u8 port_mask, u8 untag_mask, bool add); u16 icssg_get_pvid(struct prueth_emac *emac); void icssg_set_pvid(struct prueth *prueth, u8 vid, u8 port); +int emac_fdb_flow_id_updated(struct prueth_emac *emac); #define prueth_napi_to_tx_chn(pnapi) \ container_of(pnapi, struct prueth_tx_chn, napi_tx) @@ -407,7 +409,6 @@ void emac_rx_timestamp(struct prueth_emac *emac, struct sk_buff *skb, u32 *psdata); enum netdev_tx icssg_ndo_start_xmit(struct sk_buff *skb, struct net_device *ndev); irqreturn_t prueth_rx_irq(int irq, void *dev_id); -void prueth_emac_stop(struct prueth_emac *emac); void prueth_cleanup_tx_ts(struct prueth_emac *emac); int icssg_napi_rx_poll(struct napi_struct *napi_rx, int budget); int prueth_prepare_rx_chan(struct prueth_emac *emac, diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c b/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c index 5024f0647a0d..3dc86397c367 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c @@ -440,7 +440,6 @@ static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) goto halt_pru; } - emac->fw_running = 1; return 0; halt_pru: @@ -449,6 +448,29 @@ halt_pru: return ret; } +static void prueth_emac_stop(struct prueth_emac *emac) +{ + struct prueth *prueth = emac->prueth; + int slice; + + switch (emac->port_id) { + case PRUETH_PORT_MII0: + slice = ICSS_SLICE0; + break; + case PRUETH_PORT_MII1: + slice = ICSS_SLICE1; + break; + default: + netdev_err(emac->ndev, "invalid port\n"); + return; + } + + if (!emac->is_sr1) + rproc_shutdown(prueth->txpru[slice]); + rproc_shutdown(prueth->rtu[slice]); + rproc_shutdown(prueth->pru[slice]); +} + /** * emac_ndo_open - EMAC device open * @ndev: network adapter device From 9b115361248dc6cce182a2dc030c1c70b0a9639e Mon Sep 17 00:00:00 2001 From: Meghana Malladi Date: Mon, 23 Dec 2024 20:45:50 +0530 Subject: [PATCH 50/50] net: ti: icssg-prueth: Fix clearing of IEP_CMP_CFG registers during iep_init When ICSSG interfaces are brought down and brought up again, the pru cores are shut down and booted again, flushing out all the memories and start again in a clean state. Hence it is expected that the IEP_CMP_CFG register needs to be flushed during iep_init() to ensure that the existing residual configuration doesn't cause any unusual behavior. If the register is not cleared, existing IEP_CMP_CFG set for CMP1 will result in SYNC0_OUT signal based on the SYNC_OUT register values. After bringing the interface up, calling PPS enable doesn't work as the driver believes PPS is already enabled, (iep->pps_enabled is not cleared during interface bring down) and driver will just return true even though there is no signal. Fix this by disabling pps and perout. Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") Signed-off-by: Meghana Malladi Reviewed-by: Roger Quadros Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icss_iep.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c index 5d6d1cf78e93..768578c0d958 100644 --- a/drivers/net/ethernet/ti/icssg/icss_iep.c +++ b/drivers/net/ethernet/ti/icssg/icss_iep.c @@ -215,6 +215,9 @@ static void icss_iep_enable_shadow_mode(struct icss_iep *iep) for (cmp = IEP_MIN_CMP; cmp < IEP_MAX_CMP; cmp++) { regmap_update_bits(iep->map, ICSS_IEP_CMP_STAT_REG, IEP_CMP_STATUS(cmp), IEP_CMP_STATUS(cmp)); + + regmap_update_bits(iep->map, ICSS_IEP_CMP_CFG_REG, + IEP_CMP_CFG_CMP_EN(cmp), 0); } /* enable reset counter on CMP0 event */ @@ -780,6 +783,11 @@ int icss_iep_exit(struct icss_iep *iep) } icss_iep_disable(iep); + if (iep->pps_enabled) + icss_iep_pps_enable(iep, false); + else if (iep->perout_enabled) + icss_iep_perout_enable(iep, NULL, false); + return 0; } EXPORT_SYMBOL_GPL(icss_iep_exit);