From b1cb8a71f1eaec4eb77051590f7f561f25b15e32 Mon Sep 17 00:00:00 2001
From: Shigeru Yoshida <syoshida@redhat.com>
Date: Sat, 20 Aug 2022 12:25:16 +0900
Subject: [PATCH 01/83] batman-adv: Fix hang up with small MTU hard-interface

The system hangs up when batman-adv soft-interface is created on
hard-interface with small MTU.  For example, the following commands
create batman-adv soft-interface on dummy interface with zero MTU:

  # ip link add name dummy0 type dummy
  # ip link set mtu 0 dev dummy0
  # ip link set up dev dummy0
  # ip link add name bat0 type batadv
  # ip link set dev dummy0 master bat0

These commands cause the system hang up with the following messages:

  [   90.578925][ T6689] batman_adv: bat0: Adding interface: dummy0
  [   90.580884][ T6689] batman_adv: bat0: The MTU of interface dummy0 is too small (0) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to 1560 would solve the problem.
  [   90.586264][ T6689] batman_adv: bat0: Interface activated: dummy0
  [   90.590061][ T6689] batman_adv: bat0: Forced to purge local tt entries to fit new maximum fragment MTU (-320)
  [   90.595517][ T6689] batman_adv: bat0: Forced to purge local tt entries to fit new maximum fragment MTU (-320)
  [   90.598499][ T6689] batman_adv: bat0: Forced to purge local tt entries to fit new maximum fragment MTU (-320)

This patch fixes this issue by returning error when enabling
hard-interface with small MTU size.

Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol")
Signed-off-by: Shigeru Yoshida <syoshida@redhat.com>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/hard-interface.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index b8f8da7ee3de..41c1ad33d009 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -10,6 +10,7 @@
 #include <linux/atomic.h>
 #include <linux/byteorder/generic.h>
 #include <linux/container_of.h>
+#include <linux/errno.h>
 #include <linux/gfp.h>
 #include <linux/if.h>
 #include <linux/if_arp.h>
@@ -700,6 +701,9 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
 	int max_header_len = batadv_max_header_len();
 	int ret;
 
+	if (hard_iface->net_dev->mtu < ETH_MIN_MTU + max_header_len)
+		return -EINVAL;
+
 	if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
 		goto out;
 

From 4d8421f2dd88583cc7a4d6c2a5532c35e816a52a Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 5 Sep 2022 19:22:46 +0200
Subject: [PATCH 02/83] wifi: iwlwifi: don't spam logs with NSS>2 messages

I get a log line like this every 4 seconds when connected to my AP:

[15650.221468] iwlwifi 0000:09:00.0: Got NSS = 4 - trimming to 2

Looking at the code, this seems to be related to a hardware limitation,
and there's nothing to be done. In an effort to keep my dmesg
manageable, downgrade this error to "debug" rather than "info".

Cc: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220905172246.105383-1-Jason@zx2c4.com
---
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 5eb28f8ee87e..11536f115198 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -1833,8 +1833,8 @@ static void iwl_mvm_parse_ppe(struct iwl_mvm *mvm,
 	* If nss < MAX: we can set zeros in other streams
 	*/
 	if (nss > MAX_HE_SUPP_NSS) {
-		IWL_INFO(mvm, "Got NSS = %d - trimming to %d\n", nss,
-			 MAX_HE_SUPP_NSS);
+		IWL_DEBUG_INFO(mvm, "Got NSS = %d - trimming to %d\n", nss,
+			       MAX_HE_SUPP_NSS);
 		nss = MAX_HE_SUPP_NSS;
 	}
 

From 39aebedeaaa95757f5c1f2ddb5f43fdddbf478ca Mon Sep 17 00:00:00 2001
From: Igor Ryzhov <iryzhov@nfware.com>
Date: Wed, 5 Jun 2019 12:32:40 +0300
Subject: [PATCH 03/83] netfilter: nf_conntrack_sip: fix ct_sip_walk_headers

ct_sip_next_header and ct_sip_get_header return an absolute
value of matchoff, not a shift from current dataoff.
So dataoff should be assigned matchoff, not incremented by it.

This issue can be seen in the scenario when there are multiple
Contact headers and the first one is using a hostname and other headers
use IP addresses. In this case, ct_sip_walk_headers will work as follows:

The first ct_sip_get_header call to will find the first Contact header
but will return -1 as the header uses a hostname. But matchoff will
be changed to the offset of this header. After that, dataoff should be
set to matchoff, so that the next ct_sip_get_header call find the next
Contact header. But instead of assigning dataoff to matchoff, it is
incremented by it, which is not correct, as matchoff is an absolute
value of the offset. So on the next call to the ct_sip_get_header,
dataoff will be incorrect, and the next Contact header may not be
found at all.

Fixes: 05e3ced297fe ("[NETFILTER]: nf_conntrack_sip: introduce SIP-URI parsing helper")
Signed-off-by: Igor Ryzhov <iryzhov@nfware.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_conntrack_sip.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index daf06f71d31c..77f5e82d8e3f 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -477,7 +477,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
 				return ret;
 			if (ret == 0)
 				break;
-			dataoff += *matchoff;
+			dataoff = *matchoff;
 		}
 		*in_header = 0;
 	}
@@ -489,7 +489,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
 			break;
 		if (ret == 0)
 			return ret;
-		dataoff += *matchoff;
+		dataoff = *matchoff;
 	}
 
 	if (in_header)

From 25b327d4f818b9d41265ea3eea26d805216589b8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 31 Aug 2022 15:12:45 +0200
Subject: [PATCH 04/83] selftests: nft_concat_range: add socat support

There are different flavors of 'nc' around, this script fails on
my test vm because 'nc' is 'nmap-ncat' which isn't 100% compatible.

Add socat support and use it if available.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 .../selftests/netfilter/nft_concat_range.sh   | 65 +++++++++++++++----
 1 file changed, 53 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh
index a6991877e50c..e908009576c7 100755
--- a/tools/testing/selftests/netfilter/nft_concat_range.sh
+++ b/tools/testing/selftests/netfilter/nft_concat_range.sh
@@ -91,7 +91,7 @@ src
 start		1
 count		5
 src_delta	2000
-tools		sendip nc bash
+tools		sendip socat nc bash
 proto		udp
 
 race_repeat	3
@@ -116,7 +116,7 @@ src
 start		10
 count		5
 src_delta	2000
-tools		sendip nc bash
+tools		sendip socat nc bash
 proto		udp6
 
 race_repeat	3
@@ -141,7 +141,7 @@ src
 start		1
 count		5
 src_delta	2000
-tools		sendip nc bash
+tools		sendip socat nc bash
 proto		udp
 
 race_repeat	0
@@ -163,7 +163,7 @@ src		mac
 start		10
 count		5
 src_delta	2000
-tools		sendip nc bash
+tools		sendip socat nc bash
 proto		udp6
 
 race_repeat	0
@@ -185,7 +185,7 @@ src		mac proto
 start		10
 count		5
 src_delta	2000
-tools		sendip nc bash
+tools		sendip socat nc bash
 proto		udp6
 
 race_repeat	0
@@ -207,7 +207,7 @@ src		addr4
 start		1
 count		5
 src_delta	2000
-tools		sendip nc bash
+tools		sendip socat nc bash
 proto		udp
 
 race_repeat	3
@@ -227,7 +227,7 @@ src		addr6 port
 start		10
 count		5
 src_delta	2000
-tools		sendip nc
+tools		sendip socat nc
 proto		udp6
 
 race_repeat	3
@@ -247,7 +247,7 @@ src		mac proto addr4
 start		1
 count		5
 src_delta	2000
-tools		sendip nc bash
+tools		sendip socat nc bash
 proto		udp
 
 race_repeat	0
@@ -264,7 +264,7 @@ src		mac
 start		1
 count		5
 src_delta	2000
-tools		sendip nc bash
+tools		sendip socat nc bash
 proto		udp
 
 race_repeat	0
@@ -286,7 +286,7 @@ src		mac addr4
 start		1
 count		5
 src_delta	2000
-tools		sendip nc bash
+tools		sendip socat nc bash
 proto		udp
 
 race_repeat	0
@@ -337,7 +337,7 @@ src		addr4
 start		1
 count		5
 src_delta	2000
-tools		sendip nc
+tools		sendip socat nc
 proto		udp
 
 race_repeat	3
@@ -363,7 +363,7 @@ src		mac
 start		1
 count		1
 src_delta	2000
-tools		sendip nc bash
+tools		sendip socat nc bash
 proto		udp
 
 race_repeat	0
@@ -541,6 +541,24 @@ setup_send_udp() {
 			dst_port=
 			src_addr4=
 		}
+	elif command -v socat -v >/dev/null; then
+		send_udp() {
+			if [ -n "${src_addr4}" ]; then
+				B ip addr add "${src_addr4}" dev veth_b
+				__socatbind=",bind=${src_addr4}"
+				if [ -n "${src_port}" ];then
+					__socatbind="${__socatbind}:${src_port}"
+				fi
+			fi
+
+			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+			[ -z "${dst_port}" ] && dst_port=12345
+
+			echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:${dst_addr4}:${dst_port}"${__socatbind}"
+
+			src_addr4=
+			src_port=
+		}
 	elif command -v nc >/dev/null; then
 		if nc -u -w0 1.1.1.1 1 2>/dev/null; then
 			# OpenBSD netcat
@@ -606,6 +624,29 @@ setup_send_udp6() {
 			dst_port=
 			src_addr6=
 		}
+	elif command -v socat -v >/dev/null; then
+		send_udp6() {
+			ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+				2>/dev/null
+
+			__socatbind6=
+
+			if [ -n "${src_addr6}" ]; then
+				if [ -n "${src_addr6} != "${src_addr6_added} ]; then
+					B ip addr add "${src_addr6}" dev veth_b nodad
+
+					src_addr6_added=${src_addr6}
+				fi
+
+				__socatbind6=",bind=[${src_addr6}]"
+
+				if [ -n "${src_port}" ] ;then
+					__socatbind6="${__socatbind6}:${src_port}"
+				fi
+			fi
+
+			echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:[${dst_addr6}]:${dst_port}"${__socatbind6}"
+		}
 	elif command -v nc >/dev/null && nc -u -w0 1.1.1.1 1 2>/dev/null; then
 		# GNU netcat might not work with IPv6, try next tool
 		send_udp6() {

From e8d5dfd1d8747b56077d02664a8838c71ced948e Mon Sep 17 00:00:00 2001
From: David Leadbeater <dgl@dgl.cx>
Date: Fri, 26 Aug 2022 14:56:57 +1000
Subject: [PATCH 05/83] netfilter: nf_conntrack_irc: Tighten matching on DCC
 message

CTCP messages should only be at the start of an IRC message, not
anywhere within it.

While the helper only decodes packes in the ORIGINAL direction, its
possible to make a client send a CTCP message back by empedding one into
a PING request.  As-is, thats enough to make the helper believe that it
saw a CTCP message.

Fixes: 869f37d8e48f ("[NETFILTER]: nf_conntrack/nf_nat: add IRC helper port")
Signed-off-by: David Leadbeater <dgl@dgl.cx>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_conntrack_irc.c | 34 ++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 992decbcaa5c..5703846bea3b 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -157,15 +157,37 @@ static int help(struct sk_buff *skb, unsigned int protoff,
 	data = ib_ptr;
 	data_limit = ib_ptr + datalen;
 
-	/* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
-	 * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
-	while (data < data_limit - (19 + MINMATCHLEN)) {
-		if (memcmp(data, "\1DCC ", 5)) {
+	/* Skip any whitespace */
+	while (data < data_limit - 10) {
+		if (*data == ' ' || *data == '\r' || *data == '\n')
+			data++;
+		else
+			break;
+	}
+
+	/* strlen("PRIVMSG x ")=10 */
+	if (data < data_limit - 10) {
+		if (strncasecmp("PRIVMSG ", data, 8))
+			goto out;
+		data += 8;
+	}
+
+	/* strlen(" :\1DCC SENT t AAAAAAAA P\1\n")=26
+	 * 7+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=26
+	 */
+	while (data < data_limit - (21 + MINMATCHLEN)) {
+		/* Find first " :", the start of message */
+		if (memcmp(data, " :", 2)) {
 			data++;
 			continue;
 		}
+		data += 2;
+
+		/* then check that place only for the DCC command */
+		if (memcmp(data, "\1DCC ", 5))
+			goto out;
 		data += 5;
-		/* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+		/* we have at least (21+MINMATCHLEN)-(2+5) bytes valid data left */
 
 		iph = ip_hdr(skb);
 		pr_debug("DCC found in master %pI4:%u %pI4:%u\n",
@@ -181,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
 			pr_debug("DCC %s detected\n", dccprotos[i]);
 
 			/* we have at least
-			 * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+			 * (21+MINMATCHLEN)-7-dccprotos[i].matchlen bytes valid
 			 * data left (== 14/13 bytes) */
 			if (parse_dcc(data, data_limit, &dcc_ip,
 				       &dcc_port, &addr_beg_p, &addr_end_p)) {

From 559c36c5a8d730c49ef805a72b213d3bba155cc8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 7 Sep 2022 10:26:18 +0200
Subject: [PATCH 06/83] netfilter: nfnetlink_osf: fix possible bogus match in
 nf_osf_find()

nf_osf_find() incorrectly returns true on mismatch, this leads to
copying uninitialized memory area in nft_osf which can be used to leak
stale kernel stack data to userspace.

Fixes: 22c7652cdaa8 ("netfilter: nft_osf: Add version option support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nfnetlink_osf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 0fa2e2030427..ee6840bd5933 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -269,6 +269,7 @@ bool nf_osf_find(const struct sk_buff *skb,
 	struct nf_osf_hdr_ctx ctx;
 	const struct tcphdr *tcp;
 	struct tcphdr _tcph;
+	bool found = false;
 
 	memset(&ctx, 0, sizeof(ctx));
 
@@ -283,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb,
 
 		data->genre = f->genre;
 		data->version = f->version;
+		found = true;
 		break;
 	}
 
-	return true;
+	return found;
 }
 EXPORT_SYMBOL_GPL(nf_osf_find);
 

From 23c619190318376769ad7b61504c2ea0703fb783 Mon Sep 17 00:00:00 2001
From: Dave Ertman <david.m.ertman@intel.com>
Date: Tue, 9 Aug 2022 10:24:23 -0700
Subject: [PATCH 07/83] ice: Don't double unplug aux on peer initiated reset

In the IDC callback that is accessed when the aux drivers request a reset,
the function to unplug the aux devices is called.  This function is also
called in the ice_prepare_for_reset function. This double call is causing
a "scheduling while atomic" BUG.

[  662.676430] ice 0000:4c:00.0 rocep76s0: cqp opcode = 0x1 maj_err_code = 0xffff min_err_code = 0x8003

[  662.676609] ice 0000:4c:00.0 rocep76s0: [Modify QP Cmd Error][op_code=8] status=-29 waiting=1 completion_err=1 maj=0xffff min=0x8003

[  662.815006] ice 0000:4c:00.0 rocep76s0: ICE OICR event notification: oicr = 0x10000003

[  662.815014] ice 0000:4c:00.0 rocep76s0: critical PE Error, GLPE_CRITERR=0x00011424

[  662.815017] ice 0000:4c:00.0 rocep76s0: Requesting a reset

[  662.815475] BUG: scheduling while atomic: swapper/37/0/0x00010002

[  662.815475] BUG: scheduling while atomic: swapper/37/0/0x00010002
[  662.815477] Modules linked in: rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs rfkill 8021q garp mrp stp llc vfat fat rpcrdma intel_rapl_msr intel_rapl_common sunrpc i10nm_edac rdma_ucm nfit ib_srpt libnvdimm ib_isert iscsi_target_mod x86_pkg_temp_thermal intel_powerclamp coretemp target_core_mod snd_hda_intel ib_iser snd_intel_dspcfg libiscsi snd_intel_sdw_acpi scsi_transport_iscsi kvm_intel iTCO_wdt rdma_cm snd_hda_codec kvm iw_cm ipmi_ssif iTCO_vendor_support snd_hda_core irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hwdep snd_seq snd_seq_device rapl snd_pcm snd_timer isst_if_mbox_pci pcspkr isst_if_mmio irdma intel_uncore idxd acpi_ipmi joydev isst_if_common snd mei_me idxd_bus ipmi_si soundcore i2c_i801 mei ipmi_devintf i2c_smbus i2c_ismt ipmi_msghandler acpi_power_meter acpi_pad rv(OE) ib_uverbs ib_cm ib_core xfs libcrc32c ast i2c_algo_bit drm_vram_helper drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm_ttm_helpe
 r ttm
[  662.815546]  nvme nvme_core ice drm crc32c_intel i40e t10_pi wmi pinctrl_emmitsburg dm_mirror dm_region_hash dm_log dm_mod fuse
[  662.815557] Preemption disabled at:
[  662.815558] [<0000000000000000>] 0x0
[  662.815563] CPU: 37 PID: 0 Comm: swapper/37 Kdump: loaded Tainted: G S         OE     5.17.1 #2
[  662.815566] Hardware name: Intel Corporation D50DNP/D50DNP, BIOS SE5C6301.86B.6624.D18.2111021741 11/02/2021
[  662.815568] Call Trace:
[  662.815572]  <IRQ>
[  662.815574]  dump_stack_lvl+0x33/0x42
[  662.815581]  __schedule_bug.cold.147+0x7d/0x8a
[  662.815588]  __schedule+0x798/0x990
[  662.815595]  schedule+0x44/0xc0
[  662.815597]  schedule_preempt_disabled+0x14/0x20
[  662.815600]  __mutex_lock.isra.11+0x46c/0x490
[  662.815603]  ? __ibdev_printk+0x76/0xc0 [ib_core]
[  662.815633]  device_del+0x37/0x3d0
[  662.815639]  ice_unplug_aux_dev+0x1a/0x40 [ice]
[  662.815674]  ice_schedule_reset+0x3c/0xd0 [ice]
[  662.815693]  irdma_iidc_event_handler.cold.7+0xb6/0xd3 [irdma]
[  662.815712]  ? bitmap_find_next_zero_area_off+0x45/0xa0
[  662.815719]  ice_send_event_to_aux+0x54/0x70 [ice]
[  662.815741]  ice_misc_intr+0x21d/0x2d0 [ice]
[  662.815756]  __handle_irq_event_percpu+0x4c/0x180
[  662.815762]  handle_irq_event_percpu+0xf/0x40
[  662.815764]  handle_irq_event+0x34/0x60
[  662.815766]  handle_edge_irq+0x9a/0x1c0
[  662.815770]  __common_interrupt+0x62/0x100
[  662.815774]  common_interrupt+0xb4/0xd0
[  662.815779]  </IRQ>
[  662.815780]  <TASK>
[  662.815780]  asm_common_interrupt+0x1e/0x40
[  662.815785] RIP: 0010:cpuidle_enter_state+0xd6/0x380
[  662.815789] Code: 49 89 c4 0f 1f 44 00 00 31 ff e8 65 d7 95 ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 64 02 00 00 31 ff e8 ae c5 9c ff fb 45 85 f6 <0f> 88 12 01 00 00 49 63 d6 4c 2b 24 24 48 8d 04 52 48 8d 04 82 49
[  662.815791] RSP: 0018:ff2c2c4f18edbe80 EFLAGS: 00000202
[  662.815793] RAX: ff280805df140000 RBX: 0000000000000002 RCX: 000000000000001f
[  662.815795] RDX: 0000009a52da2d08 RSI: ffffffff93f8240b RDI: ffffffff93f53ee7
[  662.815796] RBP: ff5e2bd11ff41928 R08: 0000000000000000 R09: 000000000002f8c0
[  662.815797] R10: 0000010c3f18e2cf R11: 000000000000000f R12: 0000009a52da2d08
[  662.815798] R13: ffffffff94ad7e20 R14: 0000000000000002 R15: 0000000000000000
[  662.815801]  cpuidle_enter+0x29/0x40
[  662.815803]  do_idle+0x261/0x2b0
[  662.815807]  cpu_startup_entry+0x19/0x20
[  662.815809]  start_secondary+0x114/0x150
[  662.815813]  secondary_startup_64_no_verify+0xd5/0xdb
[  662.815818]  </TASK>
[  662.815846] bad: scheduling from the idle thread!
[  662.815849] CPU: 37 PID: 0 Comm: swapper/37 Kdump: loaded Tainted: G S      W  OE     5.17.1 #2
[  662.815852] Hardware name: Intel Corporation D50DNP/D50DNP, BIOS SE5C6301.86B.6624.D18.2111021741 11/02/2021
[  662.815853] Call Trace:
[  662.815855]  <IRQ>
[  662.815856]  dump_stack_lvl+0x33/0x42
[  662.815860]  dequeue_task_idle+0x20/0x30
[  662.815863]  __schedule+0x1c3/0x990
[  662.815868]  schedule+0x44/0xc0
[  662.815871]  schedule_preempt_disabled+0x14/0x20
[  662.815873]  __mutex_lock.isra.11+0x3a8/0x490
[  662.815876]  ? __ibdev_printk+0x76/0xc0 [ib_core]
[  662.815904]  device_del+0x37/0x3d0
[  662.815909]  ice_unplug_aux_dev+0x1a/0x40 [ice]
[  662.815937]  ice_schedule_reset+0x3c/0xd0 [ice]
[  662.815961]  irdma_iidc_event_handler.cold.7+0xb6/0xd3 [irdma]
[  662.815979]  ? bitmap_find_next_zero_area_off+0x45/0xa0
[  662.815985]  ice_send_event_to_aux+0x54/0x70 [ice]
[  662.816011]  ice_misc_intr+0x21d/0x2d0 [ice]
[  662.816033]  __handle_irq_event_percpu+0x4c/0x180
[  662.816037]  handle_irq_event_percpu+0xf/0x40
[  662.816039]  handle_irq_event+0x34/0x60
[  662.816042]  handle_edge_irq+0x9a/0x1c0
[  662.816045]  __common_interrupt+0x62/0x100
[  662.816048]  common_interrupt+0xb4/0xd0
[  662.816052]  </IRQ>
[  662.816053]  <TASK>
[  662.816054]  asm_common_interrupt+0x1e/0x40
[  662.816057] RIP: 0010:cpuidle_enter_state+0xd6/0x380
[  662.816060] Code: 49 89 c4 0f 1f 44 00 00 31 ff e8 65 d7 95 ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 64 02 00 00 31 ff e8 ae c5 9c ff fb 45 85 f6 <0f> 88 12 01 00 00 49 63 d6 4c 2b 24 24 48 8d 04 52 48 8d 04 82 49
[  662.816063] RSP: 0018:ff2c2c4f18edbe80 EFLAGS: 00000202
[  662.816065] RAX: ff280805df140000 RBX: 0000000000000002 RCX: 000000000000001f
[  662.816067] RDX: 0000009a52da2d08 RSI: ffffffff93f8240b RDI: ffffffff93f53ee7
[  662.816068] RBP: ff5e2bd11ff41928 R08: 0000000000000000 R09: 000000000002f8c0
[  662.816070] R10: 0000010c3f18e2cf R11: 000000000000000f R12: 0000009a52da2d08
[  662.816071] R13: ffffffff94ad7e20 R14: 0000000000000002 R15: 0000000000000000
[  662.816075]  cpuidle_enter+0x29/0x40
[  662.816077]  do_idle+0x261/0x2b0
[  662.816080]  cpu_startup_entry+0x19/0x20
[  662.816083]  start_secondary+0x114/0x150
[  662.816087]  secondary_startup_64_no_verify+0xd5/0xdb
[  662.816091]  </TASK>
[  662.816169] bad: scheduling from the idle thread!

The correct place to unplug the aux devices for a reset is in the
prepare_for_reset function, as this is a common place for all reset flows.
It also has built in protection from being called twice in a single reset
instance before the aux devices are replugged.

Fixes: f9f5301e7e2d4 ("ice: Register auxiliary device to provide RDMA")
Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Tested-by: Helena Anna Dubel <helena.anna.dubel@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_main.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 8c30eea61b6d..04836bbaf7d5 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -2399,8 +2399,6 @@ int ice_schedule_reset(struct ice_pf *pf, enum ice_reset_req reset)
 		return -EBUSY;
 	}
 
-	ice_unplug_aux_dev(pf);
-
 	switch (reset) {
 	case ICE_RESET_PFR:
 		set_bit(ICE_PFR_REQ, pf->state);

From a509702cac95a8b450228a037c8542f57e538e5b Mon Sep 17 00:00:00 2001
From: Ding Hui <dinghui@sangfor.com.cn>
Date: Wed, 17 Aug 2022 18:53:18 +0800
Subject: [PATCH 08/83] ice: Fix crash by keep old cfg when update TCs more
 than queues

There are problems if allocated queues less than Traffic Classes.

Commit a632b2a4c920 ("ice: ethtool: Prohibit improper channel config
for DCB") already disallow setting less queues than TCs.

Another case is if we first set less queues, and later update more TCs
config due to LLDP, ice_vsi_cfg_tc() will failed but left dirty
num_txq/rxq and tc_cfg in vsi, that will cause invalid pointer access.

[   95.968089] ice 0000:3b:00.1: More TCs defined than queues/rings allocated.
[   95.968092] ice 0000:3b:00.1: Trying to use more Rx queues (8), than were allocated (1)!
[   95.968093] ice 0000:3b:00.1: Failed to config TC for VSI index: 0
[   95.969621] general protection fault: 0000 [#1] SMP NOPTI
[   95.969705] CPU: 1 PID: 58405 Comm: lldpad Kdump: loaded Tainted: G     U  W  O     --------- -t - 4.18.0 #1
[   95.969867] Hardware name: O.E.M/BC11SPSCB10, BIOS 8.23 12/30/2021
[   95.969992] RIP: 0010:devm_kmalloc+0xa/0x60
[   95.970052] Code: 5c ff ff ff 31 c0 5b 5d 41 5c c3 b8 f4 ff ff ff eb f4 0f 1f 40 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 89 d1 <8b> 97 60 02 00 00 48 8d 7e 18 48 39 f7 72 3f 55 89 ce 53 48 8b 4c
[   95.970344] RSP: 0018:ffffc9003f553888 EFLAGS: 00010206
[   95.970425] RAX: dead000000000200 RBX: ffffea003c425b00 RCX: 00000000006080c0
[   95.970536] RDX: 00000000006080c0 RSI: 0000000000000200 RDI: dead000000000200
[   95.970648] RBP: dead000000000200 R08: 00000000000463c0 R09: ffff888ffa900000
[   95.970760] R10: 0000000000000000 R11: 0000000000000002 R12: ffff888ff6b40100
[   95.970870] R13: ffff888ff6a55018 R14: 0000000000000000 R15: ffff888ff6a55460
[   95.970981] FS:  00007f51b7d24700(0000) GS:ffff88903ee80000(0000) knlGS:0000000000000000
[   95.971108] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   95.971197] CR2: 00007fac5410d710 CR3: 0000000f2c1de002 CR4: 00000000007606e0
[   95.971309] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   95.971419] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   95.971530] PKRU: 55555554
[   95.971573] Call Trace:
[   95.971622]  ice_setup_rx_ring+0x39/0x110 [ice]
[   95.971695]  ice_vsi_setup_rx_rings+0x54/0x90 [ice]
[   95.971774]  ice_vsi_open+0x25/0x120 [ice]
[   95.971843]  ice_open_internal+0xb8/0x1f0 [ice]
[   95.971919]  ice_ena_vsi+0x4f/0xd0 [ice]
[   95.971987]  ice_dcb_ena_dis_vsi.constprop.5+0x29/0x90 [ice]
[   95.972082]  ice_pf_dcb_cfg+0x29a/0x380 [ice]
[   95.972154]  ice_dcbnl_setets+0x174/0x1b0 [ice]
[   95.972220]  dcbnl_ieee_set+0x89/0x230
[   95.972279]  ? dcbnl_ieee_del+0x150/0x150
[   95.972341]  dcb_doit+0x124/0x1b0
[   95.972392]  rtnetlink_rcv_msg+0x243/0x2f0
[   95.972457]  ? dcb_doit+0x14d/0x1b0
[   95.972510]  ? __kmalloc_node_track_caller+0x1d3/0x280
[   95.972591]  ? rtnl_calcit.isra.31+0x100/0x100
[   95.972661]  netlink_rcv_skb+0xcf/0xf0
[   95.972720]  netlink_unicast+0x16d/0x220
[   95.972781]  netlink_sendmsg+0x2ba/0x3a0
[   95.975891]  sock_sendmsg+0x4c/0x50
[   95.979032]  ___sys_sendmsg+0x2e4/0x300
[   95.982147]  ? kmem_cache_alloc+0x13e/0x190
[   95.985242]  ? __wake_up_common_lock+0x79/0x90
[   95.988338]  ? __check_object_size+0xac/0x1b0
[   95.991440]  ? _copy_to_user+0x22/0x30
[   95.994539]  ? move_addr_to_user+0xbb/0xd0
[   95.997619]  ? __sys_sendmsg+0x53/0x80
[   96.000664]  __sys_sendmsg+0x53/0x80
[   96.003747]  do_syscall_64+0x5b/0x1d0
[   96.006862]  entry_SYSCALL_64_after_hwframe+0x65/0xca

Only update num_txq/rxq when passed check, and restore tc_cfg if setup
queue map failed.

Fixes: a632b2a4c920 ("ice: ethtool: Prohibit improper channel config for DCB")
Signed-off-by: Ding Hui <dinghui@sangfor.com.cn>
Reviewed-by: Anatolii Gerasymenko <anatolii.gerasymenko@intel.com>
Tested-by: Arpana Arland <arpanax.arland@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_lib.c | 46 ++++++++++++++----------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 0c4ec9264071..58d483e2f539 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -914,7 +914,7 @@ static void ice_set_dflt_vsi_ctx(struct ice_hw *hw, struct ice_vsi_ctx *ctxt)
  */
 static int ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 {
-	u16 offset = 0, qmap = 0, tx_count = 0, pow = 0;
+	u16 offset = 0, qmap = 0, tx_count = 0, rx_count = 0, pow = 0;
 	u16 num_txq_per_tc, num_rxq_per_tc;
 	u16 qcount_tx = vsi->alloc_txq;
 	u16 qcount_rx = vsi->alloc_rxq;
@@ -981,22 +981,24 @@ static int ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 	 * at least 1)
 	 */
 	if (offset)
-		vsi->num_rxq = offset;
+		rx_count = offset;
 	else
-		vsi->num_rxq = num_rxq_per_tc;
+		rx_count = num_rxq_per_tc;
 
-	if (vsi->num_rxq > vsi->alloc_rxq) {
+	if (rx_count > vsi->alloc_rxq) {
 		dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Rx queues (%u), than were allocated (%u)!\n",
-			vsi->num_rxq, vsi->alloc_rxq);
+			rx_count, vsi->alloc_rxq);
+		return -EINVAL;
+	}
+
+	if (tx_count > vsi->alloc_txq) {
+		dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Tx queues (%u), than were allocated (%u)!\n",
+			tx_count, vsi->alloc_txq);
 		return -EINVAL;
 	}
 
 	vsi->num_txq = tx_count;
-	if (vsi->num_txq > vsi->alloc_txq) {
-		dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Tx queues (%u), than were allocated (%u)!\n",
-			vsi->num_txq, vsi->alloc_txq);
-		return -EINVAL;
-	}
+	vsi->num_rxq = rx_count;
 
 	if (vsi->type == ICE_VSI_VF && vsi->num_txq != vsi->num_rxq) {
 		dev_dbg(ice_pf_to_dev(vsi->back), "VF VSI should have same number of Tx and Rx queues. Hence making them equal\n");
@@ -3490,6 +3492,7 @@ ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt,
 	u16 pow, offset = 0, qcount_tx = 0, qcount_rx = 0, qmap;
 	u16 tc0_offset = vsi->mqprio_qopt.qopt.offset[0];
 	int tc0_qcount = vsi->mqprio_qopt.qopt.count[0];
+	u16 new_txq, new_rxq;
 	u8 netdev_tc = 0;
 	int i;
 
@@ -3530,21 +3533,24 @@ ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt,
 		}
 	}
 
-	/* Set actual Tx/Rx queue pairs */
-	vsi->num_txq = offset + qcount_tx;
-	if (vsi->num_txq > vsi->alloc_txq) {
+	new_txq = offset + qcount_tx;
+	if (new_txq > vsi->alloc_txq) {
 		dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Tx queues (%u), than were allocated (%u)!\n",
-			vsi->num_txq, vsi->alloc_txq);
+			new_txq, vsi->alloc_txq);
 		return -EINVAL;
 	}
 
-	vsi->num_rxq = offset + qcount_rx;
-	if (vsi->num_rxq > vsi->alloc_rxq) {
+	new_rxq = offset + qcount_rx;
+	if (new_rxq > vsi->alloc_rxq) {
 		dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Rx queues (%u), than were allocated (%u)!\n",
-			vsi->num_rxq, vsi->alloc_rxq);
+			new_rxq, vsi->alloc_rxq);
 		return -EINVAL;
 	}
 
+	/* Set actual Tx/Rx queue pairs */
+	vsi->num_txq = new_txq;
+	vsi->num_rxq = new_rxq;
+
 	/* Setup queue TC[0].qmap for given VSI context */
 	ctxt->info.tc_mapping[0] = cpu_to_le16(qmap);
 	ctxt->info.q_mapping[0] = cpu_to_le16(vsi->rxq_map[0]);
@@ -3576,6 +3582,7 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
 	struct ice_pf *pf = vsi->back;
+	struct ice_tc_cfg old_tc_cfg;
 	struct ice_vsi_ctx *ctx;
 	struct device *dev;
 	int i, ret = 0;
@@ -3600,6 +3607,7 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
 			max_txqs[i] = vsi->num_txq;
 	}
 
+	memcpy(&old_tc_cfg, &vsi->tc_cfg, sizeof(old_tc_cfg));
 	vsi->tc_cfg.ena_tc = ena_tc;
 	vsi->tc_cfg.numtc = num_tc;
 
@@ -3616,8 +3624,10 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
 	else
 		ret = ice_vsi_setup_q_map(vsi, ctx);
 
-	if (ret)
+	if (ret) {
+		memcpy(&vsi->tc_cfg, &old_tc_cfg, sizeof(vsi->tc_cfg));
 		goto out;
+	}
 
 	/* must to indicate which section of VSI context are being modified */
 	ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_RXQ_MAP_VALID);

From f66b98c868f2c1ffcb1139ea19d6f20983f1f9dc Mon Sep 17 00:00:00 2001
From: Sylwester Dziedziuch <sylwesterx.dziedziuch@intel.com>
Date: Thu, 1 Sep 2022 16:32:06 +0200
Subject: [PATCH 09/83] iavf: Fix change VF's mac address

Previously changing mac address gives false negative because
ip link set <interface> address <MAC> return with
RTNLINK: Permission denied.
In iavf_set_mac was check if PF handled our mac set request,
even before filter was added to list.
Because this check returns always true and it never waits for
PF's response.

Move iavf_is_mac_handled to wait_event_interruptible_timeout
instead of false. Now it will wait for PF's response and then
check if address was added or rejected.

Fixes: 35a2443d0910 ("iavf: Add waiting for response from PF in set mac")
Signed-off-by: Sylwester Dziedziuch <sylwesterx.dziedziuch@intel.com>
Co-developed-by: Norbert Zulinski <norbertx.zulinski@intel.com>
Signed-off-by: Norbert Zulinski <norbertx.zulinski@intel.com>
Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 10aa99dfdcdb..0c89f16bf1e2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1077,7 +1077,6 @@ static int iavf_set_mac(struct net_device *netdev, void *p)
 {
 	struct iavf_adapter *adapter = netdev_priv(netdev);
 	struct sockaddr *addr = p;
-	bool handle_mac = iavf_is_mac_set_handled(netdev, addr->sa_data);
 	int ret;
 
 	if (!is_valid_ether_addr(addr->sa_data))
@@ -1094,10 +1093,9 @@ static int iavf_set_mac(struct net_device *netdev, void *p)
 		return 0;
 	}
 
-	if (handle_mac)
-		goto done;
-
-	ret = wait_event_interruptible_timeout(adapter->vc_waitqueue, false, msecs_to_jiffies(2500));
+	ret = wait_event_interruptible_timeout(adapter->vc_waitqueue,
+					       iavf_is_mac_set_handled(netdev, addr->sa_data),
+					       msecs_to_jiffies(2500));
 
 	/* If ret < 0 then it means wait was interrupted.
 	 * If ret == 0 then it means we got a timeout.
@@ -1111,7 +1109,6 @@ static int iavf_set_mac(struct net_device *netdev, void *p)
 	if (!ret)
 		return -EAGAIN;
 
-done:
 	if (!ether_addr_equal(netdev->dev_addr, addr->sa_data))
 		return -EACCES;
 

From 809f23c0423a43266e47a7dc67e95b5cb4d1cbfc Mon Sep 17 00:00:00 2001
From: Brett Creeley <brett.creeley@intel.com>
Date: Thu, 1 Sep 2022 16:34:40 +0200
Subject: [PATCH 10/83] iavf: Fix cached head and tail value for
 iavf_get_tx_pending

The underlying hardware may or may not allow reading of the head or tail
registers and it really makes no difference if we use the software
cached values. So, always used the software cached values.

Fixes: 9c6c12595b73 ("i40e: Detection and recovery of TX queue hung logic moved to service_task from tx_timeout")
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Co-developed-by: Norbert Zulinski <norbertx.zulinski@intel.com>
Signed-off-by: Norbert Zulinski <norbertx.zulinski@intel.com>
Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 06d18797d25a..4c3f3f419110 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -114,8 +114,11 @@ u32 iavf_get_tx_pending(struct iavf_ring *ring, bool in_sw)
 {
 	u32 head, tail;
 
+	/* underlying hardware might not allow access and/or always return
+	 * 0 for the head/tail registers so just use the cached values
+	 */
 	head = ring->next_to_clean;
-	tail = readl(ring->tail);
+	tail = ring->next_to_use;
 
 	if (head != tail)
 		return (head < tail) ?

From 81225b2ea161af48e093f58e8dfee6d705b16af4 Mon Sep 17 00:00:00 2001
From: Lu Wei <luwei32@huawei.com>
Date: Wed, 7 Sep 2022 18:12:04 +0800
Subject: [PATCH 11/83] ipvlan: Fix out-of-bound bugs caused by unset
 skb->mac_header

If an AF_PACKET socket is used to send packets through ipvlan and the
default xmit function of the AF_PACKET socket is changed from
dev_queue_xmit() to packet_direct_xmit() via setsockopt() with the option
name of PACKET_QDISC_BYPASS, the skb->mac_header may not be reset and
remains as the initial value of 65535, this may trigger slab-out-of-bounds
bugs as following:

=================================================================
UG: KASAN: slab-out-of-bounds in ipvlan_xmit_mode_l2+0xdb/0x330 [ipvlan]
PU: 2 PID: 1768 Comm: raw_send Kdump: loaded Not tainted 6.0.0-rc4+ #6
ardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33
all Trace:
print_address_description.constprop.0+0x1d/0x160
print_report.cold+0x4f/0x112
kasan_report+0xa3/0x130
ipvlan_xmit_mode_l2+0xdb/0x330 [ipvlan]
ipvlan_start_xmit+0x29/0xa0 [ipvlan]
__dev_direct_xmit+0x2e2/0x380
packet_direct_xmit+0x22/0x60
packet_snd+0x7c9/0xc40
sock_sendmsg+0x9a/0xa0
__sys_sendto+0x18a/0x230
__x64_sys_sendto+0x74/0x90
do_syscall_64+0x3b/0x90
entry_SYSCALL_64_after_hwframe+0x63/0xcd

The root cause is:
  1. packet_snd() only reset skb->mac_header when sock->type is SOCK_RAW
     and skb->protocol is not specified as in packet_parse_headers()

  2. packet_direct_xmit() doesn't reset skb->mac_header as dev_queue_xmit()

In this case, skb->mac_header is 65535 when ipvlan_xmit_mode_l2() is
called. So when ipvlan_xmit_mode_l2() gets mac header with eth_hdr() which
use "skb->head + skb->mac_header", out-of-bound access occurs.

This patch replaces eth_hdr() with skb_eth_hdr() in ipvlan_xmit_mode_l2()
and reset mac header in multicast to solve this out-of-bound bug.

Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.")
Signed-off-by: Lu Wei <luwei32@huawei.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ipvlan/ipvlan_core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index dfeb5b392e64..bb1c298c1e78 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -495,7 +495,6 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
 
 static int ipvlan_process_outbound(struct sk_buff *skb)
 {
-	struct ethhdr *ethh = eth_hdr(skb);
 	int ret = NET_XMIT_DROP;
 
 	/* The ipvlan is a pseudo-L2 device, so the packets that we receive
@@ -505,6 +504,8 @@ static int ipvlan_process_outbound(struct sk_buff *skb)
 	if (skb_mac_header_was_set(skb)) {
 		/* In this mode we dont care about
 		 * multicast and broadcast traffic */
+		struct ethhdr *ethh = eth_hdr(skb);
+
 		if (is_multicast_ether_addr(ethh->h_dest)) {
 			pr_debug_ratelimited(
 				"Dropped {multi|broad}cast of type=[%x]\n",
@@ -589,7 +590,7 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
 static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
 {
 	const struct ipvl_dev *ipvlan = netdev_priv(dev);
-	struct ethhdr *eth = eth_hdr(skb);
+	struct ethhdr *eth = skb_eth_hdr(skb);
 	struct ipvl_addr *addr;
 	void *lyr3h;
 	int addr_type;
@@ -619,6 +620,7 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
 		return dev_forward_skb(ipvlan->phy_dev, skb);
 
 	} else if (is_multicast_ether_addr(eth->h_dest)) {
+		skb_reset_mac_header(skb);
 		ipvlan_skb_crossing_ns(skb, NULL);
 		ipvlan_multicast_enqueue(ipvlan->port, skb, true);
 		return NET_XMIT_SUCCESS;

From 64ae13ed478428135cddc2f1113dff162d8112d4 Mon Sep 17 00:00:00 2001
From: Ludovic Cintrat <ludovic.cintrat@gatewatcher.com>
Date: Wed, 7 Sep 2022 12:08:13 +0200
Subject: [PATCH 12/83] net: core: fix flow symmetric hash

__flow_hash_consistentify() wrongly swaps ipv4 addresses in few cases.
This function is indirectly used by __skb_get_hash_symmetric(), which is
used to fanout packets in AF_PACKET.
Intrusion detection systems may be impacted by this issue.

__flow_hash_consistentify() computes the addresses difference then swaps
them if the difference is negative. In few cases src - dst and dst - src
are both negative.

The following snippet mimics __flow_hash_consistentify():

```
 #include <stdio.h>
 #include <stdint.h>

 int main(int argc, char** argv) {

     int diffs_d, diffd_s;
     uint32_t dst  = 0xb225a8c0; /* 178.37.168.192 --> 192.168.37.178 */
     uint32_t src  = 0x3225a8c0; /*  50.37.168.192 --> 192.168.37.50  */
     uint32_t dst2 = 0x3325a8c0; /*  51.37.168.192 --> 192.168.37.51  */

     diffs_d = src - dst;
     diffd_s = dst - src;

     printf("src:%08x dst:%08x, diff(s-d)=%d(0x%x) diff(d-s)=%d(0x%x)\n",
             src, dst, diffs_d, diffs_d, diffd_s, diffd_s);

     diffs_d = src - dst2;
     diffd_s = dst2 - src;

     printf("src:%08x dst:%08x, diff(s-d)=%d(0x%x) diff(d-s)=%d(0x%x)\n",
             src, dst2, diffs_d, diffs_d, diffd_s, diffd_s);

     return 0;
 }
```

Results:

src:3225a8c0 dst:b225a8c0, \
    diff(s-d)=-2147483648(0x80000000) \
    diff(d-s)=-2147483648(0x80000000)

src:3225a8c0 dst:3325a8c0, \
    diff(s-d)=-16777216(0xff000000) \
    diff(d-s)=16777216(0x1000000)

In the first case the addresses differences are always < 0, therefore
__flow_hash_consistentify() always swaps, thus dst->src and src->dst
packets have differents hashes.

Fixes: c3f8324188fa8 ("net: Add full IPv6 addresses to flow_keys")
Signed-off-by: Ludovic Cintrat <ludovic.cintrat@gatewatcher.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 764c4cb3fe8f..5dc3860e9fc7 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1611,9 +1611,8 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys)
 
 	switch (keys->control.addr_type) {
 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
-		addr_diff = (__force u32)keys->addrs.v4addrs.dst -
-			    (__force u32)keys->addrs.v4addrs.src;
-		if (addr_diff < 0)
+		if ((__force u32)keys->addrs.v4addrs.dst <
+		    (__force u32)keys->addrs.v4addrs.src)
 			swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
 
 		if ((__force u16)keys->ports.dst <

From 35e60f1aadf6c02d77fdf42180fbf205aec7e8fc Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 8 Sep 2022 13:57:50 -0700
Subject: [PATCH 13/83] Bluetooth: Fix HCIGETDEVINFO regression

Recent changes breaks HCIGETDEVINFO since it changes the size of
hci_dev_info.

Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sock.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/net/bluetooth/hci_sock.h b/include/net/bluetooth/hci_sock.h
index 0520e21ab698..9949870f7d78 100644
--- a/include/net/bluetooth/hci_sock.h
+++ b/include/net/bluetooth/hci_sock.h
@@ -124,8 +124,6 @@ struct hci_dev_info {
 	__u16 acl_pkts;
 	__u16 sco_mtu;
 	__u16 sco_pkts;
-	__u16 iso_mtu;
-	__u16 iso_pkts;
 
 	struct hci_dev_stats stat;
 };

From 8997f5c8a62760db69fd5c56116705796322c8ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Wed, 7 Sep 2022 15:44:50 +0200
Subject: [PATCH 14/83] wifi: iwlwifi: Mark IWLMEI as broken
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The iwlmei driver breaks iwlwifi when returning from suspend. The interface
ends up in the 'down' state after coming back from suspend. And iwd doesn't
touch the interface state, but wpa_supplicant does, so the bug only happens on
iwd.

The bug report[0] has been open for four months now, and no fix seems to be
forthcoming. Since just disabling the iwlmei driver works as a workaround,
let's mark the config option as broken until it can be fixed properly.

[0] https://bugzilla.kernel.org/show_bug.cgi?id=215937

Fixes: 2da4366f9e2c ("iwlwifi: mei: add the driver to allow cooperation with CSME")
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220907134450.1183045-1-toke@toke.dk
---
 drivers/net/wireless/intel/iwlwifi/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/intel/iwlwifi/Kconfig b/drivers/net/wireless/intel/iwlwifi/Kconfig
index a647a406b87b..b20409f8c13a 100644
--- a/drivers/net/wireless/intel/iwlwifi/Kconfig
+++ b/drivers/net/wireless/intel/iwlwifi/Kconfig
@@ -140,6 +140,7 @@ config IWLMEI
 	depends on INTEL_MEI
 	depends on PM
 	depends on CFG80211
+	depends on BROKEN
 	help
 	  Enables the iwlmei kernel module.
 

From c3a510e2b53785df31d882a773c4c0780b4c825f Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 26 Aug 2022 20:23:29 +0200
Subject: [PATCH 15/83] wifi: mt76: fix reading current per-tid starting
 sequence number for aggregation

The code was accidentally shifting register values down by tid % 32 instead of
(tid * field_size) % 32.

Cc: stable@vger.kernel.org
Fixes: a28bef561a5c ("mt76: mt7615: re-enable offloading of sequence number assignment")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220826182329.18155-1-nbd@nbd.name
---
 drivers/net/wireless/mediatek/mt76/mt7615/mac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
index ad6c7d632eed..d6aae60c440d 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c
@@ -1088,7 +1088,7 @@ u32 mt7615_mac_get_sta_tid_sn(struct mt7615_dev *dev, int wcid, u8 tid)
 	offset %= 32;
 
 	val = mt76_rr(dev, addr);
-	val >>= (tid % 32);
+	val >>= offset;
 
 	if (offset > 20) {
 		addr += 4;

From 781b80f452fcc1cfc16ee41f12556626a9ced049 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 7 Sep 2022 11:52:28 +0200
Subject: [PATCH 16/83] wifi: mt76: fix 5 GHz connection regression on
 mt76x0/mt76x2

Some users have reported being unable to connect to MT76x0 APs running mt76
after a commit enabling the VHT extneded NSS BW feature.
Fix this regression by ensuring that this feature only gets enabled on drivers
that support it

Cc: stable@vger.kernel.org
Fixes: d9fcfc1424aa ("mt76: enable the VHT extended NSS BW feature")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220907095228.82072-1-nbd@nbd.name
---
 drivers/net/wireless/mediatek/mt76/mac80211.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 253cbc1956d1..6de13d641438 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -267,7 +267,8 @@ static void mt76_init_stream_cap(struct mt76_phy *phy,
 	}
 	vht_cap->vht_mcs.rx_mcs_map = cpu_to_le16(mcs_map);
 	vht_cap->vht_mcs.tx_mcs_map = cpu_to_le16(mcs_map);
-	vht_cap->vht_mcs.tx_highest |=
+	if (ieee80211_hw_check(phy->hw, SUPPORTS_VHT_EXT_NSS_BW))
+		vht_cap->vht_mcs.tx_highest |=
 				cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE);
 }
 

From ca2dccdeeb49a7e408112d681bf447984c845292 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Tue, 6 Sep 2022 16:04:51 +0300
Subject: [PATCH 17/83] net: phy: aquantia: wait for the suspend/resume
 operations to finish

The Aquantia datasheet notes that after issuing a Processor-Intensive
MDIO operation, like changing the low-power state of the device, the
driver should wait for the operation to finish before issuing a new MDIO
command.

The new aqr107_wait_processor_intensive_op() function is added which can
be used after these kind of MDIO operations. At the moment, we are only
adding it at the end of the suspend/resume calls.

The issue was identified on a board featuring the AQR113C PHY, on
which commands like 'ip link (..) up / down' issued without any delays
between them would render the link on the PHY to remain down.
The issue was easy to reproduce with a one-liner:
 $ ip link set dev ethX down; ip link set dev ethX up; \
 ip link set dev ethX down; ip link set dev ethX up;

Fixes: ac9e81c230eb ("net: phy: aquantia: add suspend / resume callbacks for AQR107 family")
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20220906130451.1483448-1-ioana.ciornei@nxp.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/aquantia_main.c | 53 ++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 4 deletions(-)

diff --git a/drivers/net/phy/aquantia_main.c b/drivers/net/phy/aquantia_main.c
index 8b7a46db30e0..7111e2e958e9 100644
--- a/drivers/net/phy/aquantia_main.c
+++ b/drivers/net/phy/aquantia_main.c
@@ -91,6 +91,9 @@
 #define VEND1_GLOBAL_FW_ID_MAJOR		GENMASK(15, 8)
 #define VEND1_GLOBAL_FW_ID_MINOR		GENMASK(7, 0)
 
+#define VEND1_GLOBAL_GEN_STAT2			0xc831
+#define VEND1_GLOBAL_GEN_STAT2_OP_IN_PROG	BIT(15)
+
 #define VEND1_GLOBAL_RSVD_STAT1			0xc885
 #define VEND1_GLOBAL_RSVD_STAT1_FW_BUILD_ID	GENMASK(7, 4)
 #define VEND1_GLOBAL_RSVD_STAT1_PROV_ID		GENMASK(3, 0)
@@ -125,6 +128,12 @@
 #define VEND1_GLOBAL_INT_VEND_MASK_GLOBAL2	BIT(1)
 #define VEND1_GLOBAL_INT_VEND_MASK_GLOBAL3	BIT(0)
 
+/* Sleep and timeout for checking if the Processor-Intensive
+ * MDIO operation is finished
+ */
+#define AQR107_OP_IN_PROG_SLEEP		1000
+#define AQR107_OP_IN_PROG_TIMEOUT	100000
+
 struct aqr107_hw_stat {
 	const char *name;
 	int reg;
@@ -597,16 +606,52 @@ static void aqr107_link_change_notify(struct phy_device *phydev)
 		phydev_info(phydev, "Aquantia 1000Base-T2 mode active\n");
 }
 
+static int aqr107_wait_processor_intensive_op(struct phy_device *phydev)
+{
+	int val, err;
+
+	/* The datasheet notes to wait at least 1ms after issuing a
+	 * processor intensive operation before checking.
+	 * We cannot use the 'sleep_before_read' parameter of read_poll_timeout
+	 * because that just determines the maximum time slept, not the minimum.
+	 */
+	usleep_range(1000, 5000);
+
+	err = phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1,
+					VEND1_GLOBAL_GEN_STAT2, val,
+					!(val & VEND1_GLOBAL_GEN_STAT2_OP_IN_PROG),
+					AQR107_OP_IN_PROG_SLEEP,
+					AQR107_OP_IN_PROG_TIMEOUT, false);
+	if (err) {
+		phydev_err(phydev, "timeout: processor-intensive MDIO operation\n");
+		return err;
+	}
+
+	return 0;
+}
+
 static int aqr107_suspend(struct phy_device *phydev)
 {
-	return phy_set_bits_mmd(phydev, MDIO_MMD_VEND1, MDIO_CTRL1,
-				MDIO_CTRL1_LPOWER);
+	int err;
+
+	err = phy_set_bits_mmd(phydev, MDIO_MMD_VEND1, MDIO_CTRL1,
+			       MDIO_CTRL1_LPOWER);
+	if (err)
+		return err;
+
+	return aqr107_wait_processor_intensive_op(phydev);
 }
 
 static int aqr107_resume(struct phy_device *phydev)
 {
-	return phy_clear_bits_mmd(phydev, MDIO_MMD_VEND1, MDIO_CTRL1,
-				  MDIO_CTRL1_LPOWER);
+	int err;
+
+	err = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND1, MDIO_CTRL1,
+				 MDIO_CTRL1_LPOWER);
+	if (err)
+		return err;
+
+	return aqr107_wait_processor_intensive_op(phydev);
 }
 
 static int aqr107_probe(struct phy_device *phydev)

From 7288ff6ec795804b1b4632e535403d52bd2b3ce1 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 6 Sep 2022 20:04:01 +0200
Subject: [PATCH 18/83] mptcp: fix fwd memory accounting on coalesce

The intel bot reported a memory accounting related splat:

[  240.473094] ------------[ cut here ]------------
[  240.478507] page_counter underflow: -4294828518 nr_pages=4294967290
[  240.485500] WARNING: CPU: 2 PID: 14986 at mm/page_counter.c:56 page_counter_cancel+0x96/0xc0
[  240.570849] CPU: 2 PID: 14986 Comm: mptcp_connect Tainted: G S                5.19.0-rc4-00739-gd24141fe7b48 #1
[  240.581637] Hardware name: HP HP Z240 SFF Workstation/802E, BIOS N51 Ver. 01.63 10/05/2017
[  240.590600] RIP: 0010:page_counter_cancel+0x96/0xc0
[  240.596179] Code: 00 00 00 45 31 c0 48 89 ef 5d 4c 89 c6 41 5c e9 40 fd ff ff 4c 89 e2 48 c7 c7 20 73 39 84 c6 05 d5 b1 52 04 01 e8 e7 95 f3
01 <0f> 0b eb a9 48 89 ef e8 1e 25 fc ff eb c3 66 66 2e 0f 1f 84 00 00
[  240.615639] RSP: 0018:ffffc9000496f7c8 EFLAGS: 00010082
[  240.621569] RAX: 0000000000000000 RBX: ffff88819c9c0120 RCX: 0000000000000000
[  240.629404] RDX: 0000000000000027 RSI: 0000000000000004 RDI: fffff5200092deeb
[  240.637239] RBP: ffff88819c9c0120 R08: 0000000000000001 R09: ffff888366527a2b
[  240.645069] R10: ffffed106cca4f45 R11: 0000000000000001 R12: 00000000fffffffa
[  240.652903] R13: ffff888366536118 R14: 00000000fffffffa R15: ffff88819c9c0000
[  240.660738] FS:  00007f3786e72540(0000) GS:ffff888366500000(0000) knlGS:0000000000000000
[  240.669529] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  240.675974] CR2: 00007f966b346000 CR3: 0000000168cea002 CR4: 00000000003706e0
[  240.683807] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  240.691641] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  240.699468] Call Trace:
[  240.702613]  <TASK>
[  240.705413]  page_counter_uncharge+0x29/0x80
[  240.710389]  drain_stock+0xd0/0x180
[  240.714585]  refill_stock+0x278/0x580
[  240.718951]  __sk_mem_reduce_allocated+0x222/0x5c0
[  240.729248]  __mptcp_update_rmem+0x235/0x2c0
[  240.734228]  __mptcp_move_skbs+0x194/0x6c0
[  240.749764]  mptcp_recvmsg+0xdfa/0x1340
[  240.763153]  inet_recvmsg+0x37f/0x500
[  240.782109]  sock_read_iter+0x24a/0x380
[  240.805353]  new_sync_read+0x420/0x540
[  240.838552]  vfs_read+0x37f/0x4c0
[  240.842582]  ksys_read+0x170/0x200
[  240.864039]  do_syscall_64+0x5c/0x80
[  240.872770]  entry_SYSCALL_64_after_hwframe+0x46/0xb0
[  240.878526] RIP: 0033:0x7f3786d9ae8e
[  240.882805] Code: c0 e9 b6 fe ff ff 50 48 8d 3d 6e 18 0a 00 e8 89 e8 01 00 66 0f 1f 84 00 00 00 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 5a c3 66 0f 1f 84 00 00 00 00 00 48 83 ec 28
[  240.902259] RSP: 002b:00007fff7be81e08 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[  240.910533] RAX: ffffffffffffffda RBX: 0000000000002000 RCX: 00007f3786d9ae8e
[  240.918368] RDX: 0000000000002000 RSI: 00007fff7be87ec0 RDI: 0000000000000005
[  240.926206] RBP: 0000000000000005 R08: 00007f3786e6a230 R09: 00007f3786e6a240
[  240.934046] R10: fffffffffffff288 R11: 0000000000000246 R12: 0000000000002000
[  240.941884] R13: 00007fff7be87ec0 R14: 00007fff7be87ec0 R15: 0000000000002000
[  240.949741]  </TASK>
[  240.952632] irq event stamp: 27367
[  240.956735] hardirqs last  enabled at (27366): [<ffffffff81ba50ea>] mem_cgroup_uncharge_skmem+0x6a/0x80
[  240.966848] hardirqs last disabled at (27367): [<ffffffff81b8fd42>] refill_stock+0x282/0x580
[  240.976017] softirqs last  enabled at (27360): [<ffffffff83a4d8ef>] mptcp_recvmsg+0xaf/0x1340
[  240.985273] softirqs last disabled at (27364): [<ffffffff83a4d30c>] __mptcp_move_skbs+0x18c/0x6c0
[  240.994872] ---[ end trace 0000000000000000 ]---

After commit d24141fe7b48 ("mptcp: drop SK_RECLAIM_* macros"),
if rmem_fwd_alloc become negative, mptcp_rmem_uncharge() can
try to reclaim a negative amount of pages, since the expression:

	reclaimable >= PAGE_SIZE

will evaluate to true for any negative value of the int
'reclaimable': 'PAGE_SIZE' is an unsigned long and
the negative integer will be promoted to a (very large)
unsigned long value.

Still after the mentioned commit, kfree_skb_partial()
in mptcp_try_coalesce() will reclaim most of just released fwd
memory, so that following charging of the skb delta size will
lead to negative fwd memory values.

At that point a racing recvmsg() can trigger the splat.

Address the issue switching the order of the memory accounting
operations. The fwd memory can still transiently reach negative
values, but that will happen in an atomic scope and no code
path could touch/use such value.

Reported-by: kernel test robot <oliver.sang@intel.com>
Fixes: d24141fe7b48 ("mptcp: drop SK_RECLAIM_* macros")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Link: https://lore.kernel.org/r/20220906180404.1255873-1-matthieu.baerts@tessares.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index d398f3810662..969b33a9dd64 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -150,9 +150,15 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
 		 MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
 		 to->len, MPTCP_SKB_CB(from)->end_seq);
 	MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
-	kfree_skb_partial(from, fragstolen);
+
+	/* note the fwd memory can reach a negative value after accounting
+	 * for the delta, but the later skb free will restore a non
+	 * negative one
+	 */
 	atomic_add(delta, &sk->sk_rmem_alloc);
 	mptcp_rmem_charge(sk, delta);
+	kfree_skb_partial(from, fragstolen);
+
 	return true;
 }
 

From 0727a9a5fbc1151fcaebfa9772e9f68f5e38ba9e Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts@tessares.net>
Date: Tue, 6 Sep 2022 20:04:02 +0200
Subject: [PATCH 19/83] Documentation: mptcp: fix pm_type formatting

When looking at the rendered HTML version, we can see 'pm_type' is not
displayed with a bold font:

  https://docs.kernel.org/5.19/networking/mptcp-sysctl.html

The empty line under 'pm_type' is then removed to have the same style as
the others.

Fixes: 6bb63ccc25d4 ("mptcp: Add a per-namespace sysctl to set the default path manager type")
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Link: https://lore.kernel.org/r/20220906180404.1255873-2-matthieu.baerts@tessares.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/mptcp-sysctl.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst
index e263dfcc4b40..213510698014 100644
--- a/Documentation/networking/mptcp-sysctl.rst
+++ b/Documentation/networking/mptcp-sysctl.rst
@@ -47,7 +47,6 @@ allow_join_initial_addr_port - BOOLEAN
 	Default: 1
 
 pm_type - INTEGER
-
 	Set the default path manager type to use for each new MPTCP
 	socket. In-kernel path management will control subflow
 	connections and address advertisements according to

From 94160108a70c8af17fa1484a37e05181c0e094af Mon Sep 17 00:00:00 2001
From: Haimin Zhang <tcs.kernel@gmail.com>
Date: Thu, 8 Sep 2022 20:19:27 +0800
Subject: [PATCH 20/83] net/ieee802154: fix uninit value bug in dgram_sendmsg

There is uninit value bug in dgram_sendmsg function in
net/ieee802154/socket.c when the length of valid data pointed by the
msg->msg_name isn't verified.

We introducing a helper function ieee802154_sockaddr_check_size to
check namelen. First we check there is addr_type in ieee802154_addr_sa.
Then, we check namelen according to addr_type.

Also fixed in raw_bind, dgram_bind, dgram_connect.

Signed-off-by: Haimin Zhang <tcs_kernel@tencent.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ieee802154_netdev.h | 37 +++++++++++++++++++++++++++++
 net/ieee802154/socket.c         | 42 ++++++++++++++++++---------------
 2 files changed, 60 insertions(+), 19 deletions(-)

diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h
index d0d188c3294b..a8994f307fc3 100644
--- a/include/net/ieee802154_netdev.h
+++ b/include/net/ieee802154_netdev.h
@@ -15,6 +15,22 @@
 #ifndef IEEE802154_NETDEVICE_H
 #define IEEE802154_NETDEVICE_H
 
+#define IEEE802154_REQUIRED_SIZE(struct_type, member) \
+	(offsetof(typeof(struct_type), member) + \
+	sizeof(((typeof(struct_type) *)(NULL))->member))
+
+#define IEEE802154_ADDR_OFFSET \
+	offsetof(typeof(struct sockaddr_ieee802154), addr)
+
+#define IEEE802154_MIN_NAMELEN (IEEE802154_ADDR_OFFSET + \
+	IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, addr_type))
+
+#define IEEE802154_NAMELEN_SHORT (IEEE802154_ADDR_OFFSET + \
+	IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, short_addr))
+
+#define IEEE802154_NAMELEN_LONG (IEEE802154_ADDR_OFFSET + \
+	IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, hwaddr))
+
 #include <net/af_ieee802154.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
@@ -165,6 +181,27 @@ static inline void ieee802154_devaddr_to_raw(void *raw, __le64 addr)
 	memcpy(raw, &temp, IEEE802154_ADDR_LEN);
 }
 
+static inline int
+ieee802154_sockaddr_check_size(struct sockaddr_ieee802154 *daddr, int len)
+{
+	struct ieee802154_addr_sa *sa;
+
+	sa = &daddr->addr;
+	if (len < IEEE802154_MIN_NAMELEN)
+		return -EINVAL;
+	switch (sa->addr_type) {
+	case IEEE802154_ADDR_SHORT:
+		if (len < IEEE802154_NAMELEN_SHORT)
+			return -EINVAL;
+		break;
+	case IEEE802154_ADDR_LONG:
+		if (len < IEEE802154_NAMELEN_LONG)
+			return -EINVAL;
+		break;
+	}
+	return 0;
+}
+
 static inline void ieee802154_addr_from_sa(struct ieee802154_addr *a,
 					   const struct ieee802154_addr_sa *sa)
 {
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index 718fb77bb372..7889e1ef7fad 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -200,8 +200,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len)
 	int err = 0;
 	struct net_device *dev = NULL;
 
-	if (len < sizeof(*uaddr))
-		return -EINVAL;
+	err = ieee802154_sockaddr_check_size(uaddr, len);
+	if (err < 0)
+		return err;
 
 	uaddr = (struct sockaddr_ieee802154 *)_uaddr;
 	if (uaddr->family != AF_IEEE802154)
@@ -493,7 +494,8 @@ static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len)
 
 	ro->bound = 0;
 
-	if (len < sizeof(*addr))
+	err = ieee802154_sockaddr_check_size(addr, len);
+	if (err < 0)
 		goto out;
 
 	if (addr->family != AF_IEEE802154)
@@ -564,8 +566,9 @@ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct dgram_sock *ro = dgram_sk(sk);
 	int err = 0;
 
-	if (len < sizeof(*addr))
-		return -EINVAL;
+	err = ieee802154_sockaddr_check_size(addr, len);
+	if (err < 0)
+		return err;
 
 	if (addr->family != AF_IEEE802154)
 		return -EINVAL;
@@ -604,6 +607,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	struct ieee802154_mac_cb *cb;
 	struct dgram_sock *ro = dgram_sk(sk);
 	struct ieee802154_addr dst_addr;
+	DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name);
 	int hlen, tlen;
 	int err;
 
@@ -612,10 +616,20 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 		return -EOPNOTSUPP;
 	}
 
-	if (!ro->connected && !msg->msg_name)
-		return -EDESTADDRREQ;
-	else if (ro->connected && msg->msg_name)
-		return -EISCONN;
+	if (msg->msg_name) {
+		if (ro->connected)
+			return -EISCONN;
+		if (msg->msg_namelen < IEEE802154_MIN_NAMELEN)
+			return -EINVAL;
+		err = ieee802154_sockaddr_check_size(daddr, msg->msg_namelen);
+		if (err < 0)
+			return err;
+		ieee802154_addr_from_sa(&dst_addr, &daddr->addr);
+	} else {
+		if (!ro->connected)
+			return -EDESTADDRREQ;
+		dst_addr = ro->dst_addr;
+	}
 
 	if (!ro->bound)
 		dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154);
@@ -651,16 +665,6 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	cb = mac_cb_init(skb);
 	cb->type = IEEE802154_FC_TYPE_DATA;
 	cb->ackreq = ro->want_ack;
-
-	if (msg->msg_name) {
-		DECLARE_SOCKADDR(struct sockaddr_ieee802154*,
-				 daddr, msg->msg_name);
-
-		ieee802154_addr_from_sa(&dst_addr, &daddr->addr);
-	} else {
-		dst_addr = ro->dst_addr;
-	}
-
 	cb->secen = ro->secen;
 	cb->secen_override = ro->secen_override;
 	cb->seclevel = ro->seclevel;

From 9124dbcc2dd6c51e81f97f63f7807118c4eb140a Mon Sep 17 00:00:00 2001
From: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Date: Thu, 8 Sep 2022 16:14:46 +0300
Subject: [PATCH 21/83] net: marvell: prestera: add support for for Aldrin2

Aldrin2 (98DX8525) is a Marvell Prestera PP, with 100G support.

Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>

V2:
  - retarget to net tree instead of net-next;
  - fix missed colon in patch subject ('net marvell' vs 'net: mavell');
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/prestera/prestera_pci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera_pci.c b/drivers/net/ethernet/marvell/prestera/prestera_pci.c
index f538a749ebd4..59470d99f522 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_pci.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_pci.c
@@ -872,6 +872,7 @@ static void prestera_pci_remove(struct pci_dev *pdev)
 static const struct pci_device_id prestera_pci_devices[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0xC804) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0xC80C) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0xCC1E) },
 	{ }
 };
 MODULE_DEVICE_TABLE(pci, prestera_pci_devices);

From 1d9a143ee3408349700f44a9197b7ae0e4faae5d Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@nvidia.com>
Date: Wed, 7 Sep 2022 16:56:39 +0900
Subject: [PATCH 22/83] net: bonding: Share lacpdu_mcast_addr definition

There are already a few definitions of arrays containing
MULTICAST_LACPDU_ADDR and the next patch will add one more use. These all
contain the same constant data so define one common instance for all
bonding code.

Signed-off-by: Benjamin Poirier <bpoirier@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_3ad.c  |  5 +++--
 drivers/net/bonding/bond_main.c | 16 ++++------------
 include/net/bond_3ad.h          |  2 --
 include/net/bonding.h           |  3 +++
 4 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 184608bd8999..e58a1e0cadd2 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -88,8 +88,9 @@ static const u8 null_mac_addr[ETH_ALEN + 2] __long_aligned = {
 static const u16 ad_ticks_per_sec = 1000 / AD_TIMER_INTERVAL;
 static const int ad_delta_in_ticks = (AD_TIMER_INTERVAL * HZ) / 1000;
 
-static const u8 lacpdu_mcast_addr[ETH_ALEN + 2] __long_aligned =
-	MULTICAST_LACPDU_ADDR;
+const u8 lacpdu_mcast_addr[ETH_ALEN + 2] __long_aligned = {
+	0x01, 0x80, 0xC2, 0x00, 0x00, 0x02
+};
 
 /* ================= main 802.3ad protocol functions ================== */
 static int ad_lacpdu_send(struct port *port);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5c2febe94428..faced8ae4edd 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -865,12 +865,8 @@ static void bond_hw_addr_flush(struct net_device *bond_dev,
 	dev_uc_unsync(slave_dev, bond_dev);
 	dev_mc_unsync(slave_dev, bond_dev);
 
-	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
-		/* del lacpdu mc addr from mc list */
-		u8 lacpdu_multicast[ETH_ALEN] = MULTICAST_LACPDU_ADDR;
-
-		dev_mc_del(slave_dev, lacpdu_multicast);
-	}
+	if (BOND_MODE(bond) == BOND_MODE_8023AD)
+		dev_mc_del(slave_dev, lacpdu_mcast_addr);
 }
 
 /*--------------------------- Active slave change ---------------------------*/
@@ -2171,12 +2167,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		dev_uc_sync_multiple(slave_dev, bond_dev);
 		netif_addr_unlock_bh(bond_dev);
 
-		if (BOND_MODE(bond) == BOND_MODE_8023AD) {
-			/* add lacpdu mc addr to mc list */
-			u8 lacpdu_multicast[ETH_ALEN] = MULTICAST_LACPDU_ADDR;
-
-			dev_mc_add(slave_dev, lacpdu_multicast);
-		}
+		if (BOND_MODE(bond) == BOND_MODE_8023AD)
+			dev_mc_add(slave_dev, lacpdu_mcast_addr);
 	}
 
 	bond->slave_cnt++;
diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h
index be2992e6de5d..a016f275cb01 100644
--- a/include/net/bond_3ad.h
+++ b/include/net/bond_3ad.h
@@ -15,8 +15,6 @@
 #define PKT_TYPE_LACPDU         cpu_to_be16(ETH_P_SLOW)
 #define AD_TIMER_INTERVAL       100 /*msec*/
 
-#define MULTICAST_LACPDU_ADDR    {0x01, 0x80, 0xC2, 0x00, 0x00, 0x02}
-
 #define AD_LACP_SLOW 0
 #define AD_LACP_FAST 1
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index afd606df149a..e999f851738b 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -786,6 +786,9 @@ extern struct rtnl_link_ops bond_link_ops;
 /* exported from bond_sysfs_slave.c */
 extern const struct sysfs_ops slave_sysfs_ops;
 
+/* exported from bond_3ad.c */
+extern const u8 lacpdu_mcast_addr[];
+
 static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb)
 {
 	dev_core_stats_tx_dropped_inc(dev);

From 86247aba599e5b07d7e828e6edaaebb0ef2b1158 Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@nvidia.com>
Date: Wed, 7 Sep 2022 16:56:40 +0900
Subject: [PATCH 23/83] net: bonding: Unsync device addresses on ndo_stop

Netdev drivers are expected to call dev_{uc,mc}_sync() in their
ndo_set_rx_mode method and dev_{uc,mc}_unsync() in their ndo_stop method.
This is mentioned in the kerneldoc for those dev_* functions.

The bonding driver calls dev_{uc,mc}_unsync() during ndo_uninit instead of
ndo_stop. This is ineffective because address lists (dev->{uc,mc}) have
already been emptied in unregister_netdevice_many() before ndo_uninit is
called. This mistake can result in addresses being leftover on former bond
slaves after a bond has been deleted; see test_LAG_cleanup() in the last
patch in this series.

Add unsync calls, via bond_hw_addr_flush(), at their expected location,
bond_close().
Add dev_mc_add() call to bond_open() to match the above change.

v3:
* When adding or deleting a slave, only sync/unsync, add/del addresses if
  the bond is up. In other cases, it is taken care of at the right time by
  ndo_open/ndo_set_rx_mode/ndo_stop.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Benjamin Poirier <bpoirier@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 47 ++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index faced8ae4edd..bc6d8b0aa6fb 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -886,7 +886,8 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active,
 		if (bond->dev->flags & IFF_ALLMULTI)
 			dev_set_allmulti(old_active->dev, -1);
 
-		bond_hw_addr_flush(bond->dev, old_active->dev);
+		if (bond->dev->flags & IFF_UP)
+			bond_hw_addr_flush(bond->dev, old_active->dev);
 	}
 
 	if (new_active) {
@@ -897,10 +898,12 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active,
 		if (bond->dev->flags & IFF_ALLMULTI)
 			dev_set_allmulti(new_active->dev, 1);
 
-		netif_addr_lock_bh(bond->dev);
-		dev_uc_sync(new_active->dev, bond->dev);
-		dev_mc_sync(new_active->dev, bond->dev);
-		netif_addr_unlock_bh(bond->dev);
+		if (bond->dev->flags & IFF_UP) {
+			netif_addr_lock_bh(bond->dev);
+			dev_uc_sync(new_active->dev, bond->dev);
+			dev_mc_sync(new_active->dev, bond->dev);
+			netif_addr_unlock_bh(bond->dev);
+		}
 	}
 }
 
@@ -2162,13 +2165,15 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 			}
 		}
 
-		netif_addr_lock_bh(bond_dev);
-		dev_mc_sync_multiple(slave_dev, bond_dev);
-		dev_uc_sync_multiple(slave_dev, bond_dev);
-		netif_addr_unlock_bh(bond_dev);
+		if (bond_dev->flags & IFF_UP) {
+			netif_addr_lock_bh(bond_dev);
+			dev_mc_sync_multiple(slave_dev, bond_dev);
+			dev_uc_sync_multiple(slave_dev, bond_dev);
+			netif_addr_unlock_bh(bond_dev);
 
-		if (BOND_MODE(bond) == BOND_MODE_8023AD)
-			dev_mc_add(slave_dev, lacpdu_mcast_addr);
+			if (BOND_MODE(bond) == BOND_MODE_8023AD)
+				dev_mc_add(slave_dev, lacpdu_mcast_addr);
+		}
 	}
 
 	bond->slave_cnt++;
@@ -2439,7 +2444,8 @@ static int __bond_release_one(struct net_device *bond_dev,
 		if (old_flags & IFF_ALLMULTI)
 			dev_set_allmulti(slave_dev, -1);
 
-		bond_hw_addr_flush(bond_dev, slave_dev);
+		if (old_flags & IFF_UP)
+			bond_hw_addr_flush(bond_dev, slave_dev);
 	}
 
 	slave_disable_netpoll(slave);
@@ -4213,6 +4219,9 @@ static int bond_open(struct net_device *bond_dev)
 		/* register to receive LACPDUs */
 		bond->recv_probe = bond_3ad_lacpdu_recv;
 		bond_3ad_initiate_agg_selection(bond, 1);
+
+		bond_for_each_slave(bond, slave, iter)
+			dev_mc_add(slave->dev, lacpdu_mcast_addr);
 	}
 
 	if (bond_mode_can_use_xmit_hash(bond))
@@ -4224,6 +4233,7 @@ static int bond_open(struct net_device *bond_dev)
 static int bond_close(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave;
 
 	bond_work_cancel_all(bond);
 	bond->send_peer_notif = 0;
@@ -4231,6 +4241,19 @@ static int bond_close(struct net_device *bond_dev)
 		bond_alb_deinitialize(bond);
 	bond->recv_probe = NULL;
 
+	if (bond_uses_primary(bond)) {
+		rcu_read_lock();
+		slave = rcu_dereference(bond->curr_active_slave);
+		if (slave)
+			bond_hw_addr_flush(bond_dev, slave->dev);
+		rcu_read_unlock();
+	} else {
+		struct list_head *iter;
+
+		bond_for_each_slave(bond, slave, iter)
+			bond_hw_addr_flush(bond_dev, slave->dev);
+	}
+
 	return 0;
 }
 

From bd60234222b2fd5573526da7bcd422801f271f5f Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@nvidia.com>
Date: Wed, 7 Sep 2022 16:56:41 +0900
Subject: [PATCH 24/83] net: team: Unsync device addresses on ndo_stop

Netdev drivers are expected to call dev_{uc,mc}_sync() in their
ndo_set_rx_mode method and dev_{uc,mc}_unsync() in their ndo_stop method.
This is mentioned in the kerneldoc for those dev_* functions.

The team driver calls dev_{uc,mc}_unsync() during ndo_uninit instead of
ndo_stop. This is ineffective because address lists (dev->{uc,mc}) have
already been emptied in unregister_netdevice_many() before ndo_uninit is
called. This mistake can result in addresses being leftover on former team
ports after a team device has been deleted; see test_LAG_cleanup() in the
last patch in this series.

Add unsync calls at their expected location, team_close().

v3:
* When adding or deleting a port, only sync/unsync addresses if the team
  device is up. In other cases, it is taken care of at the right time by
  ndo_open/ndo_set_rx_mode/ndo_stop.

Fixes: 3d249d4ca7d0 ("net: introduce ethernet teaming device")
Signed-off-by: Benjamin Poirier <bpoirier@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index aac133a1e27a..154a3c0a6dfd 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1275,10 +1275,12 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
 		}
 	}
 
-	netif_addr_lock_bh(dev);
-	dev_uc_sync_multiple(port_dev, dev);
-	dev_mc_sync_multiple(port_dev, dev);
-	netif_addr_unlock_bh(dev);
+	if (dev->flags & IFF_UP) {
+		netif_addr_lock_bh(dev);
+		dev_uc_sync_multiple(port_dev, dev);
+		dev_mc_sync_multiple(port_dev, dev);
+		netif_addr_unlock_bh(dev);
+	}
 
 	port->index = -1;
 	list_add_tail_rcu(&port->list, &team->port_list);
@@ -1349,8 +1351,10 @@ static int team_port_del(struct team *team, struct net_device *port_dev)
 	netdev_rx_handler_unregister(port_dev);
 	team_port_disable_netpoll(port);
 	vlan_vids_del_by_dev(port_dev, dev);
-	dev_uc_unsync(port_dev, dev);
-	dev_mc_unsync(port_dev, dev);
+	if (dev->flags & IFF_UP) {
+		dev_uc_unsync(port_dev, dev);
+		dev_mc_unsync(port_dev, dev);
+	}
 	dev_close(port_dev);
 	team_port_leave(team, port);
 
@@ -1700,6 +1704,14 @@ static int team_open(struct net_device *dev)
 
 static int team_close(struct net_device *dev)
 {
+	struct team *team = netdev_priv(dev);
+	struct team_port *port;
+
+	list_for_each_entry(port, &team->port_list, list) {
+		dev_uc_unsync(port->dev, dev);
+		dev_mc_unsync(port->dev, dev);
+	}
+
 	return 0;
 }
 

From bbb774d921e273ca262944c94011bc2cc888ebeb Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@nvidia.com>
Date: Wed, 7 Sep 2022 16:56:42 +0900
Subject: [PATCH 25/83] net: Add tests for bonding and team address list
 management

Test that the bonding and team drivers clean up an underlying device's
address lists (dev->uc, dev->mc) when the aggregated device is deleted.

Test addition and removal of the LACPDU multicast address on underlying
devices by the bonding driver.

v2:
* add lag_lib.sh to TEST_FILES

v3:
* extend bond_listen_lacpdu_multicast test to init_state up and down cases
* remove some superfluous shell syntax and 'set dev ... up' commands

Signed-off-by: Benjamin Poirier <bpoirier@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                                   |   1 +
 tools/testing/selftests/Makefile              |   1 +
 .../selftests/drivers/net/bonding/Makefile    |   5 +-
 .../selftests/drivers/net/bonding/config      |   1 +
 .../drivers/net/bonding/dev_addr_lists.sh     | 109 ++++++++++++++++++
 .../selftests/drivers/net/bonding/lag_lib.sh  |  61 ++++++++++
 .../selftests/drivers/net/team/Makefile       |   6 +
 .../testing/selftests/drivers/net/team/config |   3 +
 .../drivers/net/team/dev_addr_lists.sh        |  51 ++++++++
 9 files changed, 237 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/drivers/net/bonding/dev_addr_lists.sh
 create mode 100644 tools/testing/selftests/drivers/net/bonding/lag_lib.sh
 create mode 100644 tools/testing/selftests/drivers/net/team/Makefile
 create mode 100644 tools/testing/selftests/drivers/net/team/config
 create mode 100755 tools/testing/selftests/drivers/net/team/dev_addr_lists.sh

diff --git a/MAINTAINERS b/MAINTAINERS
index f1390b8270b2..4007b99b1eb0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19948,6 +19948,7 @@ S:	Supported
 F:	drivers/net/team/
 F:	include/linux/if_team.h
 F:	include/uapi/linux/if_team.h
+F:	tools/testing/selftests/net/team/
 
 TECHNOLOGIC SYSTEMS TS-5500 PLATFORM SUPPORT
 M:	"Savoir-faire Linux Inc." <kernel@savoirfairelinux.com>
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index c2064a35688b..1fc89b8ef433 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -13,6 +13,7 @@ TARGETS += damon
 TARGETS += drivers/dma-buf
 TARGETS += drivers/s390x/uvdevice
 TARGETS += drivers/net/bonding
+TARGETS += drivers/net/team
 TARGETS += efivarfs
 TARGETS += exec
 TARGETS += filesystems
diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
index ab6c54b12098..0f9659407969 100644
--- a/tools/testing/selftests/drivers/net/bonding/Makefile
+++ b/tools/testing/selftests/drivers/net/bonding/Makefile
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for net selftests
 
-TEST_PROGS := bond-break-lacpdu-tx.sh
+TEST_PROGS := bond-break-lacpdu-tx.sh \
+	      dev_addr_lists.sh
+
+TEST_FILES := lag_lib.sh
 
 include ../../../lib.mk
diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config
index dc1c22de3c92..70638fa50b2c 100644
--- a/tools/testing/selftests/drivers/net/bonding/config
+++ b/tools/testing/selftests/drivers/net/bonding/config
@@ -1 +1,2 @@
 CONFIG_BONDING=y
+CONFIG_MACVLAN=y
diff --git a/tools/testing/selftests/drivers/net/bonding/dev_addr_lists.sh b/tools/testing/selftests/drivers/net/bonding/dev_addr_lists.sh
new file mode 100755
index 000000000000..e6fa24eded5b
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/dev_addr_lists.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test bond device handling of addr lists (dev->uc, mc)
+#
+
+ALL_TESTS="
+	bond_cleanup_mode1
+	bond_cleanup_mode4
+	bond_listen_lacpdu_multicast_case_down
+	bond_listen_lacpdu_multicast_case_up
+"
+
+REQUIRE_MZ=no
+NUM_NETIFS=0
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/lib.sh
+
+source "$lib_dir"/lag_lib.sh
+
+
+destroy()
+{
+	local ifnames=(dummy1 dummy2 bond1 mv0)
+	local ifname
+
+	for ifname in "${ifnames[@]}"; do
+		ip link del "$ifname" &>/dev/null
+	done
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	destroy
+}
+
+
+# bond driver control paths vary between modes that have a primary slave
+# (bond_uses_primary()) and others. Test both kinds of modes.
+
+bond_cleanup_mode1()
+{
+	RET=0
+
+	test_LAG_cleanup "bonding" "active-backup"
+}
+
+bond_cleanup_mode4() {
+	RET=0
+
+	test_LAG_cleanup "bonding" "802.3ad"
+}
+
+bond_listen_lacpdu_multicast()
+{
+	# Initial state of bond device, up | down
+	local init_state=$1
+	local lacpdu_mc="01:80:c2:00:00:02"
+
+	ip link add dummy1 type dummy
+	ip link add bond1 "$init_state" type bond mode 802.3ad
+	ip link set dev dummy1 master bond1
+	if [ "$init_state" = "down" ]; then
+		ip link set dev bond1 up
+	fi
+
+	grep_bridge_fdb "$lacpdu_mc" bridge fdb show brport dummy1 >/dev/null
+	check_err $? "LACPDU multicast address not present on slave (1)"
+
+	ip link set dev bond1 down
+
+	not grep_bridge_fdb "$lacpdu_mc" bridge fdb show brport dummy1 >/dev/null
+	check_err $? "LACPDU multicast address still present on slave"
+
+	ip link set dev bond1 up
+
+	grep_bridge_fdb "$lacpdu_mc" bridge fdb show brport dummy1 >/dev/null
+	check_err $? "LACPDU multicast address not present on slave (2)"
+
+	cleanup
+
+	log_test "bonding LACPDU multicast address to slave (from bond $init_state)"
+}
+
+# The LACPDU mc addr is added by different paths depending on the initial state
+# of the bond when enslaving a device. Test both cases.
+
+bond_listen_lacpdu_multicast_case_down()
+{
+	RET=0
+
+	bond_listen_lacpdu_multicast "down"
+}
+
+bond_listen_lacpdu_multicast_case_up()
+{
+	RET=0
+
+	bond_listen_lacpdu_multicast "up"
+}
+
+
+trap cleanup EXIT
+
+tests_run
+
+exit "$EXIT_STATUS"
diff --git a/tools/testing/selftests/drivers/net/bonding/lag_lib.sh b/tools/testing/selftests/drivers/net/bonding/lag_lib.sh
new file mode 100644
index 000000000000..16c7fb858ac1
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/lag_lib.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test that a link aggregation device (bonding, team) removes the hardware
+# addresses that it adds on its underlying devices.
+test_LAG_cleanup()
+{
+	local driver=$1
+	local mode=$2
+	local ucaddr="02:00:00:12:34:56"
+	local addr6="fe80::78:9abc/64"
+	local mcaddr="33:33:ff:78:9a:bc"
+	local name
+
+	ip link add dummy1 type dummy
+	ip link add dummy2 type dummy
+	if [ "$driver" = "bonding" ]; then
+		name="bond1"
+		ip link add "$name" up type bond mode "$mode"
+		ip link set dev dummy1 master "$name"
+		ip link set dev dummy2 master "$name"
+	elif [ "$driver" = "team" ]; then
+		name="team0"
+		teamd -d -c '
+			{
+				"device": "'"$name"'",
+				"runner": {
+					"name": "'"$mode"'"
+				},
+				"ports": {
+					"dummy1":
+						{},
+					"dummy2":
+						{}
+				}
+			}
+		'
+		ip link set dev "$name" up
+	else
+		check_err 1
+		log_test test_LAG_cleanup ": unknown driver \"$driver\""
+		return
+	fi
+
+	# Used to test dev->uc handling
+	ip link add mv0 link "$name" up address "$ucaddr" type macvlan
+	# Used to test dev->mc handling
+	ip address add "$addr6" dev "$name"
+	ip link set dev "$name" down
+	ip link del "$name"
+
+	not grep_bridge_fdb "$ucaddr" bridge fdb show >/dev/null
+	check_err $? "macvlan unicast address still present on a slave"
+
+	not grep_bridge_fdb "$mcaddr" bridge fdb show >/dev/null
+	check_err $? "IPv6 solicited-node multicast mac address still present on a slave"
+
+	cleanup
+
+	log_test "$driver cleanup mode $mode"
+}
diff --git a/tools/testing/selftests/drivers/net/team/Makefile b/tools/testing/selftests/drivers/net/team/Makefile
new file mode 100644
index 000000000000..642d8df1c137
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/team/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for net selftests
+
+TEST_PROGS := dev_addr_lists.sh
+
+include ../../../lib.mk
diff --git a/tools/testing/selftests/drivers/net/team/config b/tools/testing/selftests/drivers/net/team/config
new file mode 100644
index 000000000000..265b6882cc21
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/team/config
@@ -0,0 +1,3 @@
+CONFIG_NET_TEAM=y
+CONFIG_NET_TEAM_MODE_LOADBALANCE=y
+CONFIG_MACVLAN=y
diff --git a/tools/testing/selftests/drivers/net/team/dev_addr_lists.sh b/tools/testing/selftests/drivers/net/team/dev_addr_lists.sh
new file mode 100755
index 000000000000..debda7262956
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/team/dev_addr_lists.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test team device handling of addr lists (dev->uc, mc)
+#
+
+ALL_TESTS="
+	team_cleanup
+"
+
+REQUIRE_MZ=no
+NUM_NETIFS=0
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/lib.sh
+
+source "$lib_dir"/../bonding/lag_lib.sh
+
+
+destroy()
+{
+	local ifnames=(dummy0 dummy1 team0 mv0)
+	local ifname
+
+	for ifname in "${ifnames[@]}"; do
+		ip link del "$ifname" &>/dev/null
+	done
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	destroy
+}
+
+
+team_cleanup()
+{
+	RET=0
+
+	test_LAG_cleanup "team" "lacp"
+}
+
+
+require_command teamd
+
+trap cleanup EXIT
+
+tests_run
+
+exit "$EXIT_STATUS"

From 96628951869c0dedf0377adca01c8675172d8639 Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Thu, 8 Sep 2022 16:15:23 -0700
Subject: [PATCH 26/83] tcp: Use WARN_ON_ONCE() in tcp_read_skb()

Prevent tcp_read_skb() from flooding the syslog.

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6cdfce6f2867..3488388eea5d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1766,7 +1766,7 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 		return 0;
 
 	__skb_unlink(skb, &sk->sk_receive_queue);
-	WARN_ON(!skb_set_owner_sk_safe(skb, sk));
+	WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
 	copied = recv_actor(sk, skb);
 	if (copied >= 0) {
 		seq += copied;

From 66039eb9015eee4f7ff0c99b83c65c7ecb3c8190 Mon Sep 17 00:00:00 2001
From: Norbert Zulinski <norbertx.zulinski@intel.com>
Date: Wed, 14 Sep 2022 15:39:13 +0200
Subject: [PATCH 27/83] iavf: Fix bad page state

Fix bad page state, free inappropriate page in handling dummy
descriptor. iavf_build_skb now has to check not only if rx_buffer is
NULL but also if size is zero, same thing in iavf_clean_rx_irq.
Without this patch driver would free page that will be used
by napi_build_skb.

Fixes: a9f49e006030 ("iavf: Fix handling of dummy receive descriptors")
Signed-off-by: Norbert Zulinski <norbertx.zulinski@intel.com>
Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 4c3f3f419110..18b6a702a1d6 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -1393,7 +1393,7 @@ static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring,
 #endif
 	struct sk_buff *skb;
 
-	if (!rx_buffer)
+	if (!rx_buffer || !size)
 		return NULL;
 	/* prefetch first cache line of first page */
 	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
@@ -1551,7 +1551,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
 			rx_ring->rx_stats.alloc_buff_failed++;
-			if (rx_buffer)
+			if (rx_buffer && size)
 				rx_buffer->pagecnt_bias++;
 			break;
 		}

From 182447b12144b7be9b63a273d27c5a11bd54960a Mon Sep 17 00:00:00 2001
From: David Thompson <davthompson@nvidia.com>
Date: Fri, 2 Sep 2022 12:42:47 -0400
Subject: [PATCH 28/83] mlxbf_gige: clear MDIO gateway lock after read

The MDIO gateway (GW) lock in BlueField-2 GIGE logic is
set after read.  This patch adds logic to make sure the
lock is always cleared at the end of each MDIO transaction.

Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver")
Reviewed-by: Asmaa Mnebhi <asmaa@nvidia.com>
Signed-off-by: David Thompson <davthompson@nvidia.com>
Link: https://lore.kernel.org/r/20220902164247.19862-1-davthompson@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c
index 85155cd9405c..4aeb927c3715 100644
--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c
+++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c
@@ -179,6 +179,9 @@ static int mlxbf_gige_mdio_read(struct mii_bus *bus, int phy_add, int phy_reg)
 	/* Only return ad bits of the gw register */
 	ret &= MLXBF_GIGE_MDIO_GW_AD_MASK;
 
+	/* The MDIO lock is set on read. To release it, clear gw register */
+	writel(0, priv->mdio_io + MLXBF_GIGE_MDIO_GW_OFFSET);
+
 	return ret;
 }
 
@@ -203,6 +206,9 @@ static int mlxbf_gige_mdio_write(struct mii_bus *bus, int phy_add,
 					temp, !(temp & MLXBF_GIGE_MDIO_GW_BUSY_MASK),
 					5, 1000000);
 
+	/* The MDIO lock is set on read. To release it, clear gw register */
+	writel(0, priv->mdio_io + MLXBF_GIGE_MDIO_GW_OFFSET);
+
 	return ret;
 }
 

From 399c98c4dc50b7eb7e9f24da7ffdda6f025676ef Mon Sep 17 00:00:00 2001
From: Michal Jaron <michalx.jaron@intel.com>
Date: Tue, 13 Sep 2022 15:38:35 +0200
Subject: [PATCH 29/83] iavf: Fix set max MTU size with port VLAN and jumbo
 frames

After setting port VLAN and MTU to 9000 on VF with ice driver there
was an iavf error
"PF returned error -5 (IAVF_ERR_PARAM) to our request 6".

During queue configuration, VF's max packet size was set to
IAVF_MAX_RXBUFFER but on ice max frame size was smaller by VLAN_HLEN
due to making some space for port VLAN as VF is not aware whether it's
in a port VLAN. This mismatch in sizes caused ice to reject queue
configuration with ERR_PARAM error. Proper max_mtu is sent from ice PF
to VF with GET_VF_RESOURCES msg but VF does not look at this.

In iavf change max_frame from IAVF_MAX_RXBUFFER to max_mtu
received from pf with GET_VF_RESOURCES msg to make vf's
max_frame_size dependent from pf. Add check if received max_mtu is
not in eligible range then set it to IAVF_MAX_RXBUFFER.

Fixes: dab86afdbbd1 ("i40e/i40evf: Change the way we limit the maximum frame size for Rx")
Signed-off-by: Michal Jaron <michalx.jaron@intel.com>
Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 15ee85dc33bd..5a9e6563923e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -269,11 +269,14 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
 void iavf_configure_queues(struct iavf_adapter *adapter)
 {
 	struct virtchnl_vsi_queue_config_info *vqci;
-	struct virtchnl_queue_pair_info *vqpi;
+	int i, max_frame = adapter->vf_res->max_mtu;
 	int pairs = adapter->num_active_queues;
-	int i, max_frame = IAVF_MAX_RXBUFFER;
+	struct virtchnl_queue_pair_info *vqpi;
 	size_t len;
 
+	if (max_frame > IAVF_MAX_RXBUFFER || !max_frame)
+		max_frame = IAVF_MAX_RXBUFFER;
+
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
 		dev_err(&adapter->pdev->dev, "Cannot configure queues, command %d pending\n",

From 372539def2824c43b6afe2403045b140f65c5acc Mon Sep 17 00:00:00 2001
From: Michal Jaron <michalx.jaron@intel.com>
Date: Tue, 13 Sep 2022 15:38:36 +0200
Subject: [PATCH 30/83] i40e: Fix VF set max MTU size

Max MTU sent to VF is set to 0 during memory allocation. It cause
that max MTU on VF is changed to IAVF_MAX_RXBUFFER and does not
depend on data from HW.

Set max_mtu field in virtchnl_vf_resource struct to inform
VF in GET_VF_RESOURCES msg what size should be max frame.

Fixes: dab86afdbbd1 ("i40e/i40evf: Change the way we limit the maximum frame size for Rx")
Signed-off-by: Michal Jaron <michalx.jaron@intel.com>
Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 .../ethernet/intel/i40e/i40e_virtchnl_pf.c    | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 4f184c50f6e8..7e9f6a69eb10 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2038,6 +2038,25 @@ static void i40e_del_qch(struct i40e_vf *vf)
 	}
 }
 
+/**
+ * i40e_vc_get_max_frame_size
+ * @vf: pointer to the VF
+ *
+ * Max frame size is determined based on the current port's max frame size and
+ * whether a port VLAN is configured on this VF. The VF is not aware whether
+ * it's in a port VLAN so the PF needs to account for this in max frame size
+ * checks and sending the max frame size to the VF.
+ **/
+static u16 i40e_vc_get_max_frame_size(struct i40e_vf *vf)
+{
+	u16 max_frame_size = vf->pf->hw.phy.link_info.max_frame_size;
+
+	if (vf->port_vlan_id)
+		max_frame_size -= VLAN_HLEN;
+
+	return max_frame_size;
+}
+
 /**
  * i40e_vc_get_vf_resources_msg
  * @vf: pointer to the VF info
@@ -2139,6 +2158,7 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
 	vfres->max_vectors = pf->hw.func_caps.num_msix_vectors_vf;
 	vfres->rss_key_size = I40E_HKEY_ARRAY_SIZE;
 	vfres->rss_lut_size = I40E_VF_HLUT_ARRAY_SIZE;
+	vfres->max_mtu = i40e_vc_get_max_frame_size(vf);
 
 	if (vf->lan_vsi_idx) {
 		vfres->vsi_res[0].vsi_id = vf->lan_vsi_id;

From 198eb7e1b81d8ba676d0f4f120c092032ae69a8e Mon Sep 17 00:00:00 2001
From: Michal Jaron <michalx.jaron@intel.com>
Date: Thu, 1 Sep 2022 09:49:33 +0200
Subject: [PATCH 31/83] i40e: Fix set max_tx_rate when it is lower than 1 Mbps

While converting max_tx_rate from bytes to Mbps, this value was set to 0,
if the original value was lower than 125000 bytes (1 Mbps). This would
cause no transmission rate limiting to occur. This happened due to lack of
check of max_tx_rate against the 1 Mbps value for max_tx_rate and the
following division by 125000. Fix this issue by adding a helper
i40e_bw_bytes_to_mbits() which sets max_tx_rate to minimum usable value of
50 Mbps, if its value is less than 1 Mbps, otherwise do the required
conversion by dividing by 125000.

Fixes: 5ecae4120a6b ("i40e: Refactor VF BW rate limiting")
Signed-off-by: Michal Jaron <michalx.jaron@intel.com>
Signed-off-by: Andrii Staikov <andrii.staikov@intel.com>
Tested-by: Bharathi Sreenivas <bharathi.sreenivas@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 32 +++++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 10c1e1ea83a1..e3d9804aeb25 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5908,6 +5908,26 @@ static int i40e_get_link_speed(struct i40e_vsi *vsi)
 	}
 }
 
+/**
+ * i40e_bw_bytes_to_mbits - Convert max_tx_rate from bytes to mbits
+ * @vsi: Pointer to vsi structure
+ * @max_tx_rate: max TX rate in bytes to be converted into Mbits
+ *
+ * Helper function to convert units before send to set BW limit
+ **/
+static u64 i40e_bw_bytes_to_mbits(struct i40e_vsi *vsi, u64 max_tx_rate)
+{
+	if (max_tx_rate < I40E_BW_MBPS_DIVISOR) {
+		dev_warn(&vsi->back->pdev->dev,
+			 "Setting max tx rate to minimum usable value of 50Mbps.\n");
+		max_tx_rate = I40E_BW_CREDIT_DIVISOR;
+	} else {
+		do_div(max_tx_rate, I40E_BW_MBPS_DIVISOR);
+	}
+
+	return max_tx_rate;
+}
+
 /**
  * i40e_set_bw_limit - setup BW limit for Tx traffic based on max_tx_rate
  * @vsi: VSI to be configured
@@ -5930,10 +5950,10 @@ int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate)
 			max_tx_rate, seid);
 		return -EINVAL;
 	}
-	if (max_tx_rate && max_tx_rate < 50) {
+	if (max_tx_rate && max_tx_rate < I40E_BW_CREDIT_DIVISOR) {
 		dev_warn(&pf->pdev->dev,
 			 "Setting max tx rate to minimum usable value of 50Mbps.\n");
-		max_tx_rate = 50;
+		max_tx_rate = I40E_BW_CREDIT_DIVISOR;
 	}
 
 	/* Tx rate credits are in values of 50Mbps, 0 is disabled */
@@ -8224,9 +8244,9 @@ static int i40e_setup_tc(struct net_device *netdev, void *type_data)
 
 	if (i40e_is_tc_mqprio_enabled(pf)) {
 		if (vsi->mqprio_qopt.max_rate[0]) {
-			u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0];
+			u64 max_tx_rate = i40e_bw_bytes_to_mbits(vsi,
+						  vsi->mqprio_qopt.max_rate[0]);
 
-			do_div(max_tx_rate, I40E_BW_MBPS_DIVISOR);
 			ret = i40e_set_bw_limit(vsi, vsi->seid, max_tx_rate);
 			if (!ret) {
 				u64 credits = max_tx_rate;
@@ -10971,10 +10991,10 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired)
 	}
 
 	if (vsi->mqprio_qopt.max_rate[0]) {
-		u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0];
+		u64 max_tx_rate = i40e_bw_bytes_to_mbits(vsi,
+						  vsi->mqprio_qopt.max_rate[0]);
 		u64 credits = 0;
 
-		do_div(max_tx_rate, I40E_BW_MBPS_DIVISOR);
 		ret = i40e_set_bw_limit(vsi, vsi->seid, max_tx_rate);
 		if (ret)
 			goto end_unlock;

From 34513ada53eb3e3f711250d8dbc2de4de493d510 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Fri, 9 Sep 2022 18:38:30 +0300
Subject: [PATCH 32/83] netdevsim: Fix hwstats debugfs file permissions

The hwstats debugfs files are only writeable, but they are created with
read and write permissions, causing certain selftests to fail [1].

Fix by creating the files with write permission only.

[1]
 # ./test_offload.py
 Test destruction of generic XDP...
 Traceback (most recent call last):
   File "/home/idosch/code/linux/tools/testing/selftests/bpf/./test_offload.py", line 810, in <module>
     simdev = NetdevSimDev()
 [...]
 Exception: Command failed: cat /sys/kernel/debug/netdevsim/netdevsim0//ports/0/dev/hwstats/l3/disable_ifindex

 cat: /sys/kernel/debug/netdevsim/netdevsim0//ports/0/dev/hwstats/l3/disable_ifindex: Invalid argument

Fixes: 1a6d7ae7d63c ("netdevsim: Introduce support for L3 offload xstats")
Reported-by: Jie2x Zhou <jie2x.zhou@intel.com>
Tested-by: Jie2x Zhou <jie2x.zhou@intel.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://lore.kernel.org/r/20220909153830.3732504-1-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/netdevsim/hwstats.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/netdevsim/hwstats.c b/drivers/net/netdevsim/hwstats.c
index 605a38e16db0..0e58aa7f0374 100644
--- a/drivers/net/netdevsim/hwstats.c
+++ b/drivers/net/netdevsim/hwstats.c
@@ -433,11 +433,11 @@ int nsim_dev_hwstats_init(struct nsim_dev *nsim_dev)
 		goto err_remove_hwstats_recursive;
 	}
 
-	debugfs_create_file("enable_ifindex", 0600, hwstats->l3_ddir, hwstats,
+	debugfs_create_file("enable_ifindex", 0200, hwstats->l3_ddir, hwstats,
 			    &nsim_dev_hwstats_l3_enable_fops.fops);
-	debugfs_create_file("disable_ifindex", 0600, hwstats->l3_ddir, hwstats,
+	debugfs_create_file("disable_ifindex", 0200, hwstats->l3_ddir, hwstats,
 			    &nsim_dev_hwstats_l3_disable_fops.fops);
-	debugfs_create_file("fail_next_enable", 0600, hwstats->l3_ddir, hwstats,
+	debugfs_create_file("fail_next_enable", 0200, hwstats->l3_ddir, hwstats,
 			    &nsim_dev_hwstats_l3_fail_fops.fops);
 
 	INIT_DELAYED_WORK(&hwstats->traffic_dw,

From 6fb2dbdb26893b6423cbf419a56a85d824619fd8 Mon Sep 17 00:00:00 2001
From: Jeroen de Borst <jeroendb@google.com>
Date: Tue, 13 Sep 2022 11:53:19 -0700
Subject: [PATCH 33/83] MAINTAINERS: gve: update developers

Updating active developers.

Signed-off-by: Jeroen de Borst <jeroendb@google.com>
Link: https://lore.kernel.org/r/20220913185319.1061909-1-jeroendb@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4007b99b1eb0..faf65a953e54 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8652,8 +8652,8 @@ F:	drivers/input/touchscreen/goodix*
 
 GOOGLE ETHERNET DRIVERS
 M:	Jeroen de Borst <jeroendb@google.com>
-R:	Catherine Sullivan <csully@google.com>
-R:	David Awogbemila <awogbemila@google.com>
+M:	Catherine Sullivan <csully@google.com>
+R:	Shailend Chand <shailend@google.com>
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	Documentation/networking/device_drivers/ethernet/google/gve.rst

From 6fd2c68da55c552f86e401ebe40c4a619025ef69 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Sun, 11 Sep 2022 13:40:05 -0700
Subject: [PATCH 34/83] net: mana: Add rmb after checking owner bits

Per GDMA spec, rmb is necessary after checking owner_bits, before
reading EQ or CQ entries.

Add rmb in these two places to comply with the specs.

Cc: stable@vger.kernel.org
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Reported-by: Sinan Kaya <Sinan.Kaya@microsoft.com>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Link: https://lore.kernel.org/r/1662928805-15861-1-git-send-email-haiyangz@microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 5f9240182351..e10b9f8f89e1 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -397,6 +397,11 @@ static void mana_gd_process_eq_events(void *arg)
 			break;
 		}
 
+		/* Per GDMA spec, rmb is necessary after checking owner_bits, before
+		 * reading eqe.
+		 */
+		rmb();
+
 		mana_gd_process_eqe(eq);
 
 		eq->head++;
@@ -1134,6 +1139,11 @@ static int mana_gd_read_cqe(struct gdma_queue *cq, struct gdma_comp *comp)
 	if (WARN_ON_ONCE(owner_bits != new_bits))
 		return -1;
 
+	/* Per GDMA spec, rmb is necessary after checking owner_bits, before
+	 * reading completion info
+	 */
+	rmb();
+
 	comp->wq_num = cqe->cqe_info.wq_num;
 	comp->is_sq = cqe->cqe_info.is_sq;
 	memcpy(comp->cqe_data, cqe->cqe_data, GDMA_COMP_DATA_SIZE);

From 5e69163d3b9931098922b3fc2f8e786af8c1f37e Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 13 Sep 2022 15:03:05 +0200
Subject: [PATCH 35/83] net: ethernet: mtk_eth_soc: enable XDP support just for
 MT7986 SoC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Disable page_pool/XDP support for MT7621 SoC in order fix a regression
introduce adding XDP for MT7986 SoC. There is no a real use case for XDP
on MT7621 since it is a low-end cpu. Moreover this patch reduces the
memory footprint.

Tested-by: Sergio Paracuellos <sergio.paracuellos@gmail.com>
Tested-by: Arınç ÜNAL <arinc.unal@arinc9.com>
Fixes: 23233e577ef9 ("net: ethernet: mtk_eth_soc: rely on page_pool for single page buffers")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/2bf31e27b888c43228b0d84dd2ef5033338269e2.1663074002.git.lorenzo@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 5ace4609de47..b344632beadd 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1458,7 +1458,7 @@ static void mtk_update_rx_cpu_idx(struct mtk_eth *eth)
 
 static bool mtk_page_pool_enabled(struct mtk_eth *eth)
 {
-	return !eth->hwlro;
+	return MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2);
 }
 
 static struct page_pool *mtk_create_page_pool(struct mtk_eth *eth,

From f232af4295653afa4ade3230462b3be15ad16419 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= <ihuguet@redhat.com>
Date: Wed, 14 Sep 2022 12:36:48 +0200
Subject: [PATCH 36/83] sfc: fix TX channel offset when using legacy interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In legacy interrupt mode the tx_channel_offset was hardcoded to 1, but
that's not correct if efx_sepparate_tx_channels is false. In that case,
the offset is 0 because the tx queues are in the single existing channel
at index 0, together with the rx queue.

Without this fix, as soon as you try to send any traffic, it tries to
get the tx queues from an uninitialized channel getting these errors:
  WARNING: CPU: 1 PID: 0 at drivers/net/ethernet/sfc/tx.c:540 efx_hard_start_xmit+0x12e/0x170 [sfc]
  [...]
  RIP: 0010:efx_hard_start_xmit+0x12e/0x170 [sfc]
  [...]
  Call Trace:
   <IRQ>
   dev_hard_start_xmit+0xd7/0x230
   sch_direct_xmit+0x9f/0x360
   __dev_queue_xmit+0x890/0xa40
  [...]
  BUG: unable to handle kernel NULL pointer dereference at 0000000000000020
  [...]
  RIP: 0010:efx_hard_start_xmit+0x153/0x170 [sfc]
  [...]
  Call Trace:
   <IRQ>
   dev_hard_start_xmit+0xd7/0x230
   sch_direct_xmit+0x9f/0x360
   __dev_queue_xmit+0x890/0xa40
  [...]

Fixes: c308dfd1b43e ("sfc: fix wrong tx channel offset with efx_separate_tx_channels")
Reported-by: Tianhao Zhao <tizhao@redhat.com>
Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>
Acked-by: Edward Cree <ecree.xilinx@gmail.com>
Link: https://lore.kernel.org/r/20220914103648.16902-1-ihuguet@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/sfc/efx_channels.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c
index 032b8c0bd788..5b4d661ab986 100644
--- a/drivers/net/ethernet/sfc/efx_channels.c
+++ b/drivers/net/ethernet/sfc/efx_channels.c
@@ -319,7 +319,7 @@ int efx_probe_interrupts(struct efx_nic *efx)
 		efx->n_channels = 1 + (efx_separate_tx_channels ? 1 : 0);
 		efx->n_rx_channels = 1;
 		efx->n_tx_channels = 1;
-		efx->tx_channel_offset = 1;
+		efx->tx_channel_offset = efx_separate_tx_channels ? 1 : 0;
 		efx->n_xdp_channels = 0;
 		efx->xdp_channel_offset = efx->n_channels;
 		efx->legacy_irq = efx->pci_dev->irq;

From 0a242eb2913a4aa3d6fbdb86559f27628e9466f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= <ihuguet@redhat.com>
Date: Wed, 14 Sep 2022 13:11:35 +0200
Subject: [PATCH 37/83] sfc: fix null pointer dereference in
 efx_hard_start_xmit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Trying to get the channel from the tx_queue variable here is wrong
because we can only be here if tx_queue is NULL, so we shouldn't
dereference it. As the above comment in the code says, this is very
unlikely to happen, but it's wrong anyway so let's fix it.

I hit this issue because of a different bug that caused tx_queue to be
NULL. If that happens, this is the error message that we get here:
  BUG: unable to handle kernel NULL pointer dereference at 0000000000000020
  [...]
  RIP: 0010:efx_hard_start_xmit+0x153/0x170 [sfc]

Fixes: 12804793b17c ("sfc: decouple TXQ type from label")
Reported-by: Tianhao Zhao <tizhao@redhat.com>
Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>
Acked-by: Edward Cree <ecree.xilinx@gmail.com>
Link: https://lore.kernel.org/r/20220914111135.21038-1-ihuguet@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/sfc/tx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index d12474042c84..c5f88f7a7a04 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -549,7 +549,7 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 		 * previous packets out.
 		 */
 		if (!netdev_xmit_more())
-			efx_tx_send_pending(tx_queue->channel);
+			efx_tx_send_pending(efx_get_tx_channel(efx, index));
 		return NETDEV_TX_OK;
 	}
 

From ae8ffba8baad651af706538e8c47d0a049d406c6 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Fri, 16 Sep 2022 02:49:32 +0300
Subject: [PATCH 38/83] bnxt_en: fix flags to check for supported fw version

The warning message of unsupported FW appears every time RX timestamps
are disabled on the interface. The patch fixes the flags to correct set
for the check.

Fixes: 66ed81dcedc6 ("bnxt_en: Enable packet timestamping for all RX packets")
Cc: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Reviewed-by: Andy Gospodarek <gospo@broadcom.com>
Reviewed-by: Michael Chan <michael.chan@broadcom.com>
Link: https://lore.kernel.org/r/20220915234932.25497-1-vfedorenko@novek.ru
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
index 7f3c0875b6f5..8e316367f6ce 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
@@ -317,9 +317,9 @@ void bnxt_ptp_cfg_tstamp_filters(struct bnxt *bp)
 
 	if (!(bp->fw_cap & BNXT_FW_CAP_RX_ALL_PKT_TS) && (ptp->tstamp_filters &
 	    (PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_ENABLE |
-	     PORT_MAC_CFG_REQ_FLAGS_PTP_RX_TS_CAPTURE_DISABLE))) {
+	     PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_DISABLE))) {
 		ptp->tstamp_filters &= ~(PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_ENABLE |
-					 PORT_MAC_CFG_REQ_FLAGS_PTP_RX_TS_CAPTURE_DISABLE);
+					 PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_DISABLE);
 		netdev_warn(bp->dev, "Unsupported FW for all RX pkts timestamp filter\n");
 	}
 

From 8ccac4edc8da764389d4fc18b1df740892006557 Mon Sep 17 00:00:00 2001
From: Shailend Chand <shailend@google.com>
Date: Mon, 12 Sep 2022 17:09:01 -0700
Subject: [PATCH 39/83] gve: Fix GFP flags when allocing pages

Use GFP_ATOMIC when allocating pages out of the hotpath,
continue to use GFP_KERNEL when allocating pages during setup.

GFP_KERNEL will allow blocking which allows it to succeed
more often in a low memory enviornment but in the hotpath we do
not want to allow the allocation to block.

Fixes: 9b8dd5e5ea48b ("gve: DQO: Add RX path")
Signed-off-by: Shailend Chand <shailend@google.com>
Signed-off-by: Jeroen de Borst <jeroendb@google.com>
Link: https://lore.kernel.org/r/20220913000901.959546-1-jeroendb@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/google/gve/gve_rx_dqo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c
index 8c939628e2d8..2e6461b0ea8b 100644
--- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c
+++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c
@@ -157,7 +157,7 @@ static int gve_alloc_page_dqo(struct gve_priv *priv,
 	int err;
 
 	err = gve_alloc_page(priv, &priv->pdev->dev, &buf_state->page_info.page,
-			     &buf_state->addr, DMA_FROM_DEVICE, GFP_KERNEL);
+			     &buf_state->addr, DMA_FROM_DEVICE, GFP_ATOMIC);
 	if (err)
 		return err;
 

From 807e5eda2078327d1adaf213cf7d6b3862cad1f4 Mon Sep 17 00:00:00 2001
From: Rakesh Sankaranarayanan <rakesh.sankaranarayanan@microchip.com>
Date: Mon, 12 Sep 2022 10:42:28 +0530
Subject: [PATCH 40/83] net: dsa: microchip: lan937x: fix maximum frame length
 check

Maximum frame length check is enabled in lan937x switch on POR, But it
is found to be disabled on driver during port setup operation. Due to
this, packets are not dropped when transmitted with greater than configured
value. For testing, setup made for lan1->lan2 transmission and configured
lan1 interface with a frame length (less than 1500 as mentioned in
documentation) and transmitted packets with greater than configured value.
Expected no packets at lan2 end, but packets observed at lan2.

Based on the documentation, packets should get discarded if the actual
packet length doesn't match the frame length configured. Frame length check
should be disabled only for cascaded ports due to tailtags.

This feature was disabled on ksz9477 series due to ptp issue, which is
not in lan937x series. But since lan937x took ksz9477 as base, frame
length check disabled here as well. Patch added to remove this portion
from port setup so that maximum frame length check will be active for
normal ports.

Fixes: 55ab6ffaf378 ("net: dsa: microchip: add DSA support for microchip LAN937x")
Signed-off-by: Rakesh Sankaranarayanan <rakesh.sankaranarayanan@microchip.com>
Link: https://lore.kernel.org/r/20220912051228.1306074-1-rakesh.sankaranarayanan@microchip.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/dsa/microchip/lan937x_main.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/dsa/microchip/lan937x_main.c b/drivers/net/dsa/microchip/lan937x_main.c
index daedd2bf20c1..5579644e8fde 100644
--- a/drivers/net/dsa/microchip/lan937x_main.c
+++ b/drivers/net/dsa/microchip/lan937x_main.c
@@ -244,10 +244,6 @@ void lan937x_port_setup(struct ksz_device *dev, int port, bool cpu_port)
 		lan937x_port_cfg(dev, port, REG_PORT_CTRL_0,
 				 PORT_TAIL_TAG_ENABLE, true);
 
-	/* disable frame check length field */
-	lan937x_port_cfg(dev, port, REG_PORT_MAC_CTRL_0, PORT_CHECK_LENGTH,
-			 false);
-
 	/* set back pressure for half duplex */
 	lan937x_port_cfg(dev, port, REG_PORT_MAC_CTRL_1, PORT_BACK_PRESSURE,
 			 true);

From 7b15515fc1ca3b320fe37793aaaf6f56cc964ef4 Mon Sep 17 00:00:00 2001
From: Francesco Dolcini <francesco.dolcini@toradex.com>
Date: Mon, 12 Sep 2022 09:01:42 +0200
Subject: [PATCH 41/83] Revert "fec: Restart PPS after link state change"

This reverts commit f79959220fa5fbda939592bf91c7a9ea90419040, this is
creating multiple issues, just not ready to be merged yet.

Link: https://lore.kernel.org/all/20220905180542.GA3685102@roeck-us.net/
Link: https://lore.kernel.org/all/CAHk-=wj1obPoTu1AHj9Bd_BGYjdjDyPP+vT5WMj8eheb3A9WHw@mail.gmail.com/
Fixes: f79959220fa5 ("fec: Restart PPS after link state change")
Signed-off-by: Francesco Dolcini <francesco.dolcini@toradex.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/freescale/fec.h      | 10 ------
 drivers/net/ethernet/freescale/fec_main.c | 42 +++--------------------
 drivers/net/ethernet/freescale/fec_ptp.c  | 29 ----------------
 3 files changed, 4 insertions(+), 77 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index d77ee8936c6a..dcfe63f9be06 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -638,13 +638,6 @@ struct fec_enet_private {
 	int pps_enable;
 	unsigned int next_counter;
 
-	struct {
-		struct timespec64 ts_phc;
-		u64 ns_sys;
-		u32 at_corr;
-		u8 at_inc_corr;
-	} ptp_saved_state;
-
 	u64 ethtool_stats[];
 };
 
@@ -655,8 +648,5 @@ void fec_ptp_disable_hwts(struct net_device *ndev);
 int fec_ptp_set(struct net_device *ndev, struct ifreq *ifr);
 int fec_ptp_get(struct net_device *ndev, struct ifreq *ifr);
 
-void fec_ptp_save_state(struct fec_enet_private *fep);
-int fec_ptp_restore_state(struct fec_enet_private *fep);
-
 /****************************************************************************/
 #endif /* FEC_H */
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 6152f6dbf1bc..4ccd74af09ae 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -286,11 +286,8 @@ MODULE_PARM_DESC(macaddr, "FEC Ethernet MAC address");
 #define FEC_MMFR_TA		(2 << 16)
 #define FEC_MMFR_DATA(v)	(v & 0xffff)
 /* FEC ECR bits definition */
-#define FEC_ECR_RESET   BIT(0)
-#define FEC_ECR_ETHEREN BIT(1)
-#define FEC_ECR_MAGICEN BIT(2)
-#define FEC_ECR_SLEEP   BIT(3)
-#define FEC_ECR_EN1588  BIT(4)
+#define FEC_ECR_MAGICEN		(1 << 2)
+#define FEC_ECR_SLEEP		(1 << 3)
 
 #define FEC_MII_TIMEOUT		30000 /* us */
 
@@ -986,9 +983,6 @@ fec_restart(struct net_device *ndev)
 	u32 temp_mac[2];
 	u32 rcntl = OPT_FRAME_SIZE | 0x04;
 	u32 ecntl = 0x2; /* ETHEREN */
-	struct ptp_clock_request ptp_rq = { .type = PTP_CLK_REQ_PPS };
-
-	fec_ptp_save_state(fep);
 
 	/* Whack a reset.  We should wait for this.
 	 * For i.MX6SX SOC, enet use AXI bus, we use disable MAC
@@ -1142,7 +1136,7 @@ fec_restart(struct net_device *ndev)
 	}
 
 	if (fep->bufdesc_ex)
-		ecntl |= FEC_ECR_EN1588;
+		ecntl |= (1 << 4);
 
 	if (fep->quirks & FEC_QUIRK_DELAYED_CLKS_SUPPORT &&
 	    fep->rgmii_txc_dly)
@@ -1163,14 +1157,6 @@ fec_restart(struct net_device *ndev)
 	if (fep->bufdesc_ex)
 		fec_ptp_start_cyclecounter(ndev);
 
-	/* Restart PPS if needed */
-	if (fep->pps_enable) {
-		/* Clear flag so fec_ptp_enable_pps() doesn't return immediately */
-		fep->pps_enable = 0;
-		fec_ptp_restore_state(fep);
-		fep->ptp_caps.enable(&fep->ptp_caps, &ptp_rq, 1);
-	}
-
 	/* Enable interrupts we wish to service */
 	if (fep->link)
 		writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
@@ -1221,8 +1207,6 @@ fec_stop(struct net_device *ndev)
 	struct fec_enet_private *fep = netdev_priv(ndev);
 	u32 rmii_mode = readl(fep->hwp + FEC_R_CNTRL) & (1 << 8);
 	u32 val;
-	struct ptp_clock_request ptp_rq = { .type = PTP_CLK_REQ_PPS };
-	u32 ecntl = 0;
 
 	/* We cannot expect a graceful transmit stop without link !!! */
 	if (fep->link) {
@@ -1232,8 +1216,6 @@ fec_stop(struct net_device *ndev)
 			netdev_err(ndev, "Graceful transmit stop did not complete!\n");
 	}
 
-	fec_ptp_save_state(fep);
-
 	/* Whack a reset.  We should wait for this.
 	 * For i.MX6SX SOC, enet use AXI bus, we use disable MAC
 	 * instead of reset MAC itself.
@@ -1253,28 +1235,12 @@ fec_stop(struct net_device *ndev)
 	writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED);
 	writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
 
-	if (fep->bufdesc_ex)
-		ecntl |= FEC_ECR_EN1588;
-
 	/* We have to keep ENET enabled to have MII interrupt stay working */
 	if (fep->quirks & FEC_QUIRK_ENET_MAC &&
 		!(fep->wol_flag & FEC_WOL_FLAG_SLEEP_ON)) {
-		ecntl |= FEC_ECR_ETHEREN;
+		writel(2, fep->hwp + FEC_ECNTRL);
 		writel(rmii_mode, fep->hwp + FEC_R_CNTRL);
 	}
-
-	writel(ecntl, fep->hwp + FEC_ECNTRL);
-
-	if (fep->bufdesc_ex)
-		fec_ptp_start_cyclecounter(ndev);
-
-	/* Restart PPS if needed */
-	if (fep->pps_enable) {
-		/* Clear flag so fec_ptp_enable_pps() doesn't return immediately */
-		fep->pps_enable = 0;
-		fec_ptp_restore_state(fep);
-		fep->ptp_caps.enable(&fep->ptp_caps, &ptp_rq, 1);
-	}
 }
 
 
diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c
index 8dd5a2615a89..af20aa237964 100644
--- a/drivers/net/ethernet/freescale/fec_ptp.c
+++ b/drivers/net/ethernet/freescale/fec_ptp.c
@@ -625,36 +625,7 @@ void fec_ptp_stop(struct platform_device *pdev)
 	struct net_device *ndev = platform_get_drvdata(pdev);
 	struct fec_enet_private *fep = netdev_priv(ndev);
 
-	if (fep->pps_enable)
-		fec_ptp_enable_pps(fep, 0);
-
 	cancel_delayed_work_sync(&fep->time_keep);
 	if (fep->ptp_clock)
 		ptp_clock_unregister(fep->ptp_clock);
 }
-
-void fec_ptp_save_state(struct fec_enet_private *fep)
-{
-	u32 atime_inc_corr;
-
-	fec_ptp_gettime(&fep->ptp_caps, &fep->ptp_saved_state.ts_phc);
-	fep->ptp_saved_state.ns_sys = ktime_get_ns();
-
-	fep->ptp_saved_state.at_corr = readl(fep->hwp + FEC_ATIME_CORR);
-	atime_inc_corr = readl(fep->hwp + FEC_ATIME_INC) & FEC_T_INC_CORR_MASK;
-	fep->ptp_saved_state.at_inc_corr = (u8)(atime_inc_corr >> FEC_T_INC_CORR_OFFSET);
-}
-
-int fec_ptp_restore_state(struct fec_enet_private *fep)
-{
-	u32 atime_inc = readl(fep->hwp + FEC_ATIME_INC) & FEC_T_INC_MASK;
-	u64 ns_sys;
-
-	writel(fep->ptp_saved_state.at_corr, fep->hwp + FEC_ATIME_CORR);
-	atime_inc |= ((u32)fep->ptp_saved_state.at_inc_corr) << FEC_T_INC_CORR_OFFSET;
-	writel(atime_inc, fep->hwp + FEC_ATIME_INC);
-
-	ns_sys = ktime_get_ns() - fep->ptp_saved_state.ns_sys;
-	timespec64_add_ns(&fep->ptp_saved_state.ts_phc, ns_sys);
-	return fec_ptp_settime(&fep->ptp_caps, &fep->ptp_saved_state.ts_phc);
-}

From 01b825f997ac28f1e20309bafbf2068cd77c50a4 Mon Sep 17 00:00:00 2001
From: Francesco Dolcini <francesco.dolcini@toradex.com>
Date: Mon, 12 Sep 2022 09:01:43 +0200
Subject: [PATCH 42/83] Revert "net: fec: Use a spinlock to guard
 `fep->ptp_clk_on`"

This reverts commit b353b241f1eb9b6265358ffbe2632fdcb563354f, this is
creating multiple issues, just not ready to be merged yet.

Link: https://lore.kernel.org/all/CAHk-=wj1obPoTu1AHj9Bd_BGYjdjDyPP+vT5WMj8eheb3A9WHw@mail.gmail.com/
Link: https://lore.kernel.org/all/20220907143915.5w65kainpykfobte@pengutronix.de/
Fixes: b353b241f1eb ("net: fec: Use a spinlock to guard `fep->ptp_clk_on`")
Signed-off-by: Francesco Dolcini <francesco.dolcini@toradex.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/freescale/fec.h      |  1 +
 drivers/net/ethernet/freescale/fec_main.c | 17 +++++++-------
 drivers/net/ethernet/freescale/fec_ptp.c  | 28 +++++++++++++++--------
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index dcfe63f9be06..a5fed00cb971 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -561,6 +561,7 @@ struct fec_enet_private {
 	struct clk *clk_2x_txclk;
 
 	bool ptp_clk_on;
+	struct mutex ptp_clk_mutex;
 	unsigned int num_tx_queues;
 	unsigned int num_rx_queues;
 
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 4ccd74af09ae..92c55e1a5507 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -1995,7 +1995,6 @@ static void fec_enet_phy_reset_after_clk_enable(struct net_device *ndev)
 static int fec_enet_clk_enable(struct net_device *ndev, bool enable)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
-	unsigned long flags;
 	int ret;
 
 	if (enable) {
@@ -2004,15 +2003,15 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool enable)
 			return ret;
 
 		if (fep->clk_ptp) {
-			spin_lock_irqsave(&fep->tmreg_lock, flags);
+			mutex_lock(&fep->ptp_clk_mutex);
 			ret = clk_prepare_enable(fep->clk_ptp);
 			if (ret) {
-				spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+				mutex_unlock(&fep->ptp_clk_mutex);
 				goto failed_clk_ptp;
 			} else {
 				fep->ptp_clk_on = true;
 			}
-			spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+			mutex_unlock(&fep->ptp_clk_mutex);
 		}
 
 		ret = clk_prepare_enable(fep->clk_ref);
@@ -2027,10 +2026,10 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool enable)
 	} else {
 		clk_disable_unprepare(fep->clk_enet_out);
 		if (fep->clk_ptp) {
-			spin_lock_irqsave(&fep->tmreg_lock, flags);
+			mutex_lock(&fep->ptp_clk_mutex);
 			clk_disable_unprepare(fep->clk_ptp);
 			fep->ptp_clk_on = false;
-			spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+			mutex_unlock(&fep->ptp_clk_mutex);
 		}
 		clk_disable_unprepare(fep->clk_ref);
 		clk_disable_unprepare(fep->clk_2x_txclk);
@@ -2043,10 +2042,10 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool enable)
 		clk_disable_unprepare(fep->clk_ref);
 failed_clk_ref:
 	if (fep->clk_ptp) {
-		spin_lock_irqsave(&fep->tmreg_lock, flags);
+		mutex_lock(&fep->ptp_clk_mutex);
 		clk_disable_unprepare(fep->clk_ptp);
 		fep->ptp_clk_on = false;
-		spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+		mutex_unlock(&fep->ptp_clk_mutex);
 	}
 failed_clk_ptp:
 	clk_disable_unprepare(fep->clk_enet_out);
@@ -3881,7 +3880,7 @@ fec_probe(struct platform_device *pdev)
 	}
 
 	fep->ptp_clk_on = false;
-	spin_lock_init(&fep->tmreg_lock);
+	mutex_init(&fep->ptp_clk_mutex);
 
 	/* clk_ref is optional, depends on board */
 	fep->clk_ref = devm_clk_get_optional(&pdev->dev, "enet_clk_ref");
diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c
index af20aa237964..3dc3c0b626c2 100644
--- a/drivers/net/ethernet/freescale/fec_ptp.c
+++ b/drivers/net/ethernet/freescale/fec_ptp.c
@@ -365,19 +365,21 @@ static int fec_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
  */
 static int fec_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 {
-	struct fec_enet_private *fep =
+	struct fec_enet_private *adapter =
 	    container_of(ptp, struct fec_enet_private, ptp_caps);
 	u64 ns;
 	unsigned long flags;
 
-	spin_lock_irqsave(&fep->tmreg_lock, flags);
+	mutex_lock(&adapter->ptp_clk_mutex);
 	/* Check the ptp clock */
-	if (!fep->ptp_clk_on) {
-		spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+	if (!adapter->ptp_clk_on) {
+		mutex_unlock(&adapter->ptp_clk_mutex);
 		return -EINVAL;
 	}
-	ns = timecounter_read(&fep->tc);
-	spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+	spin_lock_irqsave(&adapter->tmreg_lock, flags);
+	ns = timecounter_read(&adapter->tc);
+	spin_unlock_irqrestore(&adapter->tmreg_lock, flags);
+	mutex_unlock(&adapter->ptp_clk_mutex);
 
 	*ts = ns_to_timespec64(ns);
 
@@ -402,10 +404,10 @@ static int fec_ptp_settime(struct ptp_clock_info *ptp,
 	unsigned long flags;
 	u32 counter;
 
-	spin_lock_irqsave(&fep->tmreg_lock, flags);
+	mutex_lock(&fep->ptp_clk_mutex);
 	/* Check the ptp clock */
 	if (!fep->ptp_clk_on) {
-		spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+		mutex_unlock(&fep->ptp_clk_mutex);
 		return -EINVAL;
 	}
 
@@ -415,9 +417,11 @@ static int fec_ptp_settime(struct ptp_clock_info *ptp,
 	 */
 	counter = ns & fep->cc.mask;
 
+	spin_lock_irqsave(&fep->tmreg_lock, flags);
 	writel(counter, fep->hwp + FEC_ATIME);
 	timecounter_init(&fep->tc, &fep->cc, ns);
 	spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+	mutex_unlock(&fep->ptp_clk_mutex);
 	return 0;
 }
 
@@ -514,11 +518,13 @@ static void fec_time_keep(struct work_struct *work)
 	struct fec_enet_private *fep = container_of(dwork, struct fec_enet_private, time_keep);
 	unsigned long flags;
 
-	spin_lock_irqsave(&fep->tmreg_lock, flags);
+	mutex_lock(&fep->ptp_clk_mutex);
 	if (fep->ptp_clk_on) {
+		spin_lock_irqsave(&fep->tmreg_lock, flags);
 		timecounter_read(&fep->tc);
+		spin_unlock_irqrestore(&fep->tmreg_lock, flags);
 	}
-	spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+	mutex_unlock(&fep->ptp_clk_mutex);
 
 	schedule_delayed_work(&fep->time_keep, HZ);
 }
@@ -593,6 +599,8 @@ void fec_ptp_init(struct platform_device *pdev, int irq_idx)
 	}
 	fep->ptp_inc = NSEC_PER_SEC / fep->cycle_speed;
 
+	spin_lock_init(&fep->tmreg_lock);
+
 	fec_ptp_start_cyclecounter(ndev);
 
 	INIT_DELAYED_WORK(&fep->time_keep, fec_time_keep);

From db4192a754ebd52300a28abe1a50dd18eae0eb12 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Mon, 12 Sep 2022 10:35:53 -0700
Subject: [PATCH 43/83] tcp: read multiple skbs in tcp_read_skb()

Before we switched to ->read_skb(), ->read_sock() was passed with
desc.count=1, which technically indicates we only read one skb per
->sk_data_ready() call. However, for TCP, this is not true.

TCP at least has sk_rcvlowat which intentionally holds skb's in
receive queue until this watermark is reached. This means when
->sk_data_ready() is invoked there could be multiple skb's in the
queue, therefore we have to read multiple skbs in tcp_read_skb()
instead of one.

Fixes: 965b57b469a5 ("net: Introduce a new proto_ops ->read_skb()")
Reported-by: Peilin Ye <peilin.ye@bytedance.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Link: https://lore.kernel.org/r/20220912173553.235838-1-xiyou.wangcong@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/ipv4/tcp.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3488388eea5d..e373dde1f46f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1761,19 +1761,28 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 	if (sk->sk_state == TCP_LISTEN)
 		return -ENOTCONN;
 
-	skb = tcp_recv_skb(sk, seq, &offset);
-	if (!skb)
-		return 0;
+	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+		u8 tcp_flags;
+		int used;
 
-	__skb_unlink(skb, &sk->sk_receive_queue);
-	WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
-	copied = recv_actor(sk, skb);
-	if (copied >= 0) {
-		seq += copied;
-		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+		__skb_unlink(skb, &sk->sk_receive_queue);
+		WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
+		tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
+		used = recv_actor(sk, skb);
+		consume_skb(skb);
+		if (used < 0) {
+			if (!copied)
+				copied = used;
+			break;
+		}
+		seq += used;
+		copied += used;
+
+		if (tcp_flags & TCPHDR_FIN) {
 			++seq;
+			break;
+		}
 	}
-	consume_skb(skb);
 	WRITE_ONCE(tp->copied_seq, seq);
 
 	tcp_rcv_space_adjust(sk);

From 1c48709e6d9d353acaaac1d8e33474756b121d78 Mon Sep 17 00:00:00 2001
From: Liang He <windhl@126.com>
Date: Tue, 13 Sep 2022 20:56:59 +0800
Subject: [PATCH 44/83] of: mdio: Add of_node_put() when breaking out of
 for_each_xx

In of_mdiobus_register(), we should call of_node_put() for 'child'
escaped out of for_each_available_child_of_node().

Fixes: 66bdede495c7 ("of_mdio: Fix broken PHY IRQ in case of probe deferral")
Co-developed-by: Miaoqian Lin <linmq006@gmail.com>
Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Signed-off-by: Liang He <windhl@126.com>
Link: https://lore.kernel.org/r/20220913125659.3331969-1-windhl@126.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/mdio/of_mdio.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c
index 9e3c815a070f..796e9c7857d0 100644
--- a/drivers/net/mdio/of_mdio.c
+++ b/drivers/net/mdio/of_mdio.c
@@ -231,6 +231,7 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 	return 0;
 
 unregister:
+	of_node_put(child);
 	mdiobus_unregister(mdio);
 	return rc;
 }

From cf412ec333250cb82bafe57169204e14a9f1c2ac Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Tue, 13 Sep 2022 15:46:02 -0500
Subject: [PATCH 45/83] net: ipa: properly limit modem routing table use

IPA can route packets between IPA-connected entities.  The AP and
modem are currently the only such entities supported, and no routing
is required to transfer packets between them.

The number of entries in each routing table is fixed, and defined at
initialization time.  Some of these entries are designated for use
by the modem, and the rest are available for the AP to use.  The AP
sends a QMI message to the modem which describes (among other
things) information about routing table memory available for the
modem to use.

Currently the QMI initialization packet gives wrong information in
its description of routing tables.  What *should* be supplied is the
maximum index that the modem can use for the routing table memory
located at a given location.  The current code instead supplies the
total *number* of routing table entries.  Furthermore, the modem is
granted the entire table, not just the subset it's supposed to use.

This patch fixes this.  First, the ipa_mem_bounds structure is
generalized so its "end" field can be interpreted either as a final
byte offset, or a final array index.  Second, the IPv4 and IPv6
(non-hashed and hashed) table information fields in the QMI
ipa_init_modem_driver_req structure are changed to be ipa_mem_bounds
rather than ipa_mem_array structures.  Third, we set the "end" value
for each routing table to be the last index, rather than setting the
"count" to be the number of indices.  Finally, instead of allowing
the modem to use all of a routing table's memory, it is limited to
just the portion meant to be used by the modem.  In all versions of
IPA currently supported, that is IPA_ROUTE_MODEM_COUNT (8) entries.

Update a few comments for clarity.

Fixes: 530f9216a9537 ("soc: qcom: ipa: AP/modem communications")
Signed-off-by: Alex Elder <elder@linaro.org>
Link: https://lore.kernel.org/r/20220913204602.1803004-1-elder@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ipa/ipa_qmi.c     |  8 ++++----
 drivers/net/ipa/ipa_qmi_msg.c |  8 ++++----
 drivers/net/ipa/ipa_qmi_msg.h | 37 ++++++++++++++++++++---------------
 drivers/net/ipa/ipa_table.c   |  2 --
 drivers/net/ipa/ipa_table.h   |  3 +++
 5 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ipa/ipa_qmi.c b/drivers/net/ipa/ipa_qmi.c
index ec010cf2e816..6f874f99b910 100644
--- a/drivers/net/ipa/ipa_qmi.c
+++ b/drivers/net/ipa/ipa_qmi.c
@@ -308,12 +308,12 @@ init_modem_driver_req(struct ipa_qmi *ipa_qmi)
 	mem = ipa_mem_find(ipa, IPA_MEM_V4_ROUTE);
 	req.v4_route_tbl_info_valid = 1;
 	req.v4_route_tbl_info.start = ipa->mem_offset + mem->offset;
-	req.v4_route_tbl_info.count = mem->size / sizeof(__le64);
+	req.v4_route_tbl_info.end = IPA_ROUTE_MODEM_COUNT - 1;
 
 	mem = ipa_mem_find(ipa, IPA_MEM_V6_ROUTE);
 	req.v6_route_tbl_info_valid = 1;
 	req.v6_route_tbl_info.start = ipa->mem_offset + mem->offset;
-	req.v6_route_tbl_info.count = mem->size / sizeof(__le64);
+	req.v6_route_tbl_info.end = IPA_ROUTE_MODEM_COUNT - 1;
 
 	mem = ipa_mem_find(ipa, IPA_MEM_V4_FILTER);
 	req.v4_filter_tbl_start_valid = 1;
@@ -352,7 +352,7 @@ init_modem_driver_req(struct ipa_qmi *ipa_qmi)
 		req.v4_hash_route_tbl_info_valid = 1;
 		req.v4_hash_route_tbl_info.start =
 				ipa->mem_offset + mem->offset;
-		req.v4_hash_route_tbl_info.count = mem->size / sizeof(__le64);
+		req.v4_hash_route_tbl_info.end = IPA_ROUTE_MODEM_COUNT - 1;
 	}
 
 	mem = ipa_mem_find(ipa, IPA_MEM_V6_ROUTE_HASHED);
@@ -360,7 +360,7 @@ init_modem_driver_req(struct ipa_qmi *ipa_qmi)
 		req.v6_hash_route_tbl_info_valid = 1;
 		req.v6_hash_route_tbl_info.start =
 			ipa->mem_offset + mem->offset;
-		req.v6_hash_route_tbl_info.count = mem->size / sizeof(__le64);
+		req.v6_hash_route_tbl_info.end = IPA_ROUTE_MODEM_COUNT - 1;
 	}
 
 	mem = ipa_mem_find(ipa, IPA_MEM_V4_FILTER_HASHED);
diff --git a/drivers/net/ipa/ipa_qmi_msg.c b/drivers/net/ipa/ipa_qmi_msg.c
index 6838e8065072..75d3fc0092e9 100644
--- a/drivers/net/ipa/ipa_qmi_msg.c
+++ b/drivers/net/ipa/ipa_qmi_msg.c
@@ -311,7 +311,7 @@ struct qmi_elem_info ipa_init_modem_driver_req_ei[] = {
 		.tlv_type	= 0x12,
 		.offset		= offsetof(struct ipa_init_modem_driver_req,
 					   v4_route_tbl_info),
-		.ei_array	= ipa_mem_array_ei,
+		.ei_array	= ipa_mem_bounds_ei,
 	},
 	{
 		.data_type	= QMI_OPT_FLAG,
@@ -332,7 +332,7 @@ struct qmi_elem_info ipa_init_modem_driver_req_ei[] = {
 		.tlv_type	= 0x13,
 		.offset		= offsetof(struct ipa_init_modem_driver_req,
 					   v6_route_tbl_info),
-		.ei_array	= ipa_mem_array_ei,
+		.ei_array	= ipa_mem_bounds_ei,
 	},
 	{
 		.data_type	= QMI_OPT_FLAG,
@@ -496,7 +496,7 @@ struct qmi_elem_info ipa_init_modem_driver_req_ei[] = {
 		.tlv_type	= 0x1b,
 		.offset		= offsetof(struct ipa_init_modem_driver_req,
 					   v4_hash_route_tbl_info),
-		.ei_array	= ipa_mem_array_ei,
+		.ei_array	= ipa_mem_bounds_ei,
 	},
 	{
 		.data_type	= QMI_OPT_FLAG,
@@ -517,7 +517,7 @@ struct qmi_elem_info ipa_init_modem_driver_req_ei[] = {
 		.tlv_type	= 0x1c,
 		.offset		= offsetof(struct ipa_init_modem_driver_req,
 					   v6_hash_route_tbl_info),
-		.ei_array	= ipa_mem_array_ei,
+		.ei_array	= ipa_mem_bounds_ei,
 	},
 	{
 		.data_type	= QMI_OPT_FLAG,
diff --git a/drivers/net/ipa/ipa_qmi_msg.h b/drivers/net/ipa/ipa_qmi_msg.h
index 495e85abe50b..9651aa59b596 100644
--- a/drivers/net/ipa/ipa_qmi_msg.h
+++ b/drivers/net/ipa/ipa_qmi_msg.h
@@ -86,9 +86,11 @@ enum ipa_platform_type {
 	IPA_QMI_PLATFORM_TYPE_MSM_QNX_V01	= 0x5,	/* QNX MSM */
 };
 
-/* This defines the start and end offset of a range of memory.  Both
- * fields are offsets relative to the start of IPA shared memory.
- * The end value is the last addressable byte *within* the range.
+/* This defines the start and end offset of a range of memory.  The start
+ * value is a byte offset relative to the start of IPA shared memory.  The
+ * end value is the last addressable unit *within* the range.  Typically
+ * the end value is in units of bytes, however it can also be a maximum
+ * array index value.
  */
 struct ipa_mem_bounds {
 	u32 start;
@@ -129,18 +131,19 @@ struct ipa_init_modem_driver_req {
 	u8			hdr_tbl_info_valid;
 	struct ipa_mem_bounds	hdr_tbl_info;
 
-	/* Routing table information.  These define the location and size of
-	 * non-hashable IPv4 and IPv6 filter tables.  The start values are
-	 * offsets relative to the start of IPA shared memory.
+	/* Routing table information.  These define the location and maximum
+	 * *index* (not byte) for the modem portion of non-hashable IPv4 and
+	 * IPv6 routing tables.  The start values are byte offsets relative
+	 * to the start of IPA shared memory.
 	 */
 	u8			v4_route_tbl_info_valid;
-	struct ipa_mem_array	v4_route_tbl_info;
+	struct ipa_mem_bounds	v4_route_tbl_info;
 	u8			v6_route_tbl_info_valid;
-	struct ipa_mem_array	v6_route_tbl_info;
+	struct ipa_mem_bounds	v6_route_tbl_info;
 
 	/* Filter table information.  These define the location of the
 	 * non-hashable IPv4 and IPv6 filter tables.  The start values are
-	 * offsets relative to the start of IPA shared memory.
+	 * byte offsets relative to the start of IPA shared memory.
 	 */
 	u8			v4_filter_tbl_start_valid;
 	u32			v4_filter_tbl_start;
@@ -181,18 +184,20 @@ struct ipa_init_modem_driver_req {
 	u8			zip_tbl_info_valid;
 	struct ipa_mem_bounds	zip_tbl_info;
 
-	/* Routing table information.  These define the location and size
-	 * of hashable IPv4 and IPv6 filter tables.  The start values are
-	 * offsets relative to the start of IPA shared memory.
+	/* Routing table information.  These define the location and maximum
+	 * *index* (not byte) for the modem portion of hashable IPv4 and IPv6
+	 * routing tables (if supported by hardware).  The start values are
+	 * byte offsets relative to the start of IPA shared memory.
 	 */
 	u8			v4_hash_route_tbl_info_valid;
-	struct ipa_mem_array	v4_hash_route_tbl_info;
+	struct ipa_mem_bounds	v4_hash_route_tbl_info;
 	u8			v6_hash_route_tbl_info_valid;
-	struct ipa_mem_array	v6_hash_route_tbl_info;
+	struct ipa_mem_bounds	v6_hash_route_tbl_info;
 
 	/* Filter table information.  These define the location and size
-	 * of hashable IPv4 and IPv6 filter tables.  The start values are
-	 * offsets relative to the start of IPA shared memory.
+	 * of hashable IPv4 and IPv6 filter tables (if supported by hardware).
+	 * The start values are byte offsets relative to the start of IPA
+	 * shared memory.
 	 */
 	u8			v4_hash_filter_tbl_start_valid;
 	u32			v4_hash_filter_tbl_start;
diff --git a/drivers/net/ipa/ipa_table.c b/drivers/net/ipa/ipa_table.c
index 2f5a58bfc529..69efe672ca52 100644
--- a/drivers/net/ipa/ipa_table.c
+++ b/drivers/net/ipa/ipa_table.c
@@ -108,8 +108,6 @@
 
 /* Assignment of route table entries to the modem and AP */
 #define IPA_ROUTE_MODEM_MIN		0
-#define IPA_ROUTE_MODEM_COUNT		8
-
 #define IPA_ROUTE_AP_MIN		IPA_ROUTE_MODEM_COUNT
 #define IPA_ROUTE_AP_COUNT \
 		(IPA_ROUTE_COUNT_MAX - IPA_ROUTE_MODEM_COUNT)
diff --git a/drivers/net/ipa/ipa_table.h b/drivers/net/ipa/ipa_table.h
index b6a9a0d79d68..1538e2e1732f 100644
--- a/drivers/net/ipa/ipa_table.h
+++ b/drivers/net/ipa/ipa_table.h
@@ -13,6 +13,9 @@ struct ipa;
 /* The maximum number of filter table entries (IPv4, IPv6; hashed or not) */
 #define IPA_FILTER_COUNT_MAX	14
 
+/* The number of route table entries allotted to the modem */
+#define IPA_ROUTE_MODEM_COUNT	8
+
 /* The maximum number of route table entries (IPv4, IPv6; hashed or not) */
 #define IPA_ROUTE_COUNT_MAX	15
 

From b07a9b26e2b1aa3711fd6935eccb08a463b1fb11 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 14 Sep 2022 10:53:38 +0300
Subject: [PATCH 46/83] ipmr: Always call ip{,6}_mr_forward() from RCU
 read-side critical section

These functions expect to be called from RCU read-side critical section,
but this only happens when invoked from the data path via
ip{,6}_mr_input(). They can also be invoked from process context in
response to user space adding a multicast route which resolves a cache
entry with queued packets [1][2].

Fix by adding missing rcu_read_lock() / rcu_read_unlock() in these call
paths.

[1]
WARNING: suspicious RCU usage
6.0.0-rc3-custom-15969-g049d233c8bcc-dirty #1387 Not tainted
-----------------------------
net/ipv4/ipmr.c:84 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
1 lock held by smcrouted/246:
 #0: ffffffff862389b0 (rtnl_mutex){+.+.}-{3:3}, at: ip_mroute_setsockopt+0x11c/0x1420

stack backtrace:
CPU: 0 PID: 246 Comm: smcrouted Not tainted 6.0.0-rc3-custom-15969-g049d233c8bcc-dirty #1387
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x91/0xb9
 vif_dev_read+0xbf/0xd0
 ipmr_queue_xmit+0x135/0x1ab0
 ip_mr_forward+0xe7b/0x13d0
 ipmr_mfc_add+0x1a06/0x2ad0
 ip_mroute_setsockopt+0x5c1/0x1420
 do_ip_setsockopt+0x23d/0x37f0
 ip_setsockopt+0x56/0x80
 raw_setsockopt+0x219/0x290
 __sys_setsockopt+0x236/0x4d0
 __x64_sys_setsockopt+0xbe/0x160
 do_syscall_64+0x34/0x80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

[2]
WARNING: suspicious RCU usage
6.0.0-rc3-custom-15969-g049d233c8bcc-dirty #1387 Not tainted
-----------------------------
net/ipv6/ip6mr.c:69 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
1 lock held by smcrouted/246:
 #0: ffffffff862389b0 (rtnl_mutex){+.+.}-{3:3}, at: ip6_mroute_setsockopt+0x6b9/0x2630

stack backtrace:
CPU: 1 PID: 246 Comm: smcrouted Not tainted 6.0.0-rc3-custom-15969-g049d233c8bcc-dirty #1387
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x91/0xb9
 vif_dev_read+0xbf/0xd0
 ip6mr_forward2.isra.0+0xc9/0x1160
 ip6_mr_forward+0xef0/0x13f0
 ip6mr_mfc_add+0x1ff2/0x31f0
 ip6_mroute_setsockopt+0x1825/0x2630
 do_ipv6_setsockopt+0x462/0x4440
 ipv6_setsockopt+0x105/0x140
 rawv6_setsockopt+0xd8/0x690
 __sys_setsockopt+0x236/0x4d0
 __x64_sys_setsockopt+0xbe/0x160
 do_syscall_64+0x34/0x80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: ebc3197963fc ("ipmr: add rcu protection over (struct vif_device)->dev")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/ipmr.c  | 2 ++
 net/ipv6/ip6mr.c | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 73651d17e51f..e11d6b0b62b7 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1004,7 +1004,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
 
 			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
 		} else {
+			rcu_read_lock();
 			ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
+			rcu_read_unlock();
 		}
 	}
 }
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a9ba41648e36..858fd8a28b5b 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1028,8 +1028,11 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
 				((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE;
 			}
 			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
-		} else
+		} else {
+			rcu_read_lock();
 			ip6_mr_forward(net, mrt, skb->dev, skb, c);
+			rcu_read_unlock();
+		}
 	}
 }
 

From 2b5a8c8f59d9dff49f273bafbde57d5a7dc2706a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 14 Sep 2022 10:53:39 +0300
Subject: [PATCH 47/83] selftests: forwarding: Add test cases for unresolved
 multicast routes

Add IPv4 and IPv6 test cases for unresolved multicast routes, testing
that queued packets are forwarded after installing a matching (S, G)
route.

The test cases can be used to reproduce the bugs fixed in "ipmr: Always
call ip{,6}_mr_forward() from RCU read-side critical section".

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/forwarding/router_multicast.sh        | 92 ++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/forwarding/router_multicast.sh b/tools/testing/selftests/net/forwarding/router_multicast.sh
index 57e90c873a2c..5a58b1ec8aef 100755
--- a/tools/testing/selftests/net/forwarding/router_multicast.sh
+++ b/tools/testing/selftests/net/forwarding/router_multicast.sh
@@ -28,7 +28,7 @@
 # +------------------+       +------------------+
 #
 
-ALL_TESTS="mcast_v4 mcast_v6 rpf_v4 rpf_v6"
+ALL_TESTS="mcast_v4 mcast_v6 rpf_v4 rpf_v6 unres_v4 unres_v6"
 NUM_NETIFS=6
 source lib.sh
 source tc_common.sh
@@ -406,6 +406,96 @@ rpf_v6()
 	log_test "RPF IPv6"
 }
 
+unres_v4()
+{
+	# Send a multicast packet not corresponding to an installed route,
+	# causing the kernel to queue the packet for resolution and emit an
+	# IGMPMSG_NOCACHE notification. smcrouted will react to this
+	# notification by consulting its (*, G) list and installing an (S, G)
+	# route, which will be used to forward the queued packet.
+
+	RET=0
+
+	tc filter add dev $h2 ingress protocol ip pref 1 handle 1 flower \
+		dst_ip 225.1.2.3 ip_proto udp dst_port 12345 action drop
+	tc filter add dev $h3 ingress protocol ip pref 1 handle 1 flower \
+		dst_ip 225.1.2.3 ip_proto udp dst_port 12345 action drop
+
+	# Forwarding should fail before installing a matching (*, G).
+	$MZ $h1 -c 1 -p 128 -t udp "ttl=10,sp=54321,dp=12345" \
+		-a 00:11:22:33:44:55 -b 01:00:5e:01:02:03 \
+		-A 198.51.100.2 -B 225.1.2.3 -q
+
+	tc_check_packets "dev $h2 ingress" 1 0
+	check_err $? "Multicast received on first host when should not"
+	tc_check_packets "dev $h3 ingress" 1 0
+	check_err $? "Multicast received on second host when should not"
+
+	# Create (*, G). Will not be installed in the kernel.
+	create_mcast_sg $rp1 0.0.0.0 225.1.2.3 $rp2 $rp3
+
+	$MZ $h1 -c 1 -p 128 -t udp "ttl=10,sp=54321,dp=12345" \
+		-a 00:11:22:33:44:55 -b 01:00:5e:01:02:03 \
+		-A 198.51.100.2 -B 225.1.2.3 -q
+
+	tc_check_packets "dev $h2 ingress" 1 1
+	check_err $? "Multicast not received on first host"
+	tc_check_packets "dev $h3 ingress" 1 1
+	check_err $? "Multicast not received on second host"
+
+	delete_mcast_sg $rp1 0.0.0.0 225.1.2.3 $rp2 $rp3
+
+	tc filter del dev $h3 ingress protocol ip pref 1 handle 1 flower
+	tc filter del dev $h2 ingress protocol ip pref 1 handle 1 flower
+
+	log_test "Unresolved queue IPv4"
+}
+
+unres_v6()
+{
+	# Send a multicast packet not corresponding to an installed route,
+	# causing the kernel to queue the packet for resolution and emit an
+	# MRT6MSG_NOCACHE notification. smcrouted will react to this
+	# notification by consulting its (*, G) list and installing an (S, G)
+	# route, which will be used to forward the queued packet.
+
+	RET=0
+
+	tc filter add dev $h2 ingress protocol ipv6 pref 1 handle 1 flower \
+		dst_ip ff0e::3 ip_proto udp dst_port 12345 action drop
+	tc filter add dev $h3 ingress protocol ipv6 pref 1 handle 1 flower \
+		dst_ip ff0e::3 ip_proto udp dst_port 12345 action drop
+
+	# Forwarding should fail before installing a matching (*, G).
+	$MZ $h1 -6 -c 1 -p 128 -t udp "ttl=10,sp=54321,dp=12345" \
+		-a 00:11:22:33:44:55 -b 33:33:00:00:00:03 \
+		-A 2001:db8:1::2 -B ff0e::3 -q
+
+	tc_check_packets "dev $h2 ingress" 1 0
+	check_err $? "Multicast received on first host when should not"
+	tc_check_packets "dev $h3 ingress" 1 0
+	check_err $? "Multicast received on second host when should not"
+
+	# Create (*, G). Will not be installed in the kernel.
+	create_mcast_sg $rp1 :: ff0e::3 $rp2 $rp3
+
+	$MZ $h1 -6 -c 1 -p 128 -t udp "ttl=10,sp=54321,dp=12345" \
+		-a 00:11:22:33:44:55 -b 33:33:00:00:00:03 \
+		-A 2001:db8:1::2 -B ff0e::3 -q
+
+	tc_check_packets "dev $h2 ingress" 1 1
+	check_err $? "Multicast not received on first host"
+	tc_check_packets "dev $h3 ingress" 1 1
+	check_err $? "Multicast not received on second host"
+
+	delete_mcast_sg $rp1 :: ff0e::3 $rp2 $rp3
+
+	tc filter del dev $h3 ingress protocol ipv6 pref 1 handle 1 flower
+	tc filter del dev $h2 ingress protocol ipv6 pref 1 handle 1 flower
+
+	log_test "Unresolved queue IPv6"
+}
+
 trap cleanup EXIT
 
 setup_prepare

From d547c1b717fc5a1e062307e1efdf81590ba9f6c1 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 14 Sep 2022 18:51:54 +0900
Subject: [PATCH 48/83] net: clear msg_get_inq in __get_compat_msghdr()

syzbot is still complaining uninit-value in tcp_recvmsg(), for
commit 1228b34c8d0ecf6d ("net: clear msg_get_inq in __sys_recvfrom() and
__copy_msghdr_from_user()") missed that __get_compat_msghdr() is called
instead of copy_msghdr_from_user() when MSG_CMSG_COMPAT is specified.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Fixes: 1228b34c8d0ecf6d ("net: clear msg_get_inq in __sys_recvfrom() and __copy_msghdr_from_user()")
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/d06d0f7f-696c-83b4-b2d5-70b5f2730a37@I-love.SAKURA.ne.jp
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/compat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/compat.c b/net/compat.c
index fe9be3c56ef7..385f04a6be2f 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -52,6 +52,7 @@ int __get_compat_msghdr(struct msghdr *kmsg,
 		kmsg->msg_namelen = sizeof(struct sockaddr_storage);
 
 	kmsg->msg_control_is_user = true;
+	kmsg->msg_get_inq = 0;
 	kmsg->msg_control_user = compat_ptr(msg->msg_control);
 	kmsg->msg_controllen = msg->msg_controllen;
 

From 974bb793aded499491246f6f9826e26c2b127320 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= <ihuguet@redhat.com>
Date: Thu, 15 Sep 2022 16:16:53 +0200
Subject: [PATCH 49/83] sfc/siena: fix TX channel offset when using legacy
 interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As in previous commit for sfc, fix TX channels offset when
efx_siena_separate_tx_channels is false (the default)

Fixes: 25bde571b4a8 ("sfc/siena: fix wrong tx channel offset with efx_separate_tx_channels")
Reported-by: Tianhao Zhao <tizhao@redhat.com>
Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>
Link: https://lore.kernel.org/r/20220915141653.15504-1-ihuguet@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/sfc/siena/efx_channels.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/siena/efx_channels.c b/drivers/net/ethernet/sfc/siena/efx_channels.c
index 017212a40df3..f54ebd007286 100644
--- a/drivers/net/ethernet/sfc/siena/efx_channels.c
+++ b/drivers/net/ethernet/sfc/siena/efx_channels.c
@@ -320,7 +320,7 @@ int efx_siena_probe_interrupts(struct efx_nic *efx)
 		efx->n_channels = 1 + (efx_siena_separate_tx_channels ? 1 : 0);
 		efx->n_rx_channels = 1;
 		efx->n_tx_channels = 1;
-		efx->tx_channel_offset = 1;
+		efx->tx_channel_offset = efx_siena_separate_tx_channels ? 1 : 0;
 		efx->n_xdp_channels = 0;
 		efx->xdp_channel_offset = efx->n_channels;
 		efx->legacy_irq = efx->pci_dev->irq;

From 589c6eded10c77a12b7b2cf235b6b19a2bdb91fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= <ihuguet@redhat.com>
Date: Thu, 15 Sep 2022 16:19:58 +0200
Subject: [PATCH 50/83] sfc/siena: fix null pointer dereference in
 efx_hard_start_xmit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Like in previous patch for sfc, prevent potential (but unlikely) NULL
pointer dereference.

Fixes: 12804793b17c ("sfc: decouple TXQ type from label")
Reported-by: Tianhao Zhao <tizhao@redhat.com>
Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>
Link: https://lore.kernel.org/r/20220915141958.16458-1-ihuguet@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/sfc/siena/tx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/siena/tx.c b/drivers/net/ethernet/sfc/siena/tx.c
index e166dcb9b99c..91e87594ed1e 100644
--- a/drivers/net/ethernet/sfc/siena/tx.c
+++ b/drivers/net/ethernet/sfc/siena/tx.c
@@ -336,7 +336,7 @@ netdev_tx_t efx_siena_hard_start_xmit(struct sk_buff *skb,
 		 * previous packets out.
 		 */
 		if (!netdev_xmit_more())
-			efx_tx_send_pending(tx_queue->channel);
+			efx_tx_send_pending(efx_get_tx_channel(efx, index));
 		return NETDEV_TX_OK;
 	}
 

From 684dec3cf45da2b0848298efae4adf3b2aeafeda Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 16 Sep 2022 15:37:38 +0100
Subject: [PATCH 51/83] wireguard: ratelimiter: disable timings test by default

A previous commit tried to make the ratelimiter timings test more
reliable but in the process made it less reliable on other
configurations. This is an impossible problem to solve without
increasingly ridiculous heuristics. And it's not even a problem that
actually needs to be solved in any comprehensive way, since this is only
ever used during development. So just cordon this off with a DEBUG_
ifdef, just like we do for the trie's randomized tests, so it can be
enabled while hacking on the code, and otherwise disabled in CI. In the
process we also revert 151c8e499f47.

Fixes: 151c8e499f47 ("wireguard: ratelimiter: use hrtimer in selftest")
Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/wireguard/selftest/ratelimiter.c | 25 ++++++++------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireguard/selftest/ratelimiter.c b/drivers/net/wireguard/selftest/ratelimiter.c
index ba87d294604f..d4bb40a695ab 100644
--- a/drivers/net/wireguard/selftest/ratelimiter.c
+++ b/drivers/net/wireguard/selftest/ratelimiter.c
@@ -6,29 +6,28 @@
 #ifdef DEBUG
 
 #include <linux/jiffies.h>
-#include <linux/hrtimer.h>
 
 static const struct {
 	bool result;
-	u64 nsec_to_sleep_before;
+	unsigned int msec_to_sleep_before;
 } expected_results[] __initconst = {
 	[0 ... PACKETS_BURSTABLE - 1] = { true, 0 },
 	[PACKETS_BURSTABLE] = { false, 0 },
-	[PACKETS_BURSTABLE + 1] = { true, NSEC_PER_SEC / PACKETS_PER_SECOND },
+	[PACKETS_BURSTABLE + 1] = { true, MSEC_PER_SEC / PACKETS_PER_SECOND },
 	[PACKETS_BURSTABLE + 2] = { false, 0 },
-	[PACKETS_BURSTABLE + 3] = { true, (NSEC_PER_SEC / PACKETS_PER_SECOND) * 2 },
+	[PACKETS_BURSTABLE + 3] = { true, (MSEC_PER_SEC / PACKETS_PER_SECOND) * 2 },
 	[PACKETS_BURSTABLE + 4] = { true, 0 },
 	[PACKETS_BURSTABLE + 5] = { false, 0 }
 };
 
 static __init unsigned int maximum_jiffies_at_index(int index)
 {
-	u64 total_nsecs = 2 * NSEC_PER_SEC / PACKETS_PER_SECOND / 3;
+	unsigned int total_msecs = 2 * MSEC_PER_SEC / PACKETS_PER_SECOND / 3;
 	int i;
 
 	for (i = 0; i <= index; ++i)
-		total_nsecs += expected_results[i].nsec_to_sleep_before;
-	return nsecs_to_jiffies(total_nsecs);
+		total_msecs += expected_results[i].msec_to_sleep_before;
+	return msecs_to_jiffies(total_msecs);
 }
 
 static __init int timings_test(struct sk_buff *skb4, struct iphdr *hdr4,
@@ -43,12 +42,8 @@ static __init int timings_test(struct sk_buff *skb4, struct iphdr *hdr4,
 	loop_start_time = jiffies;
 
 	for (i = 0; i < ARRAY_SIZE(expected_results); ++i) {
-		if (expected_results[i].nsec_to_sleep_before) {
-			ktime_t timeout = ktime_add(ktime_add_ns(ktime_get_coarse_boottime(), TICK_NSEC * 4 / 3),
-						    ns_to_ktime(expected_results[i].nsec_to_sleep_before));
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			schedule_hrtimeout_range_clock(&timeout, 0, HRTIMER_MODE_ABS, CLOCK_BOOTTIME);
-		}
+		if (expected_results[i].msec_to_sleep_before)
+			msleep(expected_results[i].msec_to_sleep_before);
 
 		if (time_is_before_jiffies(loop_start_time +
 					   maximum_jiffies_at_index(i)))
@@ -132,7 +127,7 @@ bool __init wg_ratelimiter_selftest(void)
 	if (IS_ENABLED(CONFIG_KASAN) || IS_ENABLED(CONFIG_UBSAN))
 		return true;
 
-	BUILD_BUG_ON(NSEC_PER_SEC % PACKETS_PER_SECOND != 0);
+	BUILD_BUG_ON(MSEC_PER_SEC % PACKETS_PER_SECOND != 0);
 
 	if (wg_ratelimiter_init())
 		goto out;
@@ -172,7 +167,7 @@ bool __init wg_ratelimiter_selftest(void)
 	++test;
 #endif
 
-	for (trials = TRIALS_BEFORE_GIVING_UP;;) {
+	for (trials = TRIALS_BEFORE_GIVING_UP; IS_ENABLED(DEBUG_RATELIMITER_TIMINGS);) {
 		int test_count = 0, ret;
 
 		ret = timings_test(skb4, hdr4, skb6, hdr6, &test_count);

From 8e25c02b8cce7063ae9f08cad51d246a60370bc9 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 16 Sep 2022 15:37:39 +0100
Subject: [PATCH 52/83] wireguard: selftests: do not install headers on UML

Since 1b620d539ccc ("kbuild: disable header exports for UML in a
straightforward way"), installing headers fails on UML, so just disable
installing them, since they're not needed anyway on the architecture.

Fixes: b438b3b8d6e6 ("wireguard: selftests: support UML")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/wireguard/qemu/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile
index fda76282d34b..e95bd56b332f 100644
--- a/tools/testing/selftests/wireguard/qemu/Makefile
+++ b/tools/testing/selftests/wireguard/qemu/Makefile
@@ -343,8 +343,10 @@ $(KERNEL_BZIMAGE): $(TOOLCHAIN_PATH)/.installed $(KERNEL_BUILD_PATH)/.config $(B
 .PHONY: $(KERNEL_BZIMAGE)
 
 $(TOOLCHAIN_PATH)/$(CHOST)/include/linux/.installed: | $(KERNEL_BUILD_PATH)/.config $(TOOLCHAIN_PATH)/.installed
+ifneq ($(ARCH),um)
 	rm -rf $(TOOLCHAIN_PATH)/$(CHOST)/include/linux
 	$(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) INSTALL_HDR_PATH=$(TOOLCHAIN_PATH)/$(CHOST) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) headers_install
+endif
 	touch $@
 
 $(TOOLCHAIN_PATH)/.installed: $(TOOLCHAIN_TAR)

From 26c013108c12b94bc023bf19198a4300596c98b1 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 16 Sep 2022 15:37:40 +0100
Subject: [PATCH 53/83] wireguard: netlink: avoid variable-sized memcpy on
 sockaddr

Doing a variable-sized memcpy is slower, and the compiler isn't smart
enough to turn this into a constant-size assignment.

Further, Kees' latest fortified memcpy will actually bark, because the
destination pointer is type sockaddr, not explicitly sockaddr_in or
sockaddr_in6, so it thinks there's an overflow:

    memcpy: detected field-spanning write (size 28) of single field
    "&endpoint.addr" at drivers/net/wireguard/netlink.c:446 (size 16)

Fix this by just assigning by using explicit casts for each checked
case.

Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reported-by: syzbot+a448cda4dba2dac50de5@syzkaller.appspotmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/wireguard/netlink.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireguard/netlink.c b/drivers/net/wireguard/netlink.c
index d0f3b6d7f408..5c804bcabfe6 100644
--- a/drivers/net/wireguard/netlink.c
+++ b/drivers/net/wireguard/netlink.c
@@ -436,14 +436,13 @@ static int set_peer(struct wg_device *wg, struct nlattr **attrs)
 	if (attrs[WGPEER_A_ENDPOINT]) {
 		struct sockaddr *addr = nla_data(attrs[WGPEER_A_ENDPOINT]);
 		size_t len = nla_len(attrs[WGPEER_A_ENDPOINT]);
+		struct endpoint endpoint = { { { 0 } } };
 
-		if ((len == sizeof(struct sockaddr_in) &&
-		     addr->sa_family == AF_INET) ||
-		    (len == sizeof(struct sockaddr_in6) &&
-		     addr->sa_family == AF_INET6)) {
-			struct endpoint endpoint = { { { 0 } } };
-
-			memcpy(&endpoint.addr, addr, len);
+		if (len == sizeof(struct sockaddr_in) && addr->sa_family == AF_INET) {
+			endpoint.addr4 = *(struct sockaddr_in *)addr;
+			wg_socket_set_peer_endpoint(peer, &endpoint);
+		} else if (len == sizeof(struct sockaddr_in6) && addr->sa_family == AF_INET6) {
+			endpoint.addr6 = *(struct sockaddr_in6 *)addr;
 			wg_socket_set_peer_endpoint(peer, &endpoint);
 		}
 	}

From fed38e64d9b99d65a36c0dbadc3d3f8ddd9ea030 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 16 Sep 2022 16:32:08 +0300
Subject: [PATCH 54/83] net: enetc: move enetc_set_psfp() out of the common
 enetc_set_features()

The VF netdev driver shouldn't respond to changes in the NETIF_F_HW_TC
flag; only PFs should. Moreover, TSN-specific code should go to
enetc_qos.c, which should not be included in the VF driver.

Fixes: 79e499829f3f ("net: enetc: add hw tc hw offload features for PSPF capability")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20220916133209.3351399-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/freescale/enetc/enetc.c  | 32 +------------------
 drivers/net/ethernet/freescale/enetc/enetc.h  |  9 ++++--
 .../net/ethernet/freescale/enetc/enetc_pf.c   | 11 ++++++-
 .../net/ethernet/freescale/enetc/enetc_qos.c  | 23 +++++++++++++
 .../net/ethernet/freescale/enetc/enetc_vf.c   |  4 ++-
 5 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
index 4470a4a3e4c3..3df099f6cbe0 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc.c
@@ -2600,29 +2600,6 @@ static int enetc_set_rss(struct net_device *ndev, int en)
 	return 0;
 }
 
-static int enetc_set_psfp(struct net_device *ndev, int en)
-{
-	struct enetc_ndev_priv *priv = netdev_priv(ndev);
-	int err;
-
-	if (en) {
-		err = enetc_psfp_enable(priv);
-		if (err)
-			return err;
-
-		priv->active_offloads |= ENETC_F_QCI;
-		return 0;
-	}
-
-	err = enetc_psfp_disable(priv);
-	if (err)
-		return err;
-
-	priv->active_offloads &= ~ENETC_F_QCI;
-
-	return 0;
-}
-
 static void enetc_enable_rxvlan(struct net_device *ndev, bool en)
 {
 	struct enetc_ndev_priv *priv = netdev_priv(ndev);
@@ -2641,11 +2618,9 @@ static void enetc_enable_txvlan(struct net_device *ndev, bool en)
 		enetc_bdr_enable_txvlan(&priv->si->hw, i, en);
 }
 
-int enetc_set_features(struct net_device *ndev,
-		       netdev_features_t features)
+void enetc_set_features(struct net_device *ndev, netdev_features_t features)
 {
 	netdev_features_t changed = ndev->features ^ features;
-	int err = 0;
 
 	if (changed & NETIF_F_RXHASH)
 		enetc_set_rss(ndev, !!(features & NETIF_F_RXHASH));
@@ -2657,11 +2632,6 @@ int enetc_set_features(struct net_device *ndev,
 	if (changed & NETIF_F_HW_VLAN_CTAG_TX)
 		enetc_enable_txvlan(ndev,
 				    !!(features & NETIF_F_HW_VLAN_CTAG_TX));
-
-	if (changed & NETIF_F_HW_TC)
-		err = enetc_set_psfp(ndev, !!(features & NETIF_F_HW_TC));
-
-	return err;
 }
 
 #ifdef CONFIG_FSL_ENETC_PTP_CLOCK
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h
index 29922c20531f..caa12509d06b 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.h
+++ b/drivers/net/ethernet/freescale/enetc/enetc.h
@@ -393,8 +393,7 @@ void enetc_start(struct net_device *ndev);
 void enetc_stop(struct net_device *ndev);
 netdev_tx_t enetc_xmit(struct sk_buff *skb, struct net_device *ndev);
 struct net_device_stats *enetc_get_stats(struct net_device *ndev);
-int enetc_set_features(struct net_device *ndev,
-		       netdev_features_t features);
+void enetc_set_features(struct net_device *ndev, netdev_features_t features);
 int enetc_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd);
 int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type,
 		   void *type_data);
@@ -465,6 +464,7 @@ int enetc_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 int enetc_setup_tc_psfp(struct net_device *ndev, void *type_data);
 int enetc_psfp_init(struct enetc_ndev_priv *priv);
 int enetc_psfp_clean(struct enetc_ndev_priv *priv);
+int enetc_set_psfp(struct net_device *ndev, bool en);
 
 static inline void enetc_get_max_cap(struct enetc_ndev_priv *priv)
 {
@@ -540,4 +540,9 @@ static inline int enetc_psfp_disable(struct enetc_ndev_priv *priv)
 {
 	return 0;
 }
+
+static inline int enetc_set_psfp(struct net_device *ndev, bool en)
+{
+	return 0;
+}
 #endif
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
index c4a0e836d4f0..201b5f3f634e 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
@@ -709,6 +709,13 @@ static int enetc_pf_set_features(struct net_device *ndev,
 {
 	netdev_features_t changed = ndev->features ^ features;
 	struct enetc_ndev_priv *priv = netdev_priv(ndev);
+	int err;
+
+	if (changed & NETIF_F_HW_TC) {
+		err = enetc_set_psfp(ndev, !!(features & NETIF_F_HW_TC));
+		if (err)
+			return err;
+	}
 
 	if (changed & NETIF_F_HW_VLAN_CTAG_FILTER) {
 		struct enetc_pf *pf = enetc_si_priv(priv->si);
@@ -722,7 +729,9 @@ static int enetc_pf_set_features(struct net_device *ndev,
 	if (changed & NETIF_F_LOOPBACK)
 		enetc_set_loopback(ndev, !!(features & NETIF_F_LOOPBACK));
 
-	return enetc_set_features(ndev, features);
+	enetc_set_features(ndev, features);
+
+	return 0;
 }
 
 static const struct net_device_ops enetc_ndev_ops = {
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_qos.c b/drivers/net/ethernet/freescale/enetc/enetc_qos.c
index 582a663ed0ba..f8a2f02ce22d 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_qos.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_qos.c
@@ -1517,6 +1517,29 @@ int enetc_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 	}
 }
 
+int enetc_set_psfp(struct net_device *ndev, bool en)
+{
+	struct enetc_ndev_priv *priv = netdev_priv(ndev);
+	int err;
+
+	if (en) {
+		err = enetc_psfp_enable(priv);
+		if (err)
+			return err;
+
+		priv->active_offloads |= ENETC_F_QCI;
+		return 0;
+	}
+
+	err = enetc_psfp_disable(priv);
+	if (err)
+		return err;
+
+	priv->active_offloads &= ~ENETC_F_QCI;
+
+	return 0;
+}
+
 int enetc_psfp_init(struct enetc_ndev_priv *priv)
 {
 	if (epsfp.psfp_sfi_bitmap)
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_vf.c b/drivers/net/ethernet/freescale/enetc/enetc_vf.c
index 17924305afa2..4048101c42be 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_vf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_vf.c
@@ -88,7 +88,9 @@ static int enetc_vf_set_mac_addr(struct net_device *ndev, void *addr)
 static int enetc_vf_set_features(struct net_device *ndev,
 				 netdev_features_t features)
 {
-	return enetc_set_features(ndev, features);
+	enetc_set_features(ndev, features);
+
+	return 0;
 }
 
 /* Probing/ Init */

From 5641c751fe2f92d3d9e8a8e03c1263ac8caa0b42 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 16 Sep 2022 16:32:09 +0300
Subject: [PATCH 55/83] net: enetc: deny offload of tc-based TSN features on VF
 interfaces

TSN features on the ENETC (taprio, cbs, gate, police) are configured
through a mix of command BD ring messages and port registers:
enetc_port_rd(), enetc_port_wr().

Port registers are a region of the ENETC memory map which are only
accessible from the PCIe Physical Function. They are not accessible from
the Virtual Functions.

Moreover, attempting to access these registers crashes the kernel:

$ echo 1 > /sys/bus/pci/devices/0000\:00\:00.0/sriov_numvfs
pci 0000:00:01.0: [1957:ef00] type 00 class 0x020001
fsl_enetc_vf 0000:00:01.0: Adding to iommu group 15
fsl_enetc_vf 0000:00:01.0: enabling device (0000 -> 0002)
fsl_enetc_vf 0000:00:01.0 eno0vf0: renamed from eth0
$ tc qdisc replace dev eno0vf0 root taprio num_tc 8 map 0 1 2 3 4 5 6 7 \
	queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 base-time 0 \
	sched-entry S 0x7f 900000 sched-entry S 0x80 100000 flags 0x2
Unable to handle kernel paging request at virtual address ffff800009551a08
Internal error: Oops: 96000007 [#1] PREEMPT SMP
pc : enetc_setup_tc_taprio+0x170/0x47c
lr : enetc_setup_tc_taprio+0x16c/0x47c
Call trace:
 enetc_setup_tc_taprio+0x170/0x47c
 enetc_setup_tc+0x38/0x2dc
 taprio_change+0x43c/0x970
 taprio_init+0x188/0x1e0
 qdisc_create+0x114/0x470
 tc_modify_qdisc+0x1fc/0x6c0
 rtnetlink_rcv_msg+0x12c/0x390

Split enetc_setup_tc() into separate functions for the PF and for the
VF drivers. Also remove enetc_qos.o from being included into
enetc-vf.ko, since it serves absolutely no purpose there.

Fixes: 34c6adf1977b ("enetc: Configure the Time-Aware Scheduler via tc-taprio offload")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20220916133209.3351399-2-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/freescale/enetc/Makefile |  1 -
 drivers/net/ethernet/freescale/enetc/enetc.c  | 21 +------------------
 drivers/net/ethernet/freescale/enetc/enetc.h  |  3 +--
 .../net/ethernet/freescale/enetc/enetc_pf.c   | 21 ++++++++++++++++++-
 .../net/ethernet/freescale/enetc/enetc_vf.c   | 13 +++++++++++-
 5 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/freescale/enetc/Makefile b/drivers/net/ethernet/freescale/enetc/Makefile
index a139f2e9d59f..e0e8dfd13793 100644
--- a/drivers/net/ethernet/freescale/enetc/Makefile
+++ b/drivers/net/ethernet/freescale/enetc/Makefile
@@ -9,7 +9,6 @@ fsl-enetc-$(CONFIG_FSL_ENETC_QOS) += enetc_qos.o
 
 obj-$(CONFIG_FSL_ENETC_VF) += fsl-enetc-vf.o
 fsl-enetc-vf-y := enetc_vf.o $(common-objs)
-fsl-enetc-vf-$(CONFIG_FSL_ENETC_QOS) += enetc_qos.o
 
 obj-$(CONFIG_FSL_ENETC_IERB) += fsl-enetc-ierb.o
 fsl-enetc-ierb-y := enetc_ierb.o
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
index 3df099f6cbe0..9f5b921039bd 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc.c
@@ -2432,7 +2432,7 @@ int enetc_close(struct net_device *ndev)
 	return 0;
 }
 
-static int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data)
+int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data)
 {
 	struct enetc_ndev_priv *priv = netdev_priv(ndev);
 	struct tc_mqprio_qopt *mqprio = type_data;
@@ -2486,25 +2486,6 @@ static int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data)
 	return 0;
 }
 
-int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type,
-		   void *type_data)
-{
-	switch (type) {
-	case TC_SETUP_QDISC_MQPRIO:
-		return enetc_setup_tc_mqprio(ndev, type_data);
-	case TC_SETUP_QDISC_TAPRIO:
-		return enetc_setup_tc_taprio(ndev, type_data);
-	case TC_SETUP_QDISC_CBS:
-		return enetc_setup_tc_cbs(ndev, type_data);
-	case TC_SETUP_QDISC_ETF:
-		return enetc_setup_tc_txtime(ndev, type_data);
-	case TC_SETUP_BLOCK:
-		return enetc_setup_tc_psfp(ndev, type_data);
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
 static int enetc_setup_xdp_prog(struct net_device *dev, struct bpf_prog *prog,
 				struct netlink_ext_ack *extack)
 {
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h
index caa12509d06b..2cfe6944ebd3 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.h
+++ b/drivers/net/ethernet/freescale/enetc/enetc.h
@@ -395,8 +395,7 @@ netdev_tx_t enetc_xmit(struct sk_buff *skb, struct net_device *ndev);
 struct net_device_stats *enetc_get_stats(struct net_device *ndev);
 void enetc_set_features(struct net_device *ndev, netdev_features_t features);
 int enetc_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd);
-int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type,
-		   void *type_data);
+int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data);
 int enetc_setup_bpf(struct net_device *dev, struct netdev_bpf *xdp);
 int enetc_xdp_xmit(struct net_device *ndev, int num_frames,
 		   struct xdp_frame **frames, u32 flags);
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
index 201b5f3f634e..bb7750222691 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
@@ -734,6 +734,25 @@ static int enetc_pf_set_features(struct net_device *ndev,
 	return 0;
 }
 
+static int enetc_pf_setup_tc(struct net_device *ndev, enum tc_setup_type type,
+			     void *type_data)
+{
+	switch (type) {
+	case TC_SETUP_QDISC_MQPRIO:
+		return enetc_setup_tc_mqprio(ndev, type_data);
+	case TC_SETUP_QDISC_TAPRIO:
+		return enetc_setup_tc_taprio(ndev, type_data);
+	case TC_SETUP_QDISC_CBS:
+		return enetc_setup_tc_cbs(ndev, type_data);
+	case TC_SETUP_QDISC_ETF:
+		return enetc_setup_tc_txtime(ndev, type_data);
+	case TC_SETUP_BLOCK:
+		return enetc_setup_tc_psfp(ndev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static const struct net_device_ops enetc_ndev_ops = {
 	.ndo_open		= enetc_open,
 	.ndo_stop		= enetc_close,
@@ -748,7 +767,7 @@ static const struct net_device_ops enetc_ndev_ops = {
 	.ndo_set_vf_spoofchk	= enetc_pf_set_vf_spoofchk,
 	.ndo_set_features	= enetc_pf_set_features,
 	.ndo_eth_ioctl		= enetc_ioctl,
-	.ndo_setup_tc		= enetc_setup_tc,
+	.ndo_setup_tc		= enetc_pf_setup_tc,
 	.ndo_bpf		= enetc_setup_bpf,
 	.ndo_xdp_xmit		= enetc_xdp_xmit,
 };
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_vf.c b/drivers/net/ethernet/freescale/enetc/enetc_vf.c
index 4048101c42be..dfcaac302e24 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_vf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_vf.c
@@ -93,6 +93,17 @@ static int enetc_vf_set_features(struct net_device *ndev,
 	return 0;
 }
 
+static int enetc_vf_setup_tc(struct net_device *ndev, enum tc_setup_type type,
+			     void *type_data)
+{
+	switch (type) {
+	case TC_SETUP_QDISC_MQPRIO:
+		return enetc_setup_tc_mqprio(ndev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 /* Probing/ Init */
 static const struct net_device_ops enetc_ndev_ops = {
 	.ndo_open		= enetc_open,
@@ -102,7 +113,7 @@ static const struct net_device_ops enetc_ndev_ops = {
 	.ndo_set_mac_address	= enetc_vf_set_mac_addr,
 	.ndo_set_features	= enetc_vf_set_features,
 	.ndo_eth_ioctl		= enetc_ioctl,
-	.ndo_setup_tc		= enetc_setup_tc,
+	.ndo_setup_tc		= enetc_vf_setup_tc,
 };
 
 static void enetc_vf_netdev_setup(struct enetc_si *si, struct net_device *ndev,

From 76dd07281338da6951fdab3432ced843fa87839c Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Fri, 16 Sep 2022 11:48:21 +0300
Subject: [PATCH 56/83] ipv6: Fix crash when IPv6 is administratively disabled

The global 'raw_v6_hashinfo' variable can be accessed even when IPv6 is
administratively disabled via the 'ipv6.disable=1' kernel command line
option, leading to a crash [1].

Fix by restoring the original behavior and always initializing the
variable, regardless of IPv6 support being administratively disabled or
not.

[1]
 BUG: unable to handle page fault for address: ffffffffffffffc8
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 173e18067 P4D 173e18067 PUD 173e1a067 PMD 0
 Oops: 0000 [#1] PREEMPT SMP KASAN
 CPU: 3 PID: 271 Comm: ss Not tainted 6.0.0-rc4-custom-00136-g0727a9a5fbc1 #1396
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014
 RIP: 0010:raw_diag_dump+0x310/0x7f0
 [...]
 Call Trace:
  <TASK>
  __inet_diag_dump+0x10f/0x2e0
  netlink_dump+0x575/0xfd0
  __netlink_dump_start+0x67b/0x940
  inet_diag_handler_cmd+0x273/0x2d0
  sock_diag_rcv_msg+0x317/0x440
  netlink_rcv_skb+0x15e/0x430
  sock_diag_rcv+0x2b/0x40
  netlink_unicast+0x53b/0x800
  netlink_sendmsg+0x945/0xe60
  ____sys_sendmsg+0x747/0x960
  ___sys_sendmsg+0x13a/0x1e0
  __sys_sendmsg+0x118/0x1e0
  do_syscall_64+0x34/0x80
  entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Fixes: 0daf07e52709 ("raw: convert raw sockets to RCU")
Reported-by: Roberto Ricci <rroberto2r@gmail.com>
Tested-by: Roberto Ricci <rroberto2r@gmail.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220916084821.229287-1-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv6/af_inet6.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2ce0c44d0081..dbb1430d6cc2 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1070,13 +1070,13 @@ static int __init inet6_init(void)
 	for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
 		INIT_LIST_HEAD(r);
 
+	raw_hashinfo_init(&raw_v6_hashinfo);
+
 	if (disable_ipv6_mod) {
 		pr_info("Loaded, but administratively disabled, reboot required to enable\n");
 		goto out;
 	}
 
-	raw_hashinfo_init(&raw_v6_hashinfo);
-
 	err = proto_register(&tcpv6_prot, 1);
 	if (err)
 		goto out;

From db46e3a88a09c5cf7e505664d01da7238cd56c92 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 15 Sep 2022 13:08:01 +0300
Subject: [PATCH 57/83] net/sched: taprio: avoid disabling offload when it was
 never enabled

In an incredibly strange API design decision, qdisc->destroy() gets
called even if qdisc->init() never succeeded, not exclusively since
commit 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation"),
but apparently also earlier (in the case of qdisc_create_dflt()).

The taprio qdisc does not fully acknowledge this when it attempts full
offload, because it starts off with q->flags = TAPRIO_FLAGS_INVALID in
taprio_init(), then it replaces q->flags with TCA_TAPRIO_ATTR_FLAGS
parsed from netlink (in taprio_change(), tail called from taprio_init()).

But in taprio_destroy(), we call taprio_disable_offload(), and this
determines what to do based on FULL_OFFLOAD_IS_ENABLED(q->flags).

But looking at the implementation of FULL_OFFLOAD_IS_ENABLED()
(a bitwise check of bit 1 in q->flags), it is invalid to call this macro
on q->flags when it contains TAPRIO_FLAGS_INVALID, because that is set
to U32_MAX, and therefore FULL_OFFLOAD_IS_ENABLED() will return true on
an invalid set of flags.

As a result, it is possible to crash the kernel if user space forces an
error between setting q->flags = TAPRIO_FLAGS_INVALID, and the calling
of taprio_enable_offload(). This is because drivers do not expect the
offload to be disabled when it was never enabled.

The error that we force here is to attach taprio as a non-root qdisc,
but instead as child of an mqprio root qdisc:

$ tc qdisc add dev swp0 root handle 1: \
	mqprio num_tc 8 map 0 1 2 3 4 5 6 7 \
	queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 hw 0
$ tc qdisc replace dev swp0 parent 1:1 \
	taprio num_tc 8 map 0 1 2 3 4 5 6 7 \
	queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 base-time 0 \
	sched-entry S 0x7f 990000 sched-entry S 0x80 100000 \
	flags 0x0 clockid CLOCK_TAI
Unable to handle kernel paging request at virtual address fffffffffffffff8
[fffffffffffffff8] pgd=0000000000000000, p4d=0000000000000000
Internal error: Oops: 96000004 [#1] PREEMPT SMP
Call trace:
 taprio_dump+0x27c/0x310
 vsc9959_port_setup_tc+0x1f4/0x460
 felix_port_setup_tc+0x24/0x3c
 dsa_slave_setup_tc+0x54/0x27c
 taprio_disable_offload.isra.0+0x58/0xe0
 taprio_destroy+0x80/0x104
 qdisc_create+0x240/0x470
 tc_modify_qdisc+0x1fc/0x6b0
 rtnetlink_rcv_msg+0x12c/0x390
 netlink_rcv_skb+0x5c/0x130
 rtnetlink_rcv+0x1c/0x2c

Fix this by keeping track of the operations we made, and undo the
offload only if we actually did it.

I've added "bool offloaded" inside a 4 byte hole between "int clockid"
and "atomic64_t picos_per_byte". Now the first cache line looks like
below:

$ pahole -C taprio_sched net/sched/sch_taprio.o
struct taprio_sched {
        struct Qdisc * *           qdiscs;               /*     0     8 */
        struct Qdisc *             root;                 /*     8     8 */
        u32                        flags;                /*    16     4 */
        enum tk_offsets            tk_offset;            /*    20     4 */
        int                        clockid;              /*    24     4 */
        bool                       offloaded;            /*    28     1 */

        /* XXX 3 bytes hole, try to pack */

        atomic64_t                 picos_per_byte;       /*    32     0 */

        /* XXX 8 bytes hole, try to pack */

        spinlock_t                 current_entry_lock;   /*    40     0 */

        /* XXX 8 bytes hole, try to pack */

        struct sched_entry *       current_entry;        /*    48     8 */
        struct sched_gate_list *   oper_sched;           /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */

Fixes: 9c66d1564676 ("taprio: Add support for hardware offloading")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/sch_taprio.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 0b941dd63d26..9bec73019f94 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -67,6 +67,7 @@ struct taprio_sched {
 	u32 flags;
 	enum tk_offsets tk_offset;
 	int clockid;
+	bool offloaded;
 	atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
 				    * speeds it's sub-nanoseconds per byte
 				    */
@@ -1279,6 +1280,8 @@ static int taprio_enable_offload(struct net_device *dev,
 		goto done;
 	}
 
+	q->offloaded = true;
+
 done:
 	taprio_offload_free(offload);
 
@@ -1293,12 +1296,9 @@ static int taprio_disable_offload(struct net_device *dev,
 	struct tc_taprio_qopt_offload *offload;
 	int err;
 
-	if (!FULL_OFFLOAD_IS_ENABLED(q->flags))
+	if (!q->offloaded)
 		return 0;
 
-	if (!ops->ndo_setup_tc)
-		return -EOPNOTSUPP;
-
 	offload = taprio_offload_alloc(0);
 	if (!offload) {
 		NL_SET_ERR_MSG(extack,
@@ -1314,6 +1314,8 @@ static int taprio_disable_offload(struct net_device *dev,
 		goto out;
 	}
 
+	q->offloaded = false;
+
 out:
 	taprio_offload_free(offload);
 

From 1461d212ab277d8bba1a753d33e9afe03d81f9d4 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 15 Sep 2022 13:08:02 +0300
Subject: [PATCH 58/83] net/sched: taprio: make qdisc_leaf() see the
 per-netdev-queue pfifo child qdiscs

taprio can only operate as root qdisc, and to that end, there exists the
following check in taprio_init(), just as in mqprio:

	if (sch->parent != TC_H_ROOT)
		return -EOPNOTSUPP;

And indeed, when we try to attach taprio to an mqprio child, it fails as
expected:

$ tc qdisc add dev swp0 root handle 1: mqprio num_tc 8 \
	map 0 1 2 3 4 5 6 7 \
	queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 hw 0
$ tc qdisc replace dev swp0 parent 1:2 taprio num_tc 8 \
	map 0 1 2 3 4 5 6 7 \
	queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \
	base-time 0 sched-entry S 0x7f 990000 sched-entry S 0x80 100000 \
	flags 0x0 clockid CLOCK_TAI
Error: sch_taprio: Can only be attached as root qdisc.

(extack message added by me)

But when we try to attach a taprio child to a taprio root qdisc,
surprisingly it doesn't fail:

$ tc qdisc replace dev swp0 root handle 1: taprio num_tc 8 \
	map 0 1 2 3 4 5 6 7 queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \
	base-time 0 sched-entry S 0x7f 990000 sched-entry S 0x80 100000 \
	flags 0x0 clockid CLOCK_TAI
$ tc qdisc replace dev swp0 parent 1:2 taprio num_tc 8 \
	map 0 1 2 3 4 5 6 7 \
	queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \
	base-time 0 sched-entry S 0x7f 990000 sched-entry S 0x80 100000 \
	flags 0x0 clockid CLOCK_TAI

This is because tc_modify_qdisc() behaves differently when mqprio is
root, vs when taprio is root.

In the mqprio case, it finds the parent qdisc through
p = qdisc_lookup(dev, TC_H_MAJ(clid)), and then the child qdisc through
q = qdisc_leaf(p, clid). This leaf qdisc q has handle 0, so it is
ignored according to the comment right below ("It may be default qdisc,
ignore it"). As a result, tc_modify_qdisc() goes through the
qdisc_create() code path, and this gives taprio_init() a chance to check
for sch_parent != TC_H_ROOT and error out.

Whereas in the taprio case, the returned q = qdisc_leaf(p, clid) is
different. It is not the default qdisc created for each netdev queue
(both taprio and mqprio call qdisc_create_dflt() and keep them in
a private q->qdiscs[], or priv->qdiscs[], respectively). Instead, taprio
makes qdisc_leaf() return the _root_ qdisc, aka itself.

When taprio does that, tc_modify_qdisc() goes through the qdisc_change()
code path, because the qdisc layer never finds out about the child qdisc
of the root. And through the ->change() ops, taprio has no reason to
check whether its parent is root or not, just through ->init(), which is
not called.

The problem is the taprio_leaf() implementation. Even though code wise,
it does the exact same thing as mqprio_leaf() which it is copied from,
it works with different input data. This is because mqprio does not
attach itself (the root) to each device TX queue, but one of the default
qdiscs from its private array.

In fact, since commit 13511704f8d7 ("net: taprio offload: enforce qdisc
to netdev queue mapping"), taprio does this too, but just for the full
offload case. So if we tried to attach a taprio child to a fully
offloaded taprio root qdisc, it would properly fail too; just not to a
software root taprio.

To fix the problem, stop looking at the Qdisc that's attached to the TX
queue, and instead, always return the default qdiscs that we've
allocated (and to which we privately enqueue and dequeue, in software
scheduling mode).

Since Qdisc_class_ops :: leaf  is only called from tc_modify_qdisc(),
the risk of unforeseen side effects introduced by this change is
minimal.

Fixes: 5a781ccbd19e ("tc: Add support for configuring the taprio scheduler")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/sch_taprio.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 9bec73019f94..86675a79da1e 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -1951,12 +1951,14 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 
 static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
 {
-	struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned int ntx = cl - 1;
 
-	if (!dev_queue)
+	if (ntx >= dev->num_tx_queues)
 		return NULL;
 
-	return dev_queue->qdisc_sleeping;
+	return q->qdiscs[ntx];
 }
 
 static unsigned long taprio_find(struct Qdisc *sch, u32 classid)

From 122045ca770492660005c22379995506f13efea8 Mon Sep 17 00:00:00 2001
From: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Date: Mon, 8 Aug 2022 11:58:54 +0200
Subject: [PATCH 59/83] ice: config netdev tc before setting queues number

After lowering number of tx queues the warning appears:
"Number of in use tx queues changed invalidating tc mappings. Priority
traffic classification disabled!"
Example command to reproduce:
ethtool -L enp24s0f0 tx 36 rx 36

Fix this by setting correct tc mapping before setting real number of
queues on netdev.

Fixes: 0754d65bd4be5 ("ice: Add infrastructure for mqprio support via ndo_setup_tc")
Signed-off-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Tested-by: Gurucharan <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 04836bbaf7d5..3042a76b848d 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -6858,6 +6858,8 @@ int ice_vsi_open(struct ice_vsi *vsi)
 	if (err)
 		goto err_setup_rx;
 
+	ice_vsi_cfg_netdev_tc(vsi, vsi->tc_cfg.ena_tc);
+
 	if (vsi->type == ICE_VSI_PF) {
 		/* Notify the stack of the actual queue counts. */
 		err = netif_set_real_num_tx_queues(vsi->netdev, vsi->num_txq);

From 8ac7132704f3fbd2095abb9459e5303ce8c9e559 Mon Sep 17 00:00:00 2001
From: Mateusz Palczewski <mateusz.palczewski@intel.com>
Date: Fri, 26 Aug 2022 10:31:23 +0200
Subject: [PATCH 60/83] ice: Fix interface being down after reset with
 link-down-on-close flag on

When performing a reset on ice driver with link-down-on-close flag on
interface would always stay down. Fix this by moving a check of this
flag to ice_stop() that is called only when user wants to bring
interface down.

Fixes: ab4ab73fc1ec ("ice: Add ethtool private flag to make forcing link down optional")
Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
Tested-by: Petr Oros <poros@redhat.com>
Tested-by: Gurucharan <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_main.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 3042a76b848d..e109cb93886b 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -6649,7 +6649,7 @@ static void ice_napi_disable_all(struct ice_vsi *vsi)
  */
 int ice_down(struct ice_vsi *vsi)
 {
-	int i, tx_err, rx_err, link_err = 0, vlan_err = 0;
+	int i, tx_err, rx_err, vlan_err = 0;
 
 	WARN_ON(!test_bit(ICE_VSI_DOWN, vsi->state));
 
@@ -6683,20 +6683,13 @@ int ice_down(struct ice_vsi *vsi)
 
 	ice_napi_disable_all(vsi);
 
-	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags)) {
-		link_err = ice_force_phys_link_state(vsi, false);
-		if (link_err)
-			netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n",
-				   vsi->vsi_num, link_err);
-	}
-
 	ice_for_each_txq(vsi, i)
 		ice_clean_tx_ring(vsi->tx_rings[i]);
 
 	ice_for_each_rxq(vsi, i)
 		ice_clean_rx_ring(vsi->rx_rings[i]);
 
-	if (tx_err || rx_err || link_err || vlan_err) {
+	if (tx_err || rx_err || vlan_err) {
 		netdev_err(vsi->netdev, "Failed to close VSI 0x%04X on switch 0x%04X\n",
 			   vsi->vsi_num, vsi->vsw->sw_id);
 		return -EIO;
@@ -8892,6 +8885,16 @@ int ice_stop(struct net_device *netdev)
 		return -EBUSY;
 	}
 
+	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags)) {
+		int link_err = ice_force_phys_link_state(vsi, false);
+
+		if (link_err) {
+			netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n",
+				   vsi->vsi_num, link_err);
+			return -EIO;
+		}
+	}
+
 	ice_vsi_close(vsi);
 
 	return 0;

From 603ccb3aca717d04a4b1a04e3a7bc3b91eba33e8 Mon Sep 17 00:00:00 2001
From: Bhupesh Sharma <bhupesh.sharma@linaro.org>
Date: Thu, 15 Sep 2022 16:58:04 +0530
Subject: [PATCH 61/83] MAINTAINERS: Add myself as a reviewer for Qualcomm
 ETHQOS Ethernet driver

As suggested by Vinod, adding myself as the reviewer
for the Qualcomm ETHQOS Ethernet driver.

Recently I have enabled this driver on a few Qualcomm
SoCs / boards and hence trying to keep a close eye on
it.

Signed-off-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
Acked-by: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20220915112804.3950680-1-bhupesh.sharma@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index faf65a953e54..f0fb5ab1ffcc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16857,6 +16857,7 @@ F:	drivers/net/ethernet/qualcomm/emac/
 
 QUALCOMM ETHQOS ETHERNET DRIVER
 M:	Vinod Koul <vkoul@kernel.org>
+R:	Bhupesh Sharma <bhupesh.sharma@linaro.org>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	Documentation/devicetree/bindings/net/qcom,ethqos.txt

From 76b907ee00c4a5cdd5d0adbacdaa1c1989385615 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 9 Sep 2022 12:42:11 +0200
Subject: [PATCH 62/83] netfilter: conntrack: remove nf_conntrack_helper
 documentation

This toggle has been already remove by b118509076b3 ("netfilter: remove
nf_conntrack_helper sysctl and modparam toggles").

Remove the documentation entry for this toggle too.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Documentation/networking/nf_conntrack-sysctl.rst | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 834945ebc4cd..1120d71f28d7 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -70,15 +70,6 @@ nf_conntrack_generic_timeout - INTEGER (seconds)
 	Default for generic timeout.  This refers to layer 4 unknown/unsupported
 	protocols.
 
-nf_conntrack_helper - BOOLEAN
-	- 0 - disabled (default)
-	- not 0 - enabled
-
-	Enable automatic conntrack helper assignment.
-	If disabled it is required to set up iptables rules to assign
-	helpers to connections.  See the CT target description in the
-	iptables-extensions(8) man page for further information.
-
 nf_conntrack_icmp_timeout - INTEGER (seconds)
 	default 30
 

From 921ebde3c0d22c8cba74ce8eb3cc4626abff1ccd Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 12 Sep 2022 21:41:00 +0900
Subject: [PATCH 63/83] netfilter: nf_tables: fix nft_counters_enabled
 underflow at nf_tables_addchain()

syzbot is reporting underflow of nft_counters_enabled counter at
nf_tables_addchain() [1], for commit 43eb8949cfdffa76 ("netfilter:
nf_tables: do not leave chain stats enabled on error") missed that
nf_tables_chain_destroy() after nft_basechain_init() in the error path of
nf_tables_addchain() decrements the counter because nft_basechain_init()
makes nft_is_base_chain() return true by setting NFT_CHAIN_BASE flag.

Increment the counter immediately after returning from
nft_basechain_init().

Link:  https://syzkaller.appspot.com/bug?extid=b5d82a651b71cd8a75ab [1]
Reported-by: syzbot <syzbot+b5d82a651b71cd8a75ab@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+b5d82a651b71cd8a75ab@syzkaller.appspotmail.com>
Fixes: 43eb8949cfdffa76 ("netfilter: nf_tables: do not leave chain stats enabled on error")
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_tables_api.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 816052089b33..e062754dc6cc 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2197,7 +2197,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 			      struct netlink_ext_ack *extack)
 {
 	const struct nlattr * const *nla = ctx->nla;
-	struct nft_stats __percpu *stats = NULL;
 	struct nft_table *table = ctx->table;
 	struct nft_base_chain *basechain;
 	struct net *net = ctx->net;
@@ -2212,6 +2211,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		return -EOVERFLOW;
 
 	if (nla[NFTA_CHAIN_HOOK]) {
+		struct nft_stats __percpu *stats = NULL;
 		struct nft_chain_hook hook;
 
 		if (flags & NFT_CHAIN_BINDING)
@@ -2245,6 +2245,8 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 			kfree(basechain);
 			return err;
 		}
+		if (stats)
+			static_branch_inc(&nft_counters_enabled);
 	} else {
 		if (flags & NFT_CHAIN_BASE)
 			return -EINVAL;
@@ -2319,9 +2321,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		goto err_unregister_hook;
 	}
 
-	if (stats)
-		static_branch_inc(&nft_counters_enabled);
-
 	table->use++;
 
 	return 0;

From 9a4d6dd554b86e65581ef6b6638a39ae079b17ac Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 12 Sep 2022 22:58:51 +0900
Subject: [PATCH 64/83] netfilter: nf_tables: fix percpu memory leak at
 nf_tables_addchain()

It seems to me that percpu memory for chain stats started leaking since
commit 3bc158f8d0330f0a ("netfilter: nf_tables: map basechain priority to
hardware priority") when nft_chain_offload_priority() returned an error.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Fixes: 3bc158f8d0330f0a ("netfilter: nf_tables: map basechain priority to hardware priority")
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_tables_api.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e062754dc6cc..63c70141b3e5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2243,6 +2243,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		if (err < 0) {
 			nft_chain_release_hook(&hook);
 			kfree(basechain);
+			free_percpu(stats);
 			return err;
 		}
 		if (stats)

From 62ce44c4fff947eebdf10bb582267e686e6835c9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 20 Sep 2022 14:20:17 +0200
Subject: [PATCH 65/83] netfilter: ebtables: fix memory leak when blob is
 malformed

The bug fix was incomplete, it "replaced" crash with a memory leak.
The old code had an assignment to "ret" embedded into the conditional,
restore this.

Fixes: 7997eff82828 ("netfilter: ebtables: reject blobs that don't provide all entry points")
Reported-and-tested-by: syzbot+a24c5252f3e3ab733464@syzkaller.appspotmail.com
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/bridge/netfilter/ebtables.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 9a0ae59cdc50..4f385d52a1c4 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1040,8 +1040,10 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
 		goto free_iterate;
 	}
 
-	if (repl->valid_hooks != t->valid_hooks)
+	if (repl->valid_hooks != t->valid_hooks) {
+		ret = -EINVAL;
 		goto free_unlock;
+	}
 
 	if (repl->num_counters && repl->num_counters != t->private->nentries) {
 		ret = -EINVAL;

From d25088932227680988a6b794221e031a7232f137 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 20 Sep 2022 18:31:30 +0200
Subject: [PATCH 66/83] netfilter: nf_ct_ftp: fix deadlock when nat rewrite is
 needed

We can't use ct->lock, this is already used by the seqadj internals.
When using ftp helper + nat, seqadj will attempt to acquire ct->lock
again.

Revert back to a global lock for now.

Fixes: c783a29c7e59 ("netfilter: nf_ct_ftp: prefer skb_linearize")
Reported-by: Bruno de Paula Larini <bruno.larini@riosoft.com.br>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_conntrack_ftp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 0d9332e9cf71..617f744a2e3a 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -33,6 +33,7 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
 MODULE_DESCRIPTION("ftp connection tracking helper");
 MODULE_ALIAS("ip_conntrack_ftp");
 MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
+static DEFINE_SPINLOCK(nf_ftp_lock);
 
 #define MAX_PORTS 8
 static u_int16_t ports[MAX_PORTS];
@@ -409,7 +410,8 @@ static int help(struct sk_buff *skb,
 	}
 	datalen = skb->len - dataoff;
 
-	spin_lock_bh(&ct->lock);
+	/* seqadj (nat) uses ct->lock internally, nf_nat_ftp would cause deadlock */
+	spin_lock_bh(&nf_ftp_lock);
 	fb_ptr = skb->data + dataoff;
 
 	ends_in_nl = (fb_ptr[datalen - 1] == '\n');
@@ -538,7 +540,7 @@ static int help(struct sk_buff *skb,
 	if (ends_in_nl)
 		update_nl_seq(ct, seq, ct_ftp_info, dir, skb);
  out:
-	spin_unlock_bh(&ct->lock);
+	spin_unlock_bh(&nf_ftp_lock);
 	return ret;
 }
 

From 4924c0cdce75575295f8fa682851fb8e5d619dd2 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 19 Sep 2022 16:48:00 +0200
Subject: [PATCH 67/83] net: ravb: Fix PHY state warning splat during system
 resume

Since commit 744d23c71af39c7d ("net: phy: Warn about incorrect
mdio_bus_phy_resume() state"), a warning splat is printed during system
resume with Wake-on-LAN disabled:

        WARNING: CPU: 0 PID: 1197 at drivers/net/phy/phy_device.c:323 mdio_bus_phy_resume+0xbc/0xc8

As the Renesas Ethernet AVB driver already calls phy_{stop,start}() in
its suspend/resume callbacks, it is sufficient to just mark the MAC
responsible for managing the power state of the PHY.

Fixes: fba863b816049b03 ("net: phy: make PHY PM ops a no-op if MAC driver manages PHY PM")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Link: https://lore.kernel.org/r/8ec796f47620980fdd0403e21bd8b7200b4fa1d4.1663598796.git.geert+renesas@glider.be
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/renesas/ravb_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index b357ac4c56c5..7e32b04eb0c7 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1449,6 +1449,8 @@ static int ravb_phy_init(struct net_device *ndev)
 		phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_100baseT_Half_BIT);
 	}
 
+	/* Indicate that the MAC is responsible for managing PHY PM */
+	phydev->mac_managed_pm = true;
 	phy_attached_info(phydev);
 
 	return 0;

From 6a1dbfefdae4f7809b3e277cc76785dac0ac1cd0 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 19 Sep 2022 16:48:28 +0200
Subject: [PATCH 68/83] net: sh_eth: Fix PHY state warning splat during system
 resume

Since commit 744d23c71af39c7d ("net: phy: Warn about incorrect
mdio_bus_phy_resume() state"), a warning splat is printed during system
resume with Wake-on-LAN disabled:

	WARNING: CPU: 0 PID: 626 at drivers/net/phy/phy_device.c:323 mdio_bus_phy_resume+0xbc/0xe4

As the Renesas SuperH Ethernet driver already calls phy_{stop,start}()
in its suspend/resume callbacks, it is sufficient to just mark the MAC
responsible for managing the power state of the PHY.

Fixes: fba863b816049b03 ("net: phy: make PHY PM ops a no-op if MAC driver manages PHY PM")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Link: https://lore.kernel.org/r/c6e1331b9bef61225fa4c09db3ba3e2e7214ba2d.1663598886.git.geert+renesas@glider.be
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/renesas/sh_eth.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 67ade78fb767..7fd8828d3a84 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -2029,6 +2029,8 @@ static int sh_eth_phy_init(struct net_device *ndev)
 	if (mdp->cd->register_type != SH_ETH_REG_GIGABIT)
 		phy_set_max_speed(phydev, SPEED_100);
 
+	/* Indicate that the MAC is responsible for managing PHY PM */
+	phydev->mac_managed_pm = true;
 	phy_attached_info(phydev);
 
 	return 0;

From a09721dd47c8468b3f2fdd73f40422699ffe26dd Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Thu, 11 Aug 2022 10:25:44 +0200
Subject: [PATCH 69/83] can: flexcan: flexcan_mailbox_read() fix return value
 for drop = true
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following happened on an i.MX25 using flexcan with many packets on
the bus:

The rx-offload queue reached a length more than skb_queue_len_max. In
can_rx_offload_offload_one() the drop variable was set to true which
made the call to .mailbox_read() (here: flexcan_mailbox_read()) to
_always_ return ERR_PTR(-ENOBUFS) and drop the rx'ed CAN frame. So
can_rx_offload_offload_one() returned ERR_PTR(-ENOBUFS), too.

can_rx_offload_irq_offload_fifo() looks as follows:

| 	while (1) {
| 		skb = can_rx_offload_offload_one(offload, 0);
| 		if (IS_ERR(skb))
| 			continue;
| 		if (!skb)
| 			break;
| 		...
| 	}

The flexcan driver wrongly always returns ERR_PTR(-ENOBUFS) if drop is
requested, even if there is no CAN frame pending. As the i.MX25 is a
single core CPU, while the rx-offload processing is active, there is
no thread to process packets from the offload queue. So the queue
doesn't get any shorter and this results is a tight loop.

Instead of always returning ERR_PTR(-ENOBUFS) if drop is requested,
return NULL if no CAN frame is pending.

Changes since v1: https://lore.kernel.org/all/20220810144536.389237-1-u.kleine-koenig@pengutronix.de
- don't break in can_rx_offload_irq_offload_fifo() in case of an error,
  return NULL in flexcan_mailbox_read() in case of no pending CAN frame
  instead

Fixes: 4e9c9484b085 ("can: rx-offload: Prepare for CAN FD support")
Link: https://lore.kernel.org/all/20220811094254.1864367-1-mkl@pengutronix.de
Cc: stable@vger.kernel.org # v5.5
Suggested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Tested-by: Thorsten Scherer <t.scherer@eckelmann.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/flexcan/flexcan-core.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c
index f857968efed7..ccb438eca517 100644
--- a/drivers/net/can/flexcan/flexcan-core.c
+++ b/drivers/net/can/flexcan/flexcan-core.c
@@ -941,11 +941,6 @@ static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload,
 	u32 reg_ctrl, reg_id, reg_iflag1;
 	int i;
 
-	if (unlikely(drop)) {
-		skb = ERR_PTR(-ENOBUFS);
-		goto mark_as_read;
-	}
-
 	mb = flexcan_get_mb(priv, n);
 
 	if (priv->devtype_data.quirks & FLEXCAN_QUIRK_USE_RX_MAILBOX) {
@@ -974,6 +969,11 @@ static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload,
 		reg_ctrl = priv->read(&mb->can_ctrl);
 	}
 
+	if (unlikely(drop)) {
+		skb = ERR_PTR(-ENOBUFS);
+		goto mark_as_read;
+	}
+
 	if (reg_ctrl & FLEXCAN_MB_CNT_EDL)
 		skb = alloc_canfd_skb(offload->dev, &cfd);
 	else

From 5440428b3da65408dba0241985acb7a05258b85e Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Tue, 20 Sep 2022 11:40:56 +0200
Subject: [PATCH 70/83] can: gs_usb: gs_can_open(): fix race dev->can.state
 condition

The dev->can.state is set to CAN_STATE_ERROR_ACTIVE, after the device
has been started. On busy networks the CAN controller might receive
CAN frame between and go into an error state before the dev->can.state
is assigned.

Assign dev->can.state before starting the controller to close the race
window.

Fixes: d08e973a77d1 ("can: gs_usb: Added support for the GS_USB CAN devices")
Link: https://lore.kernel.org/all/20220920195216.232481-1-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/usb/gs_usb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index baf749c8cda3..553e2c7b0b12 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -824,6 +824,7 @@ static int gs_can_open(struct net_device *netdev)
 		flags |= GS_CAN_MODE_TRIPLE_SAMPLE;
 
 	/* finally start device */
+	dev->can.state = CAN_STATE_ERROR_ACTIVE;
 	dm->mode = cpu_to_le32(GS_CAN_MODE_START);
 	dm->flags = cpu_to_le32(flags);
 	rc = usb_control_msg(interface_to_usbdev(dev->iface),
@@ -835,13 +836,12 @@ static int gs_can_open(struct net_device *netdev)
 	if (rc < 0) {
 		netdev_err(netdev, "Couldn't start device (err=%d)\n", rc);
 		kfree(dm);
+		dev->can.state = CAN_STATE_STOPPED;
 		return rc;
 	}
 
 	kfree(dm);
 
-	dev->can.state = CAN_STATE_ERROR_ACTIVE;
-
 	parent->active_channels++;
 	if (!(dev->can.ctrlmode & CAN_CTRLMODE_LISTENONLY))
 		netif_start_queue(netdev);

From 0f2211f1cf58876f00d2fd8839e0fdadf0786894 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Thu, 18 Aug 2022 14:10:08 +0200
Subject: [PATCH 71/83] can: gs_usb: gs_usb_set_phys_id(): return with error if
 identify is not supported

Until commit 409c188c57cd ("can: tree-wide: advertise software
timestamping capabilities") the ethtool_ops was only assigned for
devices which support the GS_CAN_FEATURE_IDENTIFY feature. That commit
assigns ethtool_ops unconditionally.

This results on controllers without GS_CAN_FEATURE_IDENTIFY support
for the following ethtool error:

| $ ethtool -p can0 1
| Cannot identify NIC: Broken pipe

Restore the correct error value by checking for
GS_CAN_FEATURE_IDENTIFY in the gs_usb_set_phys_id() function.

| $ ethtool -p can0 1
| Cannot identify NIC: Operation not supported

While there use the variable "netdev" for the "struct net_device"
pointer and "dev" for the "struct gs_can" pointer as in the rest of
the driver.

Fixes: 409c188c57cd ("can: tree-wide: advertise software timestamping capabilities")
Link: http://lore.kernel.org/all/20220818143853.2671854-1-mkl@pengutronix.de
Cc: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/usb/gs_usb.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 553e2c7b0b12..c1ff3c046d62 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -925,17 +925,21 @@ static int gs_usb_set_identify(struct net_device *netdev, bool do_identify)
 }
 
 /* blink LED's for finding the this interface */
-static int gs_usb_set_phys_id(struct net_device *dev,
+static int gs_usb_set_phys_id(struct net_device *netdev,
 			      enum ethtool_phys_id_state state)
 {
+	const struct gs_can *dev = netdev_priv(netdev);
 	int rc = 0;
 
+	if (!(dev->feature & GS_CAN_FEATURE_IDENTIFY))
+		return -EOPNOTSUPP;
+
 	switch (state) {
 	case ETHTOOL_ID_ACTIVE:
-		rc = gs_usb_set_identify(dev, GS_CAN_IDENTIFY_ON);
+		rc = gs_usb_set_identify(netdev, GS_CAN_IDENTIFY_ON);
 		break;
 	case ETHTOOL_ID_INACTIVE:
-		rc = gs_usb_set_identify(dev, GS_CAN_IDENTIFY_OFF);
+		rc = gs_usb_set_identify(netdev, GS_CAN_IDENTIFY_OFF);
 		break;
 	default:
 		break;
@@ -1072,9 +1076,10 @@ static struct gs_can *gs_make_candev(unsigned int channel,
 		dev->feature |= GS_CAN_FEATURE_REQ_USB_QUIRK_LPC546XX |
 			GS_CAN_FEATURE_QUIRK_BREQ_CANTACT_PRO;
 
-	if (le32_to_cpu(dconf->sw_version) > 1)
-		if (feature & GS_CAN_FEATURE_IDENTIFY)
-			netdev->ethtool_ops = &gs_usb_ethtool_ops;
+	/* GS_CAN_FEATURE_IDENTIFY is only supported for sw_version > 1 */
+	if (!(le32_to_cpu(dconf->sw_version) > 1 &&
+	      feature & GS_CAN_FEATURE_IDENTIFY))
+		dev->feature &= ~GS_CAN_FEATURE_IDENTIFY;
 
 	kfree(bt_const);
 

From 65e5d27df61283e5390f04b09dc79cd832f95607 Mon Sep 17 00:00:00 2001
From: Jianglei Nie <niejianglei2021@163.com>
Date: Wed, 14 Sep 2022 09:42:38 +0800
Subject: [PATCH 72/83] net: atlantic: fix potential memory leak in
 aq_ndev_close()

If aq_nic_stop() fails, aq_ndev_close() returns err without calling
aq_nic_deinit() to release the relevant memory and resource, which
will lead to a memory leak.

We can fix it by deleting the if condition judgment and goto statement to
call aq_nic_deinit() directly after aq_nic_stop() to fix the memory leak.

Signed-off-by: Jianglei Nie <niejianglei2021@163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/aq_main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_main.c b/drivers/net/ethernet/aquantia/atlantic/aq_main.c
index 88595863d8bc..8a0af371e7dc 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_main.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_main.c
@@ -94,11 +94,8 @@ static int aq_ndev_close(struct net_device *ndev)
 	int err = 0;
 
 	err = aq_nic_stop(aq_nic);
-	if (err < 0)
-		goto err_exit;
 	aq_nic_deinit(aq_nic, true);
 
-err_exit:
 	return err;
 }
 

From 114f398d48c571bb628187a7b2dd42695156781f Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Mon, 19 Sep 2022 15:43:46 +0200
Subject: [PATCH 73/83] ice: Fix ice_xdp_xmit() when XDP TX queue number is not
 sufficient

The original patch added the static branch to handle the situation,
when assigning an XDP TX queue to every CPU is not possible,
so they have to be shared.

However, in the XDP transmit handler ice_xdp_xmit(), an error was
returned in such cases even before static condition was checked,
thus making queue sharing still impossible.

Fixes: 22bf877e528f ("ice: introduce XDP_TX fallback path")
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/r/20220919134346.25030-1-larysa.zaremba@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/ice/ice_txrx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 836dce840712..97453d1dfafe 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -610,7 +610,7 @@ ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	if (test_bit(ICE_VSI_DOWN, vsi->state))
 		return -ENETDOWN;
 
-	if (!ice_is_xdp_ena_vsi(vsi) || queue_index >= vsi->num_xdp_txq)
+	if (!ice_is_xdp_ena_vsi(vsi))
 		return -ENXIO;
 
 	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
@@ -621,6 +621,9 @@ ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		xdp_ring = vsi->xdp_rings[queue_index];
 		spin_lock(&xdp_ring->tx_lock);
 	} else {
+		/* Generally, should not happen */
+		if (unlikely(queue_index >= vsi->num_xdp_txq))
+			return -ENXIO;
 		xdp_ring = vsi->xdp_rings[queue_index];
 	}
 

From e738455b2c6dcdab03e45d97de36476f93f557d2 Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Tue, 20 Sep 2022 14:43:09 +0800
Subject: [PATCH 74/83] net/smc: Stop the CLC flow if no link to map buffers on

There might be a potential race between SMC-R buffer map and
link group termination.

smc_smcr_terminate_all()     | smc_connect_rdma()
--------------------------------------------------------------
                             | smc_conn_create()
for links in smcibdev        |
        schedule links down  |
                             | smc_buf_create()
                             |  \- smcr_buf_map_usable_links()
                             |      \- no usable links found,
                             |         (rmb->mr = NULL)
                             |
                             | smc_clc_send_confirm()
                             |  \- access conn->rmb_desc->mr[]->rkey
                             |     (panic)

During reboot and IB device module remove, all links will be set
down and no usable links remain in link groups. In such situation
smcr_buf_map_usable_links() should return an error and stop the
CLC flow accessing to uninitialized mr.

Fixes: b9247544c1bc ("net/smc: convert static link ID instances to support multiple links")
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Link: https://lore.kernel.org/r/1663656189-32090-1-git-send-email-guwen@linux.alibaba.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/smc/smc_core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ebf56cdf17db..df89c2e08cbf 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -2239,7 +2239,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
 static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
 				     struct smc_buf_desc *buf_desc, bool is_rmb)
 {
-	int i, rc = 0;
+	int i, rc = 0, cnt = 0;
 
 	/* protect against parallel link reconfiguration */
 	mutex_lock(&lgr->llc_conf_mutex);
@@ -2252,9 +2252,12 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
 			rc = -ENOMEM;
 			goto out;
 		}
+		cnt++;
 	}
 out:
 	mutex_unlock(&lgr->llc_conf_mutex);
+	if (!rc && !cnt)
+		rc = -EINVAL;
 	return rc;
 }
 

From 2002fbac743b6e2391b4ed50ad9eb626768dd78a Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 20 Sep 2022 16:16:19 +0200
Subject: [PATCH 75/83] net: phy: micrel: fix shared interrupt on LAN8814

Since commit ece19502834d ("net: phy: micrel: 1588 support for LAN8814
phy") the handler always returns IRQ_HANDLED, except in an error case.
Before that commit, the interrupt status register was checked and if
it was empty, IRQ_NONE was returned. Restore that behavior to play nice
with the interrupt line being shared with others.

Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy")
Signed-off-by: Michael Walle <michael@walle.cc>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Reviewed-by: Divya Koppera <Divya.Koppera@microchip.com>
Link: https://lore.kernel.org/r/20220920141619.808117-1-michael@walle.cc
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/micrel.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 6f52b4fb6888..38234d7e14c5 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -2679,16 +2679,19 @@ static int lan8804_config_init(struct phy_device *phydev)
 static irqreturn_t lan8814_handle_interrupt(struct phy_device *phydev)
 {
 	int irq_status, tsu_irq_status;
+	int ret = IRQ_NONE;
 
 	irq_status = phy_read(phydev, LAN8814_INTS);
-	if (irq_status > 0 && (irq_status & LAN8814_INT_LINK))
-		phy_trigger_machine(phydev);
-
 	if (irq_status < 0) {
 		phy_error(phydev);
 		return IRQ_NONE;
 	}
 
+	if (irq_status & LAN8814_INT_LINK) {
+		phy_trigger_machine(phydev);
+		ret = IRQ_HANDLED;
+	}
+
 	while (1) {
 		tsu_irq_status = lanphy_read_page_reg(phydev, 4,
 						      LAN8814_INTR_STS_REG);
@@ -2697,12 +2700,15 @@ static irqreturn_t lan8814_handle_interrupt(struct phy_device *phydev)
 		    (tsu_irq_status & (LAN8814_INTR_STS_REG_1588_TSU0_ |
 				       LAN8814_INTR_STS_REG_1588_TSU1_ |
 				       LAN8814_INTR_STS_REG_1588_TSU2_ |
-				       LAN8814_INTR_STS_REG_1588_TSU3_)))
+				       LAN8814_INTR_STS_REG_1588_TSU3_))) {
 			lan8814_handle_ptp_interrupt(phydev);
-		else
+			ret = IRQ_HANDLED;
+		} else {
 			break;
+		}
 	}
-	return IRQ_HANDLED;
+
+	return ret;
 }
 
 static int lan8814_ack_interrupt(struct phy_device *phydev)

From 0e400d602f46360752e4b32ce842dba3808e15e6 Mon Sep 17 00:00:00 2001
From: Jonathan Toppins <jtoppins@redhat.com>
Date: Tue, 20 Sep 2022 13:45:52 -0400
Subject: [PATCH 76/83] bonding: fix NULL deref in bond_rr_gen_slave_id

Fix a NULL dereference of the struct bonding.rr_tx_counter member because
if a bond is initially created with an initial mode != zero (Round Robin)
the memory required for the counter is never created and when the mode is
changed there is never any attempt to verify the memory is allocated upon
switching modes.

This causes the following Oops on an aarch64 machine:
    [  334.686773] Unable to handle kernel paging request at virtual address ffff2c91ac905000
    [  334.694703] Mem abort info:
    [  334.697486]   ESR = 0x0000000096000004
    [  334.701234]   EC = 0x25: DABT (current EL), IL = 32 bits
    [  334.706536]   SET = 0, FnV = 0
    [  334.709579]   EA = 0, S1PTW = 0
    [  334.712719]   FSC = 0x04: level 0 translation fault
    [  334.717586] Data abort info:
    [  334.720454]   ISV = 0, ISS = 0x00000004
    [  334.724288]   CM = 0, WnR = 0
    [  334.727244] swapper pgtable: 4k pages, 48-bit VAs, pgdp=000008044d662000
    [  334.733944] [ffff2c91ac905000] pgd=0000000000000000, p4d=0000000000000000
    [  334.740734] Internal error: Oops: 96000004 [#1] SMP
    [  334.745602] Modules linked in: bonding tls veth rfkill sunrpc arm_spe_pmu vfat fat acpi_ipmi ipmi_ssif ixgbe igb i40e mdio ipmi_devintf ipmi_msghandler arm_cmn arm_dsu_pmu cppc_cpufreq acpi_tad fuse zram crct10dif_ce ast ghash_ce sbsa_gwdt nvme drm_vram_helper drm_ttm_helper nvme_core ttm xgene_hwmon
    [  334.772217] CPU: 7 PID: 2214 Comm: ping Not tainted 6.0.0-rc4-00133-g64ae13ed4784 #4
    [  334.779950] Hardware name: GIGABYTE R272-P31-00/MP32-AR1-00, BIOS F18v (SCP: 1.08.20211002) 12/01/2021
    [  334.789244] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
    [  334.796196] pc : bond_rr_gen_slave_id+0x40/0x124 [bonding]
    [  334.801691] lr : bond_xmit_roundrobin_slave_get+0x38/0xdc [bonding]
    [  334.807962] sp : ffff8000221733e0
    [  334.811265] x29: ffff8000221733e0 x28: ffffdbac8572d198 x27: ffff80002217357c
    [  334.818392] x26: 000000000000002a x25: ffffdbacb33ee000 x24: ffff07ff980fa000
    [  334.825519] x23: ffffdbacb2e398ba x22: ffff07ff98102000 x21: ffff07ff981029c0
    [  334.832646] x20: 0000000000000001 x19: ffff07ff981029c0 x18: 0000000000000014
    [  334.839773] x17: 0000000000000000 x16: ffffdbacb1004364 x15: 0000aaaabe2f5a62
    [  334.846899] x14: ffff07ff8e55d968 x13: ffff07ff8e55db30 x12: 0000000000000000
    [  334.854026] x11: ffffdbacb21532e8 x10: 0000000000000001 x9 : ffffdbac857178ec
    [  334.861153] x8 : ffff07ff9f6e5a28 x7 : 0000000000000000 x6 : 000000007c2b3742
    [  334.868279] x5 : ffff2c91ac905000 x4 : ffff2c91ac905000 x3 : ffff07ff9f554400
    [  334.875406] x2 : ffff2c91ac905000 x1 : 0000000000000001 x0 : ffff07ff981029c0
    [  334.882532] Call trace:
    [  334.884967]  bond_rr_gen_slave_id+0x40/0x124 [bonding]
    [  334.890109]  bond_xmit_roundrobin_slave_get+0x38/0xdc [bonding]
    [  334.896033]  __bond_start_xmit+0x128/0x3a0 [bonding]
    [  334.901001]  bond_start_xmit+0x54/0xb0 [bonding]
    [  334.905622]  dev_hard_start_xmit+0xb4/0x220
    [  334.909798]  __dev_queue_xmit+0x1a0/0x720
    [  334.913799]  arp_xmit+0x3c/0xbc
    [  334.916932]  arp_send_dst+0x98/0xd0
    [  334.920410]  arp_solicit+0xe8/0x230
    [  334.923888]  neigh_probe+0x60/0xb0
    [  334.927279]  __neigh_event_send+0x3b0/0x470
    [  334.931453]  neigh_resolve_output+0x70/0x90
    [  334.935626]  ip_finish_output2+0x158/0x514
    [  334.939714]  __ip_finish_output+0xac/0x1a4
    [  334.943800]  ip_finish_output+0x40/0xfc
    [  334.947626]  ip_output+0xf8/0x1a4
    [  334.950931]  ip_send_skb+0x5c/0x100
    [  334.954410]  ip_push_pending_frames+0x3c/0x60
    [  334.958758]  raw_sendmsg+0x458/0x6d0
    [  334.962325]  inet_sendmsg+0x50/0x80
    [  334.965805]  sock_sendmsg+0x60/0x6c
    [  334.969286]  __sys_sendto+0xc8/0x134
    [  334.972853]  __arm64_sys_sendto+0x34/0x4c
    [  334.976854]  invoke_syscall+0x78/0x100
    [  334.980594]  el0_svc_common.constprop.0+0x4c/0xf4
    [  334.985287]  do_el0_svc+0x38/0x4c
    [  334.988591]  el0_svc+0x34/0x10c
    [  334.991724]  el0t_64_sync_handler+0x11c/0x150
    [  334.996072]  el0t_64_sync+0x190/0x194
    [  334.999726] Code: b9001062 f9403c02 d53cd044 8b040042 (b8210040)
    [  335.005810] ---[ end trace 0000000000000000 ]---
    [  335.010416] Kernel panic - not syncing: Oops: Fatal exception in interrupt
    [  335.017279] SMP: stopping secondary CPUs
    [  335.021374] Kernel Offset: 0x5baca8eb0000 from 0xffff800008000000
    [  335.027456] PHYS_OFFSET: 0x80000000
    [  335.030932] CPU features: 0x0000,0085c029,19805c82
    [  335.035713] Memory Limit: none
    [  335.038756] Rebooting in 180 seconds..

The fix is to allocate the memory in bond_open() which is guaranteed
to be called before any packets are processed.

Fixes: 848ca9182a7d ("net: bonding: Use per-cpu rr_tx_counter")
CC: Jussi Maki <joamaki@gmail.com>
Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index bc6d8b0aa6fb..86d42306aa5e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4182,6 +4182,12 @@ static int bond_open(struct net_device *bond_dev)
 	struct list_head *iter;
 	struct slave *slave;
 
+	if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN && !bond->rr_tx_counter) {
+		bond->rr_tx_counter = alloc_percpu(u32);
+		if (!bond->rr_tx_counter)
+			return -ENOMEM;
+	}
+
 	/* reset slave->backup and slave->inactive */
 	if (bond_has_slaves(bond)) {
 		bond_for_each_slave(bond, slave, iter) {
@@ -6243,15 +6249,6 @@ static int bond_init(struct net_device *bond_dev)
 	if (!bond->wq)
 		return -ENOMEM;
 
-	if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN) {
-		bond->rr_tx_counter = alloc_percpu(u32);
-		if (!bond->rr_tx_counter) {
-			destroy_workqueue(bond->wq);
-			bond->wq = NULL;
-			return -ENOMEM;
-		}
-	}
-
 	spin_lock_init(&bond->stats_lock);
 	netdev_lockdep_set_classes(bond_dev);
 

From 2ffd57327ff1e91ad675c1efd5f02a3d786e64d1 Mon Sep 17 00:00:00 2001
From: Jonathan Toppins <jtoppins@redhat.com>
Date: Tue, 20 Sep 2022 13:45:51 -0400
Subject: [PATCH 77/83] selftests: bonding: cause oops in bond_rr_gen_slave_id

This bonding selftest used to cause a kernel oops on aarch64
and should be architectures agnostic.

Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/drivers/net/bonding/Makefile    |  3 +-
 .../bonding/bond-arp-interval-causes-panic.sh | 49 +++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh

diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
index 0f9659407969..1d866658e541 100644
--- a/tools/testing/selftests/drivers/net/bonding/Makefile
+++ b/tools/testing/selftests/drivers/net/bonding/Makefile
@@ -2,7 +2,8 @@
 # Makefile for net selftests
 
 TEST_PROGS := bond-break-lacpdu-tx.sh \
-	      dev_addr_lists.sh
+	      dev_addr_lists.sh \
+	      bond-arp-interval-causes-panic.sh
 
 TEST_FILES := lag_lib.sh
 
diff --git a/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh b/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh
new file mode 100755
index 000000000000..71c00bfafbc9
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# cause kernel oops in bond_rr_gen_slave_id
+DEBUG=${DEBUG:-0}
+
+set -e
+test ${DEBUG} -ne 0 && set -x
+
+finish()
+{
+	ip netns delete server || true
+	ip netns delete client || true
+	ip link del link1_1 || true
+}
+
+trap finish EXIT
+
+client_ip4=192.168.1.198
+server_ip4=192.168.1.254
+
+# setup kernel so it reboots after causing the panic
+echo 180 >/proc/sys/kernel/panic
+
+# build namespaces
+ip link add dev link1_1 type veth peer name link1_2
+
+ip netns add "server"
+ip link set dev link1_2 netns server up name eth0
+ip netns exec server ip addr add ${server_ip4}/24 dev eth0
+
+ip netns add "client"
+ip link set dev link1_1 netns client down name eth0
+ip netns exec client ip link add dev bond0 down type bond mode 1 \
+	miimon 100 all_slaves_active 1
+ip netns exec client ip link set dev eth0 down master bond0
+ip netns exec client ip link set dev bond0 up
+ip netns exec client ip addr add ${client_ip4}/24 dev bond0
+ip netns exec client ping -c 5 $server_ip4 >/dev/null
+
+ip netns exec client ip link set dev eth0 down nomaster
+ip netns exec client ip link set dev bond0 down
+ip netns exec client ip link set dev bond0 type bond mode 0 \
+	arp_interval 1000 arp_ip_target "+${server_ip4}"
+ip netns exec client ip link set dev eth0 down master bond0
+ip netns exec client ip link set dev bond0 up
+ip netns exec client ping -c 5 $server_ip4 >/dev/null
+
+exit 0

From db39dfdc1c3bd9da273902da12f01973792ff911 Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Tue, 20 Sep 2022 17:59:15 -0700
Subject: [PATCH 78/83] udp: Use WARN_ON_ONCE() in udp_read_skb()

Prevent udp_read_skb() from flooding the syslog.

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Link: https://lore.kernel.org/r/20220921005915.2697-1-yepeilin.cs@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/udp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cd72158e953a..560d9eadeaa5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1821,7 +1821,7 @@ int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 			continue;
 		}
 
-		WARN_ON(!skb_set_owner_sk_safe(skb, sk));
+		WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
 		used = recv_actor(sk, skb);
 		if (used <= 0) {
 			if (!copied)

From 878e2405710aacfeeb19364c300f38b7a9abfe8f Mon Sep 17 00:00:00 2001
From: Sean Anderson <seanga2@gmail.com>
Date: Tue, 20 Sep 2022 19:50:18 -0400
Subject: [PATCH 79/83] net: sunhme: Fix packet reception for len <
 RX_COPY_THRESHOLD

There is a separate receive path for small packets (under 256 bytes).
Instead of allocating a new dma-capable skb to be used for the next packet,
this path allocates a skb and copies the data into it (reusing the existing
sbk for the next packet). There are two bytes of junk data at the beginning
of every packet. I believe these are inserted in order to allow aligned DMA
and IP headers. We skip over them using skb_reserve. Before copying over
the data, we must use a barrier to ensure we see the whole packet. The
current code only synchronizes len bytes, starting from the beginning of
the packet, including the junk bytes. However, this leaves off the final
two bytes in the packet. Synchronize the whole packet.

To reproduce this problem, ping a HME with a payload size between 17 and
214

	$ ping -s 17 <hme_address>

which will complain rather loudly about the data mismatch. Small packets
(below 60 bytes on the wire) do not have this issue. I suspect this is
related to the padding added to increase the minimum packet size.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Sean Anderson <seanga2@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20220920235018.1675956-1-seanga2@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/sun/sunhme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c
index 8594ee839628..88aa0d310aee 100644
--- a/drivers/net/ethernet/sun/sunhme.c
+++ b/drivers/net/ethernet/sun/sunhme.c
@@ -2020,9 +2020,9 @@ static void happy_meal_rx(struct happy_meal *hp, struct net_device *dev)
 
 			skb_reserve(copy_skb, 2);
 			skb_put(copy_skb, len);
-			dma_sync_single_for_cpu(hp->dma_dev, dma_addr, len, DMA_FROM_DEVICE);
+			dma_sync_single_for_cpu(hp->dma_dev, dma_addr, len + 2, DMA_FROM_DEVICE);
 			skb_copy_from_linear_data(skb, copy_skb->data, len);
-			dma_sync_single_for_device(hp->dma_dev, dma_addr, len, DMA_FROM_DEVICE);
+			dma_sync_single_for_device(hp->dma_dev, dma_addr, len + 2, DMA_FROM_DEVICE);
 			/* Reuse original ring buffer. */
 			hme_write_rxd(hp, this,
 				      (RXFLAG_OWN|((RX_BUF_ALLOC_SIZE-RX_OFFSET)<<16)),

From c2e1cfefcac35e0eea229e148c8284088ce437b5 Mon Sep 17 00:00:00 2001
From: Hangyu Hua <hbh25y@gmail.com>
Date: Wed, 21 Sep 2022 17:27:34 +0800
Subject: [PATCH 80/83] net: sched: fix possible refcount leak in
 tc_new_tfilter()

tfilter_put need to be called to put the refount got by tp->ops->get to
avoid possible refcount leak when chain->tmplt_ops != NULL and
chain->tmplt_ops != tp->ops.

Fixes: 7d5509fa0d3d ("net: sched: extend proto ops with 'put' callback")
Signed-off-by: Hangyu Hua <hbh25y@gmail.com>
Reviewed-by: Vlad Buslov <vladbu@nvidia.com>
Link: https://lore.kernel.org/r/20220921092734.31700-1-hbh25y@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/cls_api.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 790d6809be81..51d175f3fbcb 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -2137,6 +2137,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 	}
 
 	if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) {
+		tfilter_put(tp, fh);
 		NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind");
 		err = -EINVAL;
 		goto errout;

From 3aac7ada64d8c26ccc922f1ca966043c0dae88da Mon Sep 17 00:00:00 2001
From: Liang He <windhl@126.com>
Date: Wed, 21 Sep 2022 21:32:45 +0800
Subject: [PATCH 81/83] net: marvell: Fix refcounting bugs in
 prestera_port_sfp_bind()

In prestera_port_sfp_bind(), there are two refcounting bugs:
(1) we should call of_node_get() before of_find_node_by_name() as
it will automaitcally decrease the refcount of 'from' argument;
(2) we should call of_node_put() for the break of the iteration
for_each_child_of_node() as it will automatically increase and
decrease the 'child'.

Fixes: 52323ef75414 ("net: marvell: prestera: add phylink support")
Signed-off-by: Liang He <windhl@126.com>
Reviewed-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
Link: https://lore.kernel.org/r/20220921133245.4111672-1-windhl@126.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/prestera/prestera_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c
index ede3e53b9790..a895862b4821 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_main.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c
@@ -368,6 +368,7 @@ static int prestera_port_sfp_bind(struct prestera_port *port)
 	if (!sw->np)
 		return 0;
 
+	of_node_get(sw->np);
 	ports = of_find_node_by_name(sw->np, "ports");
 
 	for_each_child_of_node(ports, node) {
@@ -417,6 +418,7 @@ static int prestera_port_sfp_bind(struct prestera_port *port)
 	}
 
 out:
+	of_node_put(node);
 	of_node_put(ports);
 	return err;
 }

From c31f26c8f69f776759cbbdfb38e40ea91aa0dd65 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 21 Sep 2022 13:10:05 -0700
Subject: [PATCH 82/83] bnxt: prevent skb UAF after handing over to PTP worker

When reading the timestamp is required bnxt_tx_int() hands
over the ownership of the completed skb to the PTP worker.
The skb should not be used afterwards, as the worker may
run before the rest of our code and free the skb, leading
to a use-after-free.

Since dev_kfree_skb_any() accepts NULL make the loss of
ownership more obvious and set skb to NULL.

Fixes: 83bb623c968e ("bnxt_en: Transmit and retrieve packet timestamps")
Reviewed-by: Andy Gospodarek <gospo@broadcom.com>
Reviewed-by: Michael Chan <michael.chan@broadcom.com>
Link: https://lore.kernel.org/r/20220921201005.335390-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index f46eefb5a029..96da0ba3d507 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -659,7 +659,6 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
 
 	for (i = 0; i < nr_pkts; i++) {
 		struct bnxt_sw_tx_bd *tx_buf;
-		bool compl_deferred = false;
 		struct sk_buff *skb;
 		int j, last;
 
@@ -668,6 +667,8 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
 		skb = tx_buf->skb;
 		tx_buf->skb = NULL;
 
+		tx_bytes += skb->len;
+
 		if (tx_buf->is_push) {
 			tx_buf->is_push = 0;
 			goto next_tx_int;
@@ -688,8 +689,9 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
 		}
 		if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) {
 			if (bp->flags & BNXT_FLAG_CHIP_P5) {
+				/* PTP worker takes ownership of the skb */
 				if (!bnxt_get_tx_ts_p5(bp, skb))
-					compl_deferred = true;
+					skb = NULL;
 				else
 					atomic_inc(&bp->ptp_cfg->tx_avail);
 			}
@@ -698,9 +700,7 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
 next_tx_int:
 		cons = NEXT_TX(cons);
 
-		tx_bytes += skb->len;
-		if (!compl_deferred)
-			dev_kfree_skb_any(skb);
+		dev_kfree_skb_any(skb);
 	}
 
 	netdev_tx_completed_queue(txq, nr_pkts, tx_bytes);

From 83e4b196838d90799a8879e5054a3beecf9ed256 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 22 Sep 2022 10:44:53 +0800
Subject: [PATCH 83/83] selftests: forwarding: add shebang for sch_red.sh

RHEL/Fedora RPM build checks are stricter, and complain when executable
files don't have a shebang line, e.g.

*** WARNING: ./kselftests/net/forwarding/sch_red.sh is executable but has no shebang, removing executable bit

Fix it by adding shebang line.

Fixes: 6cf0291f9517 ("selftests: forwarding: Add a RED test for SW datapath")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://lore.kernel.org/r/20220922024453.437757-1-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/sch_red.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh
index e714bae473fb..81f31179ac88 100755
--- a/tools/testing/selftests/net/forwarding/sch_red.sh
+++ b/tools/testing/selftests/net/forwarding/sch_red.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
 # This test sends one stream of traffic from H1 through a TBF shaper, to a RED