From dc1c9fd4a8bbe1e06add9053010b652449bfe411 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 May 2023 14:20:21 +0200 Subject: [PATCH 1/7] netfilter: nf_tables: always release netdev hooks from notifier This reverts "netfilter: nf_tables: skip netdev events generated on netns removal". The problem is that when a veth device is released, the veth release callback will also queue the peer netns device for removal. Its possible that the peer netns is also slated for removal. In this case, the device memory is already released before the pre_exit hook of the peer netns runs: BUG: KASAN: slab-use-after-free in nf_hook_entry_head+0x1b8/0x1d0 Read of size 8 at addr ffff88812c0124f0 by task kworker/u8:1/45 Workqueue: netns cleanup_net Call Trace: nf_hook_entry_head+0x1b8/0x1d0 __nf_unregister_net_hook+0x76/0x510 nft_netdev_unregister_hooks+0xa0/0x220 __nft_release_hook+0x184/0x490 nf_tables_pre_exit_net+0x12f/0x1b0 .. Order is: 1. First netns is released, veth_dellink() queues peer netns device for removal 2. peer netns is queued for removal 3. peer netns device is released, unreg event is triggered 4. unreg event is ignored because netns is going down 5. pre_exit hook calls nft_netdev_unregister_hooks but device memory might be free'd already. Fixes: 68a3765c659f ("netfilter: nf_tables: skip netdev events generated on netns removal") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_chain_filter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index c3563f0be269..680fe557686e 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -344,6 +344,12 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, return; } + /* UNREGISTER events are also happening on netns exit. + * + * Although nf_tables core releases all tables/chains, only this event + * handler provides guarantee that hook->ops.dev is still accessible, + * so we cannot skip exiting net namespaces. + */ __nft_release_basechain(ctx); } @@ -362,9 +368,6 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; - if (!check_net(ctx.net)) - return NOTIFY_DONE; - nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { From e72eeab542dbf4f544e389e64fa13b82a1b6d003 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 May 2023 14:55:02 +0200 Subject: [PATCH 2/7] netfilter: conntrack: fix possible bug_on with enable_hooks=1 I received a bug report (no reproducer so far) where we trip over 712 rcu_read_lock(); 713 ct_hook = rcu_dereference(nf_ct_hook); 714 BUG_ON(ct_hook == NULL); // here In nf_conntrack_destroy(). First turn this BUG_ON into a WARN. I think it was triggered via enable_hooks=1 flag. When this flag is turned on, the conntrack hooks are registered before nf_ct_hook pointer gets assigned. This opens a short window where packets enter the conntrack machinery, can have skb->_nfct set up and a subsequent kfree_skb might occur before nf_ct_hook is set. Call nf_conntrack_init_end() to set nf_ct_hook before we register the pernet ops. Fixes: ba3fbe663635 ("netfilter: nf_conntrack: provide modparam to always register conntrack hooks") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 6 ++++-- net/netfilter/nf_conntrack_standalone.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index f0783e42108b..5f76ae86a656 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -711,9 +711,11 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); - BUG_ON(ct_hook == NULL); - ct_hook->destroy(nfct); + if (ct_hook) + ct_hook->destroy(nfct); rcu_read_unlock(); + + WARN_ON(!ct_hook); } EXPORT_SYMBOL(nf_conntrack_destroy); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 57f6724c99a7..169e16fc2bce 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1218,11 +1218,12 @@ static int __init nf_conntrack_standalone_init(void) nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif + nf_conntrack_init_end(); + ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out_pernet; - nf_conntrack_init_end(); return 0; out_pernet: From 0a11073e8e3362d715255d28d294aa7350c1c01f Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Thu, 4 May 2023 11:48:11 +0300 Subject: [PATCH 3/7] selftests: nft_flowtable.sh: use /proc for pid checking Some ps commands (e.g. busybox derived) have no -p option. Use /proc for pid existence check. Signed-off-by: Boris Sukholitko Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_flowtable.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 7060bae04ec8..4d8bc51b7a7b 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -288,11 +288,11 @@ test_tcp_forwarding_ip() sleep 3 - if ps -p $lpid > /dev/null;then + if test -d /proc/"$lpid"/; then kill $lpid fi - if ps -p $cpid > /dev/null;then + if test -d /proc/"$cpid"/; then kill $cpid fi From 0749d670d758099970c52ef70f4bbcfa5a15b3d3 Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Thu, 4 May 2023 11:48:12 +0300 Subject: [PATCH 4/7] selftests: nft_flowtable.sh: no need for ps -x option Some ps commands (e.g. busybox derived) have no -x option. For the purposes of hash calculation of the list of processes this option is inessential. Signed-off-by: Boris Sukholitko Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_flowtable.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 4d8bc51b7a7b..3cf20e9bd3a6 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -489,8 +489,8 @@ ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 ip -net $nsr1 addr add dead:1::1/64 dev veth0 ip -net $nsr1 link set up dev veth0 -KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1) -KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1) +KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) +KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1) SPI1=$RANDOM SPI2=$RANDOM From 1114803c2da974526ecbc0bd81e6c637bf257de5 Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Thu, 4 May 2023 11:48:13 +0300 Subject: [PATCH 5/7] selftests: nft_flowtable.sh: wait for specific nc pids Doing wait with no parameters may interfere with some of the tests having their own background processes. Although no such test is currently present, the cleanup is useful to rely on the nft_flowtable.sh for local development (e.g. running background tcpdump command during the tests). Signed-off-by: Boris Sukholitko Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_flowtable.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 3cf20e9bd3a6..92bc308bf168 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -296,7 +296,8 @@ test_tcp_forwarding_ip() kill $cpid fi - wait + wait $lpid + wait $cpid if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then lret=1 From 90ab51226d52ad61e5ff175b572e08046b1b0a99 Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Thu, 4 May 2023 11:48:14 +0300 Subject: [PATCH 6/7] selftests: nft_flowtable.sh: monitor result file sizes When running nft_flowtable.sh in VM on a busy server we've found that the time of the netcat file transfers vary wildly. Therefore replace hardcoded 3 second sleep with the loop checking for a change in the file sizes. Once no change in detected we test the results. Nice side effect is that we shave 1 second sleep in the fast case (hard-coded 3 second sleep vs two 1 second sleeps). Acked-by: Florian Westphal Signed-off-by: Boris Sukholitko Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_flowtable.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 92bc308bf168..51f986f19fee 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -286,7 +286,15 @@ test_tcp_forwarding_ip() ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" & cpid=$! - sleep 3 + sleep 1 + + prev="$(ls -l $ns1out $ns2out)" + sleep 1 + + while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do + sleep 1; + prev="$(ls -l $ns1out $ns2out)" + done if test -d /proc/"$lpid"/; then kill $lpid From 3acf8f6c14d0e42b889738d63b6d9cb63348fc94 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 May 2023 16:47:24 +0200 Subject: [PATCH 7/7] selftests: nft_flowtable.sh: check ingress/egress chain too Make sure flowtable interacts correctly with ingress and egress chains, i.e. those get handled before and after flow table respectively. Adds three more tests: 1. repeat flowtable test, but with 'ip dscp set cs3' done in inet forward chain. Expect that some packets have been mangled (before flowtable offload became effective) while some pass without mangling (after offload succeeds). 2. repeat flowtable test, but with 'ip dscp set cs3' done in veth0:ingress. Expect that all packets pass with cs3 dscp field. 3. same as 2, but use veth1:egress. Expect the same outcome. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- .../selftests/netfilter/nft_flowtable.sh | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 51f986f19fee..a32f490f7539 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -188,6 +188,26 @@ if [ $? -ne 0 ]; then exit $ksft_skip fi +ip netns exec $ns2 nft -f - < /dev/null; then echo "ERROR: $ns1 cannot reach ns2" 1>&2 @@ -255,6 +275,60 @@ check_counters() fi } +check_dscp() +{ + local what=$1 + local ok=1 + + local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets) + + local pc4=${counter%*bytes*} + local pc4=${pc4#*packets} + + local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets) + local pc4z=${counter%*bytes*} + local pc4z=${pc4z#*packets} + + case "$what" in + "dscp_none") + if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + "dscp_fwd") + if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + "dscp_ingress") + if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + "dscp_egress") + if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + *) + echo "FAIL: Unknown DSCP check" 1>&2 + ret=1 + ok=0 + esac + + if [ $ok -eq 1 ] ;then + echo "PASS: $what: dscp packet counters match" + fi +} + check_transfer() { in=$1 @@ -325,6 +399,51 @@ test_tcp_forwarding() return $? } +test_tcp_forwarding_set_dscp() +{ + check_dscp "dscp_none" + +ip netns exec $nsr1 nft -f - <&2 + exit 0 +fi + if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 ip netns exec $nsr1 nft list ruleset