mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-17 18:36:00 +00:00
netfilter pull request 24-02-29
-----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEN9lkrMBJgcdVAPub1V2XiooUIOQFAmXfx4MACgkQ1V2XiooU IOSX+g//UHBfqYJASMMQJpWdMwWe7tB2m1LRzLYI+WUdUenK/MEylS7rNp/bwGkW 42eDeGA0eov7kYNOY0rLB7lQBdUHwCpNZkdetTWFV9eHcEKA8cQ6OqcD1G8i41qg sCvObS+K/hq3f7fX9bJ9RvS5RvYoeuS1trw4mezhHwPS+1sj80v4FdqDOFCUqiT3 65BfeoV65pVVteCRmJQxeeZ4Bepd4LRXW+VVyr3uXli/H87jqQOFxsOTqyXNEXIq jMYL0jnbYs0ARbNYXRYySLYQCWmbVXpfnt4JIBRP0S1e6Prby2hqUwJBeyNcXBAu CwBTjCEdLIV5G25EWTZWBYQdihct58s0GDRX078Sj/AozQJAWTxBEn0QLhKy2gvH 2uspA0S2z1PS69hUvHfgGjDiBKw41T2O6D/12NBxI1DOYDLsk7ApE5tKqynUnUIj pOLUiolFnJd4JKnGZ/CTATpGi8KX/iSWdX8OElCpGOvKQgZyU8IXrydjcHnJz7b4 AdsIfpjjZSdz2VU6ZmzLYJrWf6ukAchO5kYL2FIJt/eFEyGqDfwGL36FIO7YGcnu NPHtIF23Ldl+GIesc9UT08k+IOsfR9LMbUduJC6Dg63FDrEkFfOv+wXA1eURW3kS tq+eWs+QjlCeWG9FgW2NHj3+rGyjQbGOe+v1yTgl1x/BhXNV1cM= =2BRo -----END PGP SIGNATURE----- Merge tag 'nf-24-02-29' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf Pablo Neira Ayuso says: ==================== Netfilter fixes for net Patch #1 restores NFPROTO_INET with nft_compat, from Ignat Korchagin. Patch #2 fixes an issue with bridge netfilter and broadcast/multicast packets. There is a day 0 bug in br_netfilter when used with connection tracking. Conntrack assumes that an nf_conn structure that is not yet added to hash table ("unconfirmed"), is only visible by the current cpu that is processing the sk_buff. For bridge this isn't true, sk_buff can get cloned in between, and clones can be processed in parallel on different cpu. This patch disables NAT and conntrack helpers for multicast packets. Patch #3 adds a selftest to cover for the br_netfilter bug. netfilter pull request 24-02-29 * tag 'nf-24-02-29' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf: selftests: netfilter: add bridge conntrack + multicast test case netfilter: bridge: confirm multicast packets before passing them up the stack netfilter: nf_tables: allow NFPROTO_INET in nft_(match/target)_validate() ==================== Link: https://lore.kernel.org/r/20240229000135.8780-1-pablo@netfilter.org Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
commit
b611b776a9
@ -474,6 +474,7 @@ struct nf_ct_hook {
|
||||
const struct sk_buff *);
|
||||
void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
|
||||
void (*set_closing)(struct nf_conntrack *nfct);
|
||||
int (*confirm)(struct sk_buff *skb);
|
||||
};
|
||||
extern const struct nf_ct_hook __rcu *nf_ct_hook;
|
||||
|
||||
|
@ -43,6 +43,10 @@
|
||||
#include <linux/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
||||
#include <net/netfilter/nf_conntrack_core.h>
|
||||
#endif
|
||||
|
||||
static unsigned int brnf_net_id __read_mostly;
|
||||
|
||||
struct brnf_net {
|
||||
@ -553,6 +557,90 @@ static unsigned int br_nf_pre_routing(void *priv,
|
||||
return NF_STOLEN;
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
||||
/* conntracks' nf_confirm logic cannot handle cloned skbs referencing
|
||||
* the same nf_conn entry, which will happen for multicast (broadcast)
|
||||
* Frames on bridges.
|
||||
*
|
||||
* Example:
|
||||
* macvlan0
|
||||
* br0
|
||||
* ethX ethY
|
||||
*
|
||||
* ethX (or Y) receives multicast or broadcast packet containing
|
||||
* an IP packet, not yet in conntrack table.
|
||||
*
|
||||
* 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
|
||||
* -> skb->_nfct now references a unconfirmed entry
|
||||
* 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
|
||||
* interface.
|
||||
* 3. skb gets passed up the stack.
|
||||
* 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
|
||||
* and schedules a work queue to send them out on the lower devices.
|
||||
*
|
||||
* The clone skb->_nfct is not a copy, it is the same entry as the
|
||||
* original skb. The macvlan rx handler then returns RX_HANDLER_PASS.
|
||||
* 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
|
||||
*
|
||||
* The Macvlan broadcast worker and normal confirm path will race.
|
||||
*
|
||||
* This race will not happen if step 2 already confirmed a clone. In that
|
||||
* case later steps perform skb_clone() with skb->_nfct already confirmed (in
|
||||
* hash table). This works fine.
|
||||
*
|
||||
* But such confirmation won't happen when eb/ip/nftables rules dropped the
|
||||
* packets before they reached the nf_confirm step in postrouting.
|
||||
*
|
||||
* Work around this problem by explicit confirmation of the entry at
|
||||
* LOCAL_IN time, before upper layer has a chance to clone the unconfirmed
|
||||
* entry.
|
||||
*
|
||||
*/
|
||||
static unsigned int br_nf_local_in(void *priv,
|
||||
struct sk_buff *skb,
|
||||
const struct nf_hook_state *state)
|
||||
{
|
||||
struct nf_conntrack *nfct = skb_nfct(skb);
|
||||
const struct nf_ct_hook *ct_hook;
|
||||
struct nf_conn *ct;
|
||||
int ret;
|
||||
|
||||
if (!nfct || skb->pkt_type == PACKET_HOST)
|
||||
return NF_ACCEPT;
|
||||
|
||||
ct = container_of(nfct, struct nf_conn, ct_general);
|
||||
if (likely(nf_ct_is_confirmed(ct)))
|
||||
return NF_ACCEPT;
|
||||
|
||||
WARN_ON_ONCE(skb_shared(skb));
|
||||
WARN_ON_ONCE(refcount_read(&nfct->use) != 1);
|
||||
|
||||
/* We can't call nf_confirm here, it would create a dependency
|
||||
* on nf_conntrack module.
|
||||
*/
|
||||
ct_hook = rcu_dereference(nf_ct_hook);
|
||||
if (!ct_hook) {
|
||||
skb->_nfct = 0ul;
|
||||
nf_conntrack_put(nfct);
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
nf_bridge_pull_encap_header(skb);
|
||||
ret = ct_hook->confirm(skb);
|
||||
switch (ret & NF_VERDICT_MASK) {
|
||||
case NF_STOLEN:
|
||||
return NF_STOLEN;
|
||||
default:
|
||||
nf_bridge_push_encap_header(skb);
|
||||
break;
|
||||
}
|
||||
|
||||
ct = container_of(nfct, struct nf_conn, ct_general);
|
||||
WARN_ON_ONCE(!nf_ct_is_confirmed(ct));
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* PF_BRIDGE/FORWARD *************************************************/
|
||||
static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
|
||||
@ -964,6 +1052,14 @@ static const struct nf_hook_ops br_nf_ops[] = {
|
||||
.hooknum = NF_BR_PRE_ROUTING,
|
||||
.priority = NF_BR_PRI_BRNF,
|
||||
},
|
||||
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
||||
{
|
||||
.hook = br_nf_local_in,
|
||||
.pf = NFPROTO_BRIDGE,
|
||||
.hooknum = NF_BR_LOCAL_IN,
|
||||
.priority = NF_BR_PRI_LAST,
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.hook = br_nf_forward,
|
||||
.pf = NFPROTO_BRIDGE,
|
||||
|
@ -291,6 +291,30 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
|
||||
return nf_conntrack_in(skb, &bridge_state);
|
||||
}
|
||||
|
||||
static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb,
|
||||
const struct nf_hook_state *state)
|
||||
{
|
||||
enum ip_conntrack_info ctinfo;
|
||||
struct nf_conn *ct;
|
||||
|
||||
if (skb->pkt_type == PACKET_HOST)
|
||||
return NF_ACCEPT;
|
||||
|
||||
/* nf_conntrack_confirm() cannot handle concurrent clones,
|
||||
* this happens for broad/multicast frames with e.g. macvlan on top
|
||||
* of the bridge device.
|
||||
*/
|
||||
ct = nf_ct_get(skb, &ctinfo);
|
||||
if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct))
|
||||
return NF_ACCEPT;
|
||||
|
||||
/* let inet prerouting call conntrack again */
|
||||
skb->_nfct = 0;
|
||||
nf_ct_put(ct);
|
||||
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
static void nf_ct_bridge_frag_save(struct sk_buff *skb,
|
||||
struct nf_bridge_frag_data *data)
|
||||
{
|
||||
@ -385,6 +409,12 @@ static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = {
|
||||
.hooknum = NF_BR_PRE_ROUTING,
|
||||
.priority = NF_IP_PRI_CONNTRACK,
|
||||
},
|
||||
{
|
||||
.hook = nf_ct_bridge_in,
|
||||
.pf = NFPROTO_BRIDGE,
|
||||
.hooknum = NF_BR_LOCAL_IN,
|
||||
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
|
||||
},
|
||||
{
|
||||
.hook = nf_ct_bridge_post,
|
||||
.pf = NFPROTO_BRIDGE,
|
||||
|
@ -2756,6 +2756,7 @@ static const struct nf_ct_hook nf_conntrack_hook = {
|
||||
.get_tuple_skb = nf_conntrack_get_tuple_skb,
|
||||
.attach = nf_conntrack_attach,
|
||||
.set_closing = nf_conntrack_set_closing,
|
||||
.confirm = __nf_conntrack_confirm,
|
||||
};
|
||||
|
||||
void nf_conntrack_init_end(void)
|
||||
|
@ -359,10 +359,20 @@ static int nft_target_validate(const struct nft_ctx *ctx,
|
||||
|
||||
if (ctx->family != NFPROTO_IPV4 &&
|
||||
ctx->family != NFPROTO_IPV6 &&
|
||||
ctx->family != NFPROTO_INET &&
|
||||
ctx->family != NFPROTO_BRIDGE &&
|
||||
ctx->family != NFPROTO_ARP)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = nft_chain_validate_hooks(ctx->chain,
|
||||
(1 << NF_INET_PRE_ROUTING) |
|
||||
(1 << NF_INET_LOCAL_IN) |
|
||||
(1 << NF_INET_FORWARD) |
|
||||
(1 << NF_INET_LOCAL_OUT) |
|
||||
(1 << NF_INET_POST_ROUTING));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (nft_is_base_chain(ctx->chain)) {
|
||||
const struct nft_base_chain *basechain =
|
||||
nft_base_chain(ctx->chain);
|
||||
@ -610,10 +620,20 @@ static int nft_match_validate(const struct nft_ctx *ctx,
|
||||
|
||||
if (ctx->family != NFPROTO_IPV4 &&
|
||||
ctx->family != NFPROTO_IPV6 &&
|
||||
ctx->family != NFPROTO_INET &&
|
||||
ctx->family != NFPROTO_BRIDGE &&
|
||||
ctx->family != NFPROTO_ARP)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = nft_chain_validate_hooks(ctx->chain,
|
||||
(1 << NF_INET_PRE_ROUTING) |
|
||||
(1 << NF_INET_LOCAL_IN) |
|
||||
(1 << NF_INET_FORWARD) |
|
||||
(1 << NF_INET_LOCAL_OUT) |
|
||||
(1 << NF_INET_POST_ROUTING));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (nft_is_base_chain(ctx->chain)) {
|
||||
const struct nft_base_chain *basechain =
|
||||
nft_base_chain(ctx->chain);
|
||||
|
@ -7,7 +7,8 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
|
||||
nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
|
||||
ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \
|
||||
conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh \
|
||||
conntrack_sctp_collision.sh xt_string.sh
|
||||
conntrack_sctp_collision.sh xt_string.sh \
|
||||
bridge_netfilter.sh
|
||||
|
||||
HOSTPKG_CONFIG := pkg-config
|
||||
|
||||
|
188
tools/testing/selftests/netfilter/bridge_netfilter.sh
Normal file
188
tools/testing/selftests/netfilter/bridge_netfilter.sh
Normal file
@ -0,0 +1,188 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
#
|
||||
# Test bridge netfilter + conntrack, a combination that doesn't really work,
|
||||
# with multicast/broadcast packets racing for hash table insertion.
|
||||
|
||||
# eth0 br0 eth0
|
||||
# setup is: ns1 <->,ns0 <-> ns3
|
||||
# ns2 <-' `'-> ns4
|
||||
|
||||
# Kselftest framework requirement - SKIP code is 4.
|
||||
ksft_skip=4
|
||||
ret=0
|
||||
|
||||
sfx=$(mktemp -u "XXXXXXXX")
|
||||
ns0="ns0-$sfx"
|
||||
ns1="ns1-$sfx"
|
||||
ns2="ns2-$sfx"
|
||||
ns3="ns3-$sfx"
|
||||
ns4="ns4-$sfx"
|
||||
|
||||
ebtables -V > /dev/null 2>&1
|
||||
if [ $? -ne 0 ];then
|
||||
echo "SKIP: Could not run test without ebtables"
|
||||
exit $ksft_skip
|
||||
fi
|
||||
|
||||
ip -Version > /dev/null 2>&1
|
||||
if [ $? -ne 0 ];then
|
||||
echo "SKIP: Could not run test without ip tool"
|
||||
exit $ksft_skip
|
||||
fi
|
||||
|
||||
for i in $(seq 0 4); do
|
||||
eval ip netns add \$ns$i
|
||||
done
|
||||
|
||||
cleanup() {
|
||||
for i in $(seq 0 4); do eval ip netns del \$ns$i;done
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
do_ping()
|
||||
{
|
||||
fromns="$1"
|
||||
dstip="$2"
|
||||
|
||||
ip netns exec $fromns ping -c 1 -q $dstip > /dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "ERROR: ping from $fromns to $dstip"
|
||||
ip netns exec ${ns0} nft list ruleset
|
||||
ret=1
|
||||
fi
|
||||
}
|
||||
|
||||
bcast_ping()
|
||||
{
|
||||
fromns="$1"
|
||||
dstip="$2"
|
||||
|
||||
for i in $(seq 1 1000); do
|
||||
ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "ERROR: ping -b from $fromns to $dstip"
|
||||
ip netns exec ${ns0} nft list ruleset
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1}
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "SKIP: Can't create veth device"
|
||||
exit $ksft_skip
|
||||
fi
|
||||
|
||||
ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2
|
||||
ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3
|
||||
ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4
|
||||
|
||||
ip -net ${ns0} link set lo up
|
||||
|
||||
for i in $(seq 1 4); do
|
||||
ip -net ${ns0} link set veth$i up
|
||||
done
|
||||
|
||||
ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "SKIP: Can't create bridge br0"
|
||||
exit $ksft_skip
|
||||
fi
|
||||
|
||||
# make veth0,1,2 part of bridge.
|
||||
for i in $(seq 1 3); do
|
||||
ip -net ${ns0} link set veth$i master br0
|
||||
done
|
||||
|
||||
# add a macvlan on top of the bridge.
|
||||
MACVLAN_ADDR=ba:f3:13:37:42:23
|
||||
ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private
|
||||
ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR}
|
||||
ip -net ${ns0} link set macvlan0 up
|
||||
ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0
|
||||
|
||||
# add a macvlan on top of veth4.
|
||||
MACVLAN_ADDR=ba:f3:13:37:42:24
|
||||
ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa
|
||||
ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR}
|
||||
ip -net ${ns0} link set macvlan4 up
|
||||
|
||||
# make the macvlan part of the bridge.
|
||||
# veth4 is not a bridge port, only the macvlan on top of it.
|
||||
ip -net ${ns0} link set macvlan4 master br0
|
||||
|
||||
ip -net ${ns0} link set br0 up
|
||||
ip -net ${ns0} addr add 10.0.0.1/24 dev br0
|
||||
ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1
|
||||
ret=$?
|
||||
if [ $ret -ne 0 ] ; then
|
||||
echo "SKIP: bridge netfilter not available"
|
||||
ret=$ksft_skip
|
||||
fi
|
||||
|
||||
# for testing, so namespaces will reply to ping -b probes.
|
||||
ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0
|
||||
|
||||
# enable conntrack in ns0 and drop broadcast packets in forward to
|
||||
# avoid them from getting confirmed in the postrouting hook before
|
||||
# the cloned skb is passed up the stack.
|
||||
ip netns exec ${ns0} nft -f - <<EOF
|
||||
table ip filter {
|
||||
chain input {
|
||||
type filter hook input priority 1; policy accept
|
||||
iifname br0 counter
|
||||
ct state new accept
|
||||
}
|
||||
}
|
||||
|
||||
table bridge filter {
|
||||
chain forward {
|
||||
type filter hook forward priority 0; policy accept
|
||||
meta pkttype broadcast ip protocol icmp counter drop
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
# place 1, 2 & 3 in same subnet, connected via ns0:br0.
|
||||
# ns4 is placed in same subnet as well, but its not
|
||||
# part of the bridge: the corresponding veth4 is not
|
||||
# part of the bridge, only its macvlan interface.
|
||||
for i in $(seq 1 4); do
|
||||
eval ip -net \$ns$i link set lo up
|
||||
eval ip -net \$ns$i link set eth0 up
|
||||
done
|
||||
for i in $(seq 1 2); do
|
||||
eval ip -net \$ns$i addr add 10.0.0.1$i/24 dev eth0
|
||||
done
|
||||
|
||||
ip -net ${ns3} addr add 10.23.0.13/24 dev eth0
|
||||
ip -net ${ns4} addr add 10.23.0.14/24 dev eth0
|
||||
|
||||
# test basic connectivity
|
||||
do_ping ${ns1} 10.0.0.12
|
||||
do_ping ${ns3} 10.23.0.1
|
||||
do_ping ${ns4} 10.23.0.1
|
||||
|
||||
if [ $ret -eq 0 ];then
|
||||
echo "PASS: netns connectivity: ns1 can reach ns2, ns3 and ns4 can reach ns0"
|
||||
fi
|
||||
|
||||
bcast_ping ${ns1} 10.0.0.255
|
||||
|
||||
# This should deliver broadcast to macvlan0, which is on top of ns0:br0.
|
||||
bcast_ping ${ns3} 10.23.0.255
|
||||
|
||||
# same, this time via veth4:macvlan4.
|
||||
bcast_ping ${ns4} 10.23.0.255
|
||||
|
||||
read t < /proc/sys/kernel/tainted
|
||||
|
||||
if [ $t -eq 0 ];then
|
||||
echo PASS: kernel not tainted
|
||||
else
|
||||
echo ERROR: kernel is tainted
|
||||
ret=1
|
||||
fi
|
||||
|
||||
exit $ret
|
Loading…
x
Reference in New Issue
Block a user