net: ipv4: Cache pmtu for all packet paths if multipath enabled

Check number of paths by fib_info_num_path(),
and update_or_create_fnhe() for every path.
Problem is that pmtu is cached only for the oif
that has received icmp message "need to frag",
other oifs will still try to use "default" iface mtu.

An example topology showing the problem:

                    |  host1
                +---------+
                |  dummy0 | 10.179.20.18/32  mtu9000
                +---------+
        +-----------+----------------+
    +---------+                     +---------+
    | ens17f0 |  10.179.2.141/31    | ens17f1 |  10.179.2.13/31
    +---------+                     +---------+
        |    (all here have mtu 9000)    |
    +------+                         +------+
    | ro1  |  10.179.2.140/31        | ro2  |  10.179.2.12/31
    +------+                         +------+
        |                                |
---------+------------+-------------------+------
                        |
                    +-----+
                    | ro3 | 10.10.10.10  mtu1500
                    +-----+
                        |
    ========================================
                some networks
    ========================================
                        |
                    +-----+
                    | eth0| 10.10.30.30  mtu9000
                    +-----+
                        |  host2

host1 have enabled multipath and
sysctl net.ipv4.fib_multipath_hash_policy = 1:

default proto static src 10.179.20.18
        nexthop via 10.179.2.12 dev ens17f1 weight 1
        nexthop via 10.179.2.140 dev ens17f0 weight 1

When host1 tries to do pmtud from 10.179.20.18/32 to host2,
host1 receives at ens17f1 iface an icmp packet from ro3 that ro3 mtu=1500.
And host1 caches it in nexthop exceptions cache.

Problem is that it is cached only for the iface that has received icmp,
and there is no way that ro3 will send icmp msg to host1 via another path.

Host1 now have this routes to host2:

ip r g 10.10.30.30 sport 30000 dport 443
10.10.30.30 via 10.179.2.12 dev ens17f1 src 10.179.20.18 uid 0
    cache expires 521sec mtu 1500

ip r g 10.10.30.30 sport 30033 dport 443
10.10.30.30 via 10.179.2.140 dev ens17f0 src 10.179.20.18 uid 0
    cache

So when host1 tries again to reach host2 with mtu>1500,
if packet flow is lucky enough to be hashed with oif=ens17f1 its ok,
if oif=ens17f0 it blackholes and still gets icmp msgs from ro3 to ens17f1,
until lucky day when ro3 will send it through another flow to ens17f0.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20241108093427.317942-1-deliran@verdict.gg
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Vladimir Vdovin 2024-11-08 09:34:24 +00:00 committed by Jakub Kicinski
parent 43271bb5bf
commit 7d3f3b4367
2 changed files with 108 additions and 17 deletions

View File

@ -1027,6 +1027,19 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
struct fib_nh_common *nhc; struct fib_nh_common *nhc;
fib_select_path(net, &res, fl4, NULL); fib_select_path(net, &res, fl4, NULL);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (fib_info_num_path(res.fi) > 1) {
int nhsel;
for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
nhc = fib_info_nhc(res.fi, nhsel);
update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
jiffies + net->ipv4.ip_rt_mtu_expires);
}
rcu_read_unlock();
return;
}
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
nhc = FIB_RES_NHC(res); nhc = FIB_RES_NHC(res);
update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
jiffies + net->ipv4.ip_rt_mtu_expires); jiffies + net->ipv4.ip_rt_mtu_expires);

View File

@ -197,6 +197,12 @@
# #
# - pmtu_ipv6_route_change # - pmtu_ipv6_route_change
# Same as above but with IPv6 # Same as above but with IPv6
#
# - pmtu_ipv4_mp_exceptions
# Use the same topology as in pmtu_ipv4, but add routeable addresses
# on host A and B on lo reachable via both routers. Host A and B
# addresses have multipath routes to each other, b_r1 mtu = 1500.
# Check that PMTU exceptions are created for both paths.
source lib.sh source lib.sh
source net_helper.sh source net_helper.sh
@ -266,7 +272,8 @@ tests="
list_flush_ipv4_exception ipv4: list and flush cached exceptions 1 list_flush_ipv4_exception ipv4: list and flush cached exceptions 1
list_flush_ipv6_exception ipv6: list and flush cached exceptions 1 list_flush_ipv6_exception ipv6: list and flush cached exceptions 1
pmtu_ipv4_route_change ipv4: PMTU exception w/route replace 1 pmtu_ipv4_route_change ipv4: PMTU exception w/route replace 1
pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1" pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1
pmtu_ipv4_mp_exceptions ipv4: PMTU multipath nh exceptions 1"
# Addressing and routing for tests with routers: four network segments, with # Addressing and routing for tests with routers: four network segments, with
# index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an # index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
@ -343,6 +350,9 @@ tunnel6_a_addr="fd00:2::a"
tunnel6_b_addr="fd00:2::b" tunnel6_b_addr="fd00:2::b"
tunnel6_mask="64" tunnel6_mask="64"
host4_a_addr="192.168.99.99"
host4_b_addr="192.168.88.88"
dummy6_0_prefix="fc00:1000::" dummy6_0_prefix="fc00:1000::"
dummy6_1_prefix="fc00:1001::" dummy6_1_prefix="fc00:1001::"
dummy6_mask="64" dummy6_mask="64"
@ -984,6 +994,52 @@ setup_ovs_bridge() {
run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2 run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
} }
setup_multipath_new() {
# Set up host A with multipath routes to host B host4_b_addr
run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
run_cmd ${ns_a} ip nexthop add id 401 via ${prefix4}.${a_r1}.2 dev veth_A-R1
run_cmd ${ns_a} ip nexthop add id 402 via ${prefix4}.${a_r2}.2 dev veth_A-R2
run_cmd ${ns_a} ip nexthop add id 403 group 401/402
run_cmd ${ns_a} ip route add ${host4_b_addr} src ${host4_a_addr} nhid 403
# Set up host B with multipath routes to host A host4_a_addr
run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
run_cmd ${ns_b} ip nexthop add id 401 via ${prefix4}.${b_r1}.2 dev veth_B-R1
run_cmd ${ns_b} ip nexthop add id 402 via ${prefix4}.${b_r2}.2 dev veth_B-R2
run_cmd ${ns_b} ip nexthop add id 403 group 401/402
run_cmd ${ns_b} ip route add ${host4_a_addr} src ${host4_b_addr} nhid 403
}
setup_multipath_old() {
# Set up host A with multipath routes to host B host4_b_addr
run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
run_cmd ${ns_a} ip route add ${host4_b_addr} \
src ${host4_a_addr} \
nexthop via ${prefix4}.${a_r1}.2 weight 1 \
nexthop via ${prefix4}.${a_r2}.2 weight 1
# Set up host B with multipath routes to host A host4_a_addr
run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
run_cmd ${ns_b} ip route add ${host4_a_addr} \
src ${host4_b_addr} \
nexthop via ${prefix4}.${b_r1}.2 weight 1 \
nexthop via ${prefix4}.${b_r2}.2 weight 1
}
setup_multipath() {
if [ "$USE_NH" = "yes" ]; then
setup_multipath_new
else
setup_multipath_old
fi
# Set up routers with routes to dummies
run_cmd ${ns_r1} ip route add ${host4_a_addr} via ${prefix4}.${a_r1}.1
run_cmd ${ns_r2} ip route add ${host4_a_addr} via ${prefix4}.${a_r2}.1
run_cmd ${ns_r1} ip route add ${host4_b_addr} via ${prefix4}.${b_r1}.1
run_cmd ${ns_r2} ip route add ${host4_b_addr} via ${prefix4}.${b_r2}.1
}
setup() { setup() {
[ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip [ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip
@ -1076,23 +1132,15 @@ link_get_mtu() {
} }
route_get_dst_exception() { route_get_dst_exception() {
ns_cmd="${1}" ns_cmd="${1}"; shift
dst="${2}"
dsfield="${3}"
if [ -z "${dsfield}" ]; then ${ns_cmd} ip route get "$@"
dsfield=0
fi
${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
} }
route_get_dst_pmtu_from_exception() { route_get_dst_pmtu_from_exception() {
ns_cmd="${1}" ns_cmd="${1}"; shift
dst="${2}"
dsfield="${3}"
mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")" mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")"
} }
check_pmtu_value() { check_pmtu_value() {
@ -1235,10 +1283,10 @@ test_pmtu_ipv4_dscp_icmp_exception() {
run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}" run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
# Check that exceptions have been created with the correct PMTU # Check that exceptions have been created with the correct PMTU
pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")" pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1 check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")" pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1 check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
} }
@ -1285,9 +1333,9 @@ test_pmtu_ipv4_dscp_udp_exception() {
UDP:"${dst2}":50000,tos="${dsfield}" UDP:"${dst2}":50000,tos="${dsfield}"
# Check that exceptions have been created with the correct PMTU # Check that exceptions have been created with the correct PMTU
pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")" pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1 check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")" pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1 check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
} }
@ -2329,6 +2377,36 @@ test_pmtu_ipv6_route_change() {
test_pmtu_ipvX_route_change 6 test_pmtu_ipvX_route_change 6
} }
test_pmtu_ipv4_mp_exceptions() {
setup namespaces routing multipath || return $ksft_skip
trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
"${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
"${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
"${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
# Set up initial MTU values
mtu "${ns_a}" veth_A-R1 2000
mtu "${ns_r1}" veth_R1-A 2000
mtu "${ns_r1}" veth_R1-B 1500
mtu "${ns_b}" veth_B-R1 1500
mtu "${ns_a}" veth_A-R2 2000
mtu "${ns_r2}" veth_R2-A 2000
mtu "${ns_r2}" veth_R2-B 1500
mtu "${ns_b}" veth_B-R2 1500
# Ping and expect two nexthop exceptions for two routes
run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${host4_b_addr}"
# Check that exceptions have been created with the correct PMTU
pmtu_a_R1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R1)"
pmtu_a_R2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R2)"
check_pmtu_value "1500" "${pmtu_a_R1}" "exceeding MTU (veth_A-R1)" || return 1
check_pmtu_value "1500" "${pmtu_a_R2}" "exceeding MTU (veth_A-R2)" || return 1
}
usage() { usage() {
echo echo
echo "$0 [OPTIONS] [TEST]..." echo "$0 [OPTIONS] [TEST]..."