2019-06-04 10:11:33 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2012-10-01 12:32:35 +00:00
|
|
|
/*
|
2012-11-13 13:29:15 +00:00
|
|
|
* VXLAN: Virtual eXtensible Local Area Network
|
2012-10-01 12:32:35 +00:00
|
|
|
*
|
2013-04-27 11:31:52 +00:00
|
|
|
* Copyright (c) 2012-2013 Vyatta Inc.
|
2012-10-01 12:32:35 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/udp.h>
|
|
|
|
#include <linux/igmp.h>
|
|
|
|
#include <linux/if_ether.h>
|
2013-01-29 23:43:07 +00:00
|
|
|
#include <linux/ethtool.h>
|
2012-11-20 02:50:14 +00:00
|
|
|
#include <net/arp.h>
|
|
|
|
#include <net/ndisc.h>
|
2021-11-15 09:05:51 -08:00
|
|
|
#include <net/gro.h>
|
2019-03-22 06:06:09 -07:00
|
|
|
#include <net/ipv6_stubs.h>
|
2012-10-01 12:32:35 +00:00
|
|
|
#include <net/ip.h>
|
|
|
|
#include <net/icmp.h>
|
|
|
|
#include <net/rtnetlink.h>
|
|
|
|
#include <net/inet_ecn.h>
|
|
|
|
#include <net/net_namespace.h>
|
|
|
|
#include <net/netns/generic.h>
|
2017-08-28 21:43:22 +02:00
|
|
|
#include <net/tun_proto.h>
|
2013-08-19 11:23:07 -07:00
|
|
|
#include <net/vxlan.h>
|
2020-05-21 22:26:14 -07:00
|
|
|
#include <net/nexthop.h>
|
2016-02-09 22:07:29 -08:00
|
|
|
|
2013-08-31 13:44:33 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
#include <net/ip6_tunnel.h>
|
2013-09-02 10:06:52 +08:00
|
|
|
#include <net/ip6_checksum.h>
|
2013-08-31 13:44:33 +08:00
|
|
|
#endif
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2022-03-01 05:04:30 +00:00
|
|
|
#include "vxlan_private.h"
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
#define VXLAN_VERSION "0.1"
|
|
|
|
|
|
|
|
#define FDB_AGE_DEFAULT 300 /* 5 min */
|
|
|
|
#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
|
|
|
|
|
2013-04-27 11:31:53 +00:00
|
|
|
/* UDP port for VXLAN traffic.
|
|
|
|
* The IANA assigned port is 4789, but the Linux default is 8472
|
2013-06-17 14:16:41 -07:00
|
|
|
* for compatibility with early adopters.
|
2013-04-27 11:31:53 +00:00
|
|
|
*/
|
2013-06-17 14:16:12 -07:00
|
|
|
static unsigned short vxlan_port __read_mostly = 8472;
|
|
|
|
module_param_named(udp_port, vxlan_port, ushort, 0444);
|
2012-10-01 12:32:35 +00:00
|
|
|
MODULE_PARM_DESC(udp_port, "Destination UDP port");
|
|
|
|
|
|
|
|
static bool log_ecn_error = true;
|
|
|
|
module_param(log_ecn_error, bool, 0644);
|
|
|
|
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
|
|
|
|
|
2022-03-01 05:04:30 +00:00
|
|
|
unsigned int vxlan_net_id;
|
2013-05-16 11:35:20 +00:00
|
|
|
|
2022-03-01 05:04:30 +00:00
|
|
|
const u8 all_zeros_mac[ETH_ALEN + 2];
|
|
|
|
static struct rtnl_link_ops vxlan_link_ops;
|
2013-06-25 16:01:51 +03:00
|
|
|
|
2015-09-24 13:50:01 +02:00
|
|
|
static int vxlan_sock_add(struct vxlan_dev *vxlan);
|
2015-07-21 10:44:06 +02:00
|
|
|
|
2017-06-02 03:24:08 +03:00
|
|
|
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* salt for hash table */
|
|
|
|
static u32 vxlan_salt __read_mostly;
|
|
|
|
|
2015-07-21 10:43:58 +02:00
|
|
|
static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
|
|
|
|
{
|
2015-07-21 10:44:01 +02:00
|
|
|
return vs->flags & VXLAN_F_COLLECT_METADATA ||
|
|
|
|
ip_tunnel_collect_metadata();
|
2015-07-21 10:43:58 +02:00
|
|
|
}
|
|
|
|
|
2020-09-25 15:17:17 +02:00
|
|
|
/* Find VXLAN socket based on network namespace, address family, UDP port,
|
|
|
|
* enabled unshareable flags and socket device binding (see l3mdev with
|
|
|
|
* non-default VRF).
|
2015-01-15 03:53:56 +01:00
|
|
|
*/
|
|
|
|
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
|
2018-12-03 10:54:40 +01:00
|
|
|
__be16 port, u32 flags, int ifindex)
|
2013-05-16 11:35:20 +00:00
|
|
|
{
|
|
|
|
struct vxlan_sock *vs;
|
2015-01-20 11:23:05 -08:00
|
|
|
|
|
|
|
flags &= VXLAN_F_RCV_FLAGS;
|
2013-05-16 11:35:20 +00:00
|
|
|
|
|
|
|
hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
|
2014-11-13 14:43:08 -02:00
|
|
|
if (inet_sk(vs->sock->sk)->inet_sport == port &&
|
2015-08-20 13:56:28 +02:00
|
|
|
vxlan_get_sk_family(vs) == family &&
|
2018-12-03 10:54:40 +01:00
|
|
|
vs->flags == flags &&
|
|
|
|
vs->sock->sk->sk_bound_dev_if == ifindex)
|
2013-05-16 11:35:20 +00:00
|
|
|
return vs;
|
|
|
|
}
|
|
|
|
return NULL;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2022-03-01 05:04:38 +00:00
|
|
|
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs,
|
|
|
|
int ifindex, __be32 vni,
|
|
|
|
struct vxlan_vni_node **vninode)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2022-03-01 05:04:38 +00:00
|
|
|
struct vxlan_vni_node *vnode;
|
2017-07-02 19:00:57 +02:00
|
|
|
struct vxlan_dev_node *node;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2016-02-16 21:59:03 +01:00
|
|
|
/* For flow based devices, map all packets to VNI 0 */
|
2022-03-01 05:04:36 +00:00
|
|
|
if (vs->flags & VXLAN_F_COLLECT_METADATA &&
|
|
|
|
!(vs->flags & VXLAN_F_VNIFILTER))
|
2016-02-16 21:59:03 +01:00
|
|
|
vni = 0;
|
|
|
|
|
2017-07-02 19:00:57 +02:00
|
|
|
hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
|
2022-03-01 05:04:36 +00:00
|
|
|
if (!node->vxlan)
|
2017-06-19 10:04:00 +02:00
|
|
|
continue;
|
2022-03-01 05:04:38 +00:00
|
|
|
vnode = NULL;
|
2022-03-01 05:04:36 +00:00
|
|
|
if (node->vxlan->cfg.flags & VXLAN_F_VNIFILTER) {
|
2022-03-01 05:04:38 +00:00
|
|
|
vnode = vxlan_vnifilter_lookup(node->vxlan, vni);
|
|
|
|
if (!vnode)
|
2022-03-01 05:04:36 +00:00
|
|
|
continue;
|
|
|
|
} else if (node->vxlan->default_dst.remote_vni != vni) {
|
|
|
|
continue;
|
|
|
|
}
|
2017-06-19 10:04:00 +02:00
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_IPV6)) {
|
2017-07-02 19:00:57 +02:00
|
|
|
const struct vxlan_config *cfg = &node->vxlan->cfg;
|
2017-06-19 10:04:00 +02:00
|
|
|
|
|
|
|
if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) &&
|
|
|
|
cfg->remote_ifindex != ifindex)
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2022-03-01 05:04:38 +00:00
|
|
|
if (vninode)
|
|
|
|
*vninode = vnode;
|
2017-07-02 19:00:57 +02:00
|
|
|
return node->vxlan;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2013-08-19 11:23:02 -07:00
|
|
|
/* Look up VNI in a per net namespace table */
|
2017-06-19 10:04:00 +02:00
|
|
|
static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
|
|
|
|
__be32 vni, sa_family_t family,
|
|
|
|
__be16 port, u32 flags)
|
2013-08-19 11:23:02 -07:00
|
|
|
{
|
|
|
|
struct vxlan_sock *vs;
|
|
|
|
|
2018-12-03 10:54:40 +01:00
|
|
|
vs = vxlan_find_sock(net, family, port, flags, ifindex);
|
2013-08-19 11:23:02 -07:00
|
|
|
if (!vs)
|
|
|
|
return NULL;
|
|
|
|
|
2022-03-01 05:04:38 +00:00
|
|
|
return vxlan_vs_find_vni(vs, ifindex, vni, NULL);
|
2013-08-19 11:23:02 -07:00
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Fill in neighbour message in skbuff. */
|
|
|
|
static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
|
2013-06-17 14:16:41 -07:00
|
|
|
const struct vxlan_fdb *fdb,
|
|
|
|
u32 portid, u32 seq, int type, unsigned int flags,
|
|
|
|
const struct vxlan_rdst *rdst)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
|
|
|
unsigned long now = jiffies;
|
|
|
|
struct nda_cacheinfo ci;
|
2020-05-21 22:26:14 -07:00
|
|
|
bool send_ip, send_eth;
|
2012-10-01 12:32:35 +00:00
|
|
|
struct nlmsghdr *nlh;
|
2020-05-21 22:26:14 -07:00
|
|
|
struct nexthop *nh;
|
2012-10-01 12:32:35 +00:00
|
|
|
struct ndmsg *ndm;
|
2020-05-25 00:38:56 +03:00
|
|
|
int nh_family;
|
|
|
|
u32 nh_id;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
|
|
|
|
if (nlh == NULL)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
ndm = nlmsg_data(nlh);
|
|
|
|
memset(ndm, 0, sizeof(*ndm));
|
2012-11-20 02:50:14 +00:00
|
|
|
|
|
|
|
send_eth = send_ip = true;
|
|
|
|
|
2020-05-25 00:38:56 +03:00
|
|
|
rcu_read_lock();
|
|
|
|
nh = rcu_dereference(fdb->nh);
|
|
|
|
if (nh) {
|
|
|
|
nh_family = nexthop_get_family(nh);
|
|
|
|
nh_id = nh->id;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
2012-11-20 02:50:14 +00:00
|
|
|
if (type == RTM_GETNEIGH) {
|
2020-05-21 22:26:14 -07:00
|
|
|
if (rdst) {
|
|
|
|
send_ip = !vxlan_addr_any(&rdst->remote_ip);
|
|
|
|
ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
|
|
|
|
} else if (nh) {
|
2020-05-25 00:38:56 +03:00
|
|
|
ndm->ndm_family = nh_family;
|
2020-05-21 22:26:14 -07:00
|
|
|
}
|
2012-11-20 02:50:14 +00:00
|
|
|
send_eth = !is_zero_ether_addr(fdb->eth_addr);
|
|
|
|
} else
|
|
|
|
ndm->ndm_family = AF_BRIDGE;
|
2012-10-01 12:32:35 +00:00
|
|
|
ndm->ndm_state = fdb->state;
|
|
|
|
ndm->ndm_ifindex = vxlan->dev->ifindex;
|
2013-04-19 00:36:26 +00:00
|
|
|
ndm->ndm_flags = fdb->flags;
|
2020-05-21 22:26:14 -07:00
|
|
|
if (rdst && rdst->offloaded)
|
2018-10-17 08:53:26 +00:00
|
|
|
ndm->ndm_flags |= NTF_OFFLOADED;
|
2014-07-26 00:38:59 +08:00
|
|
|
ndm->ndm_type = RTN_UNICAST;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2015-01-20 15:15:47 +01:00
|
|
|
if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
|
2015-01-26 14:10:53 +01:00
|
|
|
nla_put_s32(skb, NDA_LINK_NETNSID,
|
2016-09-01 21:53:44 -07:00
|
|
|
peernet2id(dev_net(vxlan->dev), vxlan->net)))
|
2015-01-20 15:15:47 +01:00
|
|
|
goto nla_put_failure;
|
|
|
|
|
2012-11-20 02:50:14 +00:00
|
|
|
if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
|
2012-10-01 12:32:35 +00:00
|
|
|
goto nla_put_failure;
|
2020-05-21 22:26:14 -07:00
|
|
|
if (nh) {
|
2020-05-25 00:38:56 +03:00
|
|
|
if (nla_put_u32(skb, NDA_NH_ID, nh_id))
|
2020-05-21 22:26:14 -07:00
|
|
|
goto nla_put_failure;
|
|
|
|
} else if (rdst) {
|
|
|
|
if (send_ip && vxlan_nla_put_addr(skb, NDA_DST,
|
|
|
|
&rdst->remote_ip))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
|
|
|
if (rdst->remote_port &&
|
|
|
|
rdst->remote_port != vxlan->cfg.dst_port &&
|
|
|
|
nla_put_be16(skb, NDA_PORT, rdst->remote_port))
|
|
|
|
goto nla_put_failure;
|
|
|
|
if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
|
|
|
|
nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
|
|
|
|
goto nla_put_failure;
|
|
|
|
if (rdst->remote_ifindex &&
|
|
|
|
nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
|
|
|
|
goto nla_put_failure;
|
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2017-06-19 10:03:56 +02:00
|
|
|
if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
|
2017-01-31 22:59:52 -08:00
|
|
|
nla_put_u32(skb, NDA_SRC_VNI,
|
|
|
|
be32_to_cpu(fdb->vni)))
|
|
|
|
goto nla_put_failure;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
|
|
|
|
ci.ndm_confirmed = 0;
|
|
|
|
ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
|
|
|
|
ci.ndm_refcnt = 0;
|
|
|
|
|
|
|
|
if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2015-01-16 22:09:00 +01:00
|
|
|
nlmsg_end(skb, nlh);
|
|
|
|
return 0;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
nla_put_failure:
|
|
|
|
nlmsg_cancel(skb, nlh);
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline size_t vxlan_nlmsg_size(void)
|
|
|
|
{
|
|
|
|
return NLMSG_ALIGN(sizeof(struct ndmsg))
|
|
|
|
+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
|
2013-08-31 13:44:33 +08:00
|
|
|
+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
|
2013-04-27 11:31:54 +00:00
|
|
|
+ nla_total_size(sizeof(__be16)) /* NDA_PORT */
|
2013-03-15 04:35:51 +00:00
|
|
|
+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
|
|
|
|
+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
|
2015-01-26 14:10:53 +01:00
|
|
|
+ nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
|
2012-10-01 12:32:35 +00:00
|
|
|
+ nla_total_size(sizeof(struct nda_cacheinfo));
|
|
|
|
}
|
|
|
|
|
2018-10-17 08:53:22 +00:00
|
|
|
static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
|
|
|
|
struct vxlan_rdst *rd, int type)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
|
|
|
struct net *net = dev_net(vxlan->dev);
|
|
|
|
struct sk_buff *skb;
|
|
|
|
int err = -ENOBUFS;
|
|
|
|
|
|
|
|
skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
|
|
|
|
if (skb == NULL)
|
|
|
|
goto errout;
|
|
|
|
|
2014-04-22 15:01:30 +02:00
|
|
|
err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
|
2012-10-01 12:32:35 +00:00
|
|
|
if (err < 0) {
|
|
|
|
/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
|
|
|
|
WARN_ON(err == -EMSGSIZE);
|
|
|
|
kfree_skb(skb);
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
|
|
|
|
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
|
|
|
|
return;
|
|
|
|
errout:
|
2024-08-22 12:32:43 +08:00
|
|
|
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2018-12-07 19:55:03 +00:00
|
|
|
static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan,
|
|
|
|
const struct vxlan_fdb *fdb,
|
|
|
|
const struct vxlan_rdst *rd,
|
2019-01-16 23:06:54 +00:00
|
|
|
struct netlink_ext_ack *extack,
|
2018-12-07 19:55:03 +00:00
|
|
|
struct switchdev_notifier_vxlan_fdb_info *fdb_info)
|
|
|
|
{
|
|
|
|
fdb_info->info.dev = vxlan->dev;
|
2019-01-16 23:06:54 +00:00
|
|
|
fdb_info->info.extack = extack;
|
2018-12-07 19:55:03 +00:00
|
|
|
fdb_info->remote_ip = rd->remote_ip;
|
|
|
|
fdb_info->remote_port = rd->remote_port;
|
|
|
|
fdb_info->remote_vni = rd->remote_vni;
|
|
|
|
fdb_info->remote_ifindex = rd->remote_ifindex;
|
|
|
|
memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN);
|
|
|
|
fdb_info->vni = fdb->vni;
|
|
|
|
fdb_info->offloaded = rd->offloaded;
|
|
|
|
fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER;
|
|
|
|
}
|
|
|
|
|
2019-01-16 23:06:38 +00:00
|
|
|
static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
|
|
|
|
struct vxlan_fdb *fdb,
|
|
|
|
struct vxlan_rdst *rd,
|
2019-01-16 23:06:54 +00:00
|
|
|
bool adding,
|
|
|
|
struct netlink_ext_ack *extack)
|
2018-10-17 08:53:22 +00:00
|
|
|
{
|
|
|
|
struct switchdev_notifier_vxlan_fdb_info info;
|
|
|
|
enum switchdev_notifier_type notifier_type;
|
2019-01-16 23:06:38 +00:00
|
|
|
int ret;
|
2018-10-17 08:53:22 +00:00
|
|
|
|
|
|
|
if (WARN_ON(!rd))
|
2019-01-16 23:06:38 +00:00
|
|
|
return 0;
|
2018-10-17 08:53:22 +00:00
|
|
|
|
|
|
|
notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
|
|
|
|
: SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE;
|
2019-01-16 23:06:54 +00:00
|
|
|
vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info);
|
2019-01-16 23:06:38 +00:00
|
|
|
ret = call_switchdev_notifiers(notifier_type, vxlan->dev,
|
2019-01-16 23:06:56 +00:00
|
|
|
&info.info, extack);
|
2019-01-16 23:06:38 +00:00
|
|
|
return notifier_to_errno(ret);
|
2018-10-17 08:53:22 +00:00
|
|
|
}
|
|
|
|
|
2019-01-16 23:06:38 +00:00
|
|
|
static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
|
2019-01-16 23:06:54 +00:00
|
|
|
struct vxlan_rdst *rd, int type, bool swdev_notify,
|
|
|
|
struct netlink_ext_ack *extack)
|
2018-10-17 08:53:22 +00:00
|
|
|
{
|
2019-01-16 23:06:38 +00:00
|
|
|
int err;
|
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
if (swdev_notify && rd) {
|
2018-11-21 08:02:35 +00:00
|
|
|
switch (type) {
|
|
|
|
case RTM_NEWNEIGH:
|
2019-01-16 23:06:38 +00:00
|
|
|
err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
|
2019-01-16 23:06:54 +00:00
|
|
|
true, extack);
|
2019-01-16 23:06:38 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
2018-11-21 08:02:35 +00:00
|
|
|
break;
|
|
|
|
case RTM_DELNEIGH:
|
|
|
|
vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
|
2019-01-16 23:06:54 +00:00
|
|
|
false, extack);
|
2018-11-21 08:02:35 +00:00
|
|
|
break;
|
|
|
|
}
|
2018-10-17 08:53:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
__vxlan_fdb_notify(vxlan, fdb, rd, type);
|
2019-01-16 23:06:38 +00:00
|
|
|
return 0;
|
2018-10-17 08:53:22 +00:00
|
|
|
}
|
|
|
|
|
2013-08-31 13:44:33 +08:00
|
|
|
static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
|
2012-11-20 02:50:14 +00:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2013-06-17 14:16:40 -07:00
|
|
|
struct vxlan_fdb f = {
|
|
|
|
.state = NUD_STALE,
|
|
|
|
};
|
|
|
|
struct vxlan_rdst remote = {
|
2013-08-31 13:44:33 +08:00
|
|
|
.remote_ip = *ipa, /* goes to NDA_DST */
|
2016-02-16 21:58:58 +01:00
|
|
|
.remote_vni = cpu_to_be32(VXLAN_N_VID),
|
2013-06-17 14:16:40 -07:00
|
|
|
};
|
2013-06-17 14:16:12 -07:00
|
|
|
|
2019-01-16 23:06:54 +00:00
|
|
|
vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
|
2012-11-20 02:50:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
|
|
|
|
{
|
2013-06-17 14:16:40 -07:00
|
|
|
struct vxlan_fdb f = {
|
|
|
|
.state = NUD_STALE,
|
|
|
|
};
|
2014-04-22 15:01:30 +02:00
|
|
|
struct vxlan_rdst remote = { };
|
2012-11-20 02:50:14 +00:00
|
|
|
|
|
|
|
memcpy(f.eth_addr, eth_addr, ETH_ALEN);
|
|
|
|
|
2019-01-16 23:06:54 +00:00
|
|
|
vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
|
2012-11-20 02:50:14 +00:00
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Hash Ethernet address */
|
|
|
|
static u32 eth_hash(const unsigned char *addr)
|
|
|
|
{
|
|
|
|
u64 value = get_unaligned((u64 *)addr);
|
|
|
|
|
|
|
|
/* only want 6 bytes */
|
|
|
|
#ifdef __BIG_ENDIAN
|
|
|
|
value >>= 16;
|
2012-10-09 20:35:47 +00:00
|
|
|
#else
|
|
|
|
value <<= 16;
|
2012-10-01 12:32:35 +00:00
|
|
|
#endif
|
|
|
|
return hash_64(value, FDB_HASH_BITS);
|
|
|
|
}
|
|
|
|
|
2022-03-01 05:04:31 +00:00
|
|
|
u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
|
2017-01-31 22:59:52 -08:00
|
|
|
{
|
|
|
|
/* use 1 byte of OUI and 3 bytes of NIC */
|
|
|
|
u32 key = get_unaligned((u32 *)(addr + 2));
|
|
|
|
|
|
|
|
return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
|
|
|
|
}
|
|
|
|
|
2022-03-01 05:04:31 +00:00
|
|
|
u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
|
2019-06-06 17:57:58 +08:00
|
|
|
{
|
|
|
|
if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
|
|
|
|
return eth_vni_hash(mac, vni);
|
|
|
|
else
|
|
|
|
return eth_hash(mac);
|
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Hash chain to use given mac address */
|
|
|
|
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
|
2017-01-31 22:59:52 -08:00
|
|
|
const u8 *mac, __be32 vni)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2019-06-06 17:57:58 +08:00
|
|
|
return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)];
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Look up Ethernet address in forwarding table */
|
2013-05-17 06:39:07 +00:00
|
|
|
static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
|
2017-01-31 22:59:52 -08:00
|
|
|
const u8 *mac, __be32 vni)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2017-01-31 22:59:52 -08:00
|
|
|
struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni);
|
2012-10-01 12:32:35 +00:00
|
|
|
struct vxlan_fdb *f;
|
|
|
|
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-27 17:06:00 -08:00
|
|
|
hlist_for_each_entry_rcu(f, head, hlist) {
|
2017-01-31 22:59:52 -08:00
|
|
|
if (ether_addr_equal(mac, f->eth_addr)) {
|
2017-06-19 10:03:56 +02:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
|
2017-01-31 22:59:52 -08:00
|
|
|
if (vni == f->vni)
|
|
|
|
return f;
|
|
|
|
} else {
|
|
|
|
return f;
|
|
|
|
}
|
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2013-05-17 06:39:07 +00:00
|
|
|
static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
|
2017-01-31 22:59:52 -08:00
|
|
|
const u8 *mac, __be32 vni)
|
2013-05-17 06:39:07 +00:00
|
|
|
{
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
|
2017-01-31 22:59:52 -08:00
|
|
|
f = __vxlan_find_mac(vxlan, mac, vni);
|
2018-08-29 11:52:10 +08:00
|
|
|
if (f && f->used != jiffies)
|
2013-05-17 06:39:07 +00:00
|
|
|
f->used = jiffies;
|
|
|
|
|
|
|
|
return f;
|
|
|
|
}
|
|
|
|
|
2013-06-25 16:01:52 +03:00
|
|
|
/* caller should hold vxlan->hash_lock */
|
|
|
|
static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
|
2013-08-31 13:44:33 +08:00
|
|
|
union vxlan_addr *ip, __be16 port,
|
2016-02-16 21:58:58 +01:00
|
|
|
__be32 vni, __u32 ifindex)
|
2013-03-15 04:35:51 +00:00
|
|
|
{
|
2013-06-17 14:16:12 -07:00
|
|
|
struct vxlan_rdst *rd;
|
2013-03-15 04:35:51 +00:00
|
|
|
|
2013-06-17 14:16:12 -07:00
|
|
|
list_for_each_entry(rd, &f->remotes, list) {
|
2013-08-31 13:44:33 +08:00
|
|
|
if (vxlan_addr_equal(&rd->remote_ip, ip) &&
|
2013-03-15 04:35:51 +00:00
|
|
|
rd->remote_port == port &&
|
|
|
|
rd->remote_vni == vni &&
|
|
|
|
rd->remote_ifindex == ifindex)
|
2013-06-25 16:01:52 +03:00
|
|
|
return rd;
|
2013-03-15 04:35:51 +00:00
|
|
|
}
|
2013-06-17 14:16:12 -07:00
|
|
|
|
2013-06-25 16:01:52 +03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-10-17 08:53:24 +00:00
|
|
|
int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
|
|
|
|
struct switchdev_notifier_vxlan_fdb_info *fdb_info)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
u8 eth_addr[ETH_ALEN + 2] = { 0 };
|
|
|
|
struct vxlan_rdst *rdst;
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
int rc = 0;
|
|
|
|
|
|
|
|
if (is_multicast_ether_addr(mac) ||
|
|
|
|
is_zero_ether_addr(mac))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
ether_addr_copy(eth_addr, mac);
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
f = __vxlan_find_mac(vxlan, eth_addr, vni);
|
|
|
|
if (!f) {
|
|
|
|
rc = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
rdst = first_remote_rcu(f);
|
2019-01-16 23:06:54 +00:00
|
|
|
vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info);
|
2018-10-17 08:53:24 +00:00
|
|
|
|
|
|
|
out:
|
|
|
|
rcu_read_unlock();
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc);
|
|
|
|
|
2018-12-07 19:55:04 +00:00
|
|
|
static int vxlan_fdb_notify_one(struct notifier_block *nb,
|
|
|
|
const struct vxlan_dev *vxlan,
|
|
|
|
const struct vxlan_fdb *f,
|
2019-01-16 23:06:54 +00:00
|
|
|
const struct vxlan_rdst *rdst,
|
|
|
|
struct netlink_ext_ack *extack)
|
2018-12-07 19:55:04 +00:00
|
|
|
{
|
|
|
|
struct switchdev_notifier_vxlan_fdb_info fdb_info;
|
|
|
|
int rc;
|
|
|
|
|
2019-01-16 23:06:54 +00:00
|
|
|
vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info);
|
2018-12-07 19:55:04 +00:00
|
|
|
rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
|
|
|
|
&fdb_info);
|
|
|
|
return notifier_to_errno(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
|
2019-01-16 23:06:54 +00:00
|
|
|
struct notifier_block *nb,
|
|
|
|
struct netlink_ext_ack *extack)
|
2018-12-07 19:55:04 +00:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan;
|
|
|
|
struct vxlan_rdst *rdst;
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
unsigned int h;
|
|
|
|
int rc = 0;
|
|
|
|
|
|
|
|
if (!netif_is_vxlan(dev))
|
|
|
|
return -EINVAL;
|
|
|
|
vxlan = netdev_priv(dev);
|
|
|
|
|
|
|
|
for (h = 0; h < FDB_HASH_SIZE; ++h) {
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_lock_bh(&vxlan->hash_lock[h]);
|
2018-12-07 19:55:04 +00:00
|
|
|
hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) {
|
|
|
|
if (f->vni == vni) {
|
|
|
|
list_for_each_entry(rdst, &f->remotes, list) {
|
|
|
|
rc = vxlan_fdb_notify_one(nb, vxlan,
|
2019-01-16 23:06:54 +00:00
|
|
|
f, rdst,
|
|
|
|
extack);
|
2018-12-07 19:55:04 +00:00
|
|
|
if (rc)
|
2019-06-06 17:57:58 +08:00
|
|
|
goto unlock;
|
2018-12-07 19:55:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[h]);
|
2018-12-07 19:55:04 +00:00
|
|
|
}
|
2019-06-06 17:57:58 +08:00
|
|
|
return 0;
|
2018-12-07 19:55:04 +00:00
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
unlock:
|
|
|
|
spin_unlock_bh(&vxlan->hash_lock[h]);
|
2018-12-07 19:55:04 +00:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(vxlan_fdb_replay);
|
|
|
|
|
2018-12-07 19:55:06 +00:00
|
|
|
void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan;
|
|
|
|
struct vxlan_rdst *rdst;
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
unsigned int h;
|
|
|
|
|
|
|
|
if (!netif_is_vxlan(dev))
|
|
|
|
return;
|
|
|
|
vxlan = netdev_priv(dev);
|
|
|
|
|
|
|
|
for (h = 0; h < FDB_HASH_SIZE; ++h) {
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_lock_bh(&vxlan->hash_lock[h]);
|
2018-12-07 19:55:06 +00:00
|
|
|
hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist)
|
|
|
|
if (f->vni == vni)
|
|
|
|
list_for_each_entry(rdst, &f->remotes, list)
|
|
|
|
rdst->offloaded = false;
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[h]);
|
2018-12-07 19:55:06 +00:00
|
|
|
}
|
2019-06-06 17:57:58 +08:00
|
|
|
|
2018-12-07 19:55:06 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload);
|
|
|
|
|
2013-07-19 17:20:07 +02:00
|
|
|
/* Replace destination of unicast mac */
|
|
|
|
static int vxlan_fdb_replace(struct vxlan_fdb *f,
|
2016-02-16 21:58:58 +01:00
|
|
|
union vxlan_addr *ip, __be16 port, __be32 vni,
|
2019-01-16 23:06:34 +00:00
|
|
|
__u32 ifindex, struct vxlan_rdst *oldrd)
|
2013-07-19 17:20:07 +02:00
|
|
|
{
|
|
|
|
struct vxlan_rdst *rd;
|
|
|
|
|
|
|
|
rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
|
|
|
|
if (rd)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
|
|
|
|
if (!rd)
|
|
|
|
return 0;
|
2016-02-12 15:43:56 +01:00
|
|
|
|
2019-01-16 23:06:34 +00:00
|
|
|
*oldrd = *rd;
|
2016-02-12 15:43:56 +01:00
|
|
|
dst_cache_reset(&rd->dst_cache);
|
2013-08-31 13:44:33 +08:00
|
|
|
rd->remote_ip = *ip;
|
2013-07-19 17:20:07 +02:00
|
|
|
rd->remote_port = port;
|
|
|
|
rd->remote_vni = vni;
|
|
|
|
rd->remote_ifindex = ifindex;
|
2018-12-18 13:15:59 +00:00
|
|
|
rd->offloaded = false;
|
2013-07-19 17:20:07 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2013-06-25 16:01:52 +03:00
|
|
|
/* Add/update destinations for multicast */
|
|
|
|
static int vxlan_fdb_append(struct vxlan_fdb *f,
|
2016-02-16 21:58:58 +01:00
|
|
|
union vxlan_addr *ip, __be16 port, __be32 vni,
|
2014-04-22 15:01:30 +02:00
|
|
|
__u32 ifindex, struct vxlan_rdst **rdp)
|
2013-06-25 16:01:52 +03:00
|
|
|
{
|
|
|
|
struct vxlan_rdst *rd;
|
|
|
|
|
|
|
|
rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
|
|
|
|
if (rd)
|
|
|
|
return 0;
|
|
|
|
|
2013-03-15 04:35:51 +00:00
|
|
|
rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
|
|
|
|
if (rd == NULL)
|
2022-04-06 22:46:22 -04:00
|
|
|
return -ENOMEM;
|
2016-02-12 15:43:56 +01:00
|
|
|
|
|
|
|
if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
|
|
|
|
kfree(rd);
|
2022-04-06 22:46:22 -04:00
|
|
|
return -ENOMEM;
|
2016-02-12 15:43:56 +01:00
|
|
|
}
|
|
|
|
|
2013-08-31 13:44:33 +08:00
|
|
|
rd->remote_ip = *ip;
|
2013-03-15 04:35:51 +00:00
|
|
|
rd->remote_port = port;
|
2018-10-17 08:53:26 +00:00
|
|
|
rd->offloaded = false;
|
2013-03-15 04:35:51 +00:00
|
|
|
rd->remote_vni = vni;
|
|
|
|
rd->remote_ifindex = ifindex;
|
2013-06-17 14:16:12 -07:00
|
|
|
|
|
|
|
list_add_tail_rcu(&rd->list, &f->remotes);
|
|
|
|
|
2014-04-22 15:01:30 +02:00
|
|
|
*rdp = rd;
|
2013-03-15 04:35:51 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2024-12-05 16:40:52 +01:00
|
|
|
static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol)
|
2023-07-21 16:30:46 +02:00
|
|
|
{
|
2024-12-05 16:40:52 +01:00
|
|
|
const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr;
|
2023-07-21 16:30:46 +02:00
|
|
|
|
|
|
|
/* Need to have Next Protocol set for interfaces in GPE mode. */
|
|
|
|
if (!gpe->np_applied)
|
|
|
|
return false;
|
|
|
|
/* "The initial version is 0. If a receiver does not support the
|
|
|
|
* version indicated it MUST drop the packet.
|
|
|
|
*/
|
|
|
|
if (gpe->version != 0)
|
|
|
|
return false;
|
|
|
|
/* "When the O bit is set to 1, the packet is an OAM packet and OAM
|
|
|
|
* processing MUST occur." However, we don't implement OAM
|
|
|
|
* processing, thus drop the packet.
|
|
|
|
*/
|
|
|
|
if (gpe->oam_flag)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
*protocol = tun_p_to_eth_p(gpe->next_protocol);
|
|
|
|
if (!*protocol)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-01-12 17:00:38 -08:00
|
|
|
static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
|
|
|
|
unsigned int off,
|
|
|
|
struct vxlanhdr *vh, size_t hdrlen,
|
2016-02-16 21:58:58 +01:00
|
|
|
__be32 vni_field,
|
|
|
|
struct gro_remcsum *grc,
|
2015-02-10 16:30:32 -08:00
|
|
|
bool nopartial)
|
2015-01-12 17:00:38 -08:00
|
|
|
{
|
2015-08-19 17:07:32 -07:00
|
|
|
size_t start, offset;
|
2015-01-12 17:00:38 -08:00
|
|
|
|
|
|
|
if (skb->remcsum_offload)
|
2015-08-19 17:07:32 -07:00
|
|
|
return vh;
|
2015-01-12 17:00:38 -08:00
|
|
|
|
|
|
|
if (!NAPI_GRO_CB(skb)->csum_valid)
|
|
|
|
return NULL;
|
|
|
|
|
2016-02-16 21:58:58 +01:00
|
|
|
start = vxlan_rco_start(vni_field);
|
|
|
|
offset = start + vxlan_rco_offset(vni_field);
|
2015-01-12 17:00:38 -08:00
|
|
|
|
2015-08-19 17:07:32 -07:00
|
|
|
vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
|
|
|
|
start, offset, grc, nopartial);
|
2015-01-12 17:00:38 -08:00
|
|
|
|
|
|
|
skb->remcsum_offload = 1;
|
|
|
|
|
|
|
|
return vh;
|
|
|
|
}
|
|
|
|
|
2023-07-21 16:30:47 +02:00
|
|
|
static struct vxlanhdr *vxlan_gro_prepare_receive(struct sock *sk,
|
|
|
|
struct list_head *head,
|
|
|
|
struct sk_buff *skb,
|
|
|
|
struct gro_remcsum *grc)
|
2014-01-20 13:59:21 +02:00
|
|
|
{
|
2018-06-24 14:13:49 +09:00
|
|
|
struct sk_buff *p;
|
2014-01-20 13:59:21 +02:00
|
|
|
struct vxlanhdr *vh, *vh2;
|
2014-12-30 19:10:15 -08:00
|
|
|
unsigned int hlen, off_vx;
|
2016-04-05 08:22:53 -07:00
|
|
|
struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
|
2016-02-16 21:58:58 +01:00
|
|
|
__be32 flags;
|
2015-02-10 16:30:27 -08:00
|
|
|
|
2023-07-21 16:30:47 +02:00
|
|
|
skb_gro_remcsum_init(grc);
|
2014-01-20 13:59:21 +02:00
|
|
|
|
|
|
|
off_vx = skb_gro_offset(skb);
|
|
|
|
hlen = off_vx + sizeof(*vh);
|
2022-08-23 09:10:49 +02:00
|
|
|
vh = skb_gro_header(skb, hlen, off_vx);
|
|
|
|
if (unlikely(!vh))
|
2023-07-21 16:30:47 +02:00
|
|
|
return NULL;
|
2014-01-20 13:59:21 +02:00
|
|
|
|
2015-01-12 17:00:38 -08:00
|
|
|
skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
|
|
|
|
|
2016-02-16 21:58:58 +01:00
|
|
|
flags = vh->vx_flags;
|
2015-01-12 17:00:38 -08:00
|
|
|
|
|
|
|
if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
|
|
|
|
vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
|
2023-07-21 16:30:47 +02:00
|
|
|
vh->vx_vni, grc,
|
2015-02-10 16:30:32 -08:00
|
|
|
!!(vs->flags &
|
|
|
|
VXLAN_F_REMCSUM_NOPARTIAL));
|
2015-01-12 17:00:38 -08:00
|
|
|
|
|
|
|
if (!vh)
|
2023-07-21 16:30:47 +02:00
|
|
|
return NULL;
|
2015-01-12 17:00:38 -08:00
|
|
|
}
|
|
|
|
|
2015-08-19 17:07:32 -07:00
|
|
|
skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
|
|
|
|
|
2018-06-24 14:13:49 +09:00
|
|
|
list_for_each_entry(p, head, list) {
|
2014-01-20 13:59:21 +02:00
|
|
|
if (!NAPI_GRO_CB(p)->same_flow)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
vh2 = (struct vxlanhdr *)(p->data + off_vx);
|
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
|
|
|
if (vh->vx_flags != vh2->vx_flags ||
|
|
|
|
vh->vx_vni != vh2->vx_vni) {
|
2014-01-20 13:59:21 +02:00
|
|
|
NAPI_GRO_CB(p)->same_flow = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-21 16:30:47 +02:00
|
|
|
return vh;
|
|
|
|
}
|
2014-01-20 13:59:21 +02:00
|
|
|
|
2023-07-21 16:30:47 +02:00
|
|
|
static struct sk_buff *vxlan_gro_receive(struct sock *sk,
|
|
|
|
struct list_head *head,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct sk_buff *pp = NULL;
|
|
|
|
struct gro_remcsum grc;
|
|
|
|
int flush = 1;
|
2014-01-20 13:59:21 +02:00
|
|
|
|
2023-07-21 16:30:47 +02:00
|
|
|
if (vxlan_gro_prepare_receive(sk, head, skb, &grc)) {
|
|
|
|
pp = call_gro_receive(eth_gro_receive, head, skb);
|
|
|
|
flush = 0;
|
|
|
|
}
|
2018-06-30 17:38:55 +02:00
|
|
|
skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
|
2023-07-21 16:30:47 +02:00
|
|
|
return pp;
|
|
|
|
}
|
2014-01-20 13:59:21 +02:00
|
|
|
|
2023-07-21 16:30:47 +02:00
|
|
|
static struct sk_buff *vxlan_gpe_gro_receive(struct sock *sk,
|
|
|
|
struct list_head *head,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
const struct packet_offload *ptype;
|
|
|
|
struct sk_buff *pp = NULL;
|
|
|
|
struct gro_remcsum grc;
|
|
|
|
struct vxlanhdr *vh;
|
|
|
|
__be16 protocol;
|
|
|
|
int flush = 1;
|
2014-01-20 13:59:21 +02:00
|
|
|
|
2023-07-21 16:30:47 +02:00
|
|
|
vh = vxlan_gro_prepare_receive(sk, head, skb, &grc);
|
|
|
|
if (vh) {
|
|
|
|
if (!vxlan_parse_gpe_proto(vh, &protocol))
|
|
|
|
goto out;
|
|
|
|
ptype = gro_find_receive_by_type(protocol);
|
|
|
|
if (!ptype)
|
|
|
|
goto out;
|
|
|
|
pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
|
|
|
|
flush = 0;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
|
2014-01-20 13:59:21 +02:00
|
|
|
return pp;
|
|
|
|
}
|
|
|
|
|
2016-04-05 08:22:53 -07:00
|
|
|
static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
|
2014-01-20 13:59:21 +02:00
|
|
|
{
|
2016-05-03 16:10:21 -07:00
|
|
|
/* Sets 'skb->inner_mac_header' since we are always called with
|
|
|
|
* 'skb->encapsulation' set.
|
|
|
|
*/
|
2014-12-30 19:10:15 -08:00
|
|
|
return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
|
2014-01-20 13:59:21 +02:00
|
|
|
}
|
|
|
|
|
2023-07-21 16:30:47 +02:00
|
|
|
static int vxlan_gpe_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
|
|
|
|
{
|
|
|
|
struct vxlanhdr *vh = (struct vxlanhdr *)(skb->data + nhoff);
|
|
|
|
const struct packet_offload *ptype;
|
|
|
|
int err = -ENOSYS;
|
|
|
|
__be16 protocol;
|
|
|
|
|
|
|
|
if (!vxlan_parse_gpe_proto(vh, &protocol))
|
|
|
|
return err;
|
|
|
|
ptype = gro_find_complete_by_type(protocol);
|
|
|
|
if (ptype)
|
|
|
|
err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2020-05-21 22:26:16 -07:00
|
|
|
static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac,
|
|
|
|
__u16 state, __be32 src_vni,
|
|
|
|
__u16 ndm_flags)
|
2018-07-04 16:46:30 -07:00
|
|
|
{
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
|
|
|
|
f = kmalloc(sizeof(*f), GFP_ATOMIC);
|
|
|
|
if (!f)
|
|
|
|
return NULL;
|
|
|
|
f->state = state;
|
|
|
|
f->flags = ndm_flags;
|
|
|
|
f->updated = f->used = jiffies;
|
|
|
|
f->vni = src_vni;
|
2020-05-21 22:26:14 -07:00
|
|
|
f->nh = NULL;
|
2020-05-28 22:12:36 -07:00
|
|
|
RCU_INIT_POINTER(f->vdev, vxlan);
|
2020-05-21 22:26:14 -07:00
|
|
|
INIT_LIST_HEAD(&f->nh_list);
|
2018-07-04 16:46:30 -07:00
|
|
|
INIT_LIST_HEAD(&f->remotes);
|
|
|
|
memcpy(f->eth_addr, mac, ETH_ALEN);
|
|
|
|
|
|
|
|
return f;
|
|
|
|
}
|
|
|
|
|
2019-06-28 14:07:25 +09:00
|
|
|
static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac,
|
|
|
|
__be32 src_vni, struct vxlan_fdb *f)
|
|
|
|
{
|
|
|
|
++vxlan->addrcnt;
|
|
|
|
hlist_add_head_rcu(&f->hlist,
|
|
|
|
vxlan_fdb_head(vxlan, mac, src_vni));
|
|
|
|
}
|
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
|
|
|
|
u32 nhid, struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct nexthop *old_nh = rtnl_dereference(fdb->nh);
|
|
|
|
struct nexthop *nh;
|
|
|
|
int err = -EINVAL;
|
|
|
|
|
|
|
|
if (old_nh && old_nh->id == nhid)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
nh = nexthop_find_by_id(vxlan->net, nhid);
|
|
|
|
if (!nh) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
|
|
|
|
goto err_inval;
|
|
|
|
}
|
|
|
|
|
2022-03-07 15:57:36 +03:00
|
|
|
if (!nexthop_get(nh)) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
|
|
|
|
nh = NULL;
|
|
|
|
goto err_inval;
|
|
|
|
}
|
|
|
|
if (!nexthop_is_fdb(nh)) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop");
|
|
|
|
goto err_inval;
|
|
|
|
}
|
2020-05-21 22:26:14 -07:00
|
|
|
|
2022-03-07 15:57:36 +03:00
|
|
|
if (!nexthop_is_multipath(nh)) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group");
|
|
|
|
goto err_inval;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check nexthop group family */
|
|
|
|
switch (vxlan->default_dst.remote_ip.sa.sa_family) {
|
|
|
|
case AF_INET:
|
|
|
|
if (!nexthop_has_v4(nh)) {
|
|
|
|
err = -EAFNOSUPPORT;
|
|
|
|
NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
|
2020-05-21 22:26:14 -07:00
|
|
|
goto err_inval;
|
|
|
|
}
|
2022-03-07 15:57:36 +03:00
|
|
|
break;
|
|
|
|
case AF_INET6:
|
|
|
|
if (nexthop_has_v4(nh)) {
|
|
|
|
err = -EAFNOSUPPORT;
|
|
|
|
NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
|
|
|
|
goto err_inval;
|
2020-05-21 22:26:14 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (old_nh) {
|
|
|
|
list_del_rcu(&fdb->nh_list);
|
|
|
|
nexthop_put(old_nh);
|
|
|
|
}
|
|
|
|
rcu_assign_pointer(fdb->nh, nh);
|
|
|
|
list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list);
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
err_inval:
|
|
|
|
if (nh)
|
|
|
|
nexthop_put(nh);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2022-03-01 05:04:31 +00:00
|
|
|
int vxlan_fdb_create(struct vxlan_dev *vxlan,
|
|
|
|
const u8 *mac, union vxlan_addr *ip,
|
|
|
|
__u16 state, __be16 port, __be32 src_vni,
|
|
|
|
__be32 vni, __u32 ifindex, __u16 ndm_flags,
|
|
|
|
u32 nhid, struct vxlan_fdb **fdb,
|
|
|
|
struct netlink_ext_ack *extack)
|
2018-07-04 16:46:30 -07:00
|
|
|
{
|
|
|
|
struct vxlan_rdst *rd = NULL;
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
if (vxlan->cfg.addrmax &&
|
|
|
|
vxlan->addrcnt >= vxlan->cfg.addrmax)
|
|
|
|
return -ENOSPC;
|
|
|
|
|
|
|
|
netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
|
2020-05-21 22:26:16 -07:00
|
|
|
f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
|
2018-07-04 16:46:30 -07:00
|
|
|
if (!f)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
if (nhid)
|
|
|
|
rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
|
|
|
|
else
|
|
|
|
rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
|
|
|
|
if (rc < 0)
|
|
|
|
goto errout;
|
2018-07-04 16:46:30 -07:00
|
|
|
|
|
|
|
*fdb = f;
|
|
|
|
|
|
|
|
return 0;
|
2020-05-21 22:26:14 -07:00
|
|
|
|
|
|
|
errout:
|
|
|
|
kfree(f);
|
|
|
|
return rc;
|
2018-07-04 16:46:30 -07:00
|
|
|
}
|
|
|
|
|
2019-06-28 14:07:25 +09:00
|
|
|
static void __vxlan_fdb_free(struct vxlan_fdb *f)
|
2019-01-16 23:06:30 +00:00
|
|
|
{
|
|
|
|
struct vxlan_rdst *rd, *nd;
|
2020-05-21 22:26:14 -07:00
|
|
|
struct nexthop *nh;
|
|
|
|
|
|
|
|
nh = rcu_dereference_raw(f->nh);
|
|
|
|
if (nh) {
|
|
|
|
rcu_assign_pointer(f->nh, NULL);
|
2020-05-28 22:12:36 -07:00
|
|
|
rcu_assign_pointer(f->vdev, NULL);
|
2020-05-21 22:26:14 -07:00
|
|
|
nexthop_put(nh);
|
|
|
|
}
|
2019-01-16 23:06:30 +00:00
|
|
|
|
|
|
|
list_for_each_entry_safe(rd, nd, &f->remotes, list) {
|
|
|
|
dst_cache_destroy(&rd->dst_cache);
|
|
|
|
kfree(rd);
|
|
|
|
}
|
|
|
|
kfree(f);
|
|
|
|
}
|
|
|
|
|
2019-06-28 14:07:25 +09:00
|
|
|
static void vxlan_fdb_free(struct rcu_head *head)
|
|
|
|
{
|
|
|
|
struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
|
|
|
|
|
|
|
|
__vxlan_fdb_free(f);
|
|
|
|
}
|
|
|
|
|
2019-01-16 23:06:30 +00:00
|
|
|
static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
|
|
|
|
bool do_notify, bool swdev_notify)
|
|
|
|
{
|
|
|
|
struct vxlan_rdst *rd;
|
|
|
|
|
|
|
|
netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr);
|
|
|
|
|
|
|
|
--vxlan->addrcnt;
|
2020-05-21 22:26:14 -07:00
|
|
|
if (do_notify) {
|
|
|
|
if (rcu_access_pointer(f->nh))
|
|
|
|
vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH,
|
2019-01-16 23:06:54 +00:00
|
|
|
swdev_notify, NULL);
|
2020-05-21 22:26:14 -07:00
|
|
|
else
|
|
|
|
list_for_each_entry(rd, &f->remotes, list)
|
|
|
|
vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
|
|
|
|
swdev_notify, NULL);
|
|
|
|
}
|
2019-01-16 23:06:30 +00:00
|
|
|
|
|
|
|
hlist_del_rcu(&f->hlist);
|
2020-05-28 22:12:36 -07:00
|
|
|
list_del_rcu(&f->nh_list);
|
2019-01-16 23:06:30 +00:00
|
|
|
call_rcu(&f->rcu, vxlan_fdb_free);
|
|
|
|
}
|
|
|
|
|
2019-02-07 12:18:02 +00:00
|
|
|
static void vxlan_dst_free(struct rcu_head *head)
|
|
|
|
{
|
|
|
|
struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);
|
|
|
|
|
|
|
|
dst_cache_destroy(&rd->dst_cache);
|
|
|
|
kfree(rd);
|
|
|
|
}
|
|
|
|
|
2019-01-16 23:06:32 +00:00
|
|
|
static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
|
|
|
|
union vxlan_addr *ip,
|
|
|
|
__u16 state, __u16 flags,
|
|
|
|
__be16 port, __be32 vni,
|
|
|
|
__u32 ifindex, __u16 ndm_flags,
|
2020-05-21 22:26:14 -07:00
|
|
|
struct vxlan_fdb *f, u32 nhid,
|
2019-01-16 23:06:54 +00:00
|
|
|
bool swdev_notify,
|
|
|
|
struct netlink_ext_ack *extack)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2018-11-21 08:02:36 +00:00
|
|
|
__u16 fdb_flags = (ndm_flags & ~NTF_USE);
|
2014-04-22 15:01:30 +02:00
|
|
|
struct vxlan_rdst *rd = NULL;
|
2019-01-16 23:06:34 +00:00
|
|
|
struct vxlan_rdst oldrd;
|
2012-10-01 12:32:35 +00:00
|
|
|
int notify = 0;
|
2019-01-16 23:06:38 +00:00
|
|
|
int rc = 0;
|
|
|
|
int err;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
if (nhid && !rcu_access_pointer(f->nh)) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Cannot replace an existing non nexthop fdb with a nexthop");
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nhid && (flags & NLM_F_APPEND)) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Cannot append to a nexthop fdb");
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2019-01-16 23:06:32 +00:00
|
|
|
/* Do not allow an externally learned entry to take over an entry added
|
|
|
|
* by the user.
|
|
|
|
*/
|
|
|
|
if (!(fdb_flags & NTF_EXT_LEARNED) ||
|
|
|
|
!(f->flags & NTF_VXLAN_ADDED_BY_USER)) {
|
|
|
|
if (f->state != state) {
|
|
|
|
f->state = state;
|
|
|
|
f->updated = jiffies;
|
|
|
|
notify = 1;
|
2013-04-19 00:36:26 +00:00
|
|
|
}
|
2019-01-16 23:06:32 +00:00
|
|
|
if (f->flags != fdb_flags) {
|
|
|
|
f->flags = fdb_flags;
|
|
|
|
f->updated = jiffies;
|
|
|
|
notify = 1;
|
2013-07-19 17:20:07 +02:00
|
|
|
}
|
2019-01-16 23:06:32 +00:00
|
|
|
}
|
2013-03-15 04:35:51 +00:00
|
|
|
|
2019-01-16 23:06:32 +00:00
|
|
|
if ((flags & NLM_F_REPLACE)) {
|
|
|
|
/* Only change unicasts */
|
|
|
|
if (!(is_multicast_ether_addr(f->eth_addr) ||
|
|
|
|
is_zero_ether_addr(f->eth_addr))) {
|
2020-05-21 22:26:14 -07:00
|
|
|
if (nhid) {
|
|
|
|
rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
|
|
|
|
if (rc < 0)
|
|
|
|
return rc;
|
|
|
|
} else {
|
|
|
|
rc = vxlan_fdb_replace(f, ip, port, vni,
|
|
|
|
ifindex, &oldrd);
|
|
|
|
}
|
2013-03-15 04:35:51 +00:00
|
|
|
notify |= rc;
|
2019-01-16 23:06:32 +00:00
|
|
|
} else {
|
2020-05-21 22:26:14 -07:00
|
|
|
NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries");
|
2013-07-19 17:20:07 +02:00
|
|
|
return -EOPNOTSUPP;
|
2019-01-16 23:06:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if ((flags & NLM_F_APPEND) &&
|
|
|
|
(is_multicast_ether_addr(f->eth_addr) ||
|
|
|
|
is_zero_ether_addr(f->eth_addr))) {
|
|
|
|
rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
|
2013-07-19 17:20:07 +02:00
|
|
|
|
2018-07-04 16:46:30 -07:00
|
|
|
if (rc < 0)
|
2016-11-29 09:59:36 +08:00
|
|
|
return rc;
|
2019-01-16 23:06:32 +00:00
|
|
|
notify |= rc;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2019-01-16 23:06:32 +00:00
|
|
|
if (ndm_flags & NTF_USE)
|
|
|
|
f->used = jiffies;
|
|
|
|
|
2014-04-22 15:01:30 +02:00
|
|
|
if (notify) {
|
|
|
|
if (rd == NULL)
|
|
|
|
rd = first_remote_rtnl(f);
|
2019-01-16 23:06:32 +00:00
|
|
|
|
2019-01-16 23:06:38 +00:00
|
|
|
err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH,
|
2019-01-16 23:06:54 +00:00
|
|
|
swdev_notify, extack);
|
2019-01-16 23:06:38 +00:00
|
|
|
if (err)
|
|
|
|
goto err_notify;
|
2014-04-22 15:01:30 +02:00
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
return 0;
|
2019-01-16 23:06:38 +00:00
|
|
|
|
|
|
|
err_notify:
|
2020-05-21 22:26:14 -07:00
|
|
|
if (nhid)
|
|
|
|
return err;
|
2019-01-16 23:06:38 +00:00
|
|
|
if ((flags & NLM_F_REPLACE) && rc)
|
|
|
|
*rd = oldrd;
|
2019-02-07 12:18:02 +00:00
|
|
|
else if ((flags & NLM_F_APPEND) && rc) {
|
2019-01-16 23:06:38 +00:00
|
|
|
list_del_rcu(&rd->list);
|
2019-02-07 12:18:02 +00:00
|
|
|
call_rcu(&rd->rcu, vxlan_dst_free);
|
|
|
|
}
|
2019-01-16 23:06:38 +00:00
|
|
|
return err;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2019-01-16 23:06:32 +00:00
|
|
|
static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
|
|
|
|
const u8 *mac, union vxlan_addr *ip,
|
|
|
|
__u16 state, __u16 flags,
|
|
|
|
__be16 port, __be32 src_vni, __be32 vni,
|
2020-05-21 22:26:14 -07:00
|
|
|
__u32 ifindex, __u16 ndm_flags, u32 nhid,
|
2019-01-16 23:06:54 +00:00
|
|
|
bool swdev_notify,
|
|
|
|
struct netlink_ext_ack *extack)
|
2019-01-16 23:06:32 +00:00
|
|
|
{
|
|
|
|
__u16 fdb_flags = (ndm_flags & ~NTF_USE);
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* Disallow replace to add a multicast entry */
|
|
|
|
if ((flags & NLM_F_REPLACE) &&
|
|
|
|
(is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
|
|
|
|
rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
|
2020-05-21 22:26:14 -07:00
|
|
|
vni, ifindex, fdb_flags, nhid, &f, extack);
|
2019-01-16 23:06:32 +00:00
|
|
|
if (rc < 0)
|
|
|
|
return rc;
|
|
|
|
|
2019-06-28 14:07:25 +09:00
|
|
|
vxlan_fdb_insert(vxlan, mac, src_vni, f);
|
2019-01-16 23:06:38 +00:00
|
|
|
rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH,
|
2019-01-16 23:06:54 +00:00
|
|
|
swdev_notify, extack);
|
2019-01-16 23:06:38 +00:00
|
|
|
if (rc)
|
|
|
|
goto err_notify;
|
|
|
|
|
2019-01-16 23:06:32 +00:00
|
|
|
return 0;
|
2019-01-16 23:06:38 +00:00
|
|
|
|
|
|
|
err_notify:
|
|
|
|
vxlan_fdb_destroy(vxlan, f, false, false);
|
|
|
|
return rc;
|
2019-01-16 23:06:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Add new entry to forwarding table -- assumes lock held */
|
2022-03-01 05:04:31 +00:00
|
|
|
int vxlan_fdb_update(struct vxlan_dev *vxlan,
|
|
|
|
const u8 *mac, union vxlan_addr *ip,
|
|
|
|
__u16 state, __u16 flags,
|
|
|
|
__be16 port, __be32 src_vni, __be32 vni,
|
|
|
|
__u32 ifindex, __u16 ndm_flags, u32 nhid,
|
|
|
|
bool swdev_notify,
|
|
|
|
struct netlink_ext_ack *extack)
|
2019-01-16 23:06:32 +00:00
|
|
|
{
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
|
|
|
|
f = __vxlan_find_mac(vxlan, mac, src_vni);
|
|
|
|
if (f) {
|
|
|
|
if (flags & NLM_F_EXCL) {
|
|
|
|
netdev_dbg(vxlan->dev,
|
|
|
|
"lost race to create %pM\n", mac);
|
|
|
|
return -EEXIST;
|
|
|
|
}
|
|
|
|
|
|
|
|
return vxlan_fdb_update_existing(vxlan, ip, state, flags, port,
|
|
|
|
vni, ifindex, ndm_flags, f,
|
2020-05-21 22:26:14 -07:00
|
|
|
nhid, swdev_notify, extack);
|
2019-01-16 23:06:32 +00:00
|
|
|
} else {
|
|
|
|
if (!(flags & NLM_F_CREATE))
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
return vxlan_fdb_update_create(vxlan, mac, ip, state, flags,
|
|
|
|
port, src_vni, vni, ifindex,
|
2020-05-21 22:26:14 -07:00
|
|
|
ndm_flags, nhid, swdev_notify,
|
|
|
|
extack);
|
2019-01-16 23:06:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-29 13:25:57 -04:00
|
|
|
static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
|
2018-11-21 08:02:35 +00:00
|
|
|
struct vxlan_rdst *rd, bool swdev_notify)
|
2017-05-29 13:25:57 -04:00
|
|
|
{
|
|
|
|
list_del_rcu(&rd->list);
|
2019-01-16 23:06:54 +00:00
|
|
|
vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL);
|
2017-05-29 13:25:57 -04:00
|
|
|
call_rcu(&rd->rcu, vxlan_dst_free);
|
|
|
|
}
|
|
|
|
|
2013-06-25 16:01:53 +03:00
|
|
|
static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
|
2017-01-31 22:59:52 -08:00
|
|
|
union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
|
2022-05-05 17:09:58 +02:00
|
|
|
__be32 *vni, u32 *ifindex, u32 *nhid,
|
|
|
|
struct netlink_ext_ack *extack)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2013-03-15 04:35:51 +00:00
|
|
|
struct net *net = dev_net(vxlan->dev);
|
2013-08-31 13:44:33 +08:00
|
|
|
int err;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2022-05-20 02:36:14 +02:00
|
|
|
if (tb[NDA_NH_ID] &&
|
|
|
|
(tb[NDA_DST] || tb[NDA_VNI] || tb[NDA_IFINDEX] || tb[NDA_PORT])) {
|
|
|
|
NL_SET_ERR_MSG(extack, "DST, VNI, ifindex and port are mutually exclusive with NH_ID");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2020-05-28 22:12:35 -07:00
|
|
|
|
2013-06-25 16:01:53 +03:00
|
|
|
if (tb[NDA_DST]) {
|
2013-08-31 13:44:33 +08:00
|
|
|
err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
|
2022-05-05 17:09:58 +02:00
|
|
|
if (err) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Unsupported address family");
|
2013-08-31 13:44:33 +08:00
|
|
|
return err;
|
2022-05-05 17:09:58 +02:00
|
|
|
}
|
2013-06-25 16:01:53 +03:00
|
|
|
} else {
|
2013-08-31 13:44:33 +08:00
|
|
|
union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
|
2019-12-30 17:52:22 +08:00
|
|
|
|
2013-08-31 13:44:33 +08:00
|
|
|
if (remote->sa.sa_family == AF_INET) {
|
|
|
|
ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
|
|
|
|
ip->sa.sa_family = AF_INET;
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
} else {
|
|
|
|
ip->sin6.sin6_addr = in6addr_any;
|
|
|
|
ip->sa.sa_family = AF_INET6;
|
|
|
|
#endif
|
|
|
|
}
|
2013-06-25 16:01:53 +03:00
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2013-03-15 04:35:51 +00:00
|
|
|
if (tb[NDA_PORT]) {
|
2022-05-05 17:09:58 +02:00
|
|
|
if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Invalid vxlan port");
|
2013-03-15 04:35:51 +00:00
|
|
|
return -EINVAL;
|
2022-05-05 17:09:58 +02:00
|
|
|
}
|
2013-06-25 16:01:53 +03:00
|
|
|
*port = nla_get_be16(tb[NDA_PORT]);
|
|
|
|
} else {
|
2015-07-21 10:44:02 +02:00
|
|
|
*port = vxlan->cfg.dst_port;
|
2013-06-25 16:01:53 +03:00
|
|
|
}
|
2013-03-15 04:35:51 +00:00
|
|
|
|
|
|
|
if (tb[NDA_VNI]) {
|
2022-05-05 17:09:58 +02:00
|
|
|
if (nla_len(tb[NDA_VNI]) != sizeof(u32)) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Invalid vni");
|
2013-03-15 04:35:51 +00:00
|
|
|
return -EINVAL;
|
2022-05-05 17:09:58 +02:00
|
|
|
}
|
2016-02-16 21:58:58 +01:00
|
|
|
*vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
|
2013-06-25 16:01:53 +03:00
|
|
|
} else {
|
|
|
|
*vni = vxlan->default_dst.remote_vni;
|
|
|
|
}
|
2013-03-15 04:35:51 +00:00
|
|
|
|
2017-01-31 22:59:52 -08:00
|
|
|
if (tb[NDA_SRC_VNI]) {
|
2022-05-05 17:09:58 +02:00
|
|
|
if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32)) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Invalid src vni");
|
2017-01-31 22:59:52 -08:00
|
|
|
return -EINVAL;
|
2022-05-05 17:09:58 +02:00
|
|
|
}
|
2017-01-31 22:59:52 -08:00
|
|
|
*src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
|
|
|
|
} else {
|
|
|
|
*src_vni = vxlan->default_dst.remote_vni;
|
|
|
|
}
|
|
|
|
|
2013-03-15 04:35:51 +00:00
|
|
|
if (tb[NDA_IFINDEX]) {
|
2013-03-26 08:29:30 +00:00
|
|
|
struct net_device *tdev;
|
2013-03-15 04:35:51 +00:00
|
|
|
|
2022-05-05 17:09:58 +02:00
|
|
|
if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Invalid ifindex");
|
2013-03-15 04:35:51 +00:00
|
|
|
return -EINVAL;
|
2022-05-05 17:09:58 +02:00
|
|
|
}
|
2013-06-25 16:01:53 +03:00
|
|
|
*ifindex = nla_get_u32(tb[NDA_IFINDEX]);
|
2014-01-15 10:23:41 +08:00
|
|
|
tdev = __dev_get_by_index(net, *ifindex);
|
2022-05-05 17:09:58 +02:00
|
|
|
if (!tdev) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Device not found");
|
2013-03-15 04:35:51 +00:00
|
|
|
return -EADDRNOTAVAIL;
|
2022-05-05 17:09:58 +02:00
|
|
|
}
|
2013-06-25 16:01:53 +03:00
|
|
|
} else {
|
|
|
|
*ifindex = 0;
|
|
|
|
}
|
|
|
|
|
2024-11-08 11:41:45 +01:00
|
|
|
*nhid = nla_get_u32_default(tb[NDA_NH_ID], 0);
|
2020-05-21 22:26:14 -07:00
|
|
|
|
2013-06-25 16:01:53 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Add static entry (via netlink) */
|
|
|
|
static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
|
|
|
|
struct net_device *dev,
|
2019-01-16 23:06:50 +00:00
|
|
|
const unsigned char *addr, u16 vid, u16 flags,
|
2024-11-14 15:09:53 +01:00
|
|
|
bool *notified, struct netlink_ext_ack *extack)
|
2013-06-25 16:01:53 +03:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
/* struct net *net = dev_net(vxlan->dev); */
|
2013-08-31 13:44:33 +08:00
|
|
|
union vxlan_addr ip;
|
2013-06-25 16:01:53 +03:00
|
|
|
__be16 port;
|
2017-01-31 22:59:52 -08:00
|
|
|
__be32 src_vni, vni;
|
2020-05-21 22:26:14 -07:00
|
|
|
u32 ifindex, nhid;
|
2019-06-06 17:57:58 +08:00
|
|
|
u32 hash_index;
|
2013-06-25 16:01:53 +03:00
|
|
|
int err;
|
|
|
|
|
|
|
|
if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
|
|
|
|
pr_info("RTM_NEWNEIGH with invalid state %#x\n",
|
|
|
|
ndm->ndm_state);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID]))
|
2013-06-25 16:01:53 +03:00
|
|
|
return -EINVAL;
|
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
|
2022-05-05 17:09:58 +02:00
|
|
|
&nhid, extack);
|
2013-06-25 16:01:53 +03:00
|
|
|
if (err)
|
|
|
|
return err;
|
2013-03-15 04:35:51 +00:00
|
|
|
|
2014-04-01 09:23:01 +03:00
|
|
|
if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
|
|
|
|
return -EAFNOSUPPORT;
|
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
hash_index = fdb_head_index(vxlan, addr, src_vni);
|
|
|
|
spin_lock_bh(&vxlan->hash_lock[hash_index]);
|
2018-07-04 16:46:30 -07:00
|
|
|
err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
|
2018-11-21 08:02:36 +00:00
|
|
|
port, src_vni, vni, ifindex,
|
|
|
|
ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
|
2020-05-21 22:26:14 -07:00
|
|
|
nhid, true, extack);
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2024-11-14 15:09:53 +01:00
|
|
|
if (!err)
|
|
|
|
*notified = true;
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2022-03-01 05:04:31 +00:00
|
|
|
int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
|
|
|
|
const unsigned char *addr, union vxlan_addr ip,
|
|
|
|
__be16 port, __be32 src_vni, __be32 vni,
|
|
|
|
u32 ifindex, bool swdev_notify)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2013-06-25 16:01:54 +03:00
|
|
|
struct vxlan_rdst *rd = NULL;
|
2020-05-21 22:26:14 -07:00
|
|
|
struct vxlan_fdb *f;
|
2017-01-31 22:59:52 -08:00
|
|
|
int err = -ENOENT;
|
2013-06-25 16:01:54 +03:00
|
|
|
|
2017-01-31 22:59:52 -08:00
|
|
|
f = vxlan_find_mac(vxlan, addr, src_vni);
|
2013-06-25 16:01:54 +03:00
|
|
|
if (!f)
|
2017-01-31 22:59:52 -08:00
|
|
|
return err;
|
2013-06-25 16:01:54 +03:00
|
|
|
|
2013-08-31 13:44:33 +08:00
|
|
|
if (!vxlan_addr_any(&ip)) {
|
|
|
|
rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
|
2013-06-25 16:01:54 +03:00
|
|
|
if (!rd)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* remove a destination if it's not the only one on the list,
|
|
|
|
* otherwise destroy the fdb entry
|
|
|
|
*/
|
|
|
|
if (rd && !list_is_singular(&f->remotes)) {
|
2018-11-21 08:02:35 +00:00
|
|
|
vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify);
|
2013-06-25 16:01:54 +03:00
|
|
|
goto out;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
2013-06-25 16:01:54 +03:00
|
|
|
|
2018-11-21 08:02:35 +00:00
|
|
|
vxlan_fdb_destroy(vxlan, f, true, swdev_notify);
|
2013-06-25 16:01:54 +03:00
|
|
|
|
|
|
|
out:
|
2017-01-31 22:59:52 -08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Delete entry (via netlink) */
|
|
|
|
static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
|
|
|
|
struct net_device *dev,
|
2024-11-14 15:09:54 +01:00
|
|
|
const unsigned char *addr, u16 vid, bool *notified,
|
2022-05-20 02:36:14 +02:00
|
|
|
struct netlink_ext_ack *extack)
|
2017-01-31 22:59:52 -08:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
union vxlan_addr ip;
|
|
|
|
__be32 src_vni, vni;
|
2020-05-21 22:26:14 -07:00
|
|
|
u32 ifindex, nhid;
|
2019-06-06 17:57:58 +08:00
|
|
|
u32 hash_index;
|
2020-05-21 22:26:14 -07:00
|
|
|
__be16 port;
|
2017-01-31 22:59:52 -08:00
|
|
|
int err;
|
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
|
2022-05-05 17:09:58 +02:00
|
|
|
&nhid, extack);
|
2017-01-31 22:59:52 -08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
hash_index = fdb_head_index(vxlan, addr, src_vni);
|
|
|
|
spin_lock_bh(&vxlan->hash_lock[hash_index]);
|
2018-11-21 08:02:35 +00:00
|
|
|
err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
|
|
|
|
true);
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2024-11-14 15:09:54 +01:00
|
|
|
if (!err)
|
|
|
|
*notified = true;
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Dump forwarding table */
|
|
|
|
static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
|
2014-07-10 07:01:58 -04:00
|
|
|
struct net_device *dev,
|
2016-08-30 21:56:45 -07:00
|
|
|
struct net_device *filter_dev, int *idx)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2024-12-09 10:07:45 +00:00
|
|
|
struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
|
2012-10-01 12:32:35 +00:00
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
unsigned int h;
|
2016-08-30 21:56:45 -07:00
|
|
|
int err = 0;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
for (h = 0; h < FDB_HASH_SIZE; ++h) {
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
|
2020-07-29 11:34:36 +03:00
|
|
|
rcu_read_lock();
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-27 17:06:00 -08:00
|
|
|
hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
|
2013-03-15 04:35:51 +00:00
|
|
|
struct vxlan_rdst *rd;
|
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
if (rcu_access_pointer(f->nh)) {
|
2024-12-09 10:07:45 +00:00
|
|
|
if (*idx < ctx->fdb_idx)
|
2020-06-24 14:02:36 -07:00
|
|
|
goto skip_nh;
|
2020-05-21 22:26:14 -07:00
|
|
|
err = vxlan_fdb_info(skb, vxlan, f,
|
|
|
|
NETLINK_CB(cb->skb).portid,
|
|
|
|
cb->nlh->nlmsg_seq,
|
|
|
|
RTM_NEWNEIGH,
|
|
|
|
NLM_F_MULTI, NULL);
|
2020-07-29 11:34:36 +03:00
|
|
|
if (err < 0) {
|
|
|
|
rcu_read_unlock();
|
2020-05-21 22:26:14 -07:00
|
|
|
goto out;
|
2020-07-29 11:34:36 +03:00
|
|
|
}
|
2020-06-24 14:02:36 -07:00
|
|
|
skip_nh:
|
|
|
|
*idx += 1;
|
2020-05-21 22:26:14 -07:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-06-17 14:16:12 -07:00
|
|
|
list_for_each_entry_rcu(rd, &f->remotes, list) {
|
2024-12-09 10:07:45 +00:00
|
|
|
if (*idx < ctx->fdb_idx)
|
2015-08-10 23:39:09 +09:00
|
|
|
goto skip;
|
|
|
|
|
2013-03-15 04:35:51 +00:00
|
|
|
err = vxlan_fdb_info(skb, vxlan, f,
|
|
|
|
NETLINK_CB(cb->skb).portid,
|
|
|
|
cb->nlh->nlmsg_seq,
|
|
|
|
RTM_NEWNEIGH,
|
|
|
|
NLM_F_MULTI, rd);
|
2020-07-29 11:34:36 +03:00
|
|
|
if (err < 0) {
|
|
|
|
rcu_read_unlock();
|
2013-06-17 14:16:12 -07:00
|
|
|
goto out;
|
2020-07-29 11:34:36 +03:00
|
|
|
}
|
2013-06-17 14:16:12 -07:00
|
|
|
skip:
|
2016-08-30 21:56:45 -07:00
|
|
|
*idx += 1;
|
2015-08-10 23:39:09 +09:00
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
2020-07-29 11:34:36 +03:00
|
|
|
rcu_read_unlock();
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
2013-06-17 14:16:12 -07:00
|
|
|
out:
|
2016-08-30 21:56:45 -07:00
|
|
|
return err;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2018-12-15 22:35:10 -08:00
|
|
|
static int vxlan_fdb_get(struct sk_buff *skb,
|
|
|
|
struct nlattr *tb[],
|
|
|
|
struct net_device *dev,
|
|
|
|
const unsigned char *addr,
|
|
|
|
u16 vid, u32 portid, u32 seq,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
__be32 vni;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (tb[NDA_VNI])
|
|
|
|
vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
|
|
|
|
else
|
|
|
|
vni = vxlan->default_dst.remote_vni;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
f = __vxlan_find_mac(vxlan, addr, vni);
|
|
|
|
if (!f) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Fdb entry not found");
|
|
|
|
err = -ENOENT;
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = vxlan_fdb_info(skb, vxlan, f, portid, seq,
|
|
|
|
RTM_NEWNEIGH, 0, first_remote_rcu(f));
|
|
|
|
errout:
|
|
|
|
rcu_read_unlock();
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Watch incoming packets to learn mapping between Ethernet address
|
|
|
|
* and Tunnel endpoint.
|
|
|
|
*/
|
2024-10-09 10:28:24 +08:00
|
|
|
static enum skb_drop_reason vxlan_snoop(struct net_device *dev,
|
|
|
|
union vxlan_addr *src_ip,
|
|
|
|
const u8 *src_mac, u32 src_ifindex,
|
|
|
|
__be32 vni)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct vxlan_fdb *f;
|
2017-06-19 10:03:59 +02:00
|
|
|
u32 ifindex = 0;
|
|
|
|
|
2024-06-03 10:59:26 +02:00
|
|
|
/* Ignore packets from invalid src-address */
|
|
|
|
if (!is_valid_ether_addr(src_mac))
|
2024-10-09 10:28:24 +08:00
|
|
|
return SKB_DROP_REASON_MAC_INVALID_SOURCE;
|
2024-06-03 10:59:26 +02:00
|
|
|
|
2017-06-19 10:03:59 +02:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
if (src_ip->sa.sa_family == AF_INET6 &&
|
|
|
|
(ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL))
|
|
|
|
ifindex = src_ifindex;
|
|
|
|
#endif
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2017-01-31 22:59:52 -08:00
|
|
|
f = vxlan_find_mac(vxlan, src_mac, vni);
|
2012-10-01 12:32:35 +00:00
|
|
|
if (likely(f)) {
|
2013-08-04 17:17:39 -07:00
|
|
|
struct vxlan_rdst *rdst = first_remote_rcu(f);
|
2013-06-17 14:16:12 -07:00
|
|
|
|
2017-06-19 10:03:59 +02:00
|
|
|
if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
|
|
|
|
rdst->remote_ifindex == ifindex))
|
2024-10-09 10:28:24 +08:00
|
|
|
return SKB_NOT_DROPPED_YET;
|
2013-06-17 12:09:58 -07:00
|
|
|
|
|
|
|
/* Don't migrate static entries, drop packets */
|
2017-06-11 16:32:50 -07:00
|
|
|
if (f->state & (NUD_PERMANENT | NUD_NOARP))
|
2024-10-09 10:28:24 +08:00
|
|
|
return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
/* Don't override an fdb with nexthop with a learnt entry */
|
|
|
|
if (rcu_access_pointer(f->nh))
|
2024-10-09 10:28:24 +08:00
|
|
|
return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS;
|
2020-05-21 22:26:14 -07:00
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
if (net_ratelimit())
|
|
|
|
netdev_info(dev,
|
2013-08-31 13:44:33 +08:00
|
|
|
"%pM migrated from %pIS to %pIS\n",
|
2015-02-07 03:17:31 +01:00
|
|
|
src_mac, &rdst->remote_ip.sa, &src_ip->sa);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2013-08-31 13:44:33 +08:00
|
|
|
rdst->remote_ip = *src_ip;
|
2012-10-01 12:32:35 +00:00
|
|
|
f->updated = jiffies;
|
2019-01-16 23:06:54 +00:00
|
|
|
vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL);
|
2012-10-01 12:32:35 +00:00
|
|
|
} else {
|
2019-06-06 17:57:58 +08:00
|
|
|
u32 hash_index = fdb_head_index(vxlan, src_mac, vni);
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* learned new entry */
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_lock(&vxlan->hash_lock[hash_index]);
|
2013-06-17 12:09:57 -07:00
|
|
|
|
|
|
|
/* close off race between vxlan_flush and incoming packets */
|
|
|
|
if (netif_running(dev))
|
2018-07-04 16:46:30 -07:00
|
|
|
vxlan_fdb_update(vxlan, src_mac, src_ip,
|
2013-06-17 12:09:57 -07:00
|
|
|
NUD_REACHABLE,
|
|
|
|
NLM_F_EXCL|NLM_F_CREATE,
|
2015-07-21 10:44:02 +02:00
|
|
|
vxlan->cfg.dst_port,
|
2017-01-31 22:59:52 -08:00
|
|
|
vni,
|
2013-06-17 12:09:57 -07:00
|
|
|
vxlan->default_dst.remote_vni,
|
2020-05-21 22:26:14 -07:00
|
|
|
ifindex, NTF_SELF, 0, true, NULL);
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock(&vxlan->hash_lock[hash_index]);
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
2013-06-17 12:09:58 -07:00
|
|
|
|
2024-10-09 10:28:24 +08:00
|
|
|
return SKB_NOT_DROPPED_YET;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2016-04-09 12:46:23 +02:00
|
|
|
static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
|
2013-06-17 14:16:10 -07:00
|
|
|
{
|
2015-09-24 13:50:02 +02:00
|
|
|
struct vxlan_net *vn;
|
2013-08-19 11:23:07 -07:00
|
|
|
|
2015-09-24 13:50:02 +02:00
|
|
|
if (!vs)
|
2016-04-09 12:46:23 +02:00
|
|
|
return false;
|
2017-07-04 15:52:59 +03:00
|
|
|
if (!refcount_dec_and_test(&vs->refcnt))
|
2016-04-09 12:46:23 +02:00
|
|
|
return false;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2015-09-24 13:50:02 +02:00
|
|
|
vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
|
2013-06-17 14:16:11 -07:00
|
|
|
spin_lock(&vn->sock_lock);
|
2013-06-17 14:16:10 -07:00
|
|
|
hlist_del_rcu(&vs->hlist);
|
2016-06-16 12:20:52 -07:00
|
|
|
udp_tunnel_notify_del_rx_port(vs->sock,
|
2016-06-16 12:23:19 -07:00
|
|
|
(vs->flags & VXLAN_F_GPE) ?
|
|
|
|
UDP_TUNNEL_TYPE_VXLAN_GPE :
|
2016-06-16 12:20:52 -07:00
|
|
|
UDP_TUNNEL_TYPE_VXLAN);
|
2013-06-17 14:16:11 -07:00
|
|
|
spin_unlock(&vn->sock_lock);
|
|
|
|
|
2016-04-09 12:46:23 +02:00
|
|
|
return true;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 13:50:02 +02:00
|
|
|
static void vxlan_sock_release(struct vxlan_dev *vxlan)
|
|
|
|
{
|
2016-10-28 09:59:15 -07:00
|
|
|
struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
|
2015-09-24 13:50:02 +02:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2016-10-28 09:59:15 -07:00
|
|
|
struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
|
|
|
|
|
2017-06-07 14:36:58 +03:00
|
|
|
RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
|
2016-04-09 12:46:23 +02:00
|
|
|
#endif
|
|
|
|
|
2017-06-07 14:36:58 +03:00
|
|
|
RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
|
2016-04-09 12:46:23 +02:00
|
|
|
synchronize_net();
|
|
|
|
|
2022-03-01 05:04:36 +00:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
|
|
|
|
vxlan_vs_del_vnigrp(vxlan);
|
|
|
|
else
|
|
|
|
vxlan_vs_del_dev(vxlan);
|
2017-06-02 03:24:08 +03:00
|
|
|
|
2016-10-28 09:59:15 -07:00
|
|
|
if (__vxlan_sock_release_prep(sock4)) {
|
|
|
|
udp_tunnel_sock_release(sock4->sock);
|
|
|
|
kfree(sock4);
|
2016-04-09 12:46:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2016-10-28 09:59:15 -07:00
|
|
|
if (__vxlan_sock_release_prep(sock6)) {
|
|
|
|
udp_tunnel_sock_release(sock6->sock);
|
|
|
|
kfree(sock6);
|
2016-04-09 12:46:23 +02:00
|
|
|
}
|
2015-09-24 13:50:02 +02:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2024-12-05 16:40:52 +01:00
|
|
|
static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags)
|
2015-01-12 17:00:38 -08:00
|
|
|
{
|
2024-12-05 16:40:52 +01:00
|
|
|
const struct vxlanhdr *vh = vxlan_hdr(skb);
|
2024-10-09 10:28:23 +08:00
|
|
|
enum skb_drop_reason reason;
|
2016-03-21 17:50:05 +01:00
|
|
|
size_t start, offset;
|
2015-01-12 17:00:38 -08:00
|
|
|
|
2024-12-05 16:40:52 +01:00
|
|
|
if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
|
2024-12-05 16:40:51 +01:00
|
|
|
return SKB_NOT_DROPPED_YET;
|
2015-08-19 17:07:32 -07:00
|
|
|
|
2024-12-05 16:40:52 +01:00
|
|
|
start = vxlan_rco_start(vh->vx_vni);
|
|
|
|
offset = start + vxlan_rco_offset(vh->vx_vni);
|
2015-01-12 17:00:38 -08:00
|
|
|
|
2024-10-09 10:28:23 +08:00
|
|
|
reason = pskb_may_pull_reason(skb, offset + sizeof(u16));
|
|
|
|
if (reason)
|
|
|
|
return reason;
|
2015-01-12 17:00:38 -08:00
|
|
|
|
2016-02-16 21:58:59 +01:00
|
|
|
skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
|
|
|
|
!!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
|
2024-10-09 10:28:23 +08:00
|
|
|
return SKB_NOT_DROPPED_YET;
|
2015-01-12 17:00:38 -08:00
|
|
|
}
|
|
|
|
|
2024-12-05 16:40:52 +01:00
|
|
|
static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags,
|
2016-02-23 18:02:59 +01:00
|
|
|
struct vxlan_metadata *md)
|
2016-02-16 21:59:00 +01:00
|
|
|
{
|
2024-12-05 16:40:52 +01:00
|
|
|
const struct vxlanhdr *vh = vxlan_hdr(skb);
|
|
|
|
const struct vxlanhdr_gbp *gbp;
|
2016-02-23 18:02:59 +01:00
|
|
|
struct metadata_dst *tun_dst;
|
2016-02-16 21:59:01 +01:00
|
|
|
|
2024-12-05 16:40:52 +01:00
|
|
|
gbp = (const struct vxlanhdr_gbp *)vh;
|
|
|
|
|
|
|
|
if (!(vh->vx_flags & VXLAN_HF_GBP))
|
2024-12-05 16:40:51 +01:00
|
|
|
return;
|
2016-02-16 21:59:00 +01:00
|
|
|
|
|
|
|
md->gbp = ntohs(gbp->policy_id);
|
|
|
|
|
2016-02-23 18:02:59 +01:00
|
|
|
tun_dst = (struct metadata_dst *)skb_dst(skb);
|
2016-03-08 12:34:12 -05:00
|
|
|
if (tun_dst) {
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
|
|
|
__set_bit(IP_TUNNEL_VXLAN_OPT_BIT,
|
|
|
|
tun_dst->u.tun_info.key.tun_flags);
|
2016-03-08 12:34:12 -05:00
|
|
|
tun_dst->u.tun_info.options_len = sizeof(*md);
|
|
|
|
}
|
2016-02-16 21:59:00 +01:00
|
|
|
if (gbp->dont_learn)
|
|
|
|
md->gbp |= VXLAN_GBP_DONT_LEARN;
|
|
|
|
|
|
|
|
if (gbp->policy_applied)
|
|
|
|
md->gbp |= VXLAN_GBP_POLICY_APPLIED;
|
2016-02-16 21:59:01 +01:00
|
|
|
|
2016-02-23 18:02:55 +01:00
|
|
|
/* In flow-based mode, GBP is carried in dst_metadata */
|
|
|
|
if (!(vxflags & VXLAN_F_COLLECT_METADATA))
|
|
|
|
skb->mark = md->gbp;
|
2016-02-16 21:59:00 +01:00
|
|
|
}
|
|
|
|
|
2024-10-09 10:28:25 +08:00
|
|
|
static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan,
|
|
|
|
struct vxlan_sock *vs,
|
|
|
|
struct sk_buff *skb, __be32 vni)
|
2015-07-21 10:44:06 +02:00
|
|
|
{
|
|
|
|
union vxlan_addr saddr;
|
2017-06-19 10:03:59 +02:00
|
|
|
u32 ifindex = skb->dev->ifindex;
|
2015-07-21 10:44:06 +02:00
|
|
|
|
|
|
|
skb_reset_mac_header(skb);
|
|
|
|
skb->protocol = eth_type_trans(skb, vxlan->dev);
|
|
|
|
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
|
|
|
|
|
|
|
|
/* Ignore packet loops (and multicast echo) */
|
|
|
|
if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
|
2024-10-09 10:28:25 +08:00
|
|
|
return SKB_DROP_REASON_LOCAL_MAC;
|
2015-07-21 10:44:06 +02:00
|
|
|
|
2016-02-23 18:02:57 +01:00
|
|
|
/* Get address from the outer IP header */
|
2015-12-07 16:29:08 +01:00
|
|
|
if (vxlan_get_sk_family(vs) == AF_INET) {
|
2016-02-23 18:02:56 +01:00
|
|
|
saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
|
2015-07-21 10:44:06 +02:00
|
|
|
saddr.sa.sa_family = AF_INET;
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
} else {
|
2016-02-23 18:02:56 +01:00
|
|
|
saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
|
2015-07-21 10:44:06 +02:00
|
|
|
saddr.sa.sa_family = AF_INET6;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2024-10-09 10:28:25 +08:00
|
|
|
if (!(vxlan->cfg.flags & VXLAN_F_LEARN))
|
|
|
|
return SKB_NOT_DROPPED_YET;
|
2016-02-23 18:02:56 +01:00
|
|
|
|
2024-10-09 10:28:25 +08:00
|
|
|
return vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source,
|
|
|
|
ifindex, vni);
|
2016-02-23 18:02:56 +01:00
|
|
|
}
|
|
|
|
|
2016-02-23 18:02:57 +01:00
|
|
|
static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
if (vxlan_get_sk_family(vs) == AF_INET)
|
|
|
|
err = IP_ECN_decapsulate(oiph, skb);
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
else
|
|
|
|
err = IP6_ECN_decapsulate(oiph, skb);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (unlikely(err) && log_ecn_error) {
|
|
|
|
if (vxlan_get_sk_family(vs) == AF_INET)
|
|
|
|
net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
|
|
|
|
&((struct iphdr *)oiph)->saddr,
|
|
|
|
((struct iphdr *)oiph)->tos);
|
|
|
|
else
|
|
|
|
net_info_ratelimited("non-ECT from %pI6\n",
|
|
|
|
&((struct ipv6hdr *)oiph)->saddr);
|
|
|
|
}
|
|
|
|
return err <= 1;
|
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Callback from net/ipv4/udp.c to receive packets */
|
2016-02-23 18:02:58 +01:00
|
|
|
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2022-03-01 05:04:38 +00:00
|
|
|
struct vxlan_vni_node *vninode = NULL;
|
2024-12-05 16:40:53 +01:00
|
|
|
const struct vxlanhdr *vh;
|
2016-02-18 11:22:51 +01:00
|
|
|
struct vxlan_dev *vxlan;
|
2013-08-19 11:23:02 -07:00
|
|
|
struct vxlan_sock *vs;
|
2015-07-21 10:43:58 +02:00
|
|
|
struct vxlan_metadata _md;
|
|
|
|
struct vxlan_metadata *md = &_md;
|
2016-04-11 17:06:08 +02:00
|
|
|
__be16 protocol = htons(ETH_P_TEB);
|
2024-10-09 10:28:22 +08:00
|
|
|
enum skb_drop_reason reason;
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
bool raw_proto = false;
|
2016-02-23 18:02:58 +01:00
|
|
|
void *oiph;
|
2017-01-31 22:59:52 -08:00
|
|
|
__be32 vni = 0;
|
2024-04-30 18:50:13 +02:00
|
|
|
int nh;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
/* Need UDP and VXLAN header to be present */
|
2024-10-09 10:28:22 +08:00
|
|
|
reason = pskb_may_pull_reason(skb, VXLAN_HLEN);
|
|
|
|
if (reason)
|
2016-05-19 15:58:33 +02:00
|
|
|
goto drop;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2024-12-05 16:40:53 +01:00
|
|
|
vh = vxlan_hdr(skb);
|
2016-02-16 21:59:02 +01:00
|
|
|
/* VNI flag always required to be set */
|
2024-12-05 16:40:53 +01:00
|
|
|
if (!(vh->vx_flags & VXLAN_HF_VNI)) {
|
2016-02-16 21:59:02 +01:00
|
|
|
netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
|
2024-12-05 16:40:53 +01:00
|
|
|
ntohl(vh->vx_flags), ntohl(vh->vx_vni));
|
2024-10-09 10:28:22 +08:00
|
|
|
reason = SKB_DROP_REASON_VXLAN_INVALID_HDR;
|
2016-02-16 21:59:02 +01:00
|
|
|
/* Return non vxlan pkt */
|
2016-05-19 15:58:33 +02:00
|
|
|
goto drop;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2013-09-24 10:25:40 -07:00
|
|
|
vs = rcu_dereference_sk_user_data(sk);
|
2013-08-19 11:23:02 -07:00
|
|
|
if (!vs)
|
2012-10-01 12:32:35 +00:00
|
|
|
goto drop;
|
|
|
|
|
2024-12-05 16:40:53 +01:00
|
|
|
vni = vxlan_vni(vh->vx_vni);
|
2017-01-31 22:59:52 -08:00
|
|
|
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode);
|
2024-10-09 10:28:22 +08:00
|
|
|
if (!vxlan) {
|
|
|
|
reason = SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND;
|
2016-02-18 11:22:51 +01:00
|
|
|
goto drop;
|
2024-10-09 10:28:22 +08:00
|
|
|
}
|
2016-02-18 11:22:51 +01:00
|
|
|
|
2024-12-05 16:40:54 +01:00
|
|
|
if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags ||
|
|
|
|
vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) {
|
|
|
|
/* If the header uses bits besides those enabled by the
|
|
|
|
* netdevice configuration, treat this as a malformed packet.
|
|
|
|
* This behavior diverges from VXLAN RFC (RFC7348) which
|
|
|
|
* stipulates that bits in reserved in reserved fields are to be
|
|
|
|
* ignored. The approach here maintains compatibility with
|
|
|
|
* previous stack code, and also is more robust and provides a
|
|
|
|
* little more security in adding extensions to VXLAN.
|
|
|
|
*/
|
|
|
|
reason = SKB_DROP_REASON_VXLAN_INVALID_HDR;
|
2024-12-05 16:40:55 +01:00
|
|
|
DEV_STATS_INC(vxlan->dev, rx_frame_errors);
|
|
|
|
DEV_STATS_INC(vxlan->dev, rx_errors);
|
|
|
|
vxlan_vnifilter_count(vxlan, vni, vninode,
|
|
|
|
VXLAN_VNI_STATS_RX_ERRORS, 0);
|
2024-12-05 16:40:54 +01:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
2024-12-05 16:40:50 +01:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_GPE) {
|
2024-12-05 16:40:53 +01:00
|
|
|
if (!vxlan_parse_gpe_proto(vh, &protocol))
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
goto drop;
|
|
|
|
raw_proto = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
|
2024-10-09 10:28:22 +08:00
|
|
|
!net_eq(vxlan->net, dev_net(vxlan->dev)))) {
|
|
|
|
reason = SKB_DROP_REASON_NOMEM;
|
2019-12-30 17:52:22 +08:00
|
|
|
goto drop;
|
2024-10-09 10:28:22 +08:00
|
|
|
}
|
2016-02-18 11:22:51 +01:00
|
|
|
|
2024-12-05 16:40:50 +01:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) {
|
2024-12-05 16:40:52 +01:00
|
|
|
reason = vxlan_remcsum(skb, vxlan->cfg.flags);
|
2024-10-09 10:28:23 +08:00
|
|
|
if (unlikely(reason))
|
2020-09-25 15:16:02 +02:00
|
|
|
goto drop;
|
2024-10-09 10:28:23 +08:00
|
|
|
}
|
2020-09-25 15:16:02 +02:00
|
|
|
|
2015-07-21 10:43:58 +02:00
|
|
|
if (vxlan_collect_metadata(vs)) {
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
|
|
|
IP_TUNNEL_DECLARE_FLAGS(flags) = { };
|
2016-02-23 18:02:59 +01:00
|
|
|
struct metadata_dst *tun_dst;
|
2016-02-18 19:19:29 +01:00
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
|
|
|
__set_bit(IP_TUNNEL_KEY_BIT, flags);
|
|
|
|
tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), flags,
|
2016-09-08 16:23:45 +03:00
|
|
|
key32_to_tunnel_id(vni), sizeof(*md));
|
2015-08-26 23:46:50 -07:00
|
|
|
|
2024-10-09 10:28:22 +08:00
|
|
|
if (!tun_dst) {
|
|
|
|
reason = SKB_DROP_REASON_NOMEM;
|
2015-07-21 10:43:58 +02:00
|
|
|
goto drop;
|
2024-10-09 10:28:22 +08:00
|
|
|
}
|
2015-07-21 10:43:58 +02:00
|
|
|
|
2015-09-04 12:49:32 +02:00
|
|
|
md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
|
2016-02-23 18:02:59 +01:00
|
|
|
|
|
|
|
skb_dst_set(skb, (struct dst_entry *)tun_dst);
|
2015-07-21 10:43:58 +02:00
|
|
|
} else {
|
|
|
|
memset(md, 0, sizeof(*md));
|
|
|
|
}
|
|
|
|
|
2024-12-05 16:40:56 +01:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_GBP)
|
2024-12-05 16:40:52 +01:00
|
|
|
vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md);
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
/* Note that GBP and GPE can never be active together. This is
|
|
|
|
* ensured in vxlan_dev_configure.
|
|
|
|
*/
|
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
|
|
|
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
if (!raw_proto) {
|
2024-10-09 10:28:25 +08:00
|
|
|
reason = vxlan_set_mac(vxlan, vs, skb, vni);
|
|
|
|
if (reason)
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
goto drop;
|
|
|
|
} else {
|
2016-05-13 10:48:42 +02:00
|
|
|
skb_reset_mac_header(skb);
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
skb->dev = vxlan->dev;
|
|
|
|
skb->pkt_type = PACKET_HOST;
|
|
|
|
}
|
2016-02-23 18:02:58 +01:00
|
|
|
|
2024-04-30 18:50:13 +02:00
|
|
|
/* Save offset of outer header relative to skb->head,
|
|
|
|
* because we are going to reset the network header to the inner header
|
|
|
|
* and might change skb->head.
|
|
|
|
*/
|
|
|
|
nh = skb_network_header(skb) - skb->head;
|
|
|
|
|
2016-02-23 18:02:58 +01:00
|
|
|
skb_reset_network_header(skb);
|
|
|
|
|
2024-10-09 10:28:22 +08:00
|
|
|
reason = pskb_inet_may_pull_reason(skb);
|
|
|
|
if (reason) {
|
2024-04-30 18:50:13 +02:00
|
|
|
DEV_STATS_INC(vxlan->dev, rx_length_errors);
|
|
|
|
DEV_STATS_INC(vxlan->dev, rx_errors);
|
|
|
|
vxlan_vnifilter_count(vxlan, vni, vninode,
|
|
|
|
VXLAN_VNI_STATS_RX_ERRORS, 0);
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Get the outer header. */
|
|
|
|
oiph = skb->head + nh;
|
|
|
|
|
2016-02-23 18:02:58 +01:00
|
|
|
if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
|
2024-10-09 10:28:22 +08:00
|
|
|
reason = SKB_DROP_REASON_IP_TUNNEL_ECN;
|
2024-04-26 17:27:17 +02:00
|
|
|
DEV_STATS_INC(vxlan->dev, rx_frame_errors);
|
|
|
|
DEV_STATS_INC(vxlan->dev, rx_errors);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, vninode,
|
|
|
|
VXLAN_VNI_STATS_RX_ERRORS, 0);
|
2016-02-23 18:02:58 +01:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
2019-03-10 10:36:40 -07:00
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
if (unlikely(!(vxlan->dev->flags & IFF_UP))) {
|
|
|
|
rcu_read_unlock();
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_rx_dropped(vxlan->dev);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, vninode,
|
|
|
|
VXLAN_VNI_STATS_RX_DROPS, 0);
|
2024-10-09 10:28:22 +08:00
|
|
|
reason = SKB_DROP_REASON_DEV_READY;
|
2019-03-10 10:36:40 -07:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_rx_add(vxlan->dev, skb->len);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len);
|
2016-02-23 18:02:58 +01:00
|
|
|
gro_cells_receive(&vxlan->gro_cells, skb);
|
2019-03-10 10:36:40 -07:00
|
|
|
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
2013-08-19 11:23:02 -07:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
drop:
|
2024-10-09 10:28:22 +08:00
|
|
|
reason = reason ?: SKB_DROP_REASON_NOT_SPECIFIED;
|
2016-02-16 21:59:02 +01:00
|
|
|
/* Consume bad packet */
|
2024-10-09 10:28:22 +08:00
|
|
|
kfree_skb_reason(skb, reason);
|
2016-02-16 21:59:02 +01:00
|
|
|
return 0;
|
2013-08-19 11:23:02 -07:00
|
|
|
}
|
|
|
|
|
2018-11-08 12:19:15 +01:00
|
|
|
/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
|
|
|
|
static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan;
|
|
|
|
struct vxlan_sock *vs;
|
|
|
|
struct vxlanhdr *hdr;
|
|
|
|
__be32 vni;
|
|
|
|
|
2019-06-11 00:27:05 +02:00
|
|
|
if (!pskb_may_pull(skb, skb_transport_offset(skb) + VXLAN_HLEN))
|
2018-11-08 12:19:15 +01:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
hdr = vxlan_hdr(skb);
|
|
|
|
|
|
|
|
if (!(hdr->vx_flags & VXLAN_HF_VNI))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
vs = rcu_dereference_sk_user_data(sk);
|
|
|
|
if (!vs)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
vni = vxlan_vni(hdr->vx_vni);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, NULL);
|
2018-11-08 12:19:15 +01:00
|
|
|
if (!vxlan)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-01-31 22:59:52 -08:00
|
|
|
static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
|
2012-11-20 02:50:14 +00:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct arphdr *parp;
|
|
|
|
u8 *arpptr, *sha;
|
|
|
|
__be32 sip, tip;
|
|
|
|
struct neighbour *n;
|
|
|
|
|
|
|
|
if (dev->flags & IFF_NOARP)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_tx_dropped(dev);
|
2024-04-26 17:27:19 +02:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, NULL,
|
|
|
|
VXLAN_VNI_STATS_TX_DROPS, 0);
|
2012-11-20 02:50:14 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
parp = arp_hdr(skb);
|
|
|
|
|
|
|
|
if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
|
|
|
|
parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
|
|
|
|
parp->ar_pro != htons(ETH_P_IP) ||
|
|
|
|
parp->ar_op != htons(ARPOP_REQUEST) ||
|
|
|
|
parp->ar_hln != dev->addr_len ||
|
|
|
|
parp->ar_pln != 4)
|
|
|
|
goto out;
|
|
|
|
arpptr = (u8 *)parp + sizeof(struct arphdr);
|
|
|
|
sha = arpptr;
|
|
|
|
arpptr += dev->addr_len; /* sha */
|
|
|
|
memcpy(&sip, arpptr, sizeof(sip));
|
|
|
|
arpptr += sizeof(sip);
|
|
|
|
arpptr += dev->addr_len; /* tha */
|
|
|
|
memcpy(&tip, arpptr, sizeof(tip));
|
|
|
|
|
|
|
|
if (ipv4_is_loopback(tip) ||
|
|
|
|
ipv4_is_multicast(tip))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
n = neigh_lookup(&arp_tbl, &tip, dev);
|
|
|
|
|
|
|
|
if (n) {
|
|
|
|
struct vxlan_fdb *f;
|
|
|
|
struct sk_buff *reply;
|
|
|
|
|
2023-03-13 20:17:31 +00:00
|
|
|
if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) {
|
2012-11-20 02:50:14 +00:00
|
|
|
neigh_release(n);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-01-31 22:59:52 -08:00
|
|
|
f = vxlan_find_mac(vxlan, n->ha, vni);
|
2013-08-31 13:44:33 +08:00
|
|
|
if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
|
2012-11-20 02:50:14 +00:00
|
|
|
/* bridge-local neighbor */
|
|
|
|
neigh_release(n);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
|
|
|
|
n->ha, sha);
|
|
|
|
|
|
|
|
neigh_release(n);
|
|
|
|
|
2014-03-18 12:32:29 -04:00
|
|
|
if (reply == NULL)
|
|
|
|
goto out;
|
|
|
|
|
2012-11-20 02:50:14 +00:00
|
|
|
skb_reset_mac_header(reply);
|
|
|
|
__skb_pull(reply, skb_network_offset(reply));
|
|
|
|
reply->ip_summed = CHECKSUM_UNNECESSARY;
|
|
|
|
reply->pkt_type = PACKET_HOST;
|
|
|
|
|
2022-03-06 22:57:46 +01:00
|
|
|
if (netif_rx(reply) == NET_RX_DROP) {
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_rx_dropped(dev);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, NULL,
|
|
|
|
VXLAN_VNI_STATS_RX_DROPS, 0);
|
|
|
|
}
|
|
|
|
|
2017-06-19 10:03:56 +02:00
|
|
|
} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
|
2013-08-31 13:44:33 +08:00
|
|
|
union vxlan_addr ipa = {
|
|
|
|
.sin.sin_addr.s_addr = tip,
|
2014-08-22 21:34:16 +02:00
|
|
|
.sin.sin_family = AF_INET,
|
2013-08-31 13:44:33 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
vxlan_ip_miss(dev, &ipa);
|
|
|
|
}
|
2012-11-20 02:50:14 +00:00
|
|
|
out:
|
|
|
|
consume_skb(skb);
|
|
|
|
return NETDEV_TX_OK;
|
|
|
|
}
|
|
|
|
|
2013-08-31 13:44:36 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
static struct sk_buff *vxlan_na_create(struct sk_buff *request,
|
|
|
|
struct neighbour *n, bool isrouter)
|
|
|
|
{
|
|
|
|
struct net_device *dev = request->dev;
|
|
|
|
struct sk_buff *reply;
|
|
|
|
struct nd_msg *ns, *na;
|
|
|
|
struct ipv6hdr *pip6;
|
|
|
|
u8 *daddr;
|
|
|
|
int na_olen = 8; /* opt hdr + ETH_ALEN for target */
|
|
|
|
int ns_olen;
|
|
|
|
int i, len;
|
|
|
|
|
2017-04-02 11:00:06 +02:00
|
|
|
if (dev == NULL || !pskb_may_pull(request, request->len))
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
return NULL;
|
|
|
|
|
|
|
|
len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
|
|
|
|
sizeof(*na) + na_olen + dev->needed_tailroom;
|
|
|
|
reply = alloc_skb(len, GFP_ATOMIC);
|
|
|
|
if (reply == NULL)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
reply->protocol = htons(ETH_P_IPV6);
|
|
|
|
reply->dev = dev;
|
|
|
|
skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
|
|
|
|
skb_push(reply, sizeof(struct ethhdr));
|
2016-03-03 01:16:54 +00:00
|
|
|
skb_reset_mac_header(reply);
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
|
2017-04-02 11:00:06 +02:00
|
|
|
ns = (struct nd_msg *)(ipv6_hdr(request) + 1);
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
|
|
|
|
daddr = eth_hdr(request)->h_source;
|
2017-04-02 11:00:06 +02:00
|
|
|
ns_olen = request->len - skb_network_offset(request) -
|
|
|
|
sizeof(struct ipv6hdr) - sizeof(*ns);
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
|
2020-06-01 15:58:55 +03:00
|
|
|
if (!ns->opt[i + 1]) {
|
|
|
|
kfree_skb(reply);
|
|
|
|
return NULL;
|
|
|
|
}
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
|
|
|
|
daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Ethernet header */
|
|
|
|
ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
|
|
|
|
ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
|
|
|
|
eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
|
|
|
|
reply->protocol = htons(ETH_P_IPV6);
|
|
|
|
|
|
|
|
skb_pull(reply, sizeof(struct ethhdr));
|
2016-03-03 01:16:54 +00:00
|
|
|
skb_reset_network_header(reply);
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
skb_put(reply, sizeof(struct ipv6hdr));
|
|
|
|
|
|
|
|
/* IPv6 header */
|
|
|
|
|
|
|
|
pip6 = ipv6_hdr(reply);
|
|
|
|
memset(pip6, 0, sizeof(struct ipv6hdr));
|
|
|
|
pip6->version = 6;
|
|
|
|
pip6->priority = ipv6_hdr(request)->priority;
|
|
|
|
pip6->nexthdr = IPPROTO_ICMPV6;
|
|
|
|
pip6->hop_limit = 255;
|
|
|
|
pip6->daddr = ipv6_hdr(request)->saddr;
|
|
|
|
pip6->saddr = *(struct in6_addr *)n->primary_key;
|
|
|
|
|
|
|
|
skb_pull(reply, sizeof(struct ipv6hdr));
|
2016-03-03 01:16:54 +00:00
|
|
|
skb_reset_transport_header(reply);
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
|
|
|
|
/* Neighbor Advertisement */
|
networking: convert many more places to skb_put_zero()
There were many places that my previous spatch didn't find,
as pointed out by yuan linyu in various patches.
The following spatch found many more and also removes the
now unnecessary casts:
@@
identifier p, p2;
expression len;
expression skb;
type t, t2;
@@
(
-p = skb_put(skb, len);
+p = skb_put_zero(skb, len);
|
-p = (t)skb_put(skb, len);
+p = skb_put_zero(skb, len);
)
... when != p
(
p2 = (t2)p;
-memset(p2, 0, len);
|
-memset(p, 0, len);
)
@@
type t, t2;
identifier p, p2;
expression skb;
@@
t *p;
...
(
-p = skb_put(skb, sizeof(t));
+p = skb_put_zero(skb, sizeof(t));
|
-p = (t *)skb_put(skb, sizeof(t));
+p = skb_put_zero(skb, sizeof(t));
)
... when != p
(
p2 = (t2)p;
-memset(p2, 0, sizeof(*p));
|
-memset(p, 0, sizeof(*p));
)
@@
expression skb, len;
@@
-memset(skb_put(skb, len), 0, len);
+skb_put_zero(skb, len);
Apply it to the tree (with one manual fixup to keep the
comment in vxlan.c, which spatch removed.)
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-16 14:29:19 +02:00
|
|
|
na = skb_put_zero(reply, sizeof(*na) + na_olen);
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
|
|
|
|
na->icmph.icmp6_router = isrouter;
|
|
|
|
na->icmph.icmp6_override = 1;
|
|
|
|
na->icmph.icmp6_solicited = 1;
|
|
|
|
na->target = ns->target;
|
|
|
|
ether_addr_copy(&na->opt[2], n->ha);
|
|
|
|
na->opt[0] = ND_OPT_TARGET_LL_ADDR;
|
|
|
|
na->opt[1] = na_olen >> 3;
|
|
|
|
|
|
|
|
na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
|
|
|
|
&pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
|
|
|
|
csum_partial(na, sizeof(*na)+na_olen, 0));
|
|
|
|
|
|
|
|
pip6->payload_len = htons(sizeof(*na)+na_olen);
|
|
|
|
|
|
|
|
skb_push(reply, sizeof(struct ipv6hdr));
|
|
|
|
|
|
|
|
reply->ip_summed = CHECKSUM_UNNECESSARY;
|
|
|
|
|
|
|
|
return reply;
|
|
|
|
}
|
|
|
|
|
2017-01-31 22:59:52 -08:00
|
|
|
static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
|
2013-08-31 13:44:36 +08:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2017-02-20 08:41:16 -08:00
|
|
|
const struct in6_addr *daddr;
|
2017-11-11 19:58:50 +08:00
|
|
|
const struct ipv6hdr *iphdr;
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
struct inet6_dev *in6_dev;
|
2017-11-11 19:58:50 +08:00
|
|
|
struct neighbour *n;
|
|
|
|
struct nd_msg *msg;
|
2013-08-31 13:44:36 +08:00
|
|
|
|
vxlan: add missing rcu_read_lock() in neigh_reduce()
syzbot complained in neigh_reduce(), because rcu_read_lock_bh()
is treated differently than rcu_read_lock()
WARNING: suspicious RCU usage
5.13.0-rc6-syzkaller #0 Not tainted
-----------------------------
include/net/addrconf.h:313 suspicious rcu_dereference_check() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
3 locks held by kworker/0:0/5:
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline]
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: atomic64_set include/asm-generic/atomic-instrumented.h:856 [inline]
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: atomic_long_set include/asm-generic/atomic-long.h:41 [inline]
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: set_work_data kernel/workqueue.c:617 [inline]
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: set_work_pool_and_clear_pending kernel/workqueue.c:644 [inline]
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x871/0x1600 kernel/workqueue.c:2247
#1: ffffc90000ca7da8 ((work_completion)(&port->wq)){+.+.}-{0:0}, at: process_one_work+0x8a5/0x1600 kernel/workqueue.c:2251
#2: ffffffff8bf795c0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x1da/0x3130 net/core/dev.c:4180
stack backtrace:
CPU: 0 PID: 5 Comm: kworker/0:0 Not tainted 5.13.0-rc6-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events ipvlan_process_multicast
Call Trace:
__dump_stack lib/dump_stack.c:79 [inline]
dump_stack+0x141/0x1d7 lib/dump_stack.c:120
__in6_dev_get include/net/addrconf.h:313 [inline]
__in6_dev_get include/net/addrconf.h:311 [inline]
neigh_reduce drivers/net/vxlan.c:2167 [inline]
vxlan_xmit+0x34d5/0x4c30 drivers/net/vxlan.c:2919
__netdev_start_xmit include/linux/netdevice.h:4944 [inline]
netdev_start_xmit include/linux/netdevice.h:4958 [inline]
xmit_one net/core/dev.c:3654 [inline]
dev_hard_start_xmit+0x1eb/0x920 net/core/dev.c:3670
__dev_queue_xmit+0x2133/0x3130 net/core/dev.c:4246
ipvlan_process_multicast+0xa99/0xd70 drivers/net/ipvlan/ipvlan_core.c:287
process_one_work+0x98d/0x1600 kernel/workqueue.c:2276
worker_thread+0x64c/0x1120 kernel/workqueue.c:2422
kthread+0x3b1/0x4a0 kernel/kthread.c:313
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
Fixes: f564f45c4518 ("vxlan: add ipv6 proxy support")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-21 07:44:17 -07:00
|
|
|
rcu_read_lock();
|
2013-08-31 13:44:36 +08:00
|
|
|
in6_dev = __in6_dev_get(dev);
|
|
|
|
if (!in6_dev)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
iphdr = ipv6_hdr(skb);
|
|
|
|
daddr = &iphdr->daddr;
|
2017-04-02 11:00:06 +02:00
|
|
|
msg = (struct nd_msg *)(iphdr + 1);
|
2013-08-31 13:44:36 +08:00
|
|
|
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
if (ipv6_addr_loopback(daddr) ||
|
|
|
|
ipv6_addr_is_multicast(&msg->target))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
|
2013-08-31 13:44:36 +08:00
|
|
|
|
|
|
|
if (n) {
|
|
|
|
struct vxlan_fdb *f;
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
struct sk_buff *reply;
|
2013-08-31 13:44:36 +08:00
|
|
|
|
2023-03-13 20:17:31 +00:00
|
|
|
if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) {
|
2013-08-31 13:44:36 +08:00
|
|
|
neigh_release(n);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-01-31 22:59:52 -08:00
|
|
|
f = vxlan_find_mac(vxlan, n->ha, vni);
|
2013-08-31 13:44:36 +08:00
|
|
|
if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
|
|
|
|
/* bridge-local neighbor */
|
|
|
|
neigh_release(n);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
reply = vxlan_na_create(skb, n,
|
|
|
|
!!(f ? f->flags & NTF_ROUTER : 0));
|
|
|
|
|
2013-08-31 13:44:36 +08:00
|
|
|
neigh_release(n);
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
|
|
|
|
if (reply == NULL)
|
|
|
|
goto out;
|
|
|
|
|
2022-03-06 22:57:46 +01:00
|
|
|
if (netif_rx(reply) == NET_RX_DROP) {
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_rx_dropped(dev);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, NULL,
|
|
|
|
VXLAN_VNI_STATS_RX_DROPS, 0);
|
|
|
|
}
|
2017-06-19 10:03:56 +02:00
|
|
|
} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
union vxlan_addr ipa = {
|
|
|
|
.sin6.sin6_addr = msg->target,
|
2014-08-22 21:34:16 +02:00
|
|
|
.sin6.sin6_family = AF_INET6,
|
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
|
|
|
};
|
|
|
|
|
2013-08-31 13:44:36 +08:00
|
|
|
vxlan_ip_miss(dev, &ipa);
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
vxlan: add missing rcu_read_lock() in neigh_reduce()
syzbot complained in neigh_reduce(), because rcu_read_lock_bh()
is treated differently than rcu_read_lock()
WARNING: suspicious RCU usage
5.13.0-rc6-syzkaller #0 Not tainted
-----------------------------
include/net/addrconf.h:313 suspicious rcu_dereference_check() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
3 locks held by kworker/0:0/5:
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline]
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: atomic64_set include/asm-generic/atomic-instrumented.h:856 [inline]
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: atomic_long_set include/asm-generic/atomic-long.h:41 [inline]
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: set_work_data kernel/workqueue.c:617 [inline]
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: set_work_pool_and_clear_pending kernel/workqueue.c:644 [inline]
#0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x871/0x1600 kernel/workqueue.c:2247
#1: ffffc90000ca7da8 ((work_completion)(&port->wq)){+.+.}-{0:0}, at: process_one_work+0x8a5/0x1600 kernel/workqueue.c:2251
#2: ffffffff8bf795c0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x1da/0x3130 net/core/dev.c:4180
stack backtrace:
CPU: 0 PID: 5 Comm: kworker/0:0 Not tainted 5.13.0-rc6-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events ipvlan_process_multicast
Call Trace:
__dump_stack lib/dump_stack.c:79 [inline]
dump_stack+0x141/0x1d7 lib/dump_stack.c:120
__in6_dev_get include/net/addrconf.h:313 [inline]
__in6_dev_get include/net/addrconf.h:311 [inline]
neigh_reduce drivers/net/vxlan.c:2167 [inline]
vxlan_xmit+0x34d5/0x4c30 drivers/net/vxlan.c:2919
__netdev_start_xmit include/linux/netdevice.h:4944 [inline]
netdev_start_xmit include/linux/netdevice.h:4958 [inline]
xmit_one net/core/dev.c:3654 [inline]
dev_hard_start_xmit+0x1eb/0x920 net/core/dev.c:3670
__dev_queue_xmit+0x2133/0x3130 net/core/dev.c:4246
ipvlan_process_multicast+0xa99/0xd70 drivers/net/ipvlan/ipvlan_core.c:287
process_one_work+0x98d/0x1600 kernel/workqueue.c:2276
worker_thread+0x64c/0x1120 kernel/workqueue.c:2422
kthread+0x3b1/0x4a0 kernel/kthread.c:313
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
Fixes: f564f45c4518 ("vxlan: add ipv6 proxy support")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-21 07:44:17 -07:00
|
|
|
rcu_read_unlock();
|
2013-08-31 13:44:36 +08:00
|
|
|
consume_skb(skb);
|
|
|
|
return NETDEV_TX_OK;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-11-20 02:50:14 +00:00
|
|
|
static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct neighbour *n;
|
|
|
|
|
|
|
|
if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
n = NULL;
|
|
|
|
switch (ntohs(eth_hdr(skb)->h_proto)) {
|
|
|
|
case ETH_P_IP:
|
2013-08-31 13:44:34 +08:00
|
|
|
{
|
|
|
|
struct iphdr *pip;
|
|
|
|
|
2012-11-20 02:50:14 +00:00
|
|
|
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
|
|
|
|
return false;
|
|
|
|
pip = ip_hdr(skb);
|
|
|
|
n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
|
2017-06-19 10:03:56 +02:00
|
|
|
if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
|
2013-08-31 13:44:33 +08:00
|
|
|
union vxlan_addr ipa = {
|
|
|
|
.sin.sin_addr.s_addr = pip->daddr,
|
2014-08-22 21:34:16 +02:00
|
|
|
.sin.sin_family = AF_INET,
|
2013-08-31 13:44:33 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
vxlan_ip_miss(dev, &ipa);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-11-20 02:50:14 +00:00
|
|
|
break;
|
2013-08-31 13:44:34 +08:00
|
|
|
}
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
case ETH_P_IPV6:
|
|
|
|
{
|
|
|
|
struct ipv6hdr *pip6;
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
|
|
|
|
return false;
|
|
|
|
pip6 = ipv6_hdr(skb);
|
|
|
|
n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
|
2017-06-19 10:03:56 +02:00
|
|
|
if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
|
2013-08-31 13:44:34 +08:00
|
|
|
union vxlan_addr ipa = {
|
|
|
|
.sin6.sin6_addr = pip6->daddr,
|
2014-08-22 21:34:16 +02:00
|
|
|
.sin6.sin6_family = AF_INET6,
|
2013-08-31 13:44:34 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
vxlan_ip_miss(dev, &ipa);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#endif
|
2012-11-20 02:50:14 +00:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (n) {
|
|
|
|
bool diff;
|
|
|
|
|
drivers/net: Convert uses of compare_ether_addr to ether_addr_equal
Use the new bool function ether_addr_equal to add
some clarity and reduce the likelihood for misuse
of compare_ether_addr for sorting.
Done via cocci script: (and a little typing)
$ cat compare_ether_addr.cocci
@@
expression a,b;
@@
- !compare_ether_addr(a, b)
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- compare_ether_addr(a, b)
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- !ether_addr_equal(a, b) == 0
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- !ether_addr_equal(a, b) != 0
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- ether_addr_equal(a, b) == 0
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- ether_addr_equal(a, b) != 0
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal(a, b)
+ ether_addr_equal(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-09-01 11:51:23 -07:00
|
|
|
diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
|
2012-11-20 02:50:14 +00:00
|
|
|
if (diff) {
|
|
|
|
memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
|
|
|
|
dev->addr_len);
|
|
|
|
memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
|
|
|
|
}
|
|
|
|
neigh_release(n);
|
|
|
|
return diff;
|
2013-08-31 13:44:33 +08:00
|
|
|
}
|
|
|
|
|
2012-11-20 02:50:14 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-03-16 09:07:54 +02:00
|
|
|
static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, __be16 protocol)
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
{
|
|
|
|
struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
|
|
|
|
|
|
|
|
gpe->np_applied = 1;
|
2017-08-28 21:43:22 +02:00
|
|
|
gpe->next_protocol = tun_p_from_eth_p(protocol);
|
|
|
|
if (!gpe->next_protocol)
|
|
|
|
return -EPFNOSUPPORT;
|
|
|
|
return 0;
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
}
|
|
|
|
|
2016-02-02 18:09:16 +01:00
|
|
|
static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
|
|
|
|
int iphdr_len, __be32 vni,
|
|
|
|
struct vxlan_metadata *md, u32 vxflags,
|
2016-02-02 18:09:15 +01:00
|
|
|
bool udp_sum)
|
2013-08-31 13:44:33 +08:00
|
|
|
{
|
|
|
|
struct vxlanhdr *vxh;
|
|
|
|
int min_headroom;
|
|
|
|
int err;
|
2015-01-12 17:00:38 -08:00
|
|
|
int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
__be16 inner_protocol = htons(ETH_P_TEB);
|
2015-01-12 17:00:38 -08:00
|
|
|
|
2015-01-20 11:23:05 -08:00
|
|
|
if ((vxflags & VXLAN_F_REMCSUM_TX) &&
|
2015-01-12 17:00:38 -08:00
|
|
|
skb->ip_summed == CHECKSUM_PARTIAL) {
|
|
|
|
int csum_start = skb_checksum_start_offset(skb);
|
|
|
|
|
|
|
|
if (csum_start <= VXLAN_MAX_REMCSUM_START &&
|
|
|
|
!(csum_start & VXLAN_RCO_SHIFT_MASK) &&
|
|
|
|
(skb->csum_offset == offsetof(struct udphdr, check) ||
|
2016-02-11 20:57:17 +00:00
|
|
|
skb->csum_offset == offsetof(struct tcphdr, check)))
|
2015-01-12 17:00:38 -08:00
|
|
|
type |= SKB_GSO_TUNNEL_REMCSUM;
|
|
|
|
}
|
2013-08-31 13:44:33 +08:00
|
|
|
|
|
|
|
min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
|
2016-11-13 20:43:52 -08:00
|
|
|
+ VXLAN_HLEN + iphdr_len;
|
2013-08-19 11:23:22 -07:00
|
|
|
|
|
|
|
/* Need space for new headers (invalidates iph ptr) */
|
|
|
|
err = skb_cow_head(skb, min_headroom);
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
if (unlikely(err))
|
2016-11-13 20:43:54 -08:00
|
|
|
return err;
|
2013-08-19 11:23:22 -07:00
|
|
|
|
2016-04-14 15:33:37 -04:00
|
|
|
err = iptunnel_handle_offloads(skb, type);
|
|
|
|
if (err)
|
2016-11-13 20:43:54 -08:00
|
|
|
return err;
|
2015-04-09 11:19:14 -07:00
|
|
|
|
networking: make skb_push & __skb_push return void pointers
It seems like a historic accident that these return unsigned char *,
and in many places that means casts are required, more often than not.
Make these functions return void * and remove all the casts across
the tree, adding a (u8 *) cast only where the unsigned char pointer
was used directly, all done with the following spatch:
@@
expression SKB, LEN;
typedef u8;
identifier fn = { skb_push, __skb_push, skb_push_rcsum };
@@
- *(fn(SKB, LEN))
+ *(u8 *)fn(SKB, LEN)
@@
expression E, SKB, LEN;
identifier fn = { skb_push, __skb_push, skb_push_rcsum };
type T;
@@
- E = ((T *)(fn(SKB, LEN)))
+ E = fn(SKB, LEN)
@@
expression SKB, LEN;
identifier fn = { skb_push, __skb_push, skb_push_rcsum };
@@
- fn(SKB, LEN)[0]
+ *(u8 *)fn(SKB, LEN)
Note that the last part there converts from push(...)[0] to the
more idiomatic *(u8 *)push(...).
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-16 14:29:23 +02:00
|
|
|
vxh = __skb_push(skb, sizeof(*vxh));
|
2016-02-16 21:58:58 +01:00
|
|
|
vxh->vx_flags = VXLAN_HF_VNI;
|
|
|
|
vxh->vx_vni = vxlan_vni_field(vni);
|
2013-08-19 11:23:17 -07:00
|
|
|
|
2015-01-12 17:00:38 -08:00
|
|
|
if (type & SKB_GSO_TUNNEL_REMCSUM) {
|
2016-02-16 21:58:58 +01:00
|
|
|
unsigned int start;
|
2015-01-12 17:00:38 -08:00
|
|
|
|
2016-02-16 21:58:58 +01:00
|
|
|
start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
|
|
|
|
vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
|
|
|
|
vxh->vx_flags |= VXLAN_HF_RCO;
|
2015-01-12 17:00:38 -08:00
|
|
|
|
|
|
|
if (!skb_is_gso(skb)) {
|
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
skb->encapsulation = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-20 11:23:05 -08:00
|
|
|
if (vxflags & VXLAN_F_GBP)
|
2023-03-16 09:07:54 +02:00
|
|
|
vxlan_build_gbp_hdr(vxh, md);
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
if (vxflags & VXLAN_F_GPE) {
|
2023-03-16 09:07:54 +02:00
|
|
|
err = vxlan_build_gpe_hdr(vxh, skb->protocol);
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
if (err < 0)
|
2016-11-13 20:43:54 -08:00
|
|
|
return err;
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
inner_protocol = skb->protocol;
|
|
|
|
}
|
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
|
|
|
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
skb_set_inner_protocol(skb, inner_protocol);
|
2015-12-24 14:34:54 -08:00
|
|
|
return 0;
|
2013-08-19 11:23:17 -07:00
|
|
|
}
|
|
|
|
|
2013-04-02 12:31:52 +00:00
|
|
|
/* Bypass encapsulation if the destination is local */
|
|
|
|
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
|
2020-08-04 07:53:44 +02:00
|
|
|
struct vxlan_dev *dst_vxlan, __be32 vni,
|
|
|
|
bool snoop)
|
2013-04-02 12:31:52 +00:00
|
|
|
{
|
2013-08-31 13:44:33 +08:00
|
|
|
union vxlan_addr loopback;
|
|
|
|
union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
|
2024-12-04 13:11:27 +01:00
|
|
|
unsigned int len = skb->len;
|
2019-02-07 12:27:38 -08:00
|
|
|
struct net_device *dev;
|
2013-04-02 12:31:52 +00:00
|
|
|
|
|
|
|
skb->pkt_type = PACKET_HOST;
|
|
|
|
skb->encapsulation = 0;
|
|
|
|
skb->dev = dst_vxlan->dev;
|
|
|
|
__skb_pull(skb, skb_network_offset(skb));
|
|
|
|
|
2013-08-31 13:44:33 +08:00
|
|
|
if (remote_ip->sa.sa_family == AF_INET) {
|
|
|
|
loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
|
|
|
|
loopback.sa.sa_family = AF_INET;
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
} else {
|
|
|
|
loopback.sin6.sin6_addr = in6addr_loopback;
|
|
|
|
loopback.sa.sa_family = AF_INET6;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2019-02-07 12:27:38 -08:00
|
|
|
rcu_read_lock();
|
|
|
|
dev = skb->dev;
|
|
|
|
if (unlikely(!(dev->flags & IFF_UP))) {
|
2024-10-09 10:28:29 +08:00
|
|
|
kfree_skb_reason(skb, SKB_DROP_REASON_DEV_READY);
|
2019-02-07 12:27:38 -08:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
2020-08-04 07:53:44 +02:00
|
|
|
if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop)
|
2019-02-07 12:27:38 -08:00
|
|
|
vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
|
2013-04-02 12:31:52 +00:00
|
|
|
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_tx_add(src_vxlan->dev, len);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len);
|
2013-04-02 12:31:52 +00:00
|
|
|
|
2022-02-12 00:38:38 +01:00
|
|
|
if (__netif_rx(skb) == NET_RX_SUCCESS) {
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_rx_add(dst_vxlan->dev, len);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX,
|
|
|
|
len);
|
2013-04-02 12:31:52 +00:00
|
|
|
} else {
|
2019-02-07 12:27:38 -08:00
|
|
|
drop:
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_rx_dropped(dev);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(dst_vxlan, vni, NULL,
|
|
|
|
VXLAN_VNI_STATS_RX_DROPS, 0);
|
2013-04-02 12:31:52 +00:00
|
|
|
}
|
2019-02-07 12:27:38 -08:00
|
|
|
rcu_read_unlock();
|
2013-04-02 12:31:52 +00:00
|
|
|
}
|
|
|
|
|
2016-11-13 20:43:56 -08:00
|
|
|
static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
|
2017-06-19 10:04:00 +02:00
|
|
|
struct vxlan_dev *vxlan,
|
2023-10-20 13:55:29 +02:00
|
|
|
int addr_family,
|
2017-06-19 10:04:00 +02:00
|
|
|
__be16 dst_port, int dst_ifindex, __be32 vni,
|
|
|
|
struct dst_entry *dst,
|
2016-11-13 20:43:56 -08:00
|
|
|
u32 rt_flags)
|
|
|
|
{
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
/* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
|
|
|
|
* RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
|
|
|
|
* we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL);
|
|
|
|
#endif
|
|
|
|
/* Bypass encapsulation if the destination is local */
|
|
|
|
if (rt_flags & RTCF_LOCAL &&
|
2023-05-12 11:40:33 +08:00
|
|
|
!(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
|
|
|
|
vxlan->cfg.flags & VXLAN_F_LOCALBYPASS) {
|
2016-11-13 20:43:56 -08:00
|
|
|
struct vxlan_dev *dst_vxlan;
|
|
|
|
|
|
|
|
dst_release(dst);
|
2017-06-19 10:04:00 +02:00
|
|
|
dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
|
2023-10-20 13:55:29 +02:00
|
|
|
addr_family, dst_port,
|
2017-06-19 10:03:56 +02:00
|
|
|
vxlan->cfg.flags);
|
2016-11-13 20:43:56 -08:00
|
|
|
if (!dst_vxlan) {
|
2024-04-26 17:27:17 +02:00
|
|
|
DEV_STATS_INC(dev, tx_errors);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, NULL,
|
|
|
|
VXLAN_VNI_STATS_TX_ERRORS, 0);
|
2024-10-15 16:28:30 +08:00
|
|
|
kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND);
|
2016-11-13 20:43:56 -08:00
|
|
|
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
2020-08-04 07:53:44 +02:00
|
|
|
vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
|
2016-11-13 20:43:56 -08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-03-15 15:11:50 +02:00
|
|
|
void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
|
|
|
|
__be32 default_vni, struct vxlan_rdst *rdst, bool did_rsc)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2016-02-12 15:43:57 +01:00
|
|
|
struct dst_cache *dst_cache;
|
2015-07-21 10:44:00 +02:00
|
|
|
struct ip_tunnel_info *info;
|
2023-10-16 09:15:26 +02:00
|
|
|
struct ip_tunnel_key *pkey;
|
|
|
|
struct ip_tunnel_key key;
|
2012-10-01 12:32:35 +00:00
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2024-06-19 15:34:57 +02:00
|
|
|
const struct iphdr *old_iph;
|
2015-07-21 10:43:58 +02:00
|
|
|
struct vxlan_metadata _md;
|
|
|
|
struct vxlan_metadata *md = &_md;
|
2022-03-01 05:04:38 +00:00
|
|
|
unsigned int pkt_len = skb->len;
|
2013-08-31 13:44:33 +08:00
|
|
|
__be16 src_port = 0, dst_port;
|
2016-11-13 20:43:55 -08:00
|
|
|
struct dst_entry *ndst = NULL;
|
2023-10-20 13:55:29 +02:00
|
|
|
int addr_family;
|
2023-10-16 09:15:26 +02:00
|
|
|
__u8 tos, ttl;
|
2017-06-19 10:04:00 +02:00
|
|
|
int ifindex;
|
2013-06-17 17:49:56 -07:00
|
|
|
int err;
|
2017-06-19 10:03:56 +02:00
|
|
|
u32 flags = vxlan->cfg.flags;
|
2023-10-16 09:15:26 +02:00
|
|
|
bool use_cache;
|
2016-02-02 18:09:15 +01:00
|
|
|
bool udp_sum = false;
|
2016-02-02 18:09:16 +01:00
|
|
|
bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
|
2024-10-09 10:28:27 +08:00
|
|
|
enum skb_drop_reason reason;
|
2024-06-19 15:34:57 +02:00
|
|
|
bool no_eth_encap;
|
2022-03-01 05:04:29 +00:00
|
|
|
__be32 vni = 0;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2024-06-19 15:34:57 +02:00
|
|
|
no_eth_encap = flags & VXLAN_F_GPE && skb->protocol != htons(ETH_P_TEB);
|
2024-10-09 10:28:27 +08:00
|
|
|
reason = skb_vlan_inet_prepare(skb, no_eth_encap);
|
|
|
|
if (reason)
|
2024-06-19 15:34:57 +02:00
|
|
|
goto drop;
|
|
|
|
|
2024-10-09 10:28:27 +08:00
|
|
|
reason = SKB_DROP_REASON_NOT_SPECIFIED;
|
2024-06-19 15:34:57 +02:00
|
|
|
old_iph = ip_hdr(skb);
|
|
|
|
|
2015-08-20 13:56:25 +02:00
|
|
|
info = skb_tunnel_info(skb);
|
2023-10-16 09:15:26 +02:00
|
|
|
use_cache = ip_tunnel_dst_cache_usable(skb, info);
|
2015-07-21 10:44:00 +02:00
|
|
|
|
2015-07-21 10:43:58 +02:00
|
|
|
if (rdst) {
|
2023-10-16 09:15:26 +02:00
|
|
|
memset(&key, 0, sizeof(key));
|
|
|
|
pkey = &key;
|
|
|
|
|
2023-10-20 13:55:29 +02:00
|
|
|
if (vxlan_addr_any(&rdst->remote_ip)) {
|
2016-11-13 20:43:57 -08:00
|
|
|
if (did_rsc) {
|
|
|
|
/* short-circuited back to local bridge */
|
2020-08-04 07:53:44 +02:00
|
|
|
vxlan_encap_bypass(skb, vxlan, vxlan,
|
|
|
|
default_vni, true);
|
2016-11-13 20:43:57 -08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
2023-10-20 13:55:29 +02:00
|
|
|
addr_family = vxlan->cfg.saddr.sa.sa_family;
|
2015-07-21 10:44:02 +02:00
|
|
|
dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
|
2017-01-31 22:59:52 -08:00
|
|
|
vni = (rdst->remote_vni) ? : default_vni;
|
2017-06-19 10:04:00 +02:00
|
|
|
ifindex = rdst->remote_ifindex;
|
2023-10-16 09:15:26 +02:00
|
|
|
|
2023-10-20 13:55:29 +02:00
|
|
|
if (addr_family == AF_INET) {
|
2023-10-16 09:15:26 +02:00
|
|
|
key.u.ipv4.src = vxlan->cfg.saddr.sin.sin_addr.s_addr;
|
|
|
|
key.u.ipv4.dst = rdst->remote_ip.sin.sin_addr.s_addr;
|
|
|
|
} else {
|
|
|
|
key.u.ipv6.src = vxlan->cfg.saddr.sin6.sin6_addr;
|
|
|
|
key.u.ipv6.dst = rdst->remote_ip.sin6.sin6_addr;
|
|
|
|
}
|
|
|
|
|
2016-02-12 15:43:57 +01:00
|
|
|
dst_cache = &rdst->dst_cache;
|
2016-11-13 20:43:57 -08:00
|
|
|
md->gbp = skb->mark;
|
2018-04-17 14:11:28 +08:00
|
|
|
if (flags & VXLAN_F_TTL_INHERIT) {
|
|
|
|
ttl = ip_tunnel_get_ttl(old_iph, skb);
|
|
|
|
} else {
|
|
|
|
ttl = vxlan->cfg.ttl;
|
2023-10-20 13:55:29 +02:00
|
|
|
if (!ttl && vxlan_addr_multicast(&rdst->remote_ip))
|
2018-04-17 14:11:28 +08:00
|
|
|
ttl = 1;
|
|
|
|
}
|
2016-11-13 20:43:57 -08:00
|
|
|
tos = vxlan->cfg.tos;
|
|
|
|
if (tos == 1)
|
|
|
|
tos = ip_tunnel_get_dsfield(old_iph, skb);
|
2023-10-16 09:15:26 +02:00
|
|
|
if (tos && !info)
|
|
|
|
use_cache = false;
|
2016-11-13 20:43:57 -08:00
|
|
|
|
2023-10-20 13:55:29 +02:00
|
|
|
if (addr_family == AF_INET)
|
2016-11-13 20:43:57 -08:00
|
|
|
udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
|
|
|
|
else
|
|
|
|
udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
|
2022-03-01 05:04:29 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
vxlan: add support for flowlabel inherit
By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with
an option for a fixed value. This commits add the ability to inherit the
flow label from the inner packet, like for other tunnel implementations.
This enables devices using only L3 headers for ECMP to correctly balance
VXLAN-encapsulated IPv6 packets.
```
$ ./ip/ip link add dummy1 type dummy
$ ./ip/ip addr add 2001:db8::2/64 dev dummy1
$ ./ip/ip link set up dev dummy1
$ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2
$ ./ip/ip link set up dev vxlan1
$ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
$ ./ip/ip link set arp off dev vxlan1
$ ping -q 2001:db8:1::1 &
$ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
[...]
Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
[...]
Virtual eXtensible Local Area Network
Flags: 0x0800, VXLAN Network ID (VNI)
Group Policy ID: 0
VXLAN Network Identifier (VNI): 100
[...]
Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
```
Signed-off-by: Alce Lafranque <alce@lafranque.net>
Co-developed-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-11-14 11:36:57 -06:00
|
|
|
switch (vxlan->cfg.label_policy) {
|
|
|
|
case VXLAN_LABEL_FIXED:
|
|
|
|
key.label = vxlan->cfg.label;
|
|
|
|
break;
|
|
|
|
case VXLAN_LABEL_INHERIT:
|
|
|
|
key.label = ip_tunnel_get_flowlabel(old_iph, skb);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
DEBUG_NET_WARN_ON_ONCE(1);
|
|
|
|
goto drop;
|
|
|
|
}
|
2022-03-01 05:04:29 +00:00
|
|
|
#endif
|
2015-07-21 10:43:58 +02:00
|
|
|
} else {
|
2020-09-25 18:56:04 -07:00
|
|
|
if (!info) {
|
|
|
|
WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
|
|
|
|
dev->name);
|
|
|
|
goto drop;
|
|
|
|
}
|
2023-10-16 09:15:26 +02:00
|
|
|
pkey = &info->key;
|
2023-10-20 13:55:29 +02:00
|
|
|
addr_family = ip_tunnel_info_af(info);
|
2016-11-13 20:43:57 -08:00
|
|
|
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
|
|
|
|
vni = tunnel_id_to_key32(info->key.tun_id);
|
2017-06-19 10:04:00 +02:00
|
|
|
ifindex = 0;
|
2016-02-12 15:43:57 +01:00
|
|
|
dst_cache = &info->dst_cache;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
|
|
|
if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
|
2019-10-29 01:24:32 +08:00
|
|
|
if (info->options_len < sizeof(*md))
|
|
|
|
goto drop;
|
2016-11-13 20:43:57 -08:00
|
|
|
md = ip_tunnel_info_opts(info);
|
2019-10-29 01:24:32 +08:00
|
|
|
}
|
2015-08-20 13:56:30 +02:00
|
|
|
ttl = info->key.ttl;
|
|
|
|
tos = info->key.tos;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
|
|
|
udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
|
2015-08-20 13:56:30 +02:00
|
|
|
}
|
2016-11-13 20:43:57 -08:00
|
|
|
src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
|
|
|
|
vxlan->cfg.port_max, true);
|
2015-08-20 13:56:30 +02:00
|
|
|
|
2017-02-24 11:43:36 -08:00
|
|
|
rcu_read_lock();
|
2023-10-20 13:55:29 +02:00
|
|
|
if (addr_family == AF_INET) {
|
2016-10-28 09:59:15 -07:00
|
|
|
struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
|
2016-11-13 20:43:54 -08:00
|
|
|
struct rtable *rt;
|
2016-11-13 20:43:57 -08:00
|
|
|
__be16 df = 0;
|
2023-10-16 09:15:26 +02:00
|
|
|
__be32 saddr;
|
2016-10-28 09:59:15 -07:00
|
|
|
|
2018-12-03 10:54:40 +01:00
|
|
|
if (!ifindex)
|
|
|
|
ifindex = sock4->sock->sk->sk_bound_dev_if;
|
|
|
|
|
2023-10-16 09:15:26 +02:00
|
|
|
rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, ifindex,
|
|
|
|
&saddr, pkey, src_port, dst_port,
|
|
|
|
tos, use_cache ? dst_cache : NULL);
|
2016-11-15 16:32:11 -05:00
|
|
|
if (IS_ERR(rt)) {
|
|
|
|
err = PTR_ERR(rt);
|
2024-10-09 10:28:27 +08:00
|
|
|
reason = SKB_DROP_REASON_IP_OUTNOROUTES;
|
2016-11-13 20:43:54 -08:00
|
|
|
goto tx_error;
|
2016-11-15 16:32:11 -05:00
|
|
|
}
|
2013-08-31 13:44:33 +08:00
|
|
|
|
2016-11-13 20:43:56 -08:00
|
|
|
if (!info) {
|
vxlan: Allow configuration of DF behaviour
Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.
For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.
According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.
Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.
This only applies to non-lwt tunnels: if an external control plane is
used, tunnel key will still control the DF flag.
v2:
- DF behaviour configuration only applies for non-lwt tunnels, move DF
setting to if (!info) block in vxlan_xmit_one() (Stephen Hemminger)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 12:19:16 +01:00
|
|
|
/* Bypass encapsulation if the destination is local */
|
2023-10-20 13:55:29 +02:00
|
|
|
err = encap_bypass_if_local(skb, dev, vxlan, AF_INET,
|
2017-06-19 10:04:00 +02:00
|
|
|
dst_port, ifindex, vni,
|
|
|
|
&rt->dst, rt->rt_flags);
|
2016-11-13 20:43:56 -08:00
|
|
|
if (err)
|
2017-02-24 11:43:36 -08:00
|
|
|
goto out_unlock;
|
vxlan: Allow configuration of DF behaviour
Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.
For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.
According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.
Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.
This only applies to non-lwt tunnels: if an external control plane is
used, tunnel key will still control the DF flag.
v2:
- DF behaviour configuration only applies for non-lwt tunnels, move DF
setting to if (!info) block in vxlan_xmit_one() (Stephen Hemminger)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 12:19:16 +01:00
|
|
|
|
|
|
|
if (vxlan->cfg.df == VXLAN_DF_SET) {
|
|
|
|
df = htons(IP_DF);
|
|
|
|
} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
|
|
|
|
struct ethhdr *eth = eth_hdr(skb);
|
|
|
|
|
|
|
|
if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
|
|
|
|
(ntohs(eth->h_proto) == ETH_P_IP &&
|
|
|
|
old_iph->frag_off & htons(IP_DF)))
|
|
|
|
df = htons(IP_DF);
|
|
|
|
}
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
|
|
|
} else if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT,
|
|
|
|
info->key.tun_flags)) {
|
2016-02-19 11:26:31 -08:00
|
|
|
df = htons(IP_DF);
|
2016-11-13 20:43:56 -08:00
|
|
|
}
|
2016-02-19 11:26:31 -08:00
|
|
|
|
2016-11-13 20:43:54 -08:00
|
|
|
ndst = &rt->dst;
|
vxlan: calculate correct header length for GPE
VXLAN-GPE does not add an extra inner Ethernet header. Take that into
account when calculating header length.
This causes problems in skb_tunnel_check_pmtu, where incorrect PMTU is
cached.
In the collect_md mode (which is the only mode that VXLAN-GPE
supports), there's no magic auto-setting of the tunnel interface MTU.
It can't be, since the destination and thus the underlying interface
may be different for each packet.
So, the administrator is responsible for setting the correct tunnel
interface MTU. Apparently, the administrators are capable enough to
calculate that the maximum MTU for VXLAN-GPE is (their_lower_MTU - 36).
They set the tunnel interface MTU to 1464. If you run a TCP stream over
such interface, it's then segmented according to the MTU 1464, i.e.
producing 1514 bytes frames. Which is okay, this still fits the lower
MTU.
However, skb_tunnel_check_pmtu (called from vxlan_xmit_one) uses 50 as
the header size and thus incorrectly calculates the frame size to be
1528. This leads to ICMP too big message being generated (locally),
PMTU of 1450 to be cached and the TCP stream to be resegmented.
The fix is to use the correct actual header size, especially for
skb_tunnel_check_pmtu calculation.
Fixes: e1e5314de08ba ("vxlan: implement GPE")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-20 11:05:56 +02:00
|
|
|
err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom(flags & VXLAN_F_GPE),
|
2020-08-04 07:53:44 +02:00
|
|
|
netif_is_any_bridge_port(dev));
|
|
|
|
if (err < 0) {
|
|
|
|
goto tx_error;
|
|
|
|
} else if (err) {
|
|
|
|
if (info) {
|
2021-03-25 16:35:32 +01:00
|
|
|
struct ip_tunnel_info *unclone;
|
2020-08-04 07:53:44 +02:00
|
|
|
|
2021-03-25 16:35:32 +01:00
|
|
|
unclone = skb_tunnel_info_unclone(skb);
|
|
|
|
if (unlikely(!unclone))
|
|
|
|
goto tx_error;
|
|
|
|
|
2023-10-16 09:15:26 +02:00
|
|
|
unclone->key.u.ipv4.src = pkey->u.ipv4.dst;
|
|
|
|
unclone->key.u.ipv4.dst = saddr;
|
2020-08-04 07:53:44 +02:00
|
|
|
}
|
|
|
|
vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
|
|
|
|
dst_release(ndst);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2017-12-18 14:20:56 +08:00
|
|
|
|
2020-08-05 10:41:31 +08:00
|
|
|
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
|
2013-08-31 13:44:33 +08:00
|
|
|
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
|
2016-11-13 20:43:54 -08:00
|
|
|
err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
|
2016-02-16 21:58:58 +01:00
|
|
|
vni, md, flags, udp_sum);
|
2024-10-09 10:28:27 +08:00
|
|
|
if (err < 0) {
|
|
|
|
reason = SKB_DROP_REASON_NOMEM;
|
2016-11-13 20:43:54 -08:00
|
|
|
goto tx_error;
|
2024-10-09 10:28:27 +08:00
|
|
|
}
|
2016-02-02 18:09:16 +01:00
|
|
|
|
2023-10-16 09:15:26 +02:00
|
|
|
udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr,
|
|
|
|
pkey->u.ipv4.dst, tos, ttl, df,
|
2016-02-02 18:09:16 +01:00
|
|
|
src_port, dst_port, xnet, !udp_sum);
|
2013-08-31 13:44:33 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
} else {
|
2016-10-28 09:59:15 -07:00
|
|
|
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
|
2023-10-20 13:55:29 +02:00
|
|
|
struct in6_addr saddr;
|
2013-08-31 13:44:33 +08:00
|
|
|
|
2018-12-03 10:54:40 +01:00
|
|
|
if (!ifindex)
|
|
|
|
ifindex = sock6->sock->sk->sk_bound_dev_if;
|
|
|
|
|
2023-10-20 13:55:29 +02:00
|
|
|
ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock,
|
|
|
|
ifindex, &saddr, pkey,
|
|
|
|
src_port, dst_port, tos,
|
|
|
|
use_cache ? dst_cache : NULL);
|
2015-12-07 13:04:30 +01:00
|
|
|
if (IS_ERR(ndst)) {
|
2016-11-15 16:32:11 -05:00
|
|
|
err = PTR_ERR(ndst);
|
2016-11-13 20:43:54 -08:00
|
|
|
ndst = NULL;
|
2024-10-09 10:28:27 +08:00
|
|
|
reason = SKB_DROP_REASON_IP_OUTNOROUTES;
|
2013-04-02 12:31:52 +00:00
|
|
|
goto tx_error;
|
2013-08-31 13:44:33 +08:00
|
|
|
}
|
2016-11-13 20:43:55 -08:00
|
|
|
|
2016-11-13 20:43:56 -08:00
|
|
|
if (!info) {
|
2024-04-26 15:19:52 +00:00
|
|
|
u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags;
|
2013-08-19 11:23:17 -07:00
|
|
|
|
2023-10-20 13:55:29 +02:00
|
|
|
err = encap_bypass_if_local(skb, dev, vxlan, AF_INET6,
|
2017-06-19 10:04:00 +02:00
|
|
|
dst_port, ifindex, vni,
|
|
|
|
ndst, rt6i_flags);
|
2016-11-13 20:43:56 -08:00
|
|
|
if (err)
|
2017-02-24 11:43:36 -08:00
|
|
|
goto out_unlock;
|
2016-11-13 20:43:56 -08:00
|
|
|
}
|
2016-01-20 16:22:47 -08:00
|
|
|
|
vxlan: calculate correct header length for GPE
VXLAN-GPE does not add an extra inner Ethernet header. Take that into
account when calculating header length.
This causes problems in skb_tunnel_check_pmtu, where incorrect PMTU is
cached.
In the collect_md mode (which is the only mode that VXLAN-GPE
supports), there's no magic auto-setting of the tunnel interface MTU.
It can't be, since the destination and thus the underlying interface
may be different for each packet.
So, the administrator is responsible for setting the correct tunnel
interface MTU. Apparently, the administrators are capable enough to
calculate that the maximum MTU for VXLAN-GPE is (their_lower_MTU - 36).
They set the tunnel interface MTU to 1464. If you run a TCP stream over
such interface, it's then segmented according to the MTU 1464, i.e.
producing 1514 bytes frames. Which is okay, this still fits the lower
MTU.
However, skb_tunnel_check_pmtu (called from vxlan_xmit_one) uses 50 as
the header size and thus incorrectly calculates the frame size to be
1528. This leads to ICMP too big message being generated (locally),
PMTU of 1450 to be cached and the TCP stream to be resegmented.
The fix is to use the correct actual header size, especially for
skb_tunnel_check_pmtu calculation.
Fixes: e1e5314de08ba ("vxlan: implement GPE")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-20 11:05:56 +02:00
|
|
|
err = skb_tunnel_check_pmtu(skb, ndst,
|
|
|
|
vxlan_headroom((flags & VXLAN_F_GPE) | VXLAN_F_IPV6),
|
2020-08-04 07:53:44 +02:00
|
|
|
netif_is_any_bridge_port(dev));
|
|
|
|
if (err < 0) {
|
|
|
|
goto tx_error;
|
|
|
|
} else if (err) {
|
|
|
|
if (info) {
|
2021-03-25 16:35:32 +01:00
|
|
|
struct ip_tunnel_info *unclone;
|
2020-08-04 07:53:44 +02:00
|
|
|
|
2021-03-25 16:35:32 +01:00
|
|
|
unclone = skb_tunnel_info_unclone(skb);
|
|
|
|
if (unlikely(!unclone))
|
|
|
|
goto tx_error;
|
|
|
|
|
2023-10-20 13:55:29 +02:00
|
|
|
unclone->key.u.ipv6.src = pkey->u.ipv6.dst;
|
|
|
|
unclone->key.u.ipv6.dst = saddr;
|
2020-08-04 07:53:44 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
|
|
|
|
dst_release(ndst);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2017-12-18 14:20:56 +08:00
|
|
|
|
2020-08-05 10:41:31 +08:00
|
|
|
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
|
2013-08-31 13:44:33 +08:00
|
|
|
ttl = ttl ? : ip6_dst_hoplimit(ndst);
|
2016-02-02 18:09:16 +01:00
|
|
|
skb_scrub_packet(skb, xnet);
|
|
|
|
err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
|
2016-02-16 21:58:58 +01:00
|
|
|
vni, md, flags, udp_sum);
|
2024-10-09 10:28:27 +08:00
|
|
|
if (err < 0) {
|
|
|
|
reason = SKB_DROP_REASON_NOMEM;
|
2016-11-13 20:43:54 -08:00
|
|
|
goto tx_error;
|
2024-10-09 10:28:27 +08:00
|
|
|
}
|
2016-11-13 20:43:54 -08:00
|
|
|
|
2016-11-13 20:43:57 -08:00
|
|
|
udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
|
2023-10-20 13:55:29 +02:00
|
|
|
&saddr, &pkey->u.ipv6.dst, tos, ttl,
|
|
|
|
pkey->label, src_port, dst_port, !udp_sum);
|
2013-08-31 13:44:33 +08:00
|
|
|
#endif
|
|
|
|
}
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len);
|
2017-02-24 11:43:36 -08:00
|
|
|
out_unlock:
|
|
|
|
rcu_read_unlock();
|
2013-06-17 14:16:11 -07:00
|
|
|
return;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
drop:
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_tx_dropped(dev);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0);
|
2024-10-09 10:28:27 +08:00
|
|
|
kfree_skb_reason(skb, reason);
|
2016-11-13 20:43:54 -08:00
|
|
|
return;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
tx_error:
|
2017-02-24 11:43:36 -08:00
|
|
|
rcu_read_unlock();
|
2016-11-13 20:43:55 -08:00
|
|
|
if (err == -ELOOP)
|
2024-04-26 17:27:17 +02:00
|
|
|
DEV_STATS_INC(dev, collisions);
|
2016-11-13 20:43:55 -08:00
|
|
|
else if (err == -ENETUNREACH)
|
2024-04-26 17:27:17 +02:00
|
|
|
DEV_STATS_INC(dev, tx_carrier_errors);
|
2016-11-13 20:43:54 -08:00
|
|
|
dst_release(ndst);
|
2024-04-26 17:27:17 +02:00
|
|
|
DEV_STATS_INC(dev, tx_errors);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0);
|
2024-10-09 10:28:27 +08:00
|
|
|
kfree_skb_reason(skb, reason);
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,
|
|
|
|
struct vxlan_fdb *f, __be32 vni, bool did_rsc)
|
|
|
|
{
|
|
|
|
struct vxlan_rdst nh_rdst;
|
|
|
|
struct nexthop *nh;
|
|
|
|
bool do_xmit;
|
|
|
|
u32 hash;
|
|
|
|
|
|
|
|
memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
|
|
|
|
hash = skb_get_hash(skb);
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
nh = rcu_dereference(f->nh);
|
|
|
|
if (!nh) {
|
|
|
|
rcu_read_unlock();
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
if (likely(do_xmit))
|
|
|
|
vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc);
|
|
|
|
else
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
drop:
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_tx_dropped(dev);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(netdev_priv(dev), vni, NULL,
|
|
|
|
VXLAN_VNI_STATS_TX_DROPS, 0);
|
2020-05-21 22:26:14 -07:00
|
|
|
dev_kfree_skb(skb);
|
|
|
|
}
|
|
|
|
|
2023-07-17 11:12:27 +03:00
|
|
|
static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev,
|
|
|
|
u32 nhid, __be32 vni)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct vxlan_rdst nh_rdst;
|
|
|
|
struct nexthop *nh;
|
|
|
|
bool do_xmit;
|
|
|
|
u32 hash;
|
|
|
|
|
|
|
|
memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
|
|
|
|
hash = skb_get_hash(skb);
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
nh = nexthop_find_by_id(dev_net(dev), nhid);
|
|
|
|
if (unlikely(!nh || !nexthop_is_fdb(nh) || !nexthop_is_multipath(nh))) {
|
|
|
|
rcu_read_unlock();
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
if (vxlan->cfg.saddr.sa.sa_family != nh_rdst.remote_ip.sa.sa_family)
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
if (likely(do_xmit))
|
|
|
|
vxlan_xmit_one(skb, dev, vni, &nh_rdst, false);
|
|
|
|
else
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
return NETDEV_TX_OK;
|
|
|
|
|
|
|
|
drop:
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_tx_dropped(dev);
|
2023-07-17 11:12:27 +03:00
|
|
|
vxlan_vnifilter_count(netdev_priv(dev), vni, NULL,
|
|
|
|
VXLAN_VNI_STATS_TX_DROPS, 0);
|
|
|
|
dev_kfree_skb(skb);
|
|
|
|
return NETDEV_TX_OK;
|
|
|
|
}
|
|
|
|
|
2013-03-15 04:35:51 +00:00
|
|
|
/* Transmit local packets over Vxlan
|
|
|
|
*
|
|
|
|
* Outer IP header inherits ECN and DF from inner header.
|
|
|
|
* Outer UDP destination is the VXLAN assigned port.
|
|
|
|
* source port is based on hash of flow
|
|
|
|
*/
|
|
|
|
static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2017-11-11 19:58:50 +08:00
|
|
|
struct vxlan_rdst *rdst, *fdst = NULL;
|
2015-07-21 10:44:00 +02:00
|
|
|
const struct ip_tunnel_info *info;
|
2013-03-15 04:35:51 +00:00
|
|
|
struct vxlan_fdb *f;
|
2017-11-11 19:58:50 +08:00
|
|
|
struct ethhdr *eth;
|
2017-01-31 22:59:52 -08:00
|
|
|
__be32 vni = 0;
|
2023-07-17 11:12:27 +03:00
|
|
|
u32 nhid = 0;
|
2024-08-10 10:06:32 +08:00
|
|
|
bool did_rsc;
|
2013-03-15 04:35:51 +00:00
|
|
|
|
2015-08-20 13:56:25 +02:00
|
|
|
info = skb_tunnel_info(skb);
|
2015-07-21 10:44:00 +02:00
|
|
|
|
2013-03-15 04:35:51 +00:00
|
|
|
skb_reset_mac_header(skb);
|
|
|
|
|
2017-06-19 10:03:56 +02:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
|
2017-01-31 22:59:52 -08:00
|
|
|
if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
|
|
|
|
info->mode & IP_TUNNEL_INFO_TX) {
|
|
|
|
vni = tunnel_id_to_key32(info->key.tun_id);
|
2023-07-17 11:12:27 +03:00
|
|
|
nhid = info->key.nhid;
|
2017-01-31 22:59:52 -08:00
|
|
|
} else {
|
|
|
|
if (info && info->mode & IP_TUNNEL_INFO_TX)
|
|
|
|
vxlan_xmit_one(skb, dev, vni, NULL, false);
|
|
|
|
else
|
2024-10-09 10:28:26 +08:00
|
|
|
kfree_skb_reason(skb, SKB_DROP_REASON_TUNNEL_TXINFO);
|
2017-01-31 22:59:52 -08:00
|
|
|
return NETDEV_TX_OK;
|
|
|
|
}
|
2016-04-05 14:47:11 +02:00
|
|
|
}
|
|
|
|
|
2017-06-19 10:03:56 +02:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_PROXY) {
|
2016-04-05 14:47:11 +02:00
|
|
|
eth = eth_hdr(skb);
|
2013-08-31 13:44:36 +08:00
|
|
|
if (ntohs(eth->h_proto) == ETH_P_ARP)
|
2017-01-31 22:59:52 -08:00
|
|
|
return arp_reduce(dev, skb, vni);
|
2013-08-31 13:44:36 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2017-11-11 19:58:50 +08:00
|
|
|
else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
|
|
|
|
pskb_may_pull(skb, sizeof(struct ipv6hdr) +
|
|
|
|
sizeof(struct nd_msg)) &&
|
|
|
|
ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
|
|
|
|
struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1);
|
|
|
|
|
|
|
|
if (m->icmph.icmp6_code == 0 &&
|
|
|
|
m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
|
2017-04-02 11:00:06 +02:00
|
|
|
return neigh_reduce(dev, skb, vni);
|
2013-08-31 13:44:36 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
2013-03-15 04:35:51 +00:00
|
|
|
|
2023-07-17 11:12:27 +03:00
|
|
|
if (nhid)
|
|
|
|
return vxlan_xmit_nhid(skb, dev, nhid, vni);
|
|
|
|
|
vxlan: Add MDB data path support
Integrate MDB support into the Tx path of the VXLAN driver, allowing it
to selectively forward IP multicast traffic according to the matched MDB
entry.
If MDB entries are configured (i.e., 'VXLAN_F_MDB' is set) and the
packet is an IP multicast packet, perform up to three different lookups
according to the following priority:
1. For an (S, G) entry, using {Source VNI, Source IP, Destination IP}.
2. For a (*, G) entry, using {Source VNI, Destination IP}.
3. For the catchall MDB entry (0.0.0.0 or ::), using the source VNI.
The catchall MDB entry is similar to the catchall FDB entry
(00:00:00:00:00:00) that is currently used to transmit BUM (broadcast,
unknown unicast and multicast) traffic. However, unlike the catchall FDB
entry, this entry is only used to transmit unregistered IP multicast
traffic that is not link-local. Therefore, when configured, the catchall
FDB entry will only transmit BULL (broadcast, unknown unicast,
link-local multicast) traffic.
The catchall MDB entry is useful in deployments where inter-subnet
multicast forwarding is used and not all the VTEPs in a tenant domain
are members in all the broadcast domains. In such deployments it is
advantageous to transmit BULL (broadcast, unknown unicast and link-local
multicast) and unregistered IP multicast traffic on different tunnels.
If the same tunnel was used, a VTEP only interested in IP multicast
traffic would also pull all the BULL traffic and drop it as it is not a
member in the originating broadcast domain [1].
If the packet did not match an MDB entry (or if the packet is not an IP
multicast packet), return it to the Tx path, allowing it to be forwarded
according to the FDB.
If the packet did match an MDB entry, forward it to the associated
remote VTEPs. However, if the entry is a (*, G) entry and the associated
remote is in INCLUDE mode, then skip over it as the source IP is not in
its source list (otherwise the packet would have matched on an (S, G)
entry). Similarly, if the associated remote is marked as BLOCKED (can
only be set on (S, G) entries), then skip over it as well as the remote
is in EXCLUDE mode and the source IP is in its source list.
[1] https://datatracker.ietf.org/doc/html/draft-ietf-bess-evpn-irb-mcast#section-2.6
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-03-15 15:11:53 +02:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_MDB) {
|
|
|
|
struct vxlan_mdb_entry *mdb_entry;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
mdb_entry = vxlan_mdb_entry_skb_get(vxlan, skb, vni);
|
|
|
|
if (mdb_entry) {
|
|
|
|
netdev_tx_t ret;
|
|
|
|
|
|
|
|
ret = vxlan_mdb_xmit(vxlan, mdb_entry, skb);
|
|
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2016-04-05 14:47:11 +02:00
|
|
|
eth = eth_hdr(skb);
|
2017-01-31 22:59:52 -08:00
|
|
|
f = vxlan_find_mac(vxlan, eth->h_dest, vni);
|
2013-04-19 00:36:26 +00:00
|
|
|
did_rsc = false;
|
|
|
|
|
2017-06-19 10:03:56 +02:00
|
|
|
if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) &&
|
2013-08-31 13:44:34 +08:00
|
|
|
(ntohs(eth->h_proto) == ETH_P_IP ||
|
|
|
|
ntohs(eth->h_proto) == ETH_P_IPV6)) {
|
2013-04-19 00:36:26 +00:00
|
|
|
did_rsc = route_shortcircuit(dev, skb);
|
|
|
|
if (did_rsc)
|
2017-01-31 22:59:52 -08:00
|
|
|
f = vxlan_find_mac(vxlan, eth->h_dest, vni);
|
2013-04-19 00:36:26 +00:00
|
|
|
}
|
|
|
|
|
2013-03-15 04:35:51 +00:00
|
|
|
if (f == NULL) {
|
2017-01-31 22:59:52 -08:00
|
|
|
f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
|
2013-06-25 16:01:51 +03:00
|
|
|
if (f == NULL) {
|
2017-06-19 10:03:56 +02:00
|
|
|
if ((vxlan->cfg.flags & VXLAN_F_L2MISS) &&
|
2013-06-25 16:01:51 +03:00
|
|
|
!is_multicast_ether_addr(eth->h_dest))
|
|
|
|
vxlan_fdb_miss(vxlan, eth->h_dest);
|
|
|
|
|
2024-12-04 13:11:27 +01:00
|
|
|
dev_dstats_tx_dropped(dev);
|
2022-03-01 05:04:38 +00:00
|
|
|
vxlan_vnifilter_count(vxlan, vni, NULL,
|
|
|
|
VXLAN_VNI_STATS_TX_DROPS, 0);
|
2024-12-19 11:36:05 -05:00
|
|
|
kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET);
|
2013-06-25 16:01:51 +03:00
|
|
|
return NETDEV_TX_OK;
|
|
|
|
}
|
|
|
|
}
|
2013-03-15 04:35:51 +00:00
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
if (rcu_access_pointer(f->nh)) {
|
|
|
|
vxlan_xmit_nh(skb, dev, f,
|
|
|
|
(vni ? : vxlan->default_dst.remote_vni), did_rsc);
|
|
|
|
} else {
|
|
|
|
list_for_each_entry_rcu(rdst, &f->remotes, list) {
|
|
|
|
struct sk_buff *skb1;
|
2013-03-15 04:35:51 +00:00
|
|
|
|
2020-05-21 22:26:14 -07:00
|
|
|
if (!fdst) {
|
|
|
|
fdst = rdst;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
skb1 = skb_clone(skb, GFP_ATOMIC);
|
|
|
|
if (skb1)
|
|
|
|
vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
|
2014-01-06 09:54:31 -08:00
|
|
|
}
|
2020-05-21 22:26:14 -07:00
|
|
|
if (fdst)
|
|
|
|
vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
|
|
|
|
else
|
2024-12-19 11:36:05 -05:00
|
|
|
kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET);
|
2013-03-15 04:35:51 +00:00
|
|
|
}
|
|
|
|
|
2013-06-17 14:16:11 -07:00
|
|
|
return NETDEV_TX_OK;
|
2013-03-15 04:35:51 +00:00
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Walk the forwarding table and purge stale entries */
|
2017-10-04 16:26:59 -07:00
|
|
|
static void vxlan_cleanup(struct timer_list *t)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2017-10-04 16:26:59 -07:00
|
|
|
struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
|
2012-10-01 12:32:35 +00:00
|
|
|
unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
|
|
|
|
unsigned int h;
|
|
|
|
|
|
|
|
if (!netif_running(vxlan->dev))
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (h = 0; h < FDB_HASH_SIZE; ++h) {
|
|
|
|
struct hlist_node *p, *n;
|
2015-05-26 10:42:04 +03:00
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_lock(&vxlan->hash_lock[h]);
|
2012-10-01 12:32:35 +00:00
|
|
|
hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
|
|
|
|
struct vxlan_fdb *f
|
|
|
|
= container_of(p, struct vxlan_fdb, hlist);
|
|
|
|
unsigned long timeout;
|
|
|
|
|
2017-01-23 20:44:33 -08:00
|
|
|
if (f->state & (NUD_PERMANENT | NUD_NOARP))
|
2012-10-01 12:32:35 +00:00
|
|
|
continue;
|
|
|
|
|
2017-03-27 15:46:41 -07:00
|
|
|
if (f->flags & NTF_EXT_LEARNED)
|
|
|
|
continue;
|
|
|
|
|
2015-07-21 10:44:02 +02:00
|
|
|
timeout = f->used + vxlan->cfg.age_interval * HZ;
|
2012-10-01 12:32:35 +00:00
|
|
|
if (time_before_eq(timeout, jiffies)) {
|
|
|
|
netdev_dbg(vxlan->dev,
|
|
|
|
"garbage collect %pM\n",
|
|
|
|
f->eth_addr);
|
|
|
|
f->state = NUD_STALE;
|
2018-11-21 08:02:35 +00:00
|
|
|
vxlan_fdb_destroy(vxlan, f, true, true);
|
2012-10-01 12:32:35 +00:00
|
|
|
} else if (time_before(timeout, next_timer))
|
|
|
|
next_timer = timeout;
|
|
|
|
}
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock(&vxlan->hash_lock[h]);
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
mod_timer(&vxlan->age_timer, next_timer);
|
|
|
|
}
|
|
|
|
|
2017-06-02 03:24:08 +03:00
|
|
|
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan)
|
|
|
|
{
|
|
|
|
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
|
|
|
|
|
|
|
|
spin_lock(&vn->sock_lock);
|
2017-07-02 19:00:57 +02:00
|
|
|
hlist_del_init_rcu(&vxlan->hlist4.hlist);
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
hlist_del_init_rcu(&vxlan->hlist6.hlist);
|
|
|
|
#endif
|
2017-06-02 03:24:08 +03:00
|
|
|
spin_unlock(&vn->sock_lock);
|
|
|
|
}
|
|
|
|
|
2017-07-02 19:00:57 +02:00
|
|
|
static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan,
|
|
|
|
struct vxlan_dev_node *node)
|
2013-08-19 11:22:48 -07:00
|
|
|
{
|
2015-03-18 14:50:44 -03:00
|
|
|
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
|
2016-02-16 21:58:58 +01:00
|
|
|
__be32 vni = vxlan->default_dst.remote_vni;
|
2013-08-19 11:22:48 -07:00
|
|
|
|
2017-07-02 19:00:57 +02:00
|
|
|
node->vxlan = vxlan;
|
2015-03-18 14:50:44 -03:00
|
|
|
spin_lock(&vn->sock_lock);
|
2017-07-02 19:00:57 +02:00
|
|
|
hlist_add_head_rcu(&node->hlist, vni_head(vs, vni));
|
2015-03-18 14:50:44 -03:00
|
|
|
spin_unlock(&vn->sock_lock);
|
2013-08-19 11:22:48 -07:00
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Setup stats when device is created */
|
|
|
|
static int vxlan_init(struct net_device *dev)
|
|
|
|
{
|
2020-03-18 13:28:09 +00:00
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
int err;
|
|
|
|
|
2022-03-01 05:04:36 +00:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
|
|
|
|
vxlan_vnigroup_init(vxlan);
|
|
|
|
|
2020-03-18 13:28:09 +00:00
|
|
|
err = gro_cells_init(&vxlan->gro_cells, dev);
|
2023-01-02 08:55:56 +02:00
|
|
|
if (err)
|
2024-03-11 04:24:30 -07:00
|
|
|
goto err_vnigroup_uninit;
|
2020-03-18 13:28:09 +00:00
|
|
|
|
vxlan: mdb: Add MDB control path support
Implement MDB control path support, enabling the creation, deletion,
replacement and dumping of MDB entries in a similar fashion to the
bridge driver. Unlike the bridge driver, each entry stores a list of
remote VTEPs to which matched packets need to be replicated to and not a
list of bridge ports.
The motivating use case is the installation of MDB entries by a user
space control plane in response to received EVPN routes. As such, only
allow permanent MDB entries to be installed and do not implement
snooping functionality, avoiding a lot of unnecessary complexity.
Since entries can only be modified by user space under RTNL, use RTNL as
the write lock. Use RCU to ensure that MDB entries and remotes are not
freed while being accessed from the data path during transmission.
In terms of uAPI, reuse the existing MDB netlink interface, but add a
few new attributes to request and response messages:
* IP address of the destination VXLAN tunnel endpoint where the
multicast receivers reside.
* UDP destination port number to use to connect to the remote VXLAN
tunnel endpoint.
* VXLAN VNI Network Identifier to use to connect to the remote VXLAN
tunnel endpoint. Required when Ingress Replication (IR) is used and
the remote VTEP is not a member of originating broadcast domain
(VLAN/VNI) [1].
* Source VNI Network Identifier the MDB entry belongs to. Used only when
the VXLAN device is in external mode.
* Interface index of the outgoing interface to reach the remote VXLAN
tunnel endpoint. This is required when the underlay destination IP is
multicast (P2MP), as the multicast routing tables are not consulted.
All the new attributes are added under the 'MDBA_SET_ENTRY_ATTRS' nest
which is strictly validated by the bridge driver, thereby automatically
rejecting the new attributes.
[1] https://datatracker.ietf.org/doc/html/draft-ietf-bess-evpn-irb-mcast#section-3.2.2
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-03-15 15:11:51 +02:00
|
|
|
err = vxlan_mdb_init(vxlan);
|
|
|
|
if (err)
|
|
|
|
goto err_gro_cells_destroy;
|
|
|
|
|
net: add netdev_lockdep_set_classes() to virtual drivers
Based on a syzbot report, it appears many virtual
drivers do not yet use netdev_lockdep_set_classes(),
triggerring lockdep false positives.
WARNING: possible recursive locking detected
6.8.0-rc4-next-20240212-syzkaller #0 Not tainted
syz-executor.0/19016 is trying to acquire lock:
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
but task is already holding lock:
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
lock(_xmit_ETHER#2);
lock(_xmit_ETHER#2);
*** DEADLOCK ***
May be due to missing lock nesting notation
9 locks held by syz-executor.0/19016:
#0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock net/core/rtnetlink.c:79 [inline]
#0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x82c/0x1040 net/core/rtnetlink.c:6603
#1: ffffc90000a08c00 ((&in_dev->mr_ifc_timer)){+.-.}-{0:0}, at: call_timer_fn+0xc0/0x600 kernel/time/timer.c:1697
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline]
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline]
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline]
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline]
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline]
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline]
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325
stack backtrace:
CPU: 1 PID: 19016 Comm: syz-executor.0 Not tainted 6.8.0-rc4-next-20240212-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114
check_deadlock kernel/locking/lockdep.c:3062 [inline]
validate_chain+0x15c1/0x58e0 kernel/locking/lockdep.c:3856
__lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1e4/0x530 kernel/locking/lockdep.c:5754
__raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
_raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154
spin_lock include/linux/spinlock.h:351 [inline]
__netif_tx_lock include/linux/netdevice.h:4452 [inline]
sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
__dev_xmit_skb net/core/dev.c:3784 [inline]
__dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325
neigh_output include/net/neighbour.h:542 [inline]
ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235
iptunnel_xmit+0x540/0x9b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x20ee/0x2960 net/ipv4/ip_tunnel.c:831
erspan_xmit+0x9de/0x1460 net/ipv4/ip_gre.c:720
__netdev_start_xmit include/linux/netdevice.h:4989 [inline]
netdev_start_xmit include/linux/netdevice.h:5003 [inline]
xmit_one net/core/dev.c:3555 [inline]
dev_hard_start_xmit+0x242/0x770 net/core/dev.c:3571
sch_direct_xmit+0x2b6/0x5f0 net/sched/sch_generic.c:342
__dev_xmit_skb net/core/dev.c:3784 [inline]
__dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325
neigh_output include/net/neighbour.h:542 [inline]
ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235
igmpv3_send_cr net/ipv4/igmp.c:723 [inline]
igmp_ifc_timer_expire+0xb71/0xd90 net/ipv4/igmp.c:813
call_timer_fn+0x17e/0x600 kernel/time/timer.c:1700
expire_timers kernel/time/timer.c:1751 [inline]
__run_timers+0x621/0x830 kernel/time/timer.c:2038
run_timer_softirq+0x67/0xf0 kernel/time/timer.c:2051
__do_softirq+0x2bc/0x943 kernel/softirq.c:554
invoke_softirq kernel/softirq.c:428 [inline]
__irq_exit_rcu+0xf2/0x1c0 kernel/softirq.c:633
irq_exit_rcu+0x9/0x30 kernel/softirq.c:645
instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1076 [inline]
sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1076
</IRQ>
<TASK>
asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
RIP: 0010:resched_offsets_ok kernel/sched/core.c:10127 [inline]
RIP: 0010:__might_resched+0x16f/0x780 kernel/sched/core.c:10142
Code: 00 4c 89 e8 48 c1 e8 03 48 ba 00 00 00 00 00 fc ff df 48 89 44 24 38 0f b6 04 10 84 c0 0f 85 87 04 00 00 41 8b 45 00 c1 e0 08 <01> d8 44 39 e0 0f 85 d6 00 00 00 44 89 64 24 1c 48 8d bc 24 a0 00
RSP: 0018:ffffc9000ee069e0 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8880296a9e00
RDX: dffffc0000000000 RSI: ffff8880296a9e00 RDI: ffffffff8bfe8fa0
RBP: ffffc9000ee06b00 R08: ffffffff82326877 R09: 1ffff11002b5ad1b
R10: dffffc0000000000 R11: ffffed1002b5ad1c R12: 0000000000000000
R13: ffff8880296aa23c R14: 000000000000062a R15: 1ffff92001dc0d44
down_write+0x19/0x50 kernel/locking/rwsem.c:1578
kernfs_activate fs/kernfs/dir.c:1403 [inline]
kernfs_add_one+0x4af/0x8b0 fs/kernfs/dir.c:819
__kernfs_create_file+0x22e/0x2e0 fs/kernfs/file.c:1056
sysfs_add_file_mode_ns+0x24a/0x310 fs/sysfs/file.c:307
create_files fs/sysfs/group.c:64 [inline]
internal_create_group+0x4f4/0xf20 fs/sysfs/group.c:152
internal_create_groups fs/sysfs/group.c:192 [inline]
sysfs_create_groups+0x56/0x120 fs/sysfs/group.c:218
create_dir lib/kobject.c:78 [inline]
kobject_add_internal+0x472/0x8d0 lib/kobject.c:240
kobject_add_varg lib/kobject.c:374 [inline]
kobject_init_and_add+0x124/0x190 lib/kobject.c:457
netdev_queue_add_kobject net/core/net-sysfs.c:1706 [inline]
netdev_queue_update_kobjects+0x1f3/0x480 net/core/net-sysfs.c:1758
register_queue_kobjects net/core/net-sysfs.c:1819 [inline]
netdev_register_kobject+0x265/0x310 net/core/net-sysfs.c:2059
register_netdevice+0x1191/0x19c0 net/core/dev.c:10298
bond_newlink+0x3b/0x90 drivers/net/bonding/bond_netlink.c:576
rtnl_newlink_create net/core/rtnetlink.c:3506 [inline]
__rtnl_newlink net/core/rtnetlink.c:3726 [inline]
rtnl_newlink+0x158f/0x20a0 net/core/rtnetlink.c:3739
rtnetlink_rcv_msg+0x885/0x1040 net/core/rtnetlink.c:6606
netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543
netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline]
netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367
netlink_sendmsg+0xa3c/0xd70 net/netlink/af_netlink.c:1908
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0x221/0x270 net/socket.c:745
__sys_sendto+0x3a4/0x4f0 net/socket.c:2191
__do_sys_sendto net/socket.c:2203 [inline]
__se_sys_sendto net/socket.c:2199 [inline]
__x64_sys_sendto+0xde/0x100 net/socket.c:2199
do_syscall_64+0xfb/0x240
entry_SYSCALL_64_after_hwframe+0x6d/0x75
RIP: 0033:0x7fc3fa87fa9c
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240212140700.2795436-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-02-12 14:07:00 +00:00
|
|
|
netdev_lockdep_set_classes(dev);
|
2012-10-01 12:32:35 +00:00
|
|
|
return 0;
|
2023-01-02 08:55:56 +02:00
|
|
|
|
vxlan: mdb: Add MDB control path support
Implement MDB control path support, enabling the creation, deletion,
replacement and dumping of MDB entries in a similar fashion to the
bridge driver. Unlike the bridge driver, each entry stores a list of
remote VTEPs to which matched packets need to be replicated to and not a
list of bridge ports.
The motivating use case is the installation of MDB entries by a user
space control plane in response to received EVPN routes. As such, only
allow permanent MDB entries to be installed and do not implement
snooping functionality, avoiding a lot of unnecessary complexity.
Since entries can only be modified by user space under RTNL, use RTNL as
the write lock. Use RCU to ensure that MDB entries and remotes are not
freed while being accessed from the data path during transmission.
In terms of uAPI, reuse the existing MDB netlink interface, but add a
few new attributes to request and response messages:
* IP address of the destination VXLAN tunnel endpoint where the
multicast receivers reside.
* UDP destination port number to use to connect to the remote VXLAN
tunnel endpoint.
* VXLAN VNI Network Identifier to use to connect to the remote VXLAN
tunnel endpoint. Required when Ingress Replication (IR) is used and
the remote VTEP is not a member of originating broadcast domain
(VLAN/VNI) [1].
* Source VNI Network Identifier the MDB entry belongs to. Used only when
the VXLAN device is in external mode.
* Interface index of the outgoing interface to reach the remote VXLAN
tunnel endpoint. This is required when the underlay destination IP is
multicast (P2MP), as the multicast routing tables are not consulted.
All the new attributes are added under the 'MDBA_SET_ENTRY_ATTRS' nest
which is strictly validated by the bridge driver, thereby automatically
rejecting the new attributes.
[1] https://datatracker.ietf.org/doc/html/draft-ietf-bess-evpn-irb-mcast#section-3.2.2
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-03-15 15:11:51 +02:00
|
|
|
err_gro_cells_destroy:
|
|
|
|
gro_cells_destroy(&vxlan->gro_cells);
|
2023-01-02 08:55:56 +02:00
|
|
|
err_vnigroup_uninit:
|
|
|
|
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
|
|
|
|
vxlan_vnigroup_uninit(vxlan);
|
|
|
|
return err;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2017-01-31 22:59:52 -08:00
|
|
|
static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
|
2013-06-25 16:01:51 +03:00
|
|
|
{
|
|
|
|
struct vxlan_fdb *f;
|
2019-06-06 17:57:58 +08:00
|
|
|
u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni);
|
2013-06-25 16:01:51 +03:00
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_lock_bh(&vxlan->hash_lock[hash_index]);
|
2017-01-31 22:59:52 -08:00
|
|
|
f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
|
2013-06-25 16:01:51 +03:00
|
|
|
if (f)
|
2018-11-21 08:02:35 +00:00
|
|
|
vxlan_fdb_destroy(vxlan, f, true, true);
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
|
2013-06-25 16:01:51 +03:00
|
|
|
}
|
|
|
|
|
2013-06-17 14:16:11 -07:00
|
|
|
static void vxlan_uninit(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
|
vxlan: mdb: Add MDB control path support
Implement MDB control path support, enabling the creation, deletion,
replacement and dumping of MDB entries in a similar fashion to the
bridge driver. Unlike the bridge driver, each entry stores a list of
remote VTEPs to which matched packets need to be replicated to and not a
list of bridge ports.
The motivating use case is the installation of MDB entries by a user
space control plane in response to received EVPN routes. As such, only
allow permanent MDB entries to be installed and do not implement
snooping functionality, avoiding a lot of unnecessary complexity.
Since entries can only be modified by user space under RTNL, use RTNL as
the write lock. Use RCU to ensure that MDB entries and remotes are not
freed while being accessed from the data path during transmission.
In terms of uAPI, reuse the existing MDB netlink interface, but add a
few new attributes to request and response messages:
* IP address of the destination VXLAN tunnel endpoint where the
multicast receivers reside.
* UDP destination port number to use to connect to the remote VXLAN
tunnel endpoint.
* VXLAN VNI Network Identifier to use to connect to the remote VXLAN
tunnel endpoint. Required when Ingress Replication (IR) is used and
the remote VTEP is not a member of originating broadcast domain
(VLAN/VNI) [1].
* Source VNI Network Identifier the MDB entry belongs to. Used only when
the VXLAN device is in external mode.
* Interface index of the outgoing interface to reach the remote VXLAN
tunnel endpoint. This is required when the underlay destination IP is
multicast (P2MP), as the multicast routing tables are not consulted.
All the new attributes are added under the 'MDBA_SET_ENTRY_ATTRS' nest
which is strictly validated by the bridge driver, thereby automatically
rejecting the new attributes.
[1] https://datatracker.ietf.org/doc/html/draft-ietf-bess-evpn-irb-mcast#section-3.2.2
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-03-15 15:11:51 +02:00
|
|
|
vxlan_mdb_fini(vxlan);
|
|
|
|
|
2022-03-01 05:04:36 +00:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
|
|
|
|
vxlan_vnigroup_uninit(vxlan);
|
|
|
|
|
2019-03-08 16:40:57 +01:00
|
|
|
gro_cells_destroy(&vxlan->gro_cells);
|
|
|
|
|
2017-01-31 22:59:52 -08:00
|
|
|
vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
|
2013-06-17 14:16:11 -07:00
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Start ageing timer and join group when device is brought up */
|
|
|
|
static int vxlan_open(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2015-09-24 13:50:01 +02:00
|
|
|
int ret;
|
2015-03-18 14:50:44 -03:00
|
|
|
|
2015-09-24 13:50:01 +02:00
|
|
|
ret = vxlan_sock_add(vxlan);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2022-03-01 05:04:36 +00:00
|
|
|
ret = vxlan_multicast_join(vxlan);
|
|
|
|
if (ret) {
|
|
|
|
vxlan_sock_release(vxlan);
|
|
|
|
return ret;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2015-07-21 10:44:02 +02:00
|
|
|
if (vxlan->cfg.age_interval)
|
2012-10-01 12:32:35 +00:00
|
|
|
mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
|
|
|
|
|
2015-03-18 14:50:44 -03:00
|
|
|
return ret;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2023-10-09 13:06:09 +03:00
|
|
|
struct vxlan_fdb_flush_desc {
|
2023-10-09 13:06:10 +03:00
|
|
|
bool ignore_default_entry;
|
2023-10-09 13:06:09 +03:00
|
|
|
unsigned long state;
|
|
|
|
unsigned long state_mask;
|
2023-10-09 13:06:11 +03:00
|
|
|
unsigned long flags;
|
|
|
|
unsigned long flags_mask;
|
2023-10-09 13:06:12 +03:00
|
|
|
__be32 src_vni;
|
2023-10-09 13:06:13 +03:00
|
|
|
u32 nhid;
|
2023-10-09 13:06:14 +03:00
|
|
|
__be32 vni;
|
2023-10-09 13:06:15 +03:00
|
|
|
__be16 port;
|
2023-10-09 13:06:16 +03:00
|
|
|
union vxlan_addr dst_ip;
|
2023-10-09 13:06:09 +03:00
|
|
|
};
|
|
|
|
|
2023-10-09 13:06:10 +03:00
|
|
|
static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb *f,
|
|
|
|
const struct vxlan_dev *vxlan)
|
|
|
|
{
|
|
|
|
return is_zero_ether_addr(f->eth_addr) && f->vni == vxlan->cfg.vni;
|
|
|
|
}
|
|
|
|
|
2023-10-09 13:06:13 +03:00
|
|
|
static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb *f, u32 nhid)
|
|
|
|
{
|
|
|
|
struct nexthop *nh = rtnl_dereference(f->nh);
|
|
|
|
|
|
|
|
return nh && nh->id == nhid;
|
|
|
|
}
|
|
|
|
|
2023-10-09 13:06:09 +03:00
|
|
|
static bool vxlan_fdb_flush_matches(const struct vxlan_fdb *f,
|
2023-10-09 13:06:10 +03:00
|
|
|
const struct vxlan_dev *vxlan,
|
2023-10-09 13:06:09 +03:00
|
|
|
const struct vxlan_fdb_flush_desc *desc)
|
|
|
|
{
|
|
|
|
if (desc->state_mask && (f->state & desc->state_mask) != desc->state)
|
|
|
|
return false;
|
|
|
|
|
2023-10-09 13:06:11 +03:00
|
|
|
if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags)
|
|
|
|
return false;
|
|
|
|
|
2023-10-09 13:06:10 +03:00
|
|
|
if (desc->ignore_default_entry && vxlan_fdb_is_default_entry(f, vxlan))
|
|
|
|
return false;
|
|
|
|
|
2023-10-09 13:06:12 +03:00
|
|
|
if (desc->src_vni && f->vni != desc->src_vni)
|
|
|
|
return false;
|
|
|
|
|
2023-10-09 13:06:13 +03:00
|
|
|
if (desc->nhid && !vxlan_fdb_nhid_matches(f, desc->nhid))
|
|
|
|
return false;
|
|
|
|
|
2023-10-09 13:06:09 +03:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-10-09 13:06:14 +03:00
|
|
|
static bool
|
|
|
|
vxlan_fdb_flush_should_match_remotes(const struct vxlan_fdb_flush_desc *desc)
|
|
|
|
{
|
2023-10-09 13:06:16 +03:00
|
|
|
return desc->vni || desc->port || desc->dst_ip.sa.sa_family;
|
2023-10-09 13:06:14 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
vxlan_fdb_flush_remote_matches(const struct vxlan_fdb_flush_desc *desc,
|
|
|
|
const struct vxlan_rdst *rd)
|
|
|
|
{
|
|
|
|
if (desc->vni && rd->remote_vni != desc->vni)
|
|
|
|
return false;
|
|
|
|
|
2023-10-09 13:06:15 +03:00
|
|
|
if (desc->port && rd->remote_port != desc->port)
|
|
|
|
return false;
|
|
|
|
|
2023-10-09 13:06:16 +03:00
|
|
|
if (desc->dst_ip.sa.sa_family &&
|
|
|
|
!vxlan_addr_equal(&rd->remote_ip, &desc->dst_ip))
|
|
|
|
return false;
|
|
|
|
|
2023-10-09 13:06:14 +03:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vxlan_fdb_flush_match_remotes(struct vxlan_fdb *f, struct vxlan_dev *vxlan,
|
|
|
|
const struct vxlan_fdb_flush_desc *desc,
|
|
|
|
bool *p_destroy_fdb)
|
|
|
|
{
|
|
|
|
bool remotes_flushed = false;
|
|
|
|
struct vxlan_rdst *rd, *tmp;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(rd, tmp, &f->remotes, list) {
|
|
|
|
if (!vxlan_fdb_flush_remote_matches(desc, rd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
vxlan_fdb_dst_destroy(vxlan, f, rd, true);
|
|
|
|
remotes_flushed = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
*p_destroy_fdb = remotes_flushed && list_empty(&f->remotes);
|
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Purge the forwarding table */
|
2023-10-09 13:06:09 +03:00
|
|
|
static void vxlan_flush(struct vxlan_dev *vxlan,
|
|
|
|
const struct vxlan_fdb_flush_desc *desc)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2023-10-09 13:06:14 +03:00
|
|
|
bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc);
|
2013-05-27 22:35:52 +00:00
|
|
|
unsigned int h;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
for (h = 0; h < FDB_HASH_SIZE; ++h) {
|
|
|
|
struct hlist_node *p, *n;
|
2019-06-06 17:57:58 +08:00
|
|
|
|
|
|
|
spin_lock_bh(&vxlan->hash_lock[h]);
|
2012-10-01 12:32:35 +00:00
|
|
|
hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
|
|
|
|
struct vxlan_fdb *f
|
|
|
|
= container_of(p, struct vxlan_fdb, hlist);
|
2023-10-09 13:06:09 +03:00
|
|
|
|
2023-10-09 13:06:10 +03:00
|
|
|
if (!vxlan_fdb_flush_matches(f, vxlan, desc))
|
2017-01-23 20:44:32 -08:00
|
|
|
continue;
|
2023-10-09 13:06:09 +03:00
|
|
|
|
2023-10-09 13:06:14 +03:00
|
|
|
if (match_remotes) {
|
|
|
|
bool destroy_fdb = false;
|
|
|
|
|
|
|
|
vxlan_fdb_flush_match_remotes(f, vxlan, desc,
|
|
|
|
&destroy_fdb);
|
|
|
|
|
|
|
|
if (!destroy_fdb)
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-08-01 07:07:50 +00:00
|
|
|
vxlan_fdb_destroy(vxlan, f, true, true);
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[h]);
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-09 13:06:11 +03:00
|
|
|
static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = {
|
2023-10-09 13:06:12 +03:00
|
|
|
[NDA_SRC_VNI] = { .type = NLA_U32 },
|
2023-10-09 13:06:13 +03:00
|
|
|
[NDA_NH_ID] = { .type = NLA_U32 },
|
2023-10-09 13:06:14 +03:00
|
|
|
[NDA_VNI] = { .type = NLA_U32 },
|
2023-10-09 13:06:15 +03:00
|
|
|
[NDA_PORT] = { .type = NLA_U16 },
|
2023-10-09 13:06:16 +03:00
|
|
|
[NDA_DST] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr),
|
|
|
|
sizeof(struct in6_addr)),
|
2023-10-09 13:06:11 +03:00
|
|
|
[NDA_NDM_STATE_MASK] = { .type = NLA_U16 },
|
|
|
|
[NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 },
|
|
|
|
};
|
|
|
|
|
|
|
|
#define VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF)
|
|
|
|
#define VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP)
|
|
|
|
#define VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_EXT_LEARNED | NTF_OFFLOADED | \
|
|
|
|
NTF_ROUTER)
|
|
|
|
|
|
|
|
static int vxlan_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct vxlan_fdb_flush_desc desc = {};
|
|
|
|
struct ndmsg *ndm = nlmsg_data(nlh);
|
|
|
|
struct nlattr *tb[NDA_MAX + 1];
|
|
|
|
u8 ndm_flags;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
ndm_flags = ndm->ndm_flags & ~VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS;
|
|
|
|
|
|
|
|
err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, vxlan_del_bulk_policy,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (ndm_flags & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (ndm->ndm_state & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
desc.state = ndm->ndm_state;
|
|
|
|
desc.flags = ndm_flags;
|
|
|
|
|
|
|
|
if (tb[NDA_NDM_STATE_MASK])
|
|
|
|
desc.state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]);
|
|
|
|
|
|
|
|
if (tb[NDA_NDM_FLAGS_MASK])
|
|
|
|
desc.flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]);
|
|
|
|
|
2023-10-09 13:06:12 +03:00
|
|
|
if (tb[NDA_SRC_VNI])
|
|
|
|
desc.src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
|
|
|
|
|
2023-10-09 13:06:13 +03:00
|
|
|
if (tb[NDA_NH_ID])
|
|
|
|
desc.nhid = nla_get_u32(tb[NDA_NH_ID]);
|
|
|
|
|
2023-10-09 13:06:14 +03:00
|
|
|
if (tb[NDA_VNI])
|
|
|
|
desc.vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
|
|
|
|
|
2023-10-09 13:06:15 +03:00
|
|
|
if (tb[NDA_PORT])
|
|
|
|
desc.port = nla_get_be16(tb[NDA_PORT]);
|
|
|
|
|
2023-10-09 13:06:16 +03:00
|
|
|
if (tb[NDA_DST]) {
|
|
|
|
union vxlan_addr ip;
|
|
|
|
|
|
|
|
err = vxlan_nla_get_addr(&ip, tb[NDA_DST]);
|
|
|
|
if (err) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[NDA_DST],
|
|
|
|
"Unsupported address family");
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
desc.dst_ip = ip;
|
|
|
|
}
|
|
|
|
|
2023-10-09 13:06:11 +03:00
|
|
|
vxlan_flush(vxlan, &desc);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Cleanup timer and forwarding table on shutdown */
|
|
|
|
static int vxlan_stop(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2023-10-09 13:06:09 +03:00
|
|
|
struct vxlan_fdb_flush_desc desc = {
|
2023-10-09 13:06:10 +03:00
|
|
|
/* Default entry is deleted at vxlan_uninit. */
|
|
|
|
.ignore_default_entry = true,
|
2023-10-09 13:06:09 +03:00
|
|
|
.state = 0,
|
|
|
|
.state_mask = NUD_PERMANENT | NUD_NOARP,
|
|
|
|
};
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2022-03-01 05:04:36 +00:00
|
|
|
vxlan_multicast_leave(vxlan);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
del_timer_sync(&vxlan->age_timer);
|
|
|
|
|
2023-10-09 13:06:09 +03:00
|
|
|
vxlan_flush(vxlan, &desc);
|
2015-09-24 13:50:01 +02:00
|
|
|
vxlan_sock_release(vxlan);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2022-03-08 21:43:09 +08:00
|
|
|
return 0;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Stub, nothing needs to be done. */
|
|
|
|
static void vxlan_set_multicast_list(struct net_device *dev)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
|
|
|
static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
|
2013-12-18 00:21:08 +01:00
|
|
|
{
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct vxlan_rdst *dst = &vxlan->default_dst;
|
|
|
|
struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
|
|
|
|
dst->remote_ifindex);
|
2013-12-18 00:21:08 +01:00
|
|
|
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
|
|
|
/* This check is different than dev->max_mtu, because it looks at
|
|
|
|
* the lowerdev->mtu, rather than the static dev->max_mtu
|
|
|
|
*/
|
|
|
|
if (lowerdev) {
|
vxlan: calculate correct header length for GPE
VXLAN-GPE does not add an extra inner Ethernet header. Take that into
account when calculating header length.
This causes problems in skb_tunnel_check_pmtu, where incorrect PMTU is
cached.
In the collect_md mode (which is the only mode that VXLAN-GPE
supports), there's no magic auto-setting of the tunnel interface MTU.
It can't be, since the destination and thus the underlying interface
may be different for each packet.
So, the administrator is responsible for setting the correct tunnel
interface MTU. Apparently, the administrators are capable enough to
calculate that the maximum MTU for VXLAN-GPE is (their_lower_MTU - 36).
They set the tunnel interface MTU to 1464. If you run a TCP stream over
such interface, it's then segmented according to the MTU 1464, i.e.
producing 1514 bytes frames. Which is okay, this still fits the lower
MTU.
However, skb_tunnel_check_pmtu (called from vxlan_xmit_one) uses 50 as
the header size and thus incorrectly calculates the frame size to be
1528. This leads to ICMP too big message being generated (locally),
PMTU of 1450 to be cached and the TCP stream to be resegmented.
The fix is to use the correct actual header size, especially for
skb_tunnel_check_pmtu calculation.
Fixes: e1e5314de08ba ("vxlan: implement GPE")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-20 11:05:56 +02:00
|
|
|
int max_mtu = lowerdev->mtu - vxlan_headroom(vxlan->cfg.flags);
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
|
|
|
if (new_mtu > max_mtu)
|
2016-02-10 00:05:55 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2024-05-06 10:28:12 +00:00
|
|
|
WRITE_ONCE(dev->mtu, new_mtu);
|
2013-12-18 00:21:08 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-22 18:17:16 -07:00
|
|
|
static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct ip_tunnel_info *info = skb_tunnel_info(skb);
|
|
|
|
__be16 sport, dport;
|
|
|
|
|
|
|
|
sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
|
|
|
|
vxlan->cfg.port_max, true);
|
|
|
|
dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
|
|
|
|
|
2015-12-07 13:04:31 +01:00
|
|
|
if (ip_tunnel_info_af(info) == AF_INET) {
|
2016-10-28 09:59:15 -07:00
|
|
|
struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
|
2016-02-02 18:09:14 +01:00
|
|
|
struct rtable *rt;
|
|
|
|
|
2023-10-16 09:15:26 +02:00
|
|
|
if (!sock4)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, 0,
|
|
|
|
&info->key.u.ipv4.src,
|
|
|
|
&info->key,
|
|
|
|
sport, dport, info->key.tos,
|
|
|
|
&info->dst_cache);
|
2016-02-02 18:09:14 +01:00
|
|
|
if (IS_ERR(rt))
|
|
|
|
return PTR_ERR(rt);
|
|
|
|
ip_rt_put(rt);
|
2015-12-07 13:04:31 +01:00
|
|
|
} else {
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2016-11-13 20:43:53 -08:00
|
|
|
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
|
2015-12-07 13:04:31 +01:00
|
|
|
struct dst_entry *ndst;
|
|
|
|
|
2023-10-20 13:55:29 +02:00
|
|
|
if (!sock6)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock,
|
|
|
|
0, &info->key.u.ipv6.src,
|
|
|
|
&info->key,
|
|
|
|
sport, dport, info->key.tos,
|
|
|
|
&info->dst_cache);
|
2015-12-07 13:04:31 +01:00
|
|
|
if (IS_ERR(ndst))
|
|
|
|
return PTR_ERR(ndst);
|
|
|
|
dst_release(ndst);
|
|
|
|
#else /* !CONFIG_IPV6 */
|
|
|
|
return -EPFNOSUPPORT;
|
|
|
|
#endif
|
|
|
|
}
|
2016-02-02 18:09:14 +01:00
|
|
|
info->key.tp_src = sport;
|
|
|
|
info->key.tp_dst = dport;
|
2015-12-07 13:04:31 +01:00
|
|
|
return 0;
|
2015-10-22 18:17:16 -07:00
|
|
|
}
|
|
|
|
|
2016-04-05 14:47:10 +02:00
|
|
|
static const struct net_device_ops vxlan_netdev_ether_ops = {
|
2012-10-01 12:32:35 +00:00
|
|
|
.ndo_init = vxlan_init,
|
2013-06-17 14:16:11 -07:00
|
|
|
.ndo_uninit = vxlan_uninit,
|
2012-10-01 12:32:35 +00:00
|
|
|
.ndo_open = vxlan_open,
|
|
|
|
.ndo_stop = vxlan_stop,
|
|
|
|
.ndo_start_xmit = vxlan_xmit,
|
|
|
|
.ndo_set_rx_mode = vxlan_set_multicast_list,
|
2013-12-18 00:21:08 +01:00
|
|
|
.ndo_change_mtu = vxlan_change_mtu,
|
2012-10-01 12:32:35 +00:00
|
|
|
.ndo_validate_addr = eth_validate_addr,
|
|
|
|
.ndo_set_mac_address = eth_mac_addr,
|
|
|
|
.ndo_fdb_add = vxlan_fdb_add,
|
|
|
|
.ndo_fdb_del = vxlan_fdb_delete,
|
2023-10-09 13:06:11 +03:00
|
|
|
.ndo_fdb_del_bulk = vxlan_fdb_delete_bulk,
|
2012-10-01 12:32:35 +00:00
|
|
|
.ndo_fdb_dump = vxlan_fdb_dump,
|
2018-12-15 22:35:10 -08:00
|
|
|
.ndo_fdb_get = vxlan_fdb_get,
|
2023-03-15 15:11:54 +02:00
|
|
|
.ndo_mdb_add = vxlan_mdb_add,
|
|
|
|
.ndo_mdb_del = vxlan_mdb_del,
|
2023-12-17 10:32:41 +02:00
|
|
|
.ndo_mdb_del_bulk = vxlan_mdb_del_bulk,
|
2023-03-15 15:11:54 +02:00
|
|
|
.ndo_mdb_dump = vxlan_mdb_dump,
|
2023-10-25 15:30:17 +03:00
|
|
|
.ndo_mdb_get = vxlan_mdb_get,
|
2015-10-22 18:17:16 -07:00
|
|
|
.ndo_fill_metadata_dst = vxlan_fill_metadata_dst,
|
2012-10-01 12:32:35 +00:00
|
|
|
};
|
|
|
|
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
static const struct net_device_ops vxlan_netdev_raw_ops = {
|
|
|
|
.ndo_init = vxlan_init,
|
|
|
|
.ndo_uninit = vxlan_uninit,
|
|
|
|
.ndo_open = vxlan_open,
|
|
|
|
.ndo_stop = vxlan_stop,
|
|
|
|
.ndo_start_xmit = vxlan_xmit,
|
|
|
|
.ndo_change_mtu = vxlan_change_mtu,
|
|
|
|
.ndo_fill_metadata_dst = vxlan_fill_metadata_dst,
|
|
|
|
};
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Info for udev, that this is a virtual tunnel endpoint */
|
2024-02-17 17:13:26 -03:00
|
|
|
static const struct device_type vxlan_type = {
|
2012-10-01 12:32:35 +00:00
|
|
|
.name = "vxlan",
|
|
|
|
};
|
|
|
|
|
2016-07-11 13:12:28 +02:00
|
|
|
/* Calls the ndo_udp_tunnel_add of the caller in order to
|
2013-09-13 07:34:13 -07:00
|
|
|
* supply the listening VXLAN udp ports. Callers are expected
|
2016-07-11 13:12:28 +02:00
|
|
|
* to implement the ndo_udp_tunnel_add.
|
2013-09-04 02:13:38 -07:00
|
|
|
*/
|
2017-07-21 12:49:32 +02:00
|
|
|
static void vxlan_offload_rx_ports(struct net_device *dev, bool push)
|
2013-09-04 02:13:38 -07:00
|
|
|
{
|
|
|
|
struct vxlan_sock *vs;
|
|
|
|
struct net *net = dev_net(dev);
|
|
|
|
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
|
2013-09-13 07:34:13 -07:00
|
|
|
unsigned int i;
|
2013-09-04 02:13:38 -07:00
|
|
|
|
|
|
|
spin_lock(&vn->sock_lock);
|
|
|
|
for (i = 0; i < PORT_HASH_SIZE; ++i) {
|
2017-07-21 12:49:32 +02:00
|
|
|
hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
|
|
|
|
unsigned short type;
|
|
|
|
|
|
|
|
if (vs->flags & VXLAN_F_GPE)
|
|
|
|
type = UDP_TUNNEL_TYPE_VXLAN_GPE;
|
|
|
|
else
|
|
|
|
type = UDP_TUNNEL_TYPE_VXLAN;
|
|
|
|
|
|
|
|
if (push)
|
|
|
|
udp_tunnel_push_rx_port(dev, vs->sock, type);
|
|
|
|
else
|
|
|
|
udp_tunnel_drop_rx_port(dev, vs->sock, type);
|
|
|
|
}
|
2013-09-04 02:13:38 -07:00
|
|
|
}
|
|
|
|
spin_unlock(&vn->sock_lock);
|
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
/* Initialize the device structure. */
|
|
|
|
static void vxlan_setup(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2013-05-27 22:35:52 +00:00
|
|
|
unsigned int h;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2016-04-28 16:36:30 +02:00
|
|
|
eth_hw_addr_random(dev);
|
|
|
|
ether_setup(dev);
|
|
|
|
|
net: Fix inconsistent teardown and release of private netdev state.
Network devices can allocate reasources and private memory using
netdev_ops->ndo_init(). However, the release of these resources
can occur in one of two different places.
Either netdev_ops->ndo_uninit() or netdev->destructor().
The decision of which operation frees the resources depends upon
whether it is necessary for all netdev refs to be released before it
is safe to perform the freeing.
netdev_ops->ndo_uninit() presumably can occur right after the
NETDEV_UNREGISTER notifier completes and the unicast and multicast
address lists are flushed.
netdev->destructor(), on the other hand, does not run until the
netdev references all go away.
Further complicating the situation is that netdev->destructor()
almost universally does also a free_netdev().
This creates a problem for the logic in register_netdevice().
Because all callers of register_netdevice() manage the freeing
of the netdev, and invoke free_netdev(dev) if register_netdevice()
fails.
If netdev_ops->ndo_init() succeeds, but something else fails inside
of register_netdevice(), it does call ndo_ops->ndo_uninit(). But
it is not able to invoke netdev->destructor().
This is because netdev->destructor() will do a free_netdev() and
then the caller of register_netdevice() will do the same.
However, this means that the resources that would normally be released
by netdev->destructor() will not be.
Over the years drivers have added local hacks to deal with this, by
invoking their destructor parts by hand when register_netdevice()
fails.
Many drivers do not try to deal with this, and instead we have leaks.
Let's close this hole by formalizing the distinction between what
private things need to be freed up by netdev->destructor() and whether
the driver needs unregister_netdevice() to perform the free_netdev().
netdev->priv_destructor() performs all actions to free up the private
resources that used to be freed by netdev->destructor(), except for
free_netdev().
netdev->needs_free_netdev is a boolean that indicates whether
free_netdev() should be done at the end of unregister_netdevice().
Now, register_netdevice() can sanely release all resources after
ndo_ops->ndo_init() succeeds, by invoking both ndo_ops->ndo_uninit()
and netdev->priv_destructor().
And at the end of unregister_netdevice(), we invoke
netdev->priv_destructor() and optionally call free_netdev().
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-08 12:52:56 -04:00
|
|
|
dev->needs_free_netdev = true;
|
2012-10-01 12:32:35 +00:00
|
|
|
SET_NETDEV_DEVTYPE(dev, &vxlan_type);
|
|
|
|
|
2021-01-15 17:47:45 +08:00
|
|
|
dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
|
2012-12-07 14:14:18 +00:00
|
|
|
dev->features |= NETIF_F_RXCSUM;
|
2013-03-07 13:22:36 +00:00
|
|
|
dev->features |= NETIF_F_GSO_SOFTWARE;
|
2012-12-07 14:14:18 +00:00
|
|
|
|
2013-08-19 11:23:29 -07:00
|
|
|
dev->vlan_features = dev->features;
|
2021-01-15 17:47:45 +08:00
|
|
|
dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
|
|
|
|
dev->hw_features |= NETIF_F_RXCSUM;
|
2013-03-07 13:22:36 +00:00
|
|
|
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
|
2014-10-05 18:38:35 -07:00
|
|
|
netif_keep_dst(dev);
|
2024-08-29 14:33:36 +02:00
|
|
|
dev->priv_flags |= IFF_NO_QUEUE;
|
|
|
|
dev->change_proto_down = true;
|
2024-08-29 14:33:37 +02:00
|
|
|
dev->lltx = true;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
/* MTU range: 68 - 65535 */
|
|
|
|
dev->min_mtu = ETH_MIN_MTU;
|
|
|
|
dev->max_mtu = ETH_MAX_MTU;
|
|
|
|
|
2024-12-04 13:11:27 +01:00
|
|
|
dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
|
2013-05-16 11:35:20 +00:00
|
|
|
INIT_LIST_HEAD(&vxlan->next);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2017-10-04 16:26:59 -07:00
|
|
|
timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
vxlan->dev = dev;
|
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
for (h = 0; h < FDB_HASH_SIZE; ++h) {
|
|
|
|
spin_lock_init(&vxlan->hash_lock[h]);
|
2012-10-01 12:32:35 +00:00
|
|
|
INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
|
2019-06-06 17:57:58 +08:00
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2016-04-05 14:47:10 +02:00
|
|
|
static void vxlan_ether_setup(struct net_device *dev)
|
|
|
|
{
|
|
|
|
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
|
|
|
|
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
|
|
|
|
dev->netdev_ops = &vxlan_netdev_ether_ops;
|
|
|
|
}
|
|
|
|
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
static void vxlan_raw_setup(struct net_device *dev)
|
|
|
|
{
|
2016-04-28 16:36:30 +02:00
|
|
|
dev->header_ops = NULL;
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
dev->type = ARPHRD_NONE;
|
|
|
|
dev->hard_header_len = 0;
|
|
|
|
dev->addr_len = 0;
|
|
|
|
dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
|
|
|
|
dev->netdev_ops = &vxlan_netdev_raw_ops;
|
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
|
2023-05-12 11:40:33 +08:00
|
|
|
[IFLA_VXLAN_UNSPEC] = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS },
|
2012-10-01 12:32:35 +00:00
|
|
|
[IFLA_VXLAN_ID] = { .type = NLA_U32 },
|
2019-12-09 10:31:43 -08:00
|
|
|
[IFLA_VXLAN_GROUP] = { .len = sizeof_field(struct iphdr, daddr) },
|
2013-08-31 13:44:33 +08:00
|
|
|
[IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) },
|
2012-10-01 12:32:35 +00:00
|
|
|
[IFLA_VXLAN_LINK] = { .type = NLA_U32 },
|
2019-12-09 10:31:43 -08:00
|
|
|
[IFLA_VXLAN_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) },
|
2013-08-31 13:44:33 +08:00
|
|
|
[IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) },
|
2012-10-01 12:32:35 +00:00
|
|
|
[IFLA_VXLAN_TOS] = { .type = NLA_U8 },
|
|
|
|
[IFLA_VXLAN_TTL] = { .type = NLA_U8 },
|
2016-03-09 03:00:03 +01:00
|
|
|
[IFLA_VXLAN_LABEL] = { .type = NLA_U32 },
|
2012-10-01 12:32:35 +00:00
|
|
|
[IFLA_VXLAN_LEARNING] = { .type = NLA_U8 },
|
|
|
|
[IFLA_VXLAN_AGEING] = { .type = NLA_U32 },
|
|
|
|
[IFLA_VXLAN_LIMIT] = { .type = NLA_U32 },
|
2012-10-09 20:35:50 +00:00
|
|
|
[IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) },
|
2012-11-20 02:50:14 +00:00
|
|
|
[IFLA_VXLAN_PROXY] = { .type = NLA_U8 },
|
|
|
|
[IFLA_VXLAN_RSC] = { .type = NLA_U8 },
|
|
|
|
[IFLA_VXLAN_L2MISS] = { .type = NLA_U8 },
|
|
|
|
[IFLA_VXLAN_L3MISS] = { .type = NLA_U8 },
|
2015-07-30 20:10:22 -07:00
|
|
|
[IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 },
|
2013-04-27 11:31:57 +00:00
|
|
|
[IFLA_VXLAN_PORT] = { .type = NLA_U16 },
|
2014-11-06 18:06:01 -08:00
|
|
|
[IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 },
|
|
|
|
[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
|
|
|
|
[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
|
2015-01-12 17:00:38 -08:00
|
|
|
[IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
|
|
|
|
[IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
|
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
|
|
|
[IFLA_VXLAN_GBP] = { .type = NLA_FLAG, },
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
[IFLA_VXLAN_GPE] = { .type = NLA_FLAG, },
|
2015-02-10 16:30:32 -08:00
|
|
|
[IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG },
|
2018-04-17 14:11:28 +08:00
|
|
|
[IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG },
|
vxlan: Allow configuration of DF behaviour
Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.
For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.
According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.
Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.
This only applies to non-lwt tunnels: if an external control plane is
used, tunnel key will still control the DF flag.
v2:
- DF behaviour configuration only applies for non-lwt tunnels, move DF
setting to if (!info) block in vxlan_xmit_one() (Stephen Hemminger)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 12:19:16 +01:00
|
|
|
[IFLA_VXLAN_DF] = { .type = NLA_U8 },
|
2022-03-01 05:04:36 +00:00
|
|
|
[IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 },
|
2023-05-12 11:40:33 +08:00
|
|
|
[IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1),
|
vxlan: add support for flowlabel inherit
By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with
an option for a fixed value. This commits add the ability to inherit the
flow label from the inner packet, like for other tunnel implementations.
This enables devices using only L3 headers for ECMP to correctly balance
VXLAN-encapsulated IPv6 packets.
```
$ ./ip/ip link add dummy1 type dummy
$ ./ip/ip addr add 2001:db8::2/64 dev dummy1
$ ./ip/ip link set up dev dummy1
$ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2
$ ./ip/ip link set up dev vxlan1
$ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
$ ./ip/ip link set arp off dev vxlan1
$ ping -q 2001:db8:1::1 &
$ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
[...]
Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
[...]
Virtual eXtensible Local Area Network
Flags: 0x0800, VXLAN Network ID (VNI)
Group Policy ID: 0
VXLAN Network Identifier (VNI): 100
[...]
Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
```
Signed-off-by: Alce Lafranque <alce@lafranque.net>
Co-developed-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-11-14 11:36:57 -06:00
|
|
|
[IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX),
|
2024-12-05 16:40:57 +01:00
|
|
|
[IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)),
|
2012-10-01 12:32:35 +00:00
|
|
|
};
|
|
|
|
|
2017-06-25 23:56:01 +02:00
|
|
|
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
|
|
|
|
struct netlink_ext_ack *extack)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
|
|
|
if (tb[IFLA_ADDRESS]) {
|
|
|
|
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
|
2017-08-11 15:20:59 -07:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
|
|
|
|
"Provided link layer address is not Ethernet");
|
2012-10-01 12:32:35 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
|
2017-08-11 15:20:59 -07:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
|
|
|
|
"Provided Ethernet address is not unicast");
|
2012-10-01 12:32:35 +00:00
|
|
|
return -EADDRNOTAVAIL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
if (tb[IFLA_MTU]) {
|
2017-06-27 14:42:43 +02:00
|
|
|
u32 mtu = nla_get_u32(tb[IFLA_MTU]);
|
2017-06-19 10:03:55 +02:00
|
|
|
|
2017-08-11 15:20:59 -07:00
|
|
|
if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
|
|
|
|
"MTU must be between 68 and 65535");
|
2017-06-19 10:03:55 +02:00
|
|
|
return -EINVAL;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2017-06-19 10:03:55 +02:00
|
|
|
}
|
|
|
|
|
2017-08-11 15:20:59 -07:00
|
|
|
if (!data) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Required attributes not provided to perform the operation");
|
2012-10-01 12:32:35 +00:00
|
|
|
return -EINVAL;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
if (data[IFLA_VXLAN_ID]) {
|
2017-06-19 10:03:55 +02:00
|
|
|
u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
|
|
|
|
|
2017-08-11 15:20:59 -07:00
|
|
|
if (id >= VXLAN_N_VID) {
|
2020-04-22 17:29:50 +02:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID],
|
2017-08-11 15:20:59 -07:00
|
|
|
"VXLAN ID must be lower than 16777216");
|
2012-10-01 12:32:35 +00:00
|
|
|
return -ERANGE;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2012-10-09 20:35:50 +00:00
|
|
|
if (data[IFLA_VXLAN_PORT_RANGE]) {
|
|
|
|
const struct ifla_vxlan_port_range *p
|
|
|
|
= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
|
|
|
|
|
|
|
|
if (ntohs(p->high) < ntohs(p->low)) {
|
2020-04-22 17:29:50 +02:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE],
|
2017-08-11 15:20:59 -07:00
|
|
|
"Invalid source port range");
|
2012-10-09 20:35:50 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
vxlan: Allow configuration of DF behaviour
Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.
For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.
According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.
Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.
This only applies to non-lwt tunnels: if an external control plane is
used, tunnel key will still control the DF flag.
v2:
- DF behaviour configuration only applies for non-lwt tunnels, move DF
setting to if (!info) block in vxlan_xmit_one() (Stephen Hemminger)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 12:19:16 +01:00
|
|
|
if (data[IFLA_VXLAN_DF]) {
|
|
|
|
enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);
|
|
|
|
|
|
|
|
if (df < 0 || df > VXLAN_DF_MAX) {
|
2020-04-22 17:29:50 +02:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_DF],
|
vxlan: Allow configuration of DF behaviour
Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.
For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.
According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.
Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.
This only applies to non-lwt tunnels: if an external control plane is
used, tunnel key will still control the DF flag.
v2:
- DF behaviour configuration only applies for non-lwt tunnels, move DF
setting to if (!info) block in vxlan_xmit_one() (Stephen Hemminger)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 12:19:16 +01:00
|
|
|
"Invalid DF attribute");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-01-29 23:43:07 +00:00
|
|
|
static void vxlan_get_drvinfo(struct net_device *netdev,
|
|
|
|
struct ethtool_drvinfo *drvinfo)
|
|
|
|
{
|
2022-08-30 22:14:52 +02:00
|
|
|
strscpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
|
|
|
|
strscpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
|
2013-01-29 23:43:07 +00:00
|
|
|
}
|
|
|
|
|
2019-11-12 22:12:24 +01:00
|
|
|
static int vxlan_get_link_ksettings(struct net_device *dev,
|
|
|
|
struct ethtool_link_ksettings *cmd)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct vxlan_rdst *dst = &vxlan->default_dst;
|
|
|
|
struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
|
|
|
|
dst->remote_ifindex);
|
|
|
|
|
|
|
|
if (!lowerdev) {
|
|
|
|
cmd->base.duplex = DUPLEX_UNKNOWN;
|
|
|
|
cmd->base.port = PORT_OTHER;
|
|
|
|
cmd->base.speed = SPEED_UNKNOWN;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return __ethtool_get_link_ksettings(lowerdev, cmd);
|
|
|
|
}
|
|
|
|
|
2013-01-29 23:43:07 +00:00
|
|
|
static const struct ethtool_ops vxlan_ethtool_ops = {
|
2019-11-12 22:12:24 +01:00
|
|
|
.get_drvinfo = vxlan_get_drvinfo,
|
|
|
|
.get_link = ethtool_op_get_link,
|
|
|
|
.get_link_ksettings = vxlan_get_link_ksettings,
|
2013-01-29 23:43:07 +00:00
|
|
|
};
|
|
|
|
|
2014-07-13 19:49:42 -07:00
|
|
|
static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
|
2018-12-03 10:54:40 +01:00
|
|
|
__be16 port, u32 flags, int ifindex)
|
2013-05-16 11:35:20 +00:00
|
|
|
{
|
2013-08-31 13:44:33 +08:00
|
|
|
struct socket *sock;
|
2014-07-13 19:49:42 -07:00
|
|
|
struct udp_port_cfg udp_conf;
|
|
|
|
int err;
|
2013-08-31 13:44:33 +08:00
|
|
|
|
2014-07-13 19:49:42 -07:00
|
|
|
memset(&udp_conf, 0, sizeof(udp_conf));
|
2013-08-31 13:44:33 +08:00
|
|
|
|
2014-07-13 19:49:42 -07:00
|
|
|
if (ipv6) {
|
|
|
|
udp_conf.family = AF_INET6;
|
|
|
|
udp_conf.use_udp6_rx_checksums =
|
2014-11-24 20:08:38 -08:00
|
|
|
!(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
|
2015-08-28 20:48:22 +02:00
|
|
|
udp_conf.ipv6_v6only = 1;
|
2014-07-13 19:49:42 -07:00
|
|
|
} else {
|
|
|
|
udp_conf.family = AF_INET;
|
2013-08-31 13:44:33 +08:00
|
|
|
}
|
|
|
|
|
2014-07-13 19:49:42 -07:00
|
|
|
udp_conf.local_udp_port = port;
|
2018-12-03 10:54:40 +01:00
|
|
|
udp_conf.bind_ifindex = ifindex;
|
2013-05-16 11:35:20 +00:00
|
|
|
|
2014-07-13 19:49:42 -07:00
|
|
|
/* Open UDP socket */
|
|
|
|
err = udp_sock_create(net, &udp_conf, &sock);
|
|
|
|
if (err < 0)
|
|
|
|
return ERR_PTR(err);
|
2013-08-31 13:44:33 +08:00
|
|
|
|
2021-03-30 12:28:53 +02:00
|
|
|
udp_allow_gso(sock->sk);
|
2013-10-28 14:01:48 +08:00
|
|
|
return sock;
|
2013-08-31 13:44:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Create new listen socket if needed */
|
2015-09-24 13:50:02 +02:00
|
|
|
static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
|
2018-12-03 10:54:40 +01:00
|
|
|
__be16 port, u32 flags,
|
|
|
|
int ifindex)
|
2013-08-31 13:44:33 +08:00
|
|
|
{
|
|
|
|
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
|
|
|
|
struct vxlan_sock *vs;
|
|
|
|
struct socket *sock;
|
|
|
|
unsigned int h;
|
2014-09-16 17:31:18 -07:00
|
|
|
struct udp_tunnel_sock_cfg tunnel_cfg;
|
2013-08-31 13:44:33 +08:00
|
|
|
|
2014-01-20 13:59:21 +02:00
|
|
|
vs = kzalloc(sizeof(*vs), GFP_KERNEL);
|
2013-08-31 13:44:33 +08:00
|
|
|
if (!vs)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
for (h = 0; h < VNI_HASH_SIZE; ++h)
|
|
|
|
INIT_HLIST_HEAD(&vs->vni_list[h]);
|
|
|
|
|
2018-12-03 10:54:40 +01:00
|
|
|
sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
|
2013-10-28 14:01:48 +08:00
|
|
|
if (IS_ERR(sock)) {
|
2013-05-16 11:35:20 +00:00
|
|
|
kfree(vs);
|
2013-11-01 13:09:43 +08:00
|
|
|
return ERR_CAST(sock);
|
2013-05-16 11:35:20 +00:00
|
|
|
}
|
2013-08-31 13:44:33 +08:00
|
|
|
|
|
|
|
vs->sock = sock;
|
2017-07-04 15:52:59 +03:00
|
|
|
refcount_set(&vs->refcnt, 1);
|
2015-01-20 11:23:05 -08:00
|
|
|
vs->flags = (flags & VXLAN_F_RCV_FLAGS);
|
2013-05-16 11:35:20 +00:00
|
|
|
|
2013-08-19 11:22:48 -07:00
|
|
|
spin_lock(&vn->sock_lock);
|
|
|
|
hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
|
2016-06-16 12:20:52 -07:00
|
|
|
udp_tunnel_notify_add_rx_port(sock,
|
2016-06-16 12:23:19 -07:00
|
|
|
(vs->flags & VXLAN_F_GPE) ?
|
|
|
|
UDP_TUNNEL_TYPE_VXLAN_GPE :
|
2016-06-16 12:20:52 -07:00
|
|
|
UDP_TUNNEL_TYPE_VXLAN);
|
2013-08-19 11:22:48 -07:00
|
|
|
spin_unlock(&vn->sock_lock);
|
2013-05-16 11:35:20 +00:00
|
|
|
|
|
|
|
/* Mark socket as an encapsulation socket. */
|
2016-04-05 08:22:53 -07:00
|
|
|
memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
|
2014-09-16 17:31:18 -07:00
|
|
|
tunnel_cfg.sk_user_data = vs;
|
|
|
|
tunnel_cfg.encap_type = 1;
|
2016-02-23 18:02:58 +01:00
|
|
|
tunnel_cfg.encap_rcv = vxlan_rcv;
|
2018-11-08 12:19:15 +01:00
|
|
|
tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
|
2014-09-16 17:31:18 -07:00
|
|
|
tunnel_cfg.encap_destroy = NULL;
|
2023-07-21 16:30:47 +02:00
|
|
|
if (vs->flags & VXLAN_F_GPE) {
|
|
|
|
tunnel_cfg.gro_receive = vxlan_gpe_gro_receive;
|
|
|
|
tunnel_cfg.gro_complete = vxlan_gpe_gro_complete;
|
|
|
|
} else {
|
|
|
|
tunnel_cfg.gro_receive = vxlan_gro_receive;
|
|
|
|
tunnel_cfg.gro_complete = vxlan_gro_complete;
|
|
|
|
}
|
2014-09-16 17:31:18 -07:00
|
|
|
|
|
|
|
setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
|
2013-08-31 13:44:33 +08:00
|
|
|
|
2013-08-19 11:22:48 -07:00
|
|
|
return vs;
|
|
|
|
}
|
|
|
|
|
2015-09-24 13:50:02 +02:00
|
|
|
static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
|
2013-08-19 11:22:48 -07:00
|
|
|
{
|
2015-09-24 13:50:01 +02:00
|
|
|
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
|
2022-03-01 05:04:36 +00:00
|
|
|
bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
|
2015-09-24 13:50:01 +02:00
|
|
|
struct vxlan_sock *vs = NULL;
|
2017-07-02 19:00:57 +02:00
|
|
|
struct vxlan_dev_node *node;
|
2018-12-03 10:54:40 +01:00
|
|
|
int l3mdev_index = 0;
|
|
|
|
|
|
|
|
if (vxlan->cfg.remote_ifindex)
|
|
|
|
l3mdev_index = l3mdev_master_upper_ifindex_by_index(
|
|
|
|
vxlan->net, vxlan->cfg.remote_ifindex);
|
2013-08-19 11:22:48 -07:00
|
|
|
|
2015-09-24 13:50:01 +02:00
|
|
|
if (!vxlan->cfg.no_share) {
|
2015-03-18 14:50:44 -03:00
|
|
|
spin_lock(&vn->sock_lock);
|
2015-09-24 13:50:01 +02:00
|
|
|
vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
|
2018-12-03 10:54:40 +01:00
|
|
|
vxlan->cfg.dst_port, vxlan->cfg.flags,
|
|
|
|
l3mdev_index);
|
2017-07-04 15:52:59 +03:00
|
|
|
if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
|
2015-03-18 14:50:44 -03:00
|
|
|
spin_unlock(&vn->sock_lock);
|
2015-09-24 13:50:01 +02:00
|
|
|
return -EBUSY;
|
2015-03-18 14:50:44 -03:00
|
|
|
}
|
|
|
|
spin_unlock(&vn->sock_lock);
|
|
|
|
}
|
2015-09-24 13:50:01 +02:00
|
|
|
if (!vs)
|
2015-09-24 13:50:02 +02:00
|
|
|
vs = vxlan_socket_create(vxlan->net, ipv6,
|
2018-12-03 10:54:40 +01:00
|
|
|
vxlan->cfg.dst_port, vxlan->cfg.flags,
|
|
|
|
l3mdev_index);
|
2015-09-24 13:50:01 +02:00
|
|
|
if (IS_ERR(vs))
|
|
|
|
return PTR_ERR(vs);
|
2015-09-24 13:50:02 +02:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2017-07-02 19:00:57 +02:00
|
|
|
if (ipv6) {
|
2016-10-28 09:59:15 -07:00
|
|
|
rcu_assign_pointer(vxlan->vn6_sock, vs);
|
2017-07-02 19:00:57 +02:00
|
|
|
node = &vxlan->hlist6;
|
|
|
|
} else
|
2015-09-24 13:50:02 +02:00
|
|
|
#endif
|
2017-07-02 19:00:57 +02:00
|
|
|
{
|
2016-10-28 09:59:15 -07:00
|
|
|
rcu_assign_pointer(vxlan->vn4_sock, vs);
|
2017-07-02 19:00:57 +02:00
|
|
|
node = &vxlan->hlist4;
|
|
|
|
}
|
2022-03-01 05:04:36 +00:00
|
|
|
|
|
|
|
if (metadata && (vxlan->cfg.flags & VXLAN_F_VNIFILTER))
|
|
|
|
vxlan_vs_add_vnigrp(vxlan, vs, ipv6);
|
|
|
|
else
|
|
|
|
vxlan_vs_add_dev(vs, vxlan, node);
|
|
|
|
|
2015-09-24 13:50:01 +02:00
|
|
|
return 0;
|
2013-05-16 11:35:20 +00:00
|
|
|
}
|
|
|
|
|
2015-09-24 13:50:02 +02:00
|
|
|
static int vxlan_sock_add(struct vxlan_dev *vxlan)
|
|
|
|
{
|
2017-06-19 10:03:56 +02:00
|
|
|
bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
|
|
|
|
bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
|
2017-04-27 21:24:35 +02:00
|
|
|
bool ipv4 = !ipv6 || metadata;
|
2015-09-24 13:50:02 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
2016-10-28 09:59:15 -07:00
|
|
|
RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
|
2015-09-24 13:50:02 +02:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2016-10-28 09:59:15 -07:00
|
|
|
RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
|
2017-04-27 21:24:35 +02:00
|
|
|
if (ipv6) {
|
2015-09-24 13:50:02 +02:00
|
|
|
ret = __vxlan_sock_add(vxlan, true);
|
2017-04-27 21:24:35 +02:00
|
|
|
if (ret < 0 && ret != -EAFNOSUPPORT)
|
|
|
|
ipv4 = false;
|
|
|
|
}
|
2015-09-24 13:50:02 +02:00
|
|
|
#endif
|
2017-04-27 21:24:35 +02:00
|
|
|
if (ipv4)
|
2015-09-24 13:50:02 +02:00
|
|
|
ret = __vxlan_sock_add(vxlan, false);
|
|
|
|
if (ret < 0)
|
|
|
|
vxlan_sock_release(vxlan);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-03-01 05:04:36 +00:00
|
|
|
int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan,
|
|
|
|
struct vxlan_config *conf, __be32 vni)
|
2022-03-01 05:04:33 +00:00
|
|
|
{
|
|
|
|
struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
|
|
|
|
struct vxlan_dev *tmp;
|
|
|
|
|
|
|
|
list_for_each_entry(tmp, &vn->vxlan_list, next) {
|
|
|
|
if (tmp == vxlan)
|
|
|
|
continue;
|
2022-03-01 05:04:36 +00:00
|
|
|
if (tmp->cfg.flags & VXLAN_F_VNIFILTER) {
|
|
|
|
if (!vxlan_vnifilter_lookup(tmp, vni))
|
|
|
|
continue;
|
|
|
|
} else if (tmp->cfg.vni != vni) {
|
2022-03-01 05:04:33 +00:00
|
|
|
continue;
|
2022-03-01 05:04:36 +00:00
|
|
|
}
|
2022-03-01 05:04:33 +00:00
|
|
|
if (tmp->cfg.dst_port != conf->dst_port)
|
|
|
|
continue;
|
|
|
|
if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
|
|
|
|
(conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
|
|
|
|
tmp->cfg.remote_ifindex != conf->remote_ifindex)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
return -EEXIST;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
|
|
|
|
struct net_device **lower,
|
2017-08-11 15:20:59 -07:00
|
|
|
struct vxlan_dev *old,
|
|
|
|
struct netlink_ext_ack *extack)
|
2012-10-01 12:32:35 +00:00
|
|
|
{
|
2013-08-31 13:44:33 +08:00
|
|
|
bool use_ipv6 = false;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
if (conf->flags & VXLAN_F_GPE) {
|
|
|
|
/* For now, allow GPE only together with
|
|
|
|
* COLLECT_METADATA. This can be relaxed later; in such
|
|
|
|
* case, the other side of the PtP link will have to be
|
|
|
|
* provided.
|
|
|
|
*/
|
|
|
|
if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
|
|
|
|
!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
|
2017-08-11 15:20:59 -07:00
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"VXLAN GPE does not support this combination of attributes");
|
2017-06-19 10:03:55 +02:00
|
|
|
return -EINVAL;
|
2016-09-02 13:37:12 +02:00
|
|
|
}
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
}
|
2016-04-05 14:47:10 +02:00
|
|
|
|
2017-06-19 10:03:57 +02:00
|
|
|
if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) {
|
|
|
|
/* Unless IPv6 is explicitly requested, assume IPv4 */
|
2017-06-19 10:03:55 +02:00
|
|
|
conf->remote_ip.sa.sa_family = AF_INET;
|
2017-06-19 10:03:57 +02:00
|
|
|
conf->saddr.sa.sa_family = AF_INET;
|
|
|
|
} else if (!conf->remote_ip.sa.sa_family) {
|
|
|
|
conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family;
|
|
|
|
} else if (!conf->saddr.sa.sa_family) {
|
|
|
|
conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
|
|
|
|
}
|
|
|
|
|
2017-08-11 15:20:59 -07:00
|
|
|
if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Local and remote address must be from the same family");
|
2017-06-19 10:03:57 +02:00
|
|
|
return -EINVAL;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2013-08-31 13:44:33 +08:00
|
|
|
|
2017-08-11 15:20:59 -07:00
|
|
|
if (vxlan_addr_multicast(&conf->saddr)) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Local address cannot be multicast");
|
2017-06-19 10:03:58 +02:00
|
|
|
return -EINVAL;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2017-06-19 10:03:58 +02:00
|
|
|
|
2017-06-19 10:03:57 +02:00
|
|
|
if (conf->saddr.sa.sa_family == AF_INET6) {
|
2017-08-11 15:20:59 -07:00
|
|
|
if (!IS_ENABLED(CONFIG_IPV6)) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"IPv6 support not enabled in the kernel");
|
2015-09-17 16:11:11 +02:00
|
|
|
return -EPFNOSUPPORT;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2013-08-31 13:44:33 +08:00
|
|
|
use_ipv6 = true;
|
2017-06-19 10:03:55 +02:00
|
|
|
conf->flags |= VXLAN_F_IPV6;
|
2017-06-19 10:03:58 +02:00
|
|
|
|
|
|
|
if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
|
|
|
|
int local_type =
|
|
|
|
ipv6_addr_type(&conf->saddr.sin6.sin6_addr);
|
|
|
|
int remote_type =
|
|
|
|
ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr);
|
|
|
|
|
|
|
|
if (local_type & IPV6_ADDR_LINKLOCAL) {
|
|
|
|
if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
|
2017-08-11 15:20:59 -07:00
|
|
|
(remote_type != IPV6_ADDR_ANY)) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Invalid combination of local and remote address scopes");
|
2017-06-19 10:03:58 +02:00
|
|
|
return -EINVAL;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2017-06-19 10:03:58 +02:00
|
|
|
|
|
|
|
conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
|
|
|
|
} else {
|
|
|
|
if (remote_type ==
|
2017-08-11 15:20:59 -07:00
|
|
|
(IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Invalid combination of local and remote address scopes");
|
2017-06-19 10:03:58 +02:00
|
|
|
return -EINVAL;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2017-06-19 10:03:58 +02:00
|
|
|
|
|
|
|
conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL;
|
|
|
|
}
|
|
|
|
}
|
2015-09-17 16:11:11 +02:00
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2017-08-11 15:20:59 -07:00
|
|
|
if (conf->label && !use_ipv6) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Label attribute only applies to IPv6 VXLAN devices");
|
2016-03-09 03:00:03 +01:00
|
|
|
return -EINVAL;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2016-03-09 03:00:03 +01:00
|
|
|
|
vxlan: add support for flowlabel inherit
By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with
an option for a fixed value. This commits add the ability to inherit the
flow label from the inner packet, like for other tunnel implementations.
This enables devices using only L3 headers for ECMP to correctly balance
VXLAN-encapsulated IPv6 packets.
```
$ ./ip/ip link add dummy1 type dummy
$ ./ip/ip addr add 2001:db8::2/64 dev dummy1
$ ./ip/ip link set up dev dummy1
$ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2
$ ./ip/ip link set up dev vxlan1
$ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
$ ./ip/ip link set arp off dev vxlan1
$ ping -q 2001:db8:1::1 &
$ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
[...]
Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
[...]
Virtual eXtensible Local Area Network
Flags: 0x0800, VXLAN Network ID (VNI)
Group Policy ID: 0
VXLAN Network Identifier (VNI): 100
[...]
Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
```
Signed-off-by: Alce Lafranque <alce@lafranque.net>
Co-developed-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-11-14 11:36:57 -06:00
|
|
|
if (conf->label_policy && !use_ipv6) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Label policy only applies to IPv6 VXLAN devices");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
if (conf->remote_ifindex) {
|
|
|
|
struct net_device *lowerdev;
|
2012-10-09 20:35:53 +00:00
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
|
2017-08-11 15:20:59 -07:00
|
|
|
if (!lowerdev) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Invalid local interface, device not found");
|
2012-10-09 20:35:53 +00:00
|
|
|
return -ENODEV;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2013-08-31 13:44:33 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
if (use_ipv6) {
|
|
|
|
struct inet6_dev *idev = __in6_dev_get(lowerdev);
|
2021-03-13 14:06:49 +05:30
|
|
|
|
2017-08-11 15:20:59 -07:00
|
|
|
if (idev && idev->cnf.disable_ipv6) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"IPv6 support disabled by administrator");
|
2013-08-31 13:44:33 +08:00
|
|
|
return -EPERM;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2013-08-31 13:44:33 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
*lower = lowerdev;
|
|
|
|
} else {
|
2017-08-11 15:20:59 -07:00
|
|
|
if (vxlan_addr_multicast(&conf->remote_ip)) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Local interface required for multicast remote destination");
|
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
return -EINVAL;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2012-11-13 13:10:59 +00:00
|
|
|
|
2017-06-19 10:03:58 +02:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2017-08-11 15:20:59 -07:00
|
|
|
if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Local interface required for link-local local/remote addresses");
|
2017-06-19 10:03:58 +02:00
|
|
|
return -EINVAL;
|
2017-08-11 15:20:59 -07:00
|
|
|
}
|
2017-06-19 10:03:58 +02:00
|
|
|
#endif
|
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
*lower = NULL;
|
2015-09-17 16:11:10 +02:00
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
if (!conf->dst_port) {
|
|
|
|
if (conf->flags & VXLAN_F_GPE)
|
2021-11-27 17:34:04 +08:00
|
|
|
conf->dst_port = htons(IANA_VXLAN_GPE_UDP_PORT);
|
2017-06-19 10:03:55 +02:00
|
|
|
else
|
|
|
|
conf->dst_port = htons(vxlan_port);
|
2017-03-29 17:56:43 -07:00
|
|
|
}
|
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
if (!conf->age_interval)
|
|
|
|
conf->age_interval = FDB_AGE_DEFAULT;
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
|
|
|
|
2022-03-01 05:04:33 +00:00
|
|
|
if (vxlan_vni_in_use(src_net, old, conf, conf->vni)) {
|
2017-08-11 15:20:59 -07:00
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"A VXLAN device with the specified VNI already exists");
|
2017-06-19 10:04:00 +02:00
|
|
|
return -EEXIST;
|
2017-06-19 10:03:55 +02:00
|
|
|
}
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vxlan_config_apply(struct net_device *dev,
|
|
|
|
struct vxlan_config *conf,
|
2017-06-30 15:50:00 +02:00
|
|
|
struct net_device *lowerdev,
|
|
|
|
struct net *src_net,
|
|
|
|
bool changelink)
|
2017-06-19 10:03:55 +02:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct vxlan_rdst *dst = &vxlan->default_dst;
|
|
|
|
unsigned short needed_headroom = ETH_HLEN;
|
|
|
|
int max_mtu = ETH_MAX_MTU;
|
vxlan: calculate correct header length for GPE
VXLAN-GPE does not add an extra inner Ethernet header. Take that into
account when calculating header length.
This causes problems in skb_tunnel_check_pmtu, where incorrect PMTU is
cached.
In the collect_md mode (which is the only mode that VXLAN-GPE
supports), there's no magic auto-setting of the tunnel interface MTU.
It can't be, since the destination and thus the underlying interface
may be different for each packet.
So, the administrator is responsible for setting the correct tunnel
interface MTU. Apparently, the administrators are capable enough to
calculate that the maximum MTU for VXLAN-GPE is (their_lower_MTU - 36).
They set the tunnel interface MTU to 1464. If you run a TCP stream over
such interface, it's then segmented according to the MTU 1464, i.e.
producing 1514 bytes frames. Which is okay, this still fits the lower
MTU.
However, skb_tunnel_check_pmtu (called from vxlan_xmit_one) uses 50 as
the header size and thus incorrectly calculates the frame size to be
1528. This leads to ICMP too big message being generated (locally),
PMTU of 1450 to be cached and the TCP stream to be resegmented.
The fix is to use the correct actual header size, especially for
skb_tunnel_check_pmtu calculation.
Fixes: e1e5314de08ba ("vxlan: implement GPE")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-20 11:05:56 +02:00
|
|
|
u32 flags = conf->flags;
|
2017-06-19 10:03:55 +02:00
|
|
|
|
|
|
|
if (!changelink) {
|
vxlan: calculate correct header length for GPE
VXLAN-GPE does not add an extra inner Ethernet header. Take that into
account when calculating header length.
This causes problems in skb_tunnel_check_pmtu, where incorrect PMTU is
cached.
In the collect_md mode (which is the only mode that VXLAN-GPE
supports), there's no magic auto-setting of the tunnel interface MTU.
It can't be, since the destination and thus the underlying interface
may be different for each packet.
So, the administrator is responsible for setting the correct tunnel
interface MTU. Apparently, the administrators are capable enough to
calculate that the maximum MTU for VXLAN-GPE is (their_lower_MTU - 36).
They set the tunnel interface MTU to 1464. If you run a TCP stream over
such interface, it's then segmented according to the MTU 1464, i.e.
producing 1514 bytes frames. Which is okay, this still fits the lower
MTU.
However, skb_tunnel_check_pmtu (called from vxlan_xmit_one) uses 50 as
the header size and thus incorrectly calculates the frame size to be
1528. This leads to ICMP too big message being generated (locally),
PMTU of 1450 to be cached and the TCP stream to be resegmented.
The fix is to use the correct actual header size, especially for
skb_tunnel_check_pmtu calculation.
Fixes: e1e5314de08ba ("vxlan: implement GPE")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-20 11:05:56 +02:00
|
|
|
if (flags & VXLAN_F_GPE)
|
2017-06-19 10:03:55 +02:00
|
|
|
vxlan_raw_setup(dev);
|
|
|
|
else
|
|
|
|
vxlan_ether_setup(dev);
|
|
|
|
|
|
|
|
if (conf->mtu)
|
|
|
|
dev->mtu = conf->mtu;
|
2017-06-30 15:50:00 +02:00
|
|
|
|
|
|
|
vxlan->net = src_net;
|
2017-06-19 10:03:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
dst->remote_vni = conf->vni;
|
|
|
|
|
|
|
|
memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));
|
|
|
|
|
|
|
|
if (lowerdev) {
|
|
|
|
dst->remote_ifindex = conf->remote_ifindex;
|
|
|
|
|
2022-05-05 19:51:31 -07:00
|
|
|
netif_inherit_tso_max(dev, lowerdev);
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
needed_headroom = lowerdev->hard_header_len;
|
2020-11-26 13:52:46 +01:00
|
|
|
needed_headroom += lowerdev->needed_headroom;
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
|
|
|
|
2020-11-26 13:52:47 +01:00
|
|
|
dev->needed_tailroom = lowerdev->needed_tailroom;
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
|
|
|
|
vxlan: calculate correct header length for GPE
VXLAN-GPE does not add an extra inner Ethernet header. Take that into
account when calculating header length.
This causes problems in skb_tunnel_check_pmtu, where incorrect PMTU is
cached.
In the collect_md mode (which is the only mode that VXLAN-GPE
supports), there's no magic auto-setting of the tunnel interface MTU.
It can't be, since the destination and thus the underlying interface
may be different for each packet.
So, the administrator is responsible for setting the correct tunnel
interface MTU. Apparently, the administrators are capable enough to
calculate that the maximum MTU for VXLAN-GPE is (their_lower_MTU - 36).
They set the tunnel interface MTU to 1464. If you run a TCP stream over
such interface, it's then segmented according to the MTU 1464, i.e.
producing 1514 bytes frames. Which is okay, this still fits the lower
MTU.
However, skb_tunnel_check_pmtu (called from vxlan_xmit_one) uses 50 as
the header size and thus incorrectly calculates the frame size to be
1528. This leads to ICMP too big message being generated (locally),
PMTU of 1450 to be cached and the TCP stream to be resegmented.
The fix is to use the correct actual header size, especially for
skb_tunnel_check_pmtu calculation.
Fixes: e1e5314de08ba ("vxlan: implement GPE")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-20 11:05:56 +02:00
|
|
|
max_mtu = lowerdev->mtu - vxlan_headroom(flags);
|
2017-12-14 20:20:00 +03:00
|
|
|
if (max_mtu < ETH_MIN_MTU)
|
|
|
|
max_mtu = ETH_MIN_MTU;
|
|
|
|
|
|
|
|
if (!changelink && !conf->mtu)
|
|
|
|
dev->mtu = max_mtu;
|
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 00:05:58 +00:00
|
|
|
}
|
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
if (dev->mtu > max_mtu)
|
|
|
|
dev->mtu = max_mtu;
|
|
|
|
|
vxlan: calculate correct header length for GPE
VXLAN-GPE does not add an extra inner Ethernet header. Take that into
account when calculating header length.
This causes problems in skb_tunnel_check_pmtu, where incorrect PMTU is
cached.
In the collect_md mode (which is the only mode that VXLAN-GPE
supports), there's no magic auto-setting of the tunnel interface MTU.
It can't be, since the destination and thus the underlying interface
may be different for each packet.
So, the administrator is responsible for setting the correct tunnel
interface MTU. Apparently, the administrators are capable enough to
calculate that the maximum MTU for VXLAN-GPE is (their_lower_MTU - 36).
They set the tunnel interface MTU to 1464. If you run a TCP stream over
such interface, it's then segmented according to the MTU 1464, i.e.
producing 1514 bytes frames. Which is okay, this still fits the lower
MTU.
However, skb_tunnel_check_pmtu (called from vxlan_xmit_one) uses 50 as
the header size and thus incorrectly calculates the frame size to be
1528. This leads to ICMP too big message being generated (locally),
PMTU of 1450 to be cached and the TCP stream to be resegmented.
The fix is to use the correct actual header size, especially for
skb_tunnel_check_pmtu calculation.
Fixes: e1e5314de08ba ("vxlan: implement GPE")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-20 11:05:56 +02:00
|
|
|
if (flags & VXLAN_F_COLLECT_METADATA)
|
|
|
|
flags |= VXLAN_F_IPV6;
|
|
|
|
needed_headroom += vxlan_headroom(flags);
|
2015-09-24 13:50:02 +02:00
|
|
|
dev->needed_headroom = needed_headroom;
|
|
|
|
|
2015-07-21 10:44:02 +02:00
|
|
|
memcpy(&vxlan->cfg, conf, sizeof(*conf));
|
2017-06-19 10:03:55 +02:00
|
|
|
}
|
2015-07-21 10:44:02 +02:00
|
|
|
|
2017-06-19 10:03:55 +02:00
|
|
|
static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
|
2017-08-11 15:20:59 -07:00
|
|
|
struct vxlan_config *conf, bool changelink,
|
|
|
|
struct netlink_ext_ack *extack)
|
2017-06-19 10:03:55 +02:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct net_device *lowerdev;
|
|
|
|
int ret;
|
2015-07-21 10:44:02 +02:00
|
|
|
|
2017-08-11 15:20:59 -07:00
|
|
|
ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack);
|
2017-06-19 10:03:55 +02:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2017-02-20 08:29:19 -08:00
|
|
|
|
2017-06-30 15:50:00 +02:00
|
|
|
vxlan_config_apply(dev, conf, lowerdev, src_net, changelink);
|
2015-07-21 10:44:02 +02:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-03-13 16:24:03 +01:00
|
|
|
static int __vxlan_dev_create(struct net *net, struct net_device *dev,
|
2017-08-11 15:20:59 -07:00
|
|
|
struct vxlan_config *conf,
|
|
|
|
struct netlink_ext_ack *extack)
|
2017-03-13 16:24:03 +01:00
|
|
|
{
|
|
|
|
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2019-10-21 18:47:57 +00:00
|
|
|
struct net_device *remote_dev = NULL;
|
2018-07-04 16:46:32 -07:00
|
|
|
struct vxlan_fdb *f = NULL;
|
vxlan: Fix error path in __vxlan_dev_create()
When a failure occurs in rtnl_configure_link(), the current code
calls unregister_netdevice() to roll back the earlier call to
register_netdevice(), and jumps to errout, which calls
vxlan_fdb_destroy().
However unregister_netdevice() calls transitively ndo_uninit, which is
vxlan_uninit(), and that already takes care of deleting the default FDB
entry by calling vxlan_fdb_delete_default(). Since the entry added
earlier in __vxlan_dev_create() is exactly the default entry, the
cleanup code in the errout block always leads to double free and thus a
panic.
Besides, since vxlan_fdb_delete_default() always destroys the FDB entry
with notification enabled, the deletion of the default entry is notified
even before the addition was notified.
Instead, move the unregister_netdevice() call after the manual destroy,
which solves both problems.
Fixes: 0241b836732f ("vxlan: fix default fdb entry netlink notify ordering during netdev create")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-18 13:16:00 +00:00
|
|
|
bool unregister = false;
|
2019-10-21 18:47:57 +00:00
|
|
|
struct vxlan_rdst *dst;
|
2017-03-13 16:24:03 +01:00
|
|
|
int err;
|
|
|
|
|
2019-10-21 18:47:57 +00:00
|
|
|
dst = &vxlan->default_dst;
|
2017-08-11 15:20:59 -07:00
|
|
|
err = vxlan_dev_configure(net, dev, conf, false, extack);
|
2017-03-13 16:24:03 +01:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
dev->ethtool_ops = &vxlan_ethtool_ops;
|
|
|
|
|
|
|
|
/* create an fdb entry for a valid default destination */
|
2019-10-21 18:47:57 +00:00
|
|
|
if (!vxlan_addr_any(&dst->remote_ip)) {
|
2018-07-04 16:46:32 -07:00
|
|
|
err = vxlan_fdb_create(vxlan, all_zeros_mac,
|
2019-10-21 18:47:57 +00:00
|
|
|
&dst->remote_ip,
|
2017-03-13 16:24:03 +01:00
|
|
|
NUD_REACHABLE | NUD_PERMANENT,
|
|
|
|
vxlan->cfg.dst_port,
|
2019-10-21 18:47:57 +00:00
|
|
|
dst->remote_vni,
|
|
|
|
dst->remote_vni,
|
|
|
|
dst->remote_ifindex,
|
2020-05-21 22:26:14 -07:00
|
|
|
NTF_SELF, 0, &f, extack);
|
2017-03-13 16:24:03 +01:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = register_netdevice(dev);
|
2018-07-04 16:46:32 -07:00
|
|
|
if (err)
|
|
|
|
goto errout;
|
vxlan: Fix error path in __vxlan_dev_create()
When a failure occurs in rtnl_configure_link(), the current code
calls unregister_netdevice() to roll back the earlier call to
register_netdevice(), and jumps to errout, which calls
vxlan_fdb_destroy().
However unregister_netdevice() calls transitively ndo_uninit, which is
vxlan_uninit(), and that already takes care of deleting the default FDB
entry by calling vxlan_fdb_delete_default(). Since the entry added
earlier in __vxlan_dev_create() is exactly the default entry, the
cleanup code in the errout block always leads to double free and thus a
panic.
Besides, since vxlan_fdb_delete_default() always destroys the FDB entry
with notification enabled, the deletion of the default entry is notified
even before the addition was notified.
Instead, move the unregister_netdevice() call after the manual destroy,
which solves both problems.
Fixes: 0241b836732f ("vxlan: fix default fdb entry netlink notify ordering during netdev create")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-18 13:16:00 +00:00
|
|
|
unregister = true;
|
2018-07-04 16:46:32 -07:00
|
|
|
|
2019-10-21 18:47:57 +00:00
|
|
|
if (dst->remote_ifindex) {
|
|
|
|
remote_dev = __dev_get_by_index(net, dst->remote_ifindex);
|
2020-12-02 17:58:42 +08:00
|
|
|
if (!remote_dev) {
|
|
|
|
err = -ENODEV;
|
2019-10-21 18:47:57 +00:00
|
|
|
goto errout;
|
2020-12-02 17:58:42 +08:00
|
|
|
}
|
2019-10-21 18:47:57 +00:00
|
|
|
|
|
|
|
err = netdev_upper_dev_link(remote_dev, dev, extack);
|
|
|
|
if (err)
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
|
2022-10-28 04:42:21 -04:00
|
|
|
err = rtnl_configure_link(dev, NULL, 0, NULL);
|
2020-09-25 15:16:59 +02:00
|
|
|
if (err < 0)
|
2019-10-21 18:47:57 +00:00
|
|
|
goto unlink;
|
2017-03-13 16:24:03 +01:00
|
|
|
|
2019-01-16 23:06:38 +00:00
|
|
|
if (f) {
|
2019-10-21 18:47:57 +00:00
|
|
|
vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f);
|
2019-06-28 14:07:25 +09:00
|
|
|
|
|
|
|
/* notify default fdb entry */
|
2019-01-16 23:06:38 +00:00
|
|
|
err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f),
|
2019-01-16 23:06:54 +00:00
|
|
|
RTM_NEWNEIGH, true, extack);
|
2019-06-28 14:07:25 +09:00
|
|
|
if (err) {
|
|
|
|
vxlan_fdb_destroy(vxlan, f, false, false);
|
2019-10-21 18:47:57 +00:00
|
|
|
if (remote_dev)
|
|
|
|
netdev_upper_dev_unlink(remote_dev, dev);
|
2019-06-28 14:07:25 +09:00
|
|
|
goto unregister;
|
|
|
|
}
|
2019-01-16 23:06:38 +00:00
|
|
|
}
|
2018-07-04 16:46:32 -07:00
|
|
|
|
2017-03-13 16:24:03 +01:00
|
|
|
list_add(&vxlan->next, &vn->vxlan_list);
|
2019-10-21 18:47:57 +00:00
|
|
|
if (remote_dev)
|
|
|
|
dst->remote_dev = remote_dev;
|
2017-03-13 16:24:03 +01:00
|
|
|
return 0;
|
2019-10-21 18:47:57 +00:00
|
|
|
unlink:
|
|
|
|
if (remote_dev)
|
|
|
|
netdev_upper_dev_unlink(remote_dev, dev);
|
2018-07-04 16:46:32 -07:00
|
|
|
errout:
|
vxlan: Fix error path in __vxlan_dev_create()
When a failure occurs in rtnl_configure_link(), the current code
calls unregister_netdevice() to roll back the earlier call to
register_netdevice(), and jumps to errout, which calls
vxlan_fdb_destroy().
However unregister_netdevice() calls transitively ndo_uninit, which is
vxlan_uninit(), and that already takes care of deleting the default FDB
entry by calling vxlan_fdb_delete_default(). Since the entry added
earlier in __vxlan_dev_create() is exactly the default entry, the
cleanup code in the errout block always leads to double free and thus a
panic.
Besides, since vxlan_fdb_delete_default() always destroys the FDB entry
with notification enabled, the deletion of the default entry is notified
even before the addition was notified.
Instead, move the unregister_netdevice() call after the manual destroy,
which solves both problems.
Fixes: 0241b836732f ("vxlan: fix default fdb entry netlink notify ordering during netdev create")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-18 13:16:00 +00:00
|
|
|
/* unregister_netdevice() destroys the default FDB entry with deletion
|
|
|
|
* notification. But the addition notification was not sent yet, so
|
|
|
|
* destroy the entry by hand here.
|
|
|
|
*/
|
2018-07-04 16:46:32 -07:00
|
|
|
if (f)
|
2019-06-28 14:07:25 +09:00
|
|
|
__vxlan_fdb_free(f);
|
|
|
|
unregister:
|
vxlan: Fix error path in __vxlan_dev_create()
When a failure occurs in rtnl_configure_link(), the current code
calls unregister_netdevice() to roll back the earlier call to
register_netdevice(), and jumps to errout, which calls
vxlan_fdb_destroy().
However unregister_netdevice() calls transitively ndo_uninit, which is
vxlan_uninit(), and that already takes care of deleting the default FDB
entry by calling vxlan_fdb_delete_default(). Since the entry added
earlier in __vxlan_dev_create() is exactly the default entry, the
cleanup code in the errout block always leads to double free and thus a
panic.
Besides, since vxlan_fdb_delete_default() always destroys the FDB entry
with notification enabled, the deletion of the default entry is notified
even before the addition was notified.
Instead, move the unregister_netdevice() call after the manual destroy,
which solves both problems.
Fixes: 0241b836732f ("vxlan: fix default fdb entry netlink notify ordering during netdev create")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-18 13:16:00 +00:00
|
|
|
if (unregister)
|
|
|
|
unregister_netdevice(dev);
|
2018-07-04 16:46:32 -07:00
|
|
|
return err;
|
2017-03-13 16:24:03 +01:00
|
|
|
}
|
|
|
|
|
2019-02-25 22:03:01 -08:00
|
|
|
/* Set/clear flags based on attribute */
|
|
|
|
static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[],
|
|
|
|
int attrtype, unsigned long mask, bool changelink,
|
|
|
|
bool changelink_supported,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
if (!tb[attrtype])
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (changelink && !changelink_supported) {
|
|
|
|
vxlan_flag_attr_error(attrtype, extack);
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (vxlan_policy[attrtype].type == NLA_FLAG)
|
|
|
|
flags = conf->flags | mask;
|
|
|
|
else if (nla_get_u8(tb[attrtype]))
|
|
|
|
flags = conf->flags | mask;
|
|
|
|
else
|
|
|
|
flags = conf->flags & ~mask;
|
|
|
|
|
|
|
|
conf->flags = flags;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
|
|
|
|
struct net_device *dev, struct vxlan_config *conf,
|
2019-02-25 22:03:01 -08:00
|
|
|
bool changelink, struct netlink_ext_ack *extack)
|
2015-07-21 10:44:02 +02:00
|
|
|
{
|
2024-12-05 16:40:54 +01:00
|
|
|
struct vxlanhdr used_bits = {
|
|
|
|
.vx_flags = VXLAN_HF_VNI,
|
|
|
|
.vx_vni = VXLAN_VNI_MASK,
|
|
|
|
};
|
2017-02-20 08:29:19 -08:00
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2019-02-25 22:03:01 -08:00
|
|
|
int err = 0;
|
2015-07-21 10:44:02 +02:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
memset(conf, 0, sizeof(*conf));
|
2015-10-16 16:36:00 -07:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
/* if changelink operation, start with old existing cfg */
|
|
|
|
if (changelink)
|
|
|
|
memcpy(conf, &vxlan->cfg, sizeof(*conf));
|
|
|
|
|
|
|
|
if (data[IFLA_VXLAN_ID]) {
|
|
|
|
__be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
|
|
|
|
|
2019-02-25 22:03:01 -08:00
|
|
|
if (changelink && (vni != conf->vni)) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI");
|
2017-02-20 08:29:19 -08:00
|
|
|
return -EOPNOTSUPP;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
|
|
|
|
}
|
2015-07-21 10:44:02 +02:00
|
|
|
|
|
|
|
if (data[IFLA_VXLAN_GROUP]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group");
|
2017-06-19 10:03:57 +02:00
|
|
|
return -EOPNOTSUPP;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2017-06-19 10:03:57 +02:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
|
2017-06-19 10:03:57 +02:00
|
|
|
conf->remote_ip.sa.sa_family = AF_INET;
|
2015-07-21 10:44:02 +02:00
|
|
|
} else if (data[IFLA_VXLAN_GROUP6]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
if (!IS_ENABLED(CONFIG_IPV6)) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel");
|
2015-07-21 10:44:02 +02:00
|
|
|
return -EPFNOSUPPORT;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2015-07-21 10:44:02 +02:00
|
|
|
|
2019-02-25 22:03:01 -08:00
|
|
|
if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group");
|
2017-06-19 10:03:57 +02:00
|
|
|
return -EOPNOTSUPP;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2017-06-19 10:03:57 +02:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
|
|
|
|
conf->remote_ip.sa.sa_family = AF_INET6;
|
2015-07-21 10:44:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (data[IFLA_VXLAN_LOCAL]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
if (changelink && (conf->saddr.sa.sa_family != AF_INET)) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old");
|
2017-06-19 10:03:57 +02:00
|
|
|
return -EOPNOTSUPP;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2017-06-19 10:03:57 +02:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
|
|
|
|
conf->saddr.sa.sa_family = AF_INET;
|
2015-07-21 10:44:02 +02:00
|
|
|
} else if (data[IFLA_VXLAN_LOCAL6]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
if (!IS_ENABLED(CONFIG_IPV6)) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel");
|
2015-07-21 10:44:02 +02:00
|
|
|
return -EPFNOSUPPORT;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2015-07-21 10:44:02 +02:00
|
|
|
|
2019-02-25 22:03:01 -08:00
|
|
|
if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old");
|
2017-06-19 10:03:57 +02:00
|
|
|
return -EOPNOTSUPP;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2017-06-19 10:03:57 +02:00
|
|
|
|
2015-07-21 10:44:02 +02:00
|
|
|
/* TODO: respect scope id */
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
|
|
|
|
conf->saddr.sa.sa_family = AF_INET6;
|
2015-07-21 10:44:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (data[IFLA_VXLAN_LINK])
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
|
2015-07-21 10:44:02 +02:00
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
if (data[IFLA_VXLAN_TOS])
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->tos = nla_get_u8(data[IFLA_VXLAN_TOS]);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2012-10-30 10:27:16 +00:00
|
|
|
if (data[IFLA_VXLAN_TTL])
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
|
2012-10-30 10:27:16 +00:00
|
|
|
|
2018-04-17 14:11:28 +08:00
|
|
|
if (data[IFLA_VXLAN_TTL_INHERIT]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT,
|
|
|
|
VXLAN_F_TTL_INHERIT, changelink, false,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2018-04-17 14:11:28 +08:00
|
|
|
}
|
|
|
|
|
2016-03-09 03:00:03 +01:00
|
|
|
if (data[IFLA_VXLAN_LABEL])
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
|
2016-03-09 03:00:03 +01:00
|
|
|
IPV6_FLOWLABEL_MASK;
|
vxlan: add support for flowlabel inherit
By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with
an option for a fixed value. This commits add the ability to inherit the
flow label from the inner packet, like for other tunnel implementations.
This enables devices using only L3 headers for ECMP to correctly balance
VXLAN-encapsulated IPv6 packets.
```
$ ./ip/ip link add dummy1 type dummy
$ ./ip/ip addr add 2001:db8::2/64 dev dummy1
$ ./ip/ip link set up dev dummy1
$ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2
$ ./ip/ip link set up dev vxlan1
$ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
$ ./ip/ip link set arp off dev vxlan1
$ ping -q 2001:db8:1::1 &
$ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
[...]
Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
[...]
Virtual eXtensible Local Area Network
Flags: 0x0800, VXLAN Network ID (VNI)
Group Policy ID: 0
VXLAN Network Identifier (VNI): 100
[...]
Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
```
Signed-off-by: Alce Lafranque <alce@lafranque.net>
Co-developed-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-11-14 11:36:57 -06:00
|
|
|
if (data[IFLA_VXLAN_LABEL_POLICY])
|
|
|
|
conf->label_policy = nla_get_u32(data[IFLA_VXLAN_LABEL_POLICY]);
|
2016-03-09 03:00:03 +01:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_LEARNING]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
|
|
|
|
VXLAN_F_LEARN, changelink, true,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2017-02-20 08:29:19 -08:00
|
|
|
} else if (!changelink) {
|
|
|
|
/* default to learn on a new device */
|
|
|
|
conf->flags |= VXLAN_F_LEARN;
|
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2018-11-21 08:02:40 +00:00
|
|
|
if (data[IFLA_VXLAN_AGEING])
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_PROXY]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY,
|
|
|
|
VXLAN_F_PROXY, changelink, false,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
2012-11-20 02:50:14 +00:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_RSC]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC,
|
|
|
|
VXLAN_F_RSC, changelink, false,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
2012-11-20 02:50:14 +00:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_L2MISS]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS,
|
|
|
|
VXLAN_F_L2MISS, changelink, false,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
2012-11-20 02:50:14 +00:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_L3MISS]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS,
|
|
|
|
VXLAN_F_L3MISS, changelink, false,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
2012-11-20 02:50:14 +00:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_LIMIT]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
if (changelink) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT],
|
|
|
|
"Cannot change limit");
|
2017-02-20 08:29:19 -08:00
|
|
|
return -EOPNOTSUPP;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
|
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_COLLECT_METADATA]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA,
|
|
|
|
VXLAN_F_COLLECT_METADATA, changelink, false,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
2015-07-30 20:10:22 -07:00
|
|
|
|
2012-10-09 20:35:50 +00:00
|
|
|
if (data[IFLA_VXLAN_PORT_RANGE]) {
|
2017-02-20 08:29:19 -08:00
|
|
|
if (!changelink) {
|
|
|
|
const struct ifla_vxlan_port_range *p
|
|
|
|
= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
|
|
|
|
conf->port_min = ntohs(p->low);
|
|
|
|
conf->port_max = ntohs(p->high);
|
|
|
|
} else {
|
2019-02-25 22:03:01 -08:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
|
|
|
|
"Cannot change port range");
|
2017-02-20 08:29:19 -08:00
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
2012-10-09 20:35:50 +00:00
|
|
|
}
|
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_PORT]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
if (changelink) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT],
|
|
|
|
"Cannot change port");
|
2017-02-20 08:29:19 -08:00
|
|
|
return -EOPNOTSUPP;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
|
|
|
|
}
|
2013-04-27 11:31:57 +00:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_UDP_CSUM]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
if (changelink) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM],
|
|
|
|
"Cannot change UDP_CSUM flag");
|
2017-02-20 08:29:19 -08:00
|
|
|
return -EOPNOTSUPP;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2017-02-20 08:29:19 -08:00
|
|
|
if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
|
|
|
|
conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
|
|
|
|
}
|
2014-06-04 17:20:29 -07:00
|
|
|
|
2023-05-12 11:40:33 +08:00
|
|
|
if (data[IFLA_VXLAN_LOCALBYPASS]) {
|
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS,
|
|
|
|
VXLAN_F_LOCALBYPASS, changelink,
|
|
|
|
true, extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
} else if (!changelink) {
|
|
|
|
/* default to local bypass on a new device */
|
|
|
|
conf->flags |= VXLAN_F_LOCALBYPASS;
|
|
|
|
}
|
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
|
|
|
|
VXLAN_F_UDP_ZERO_CSUM6_TX, changelink,
|
|
|
|
false, extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
2014-06-04 17:20:29 -07:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
|
|
|
|
VXLAN_F_UDP_ZERO_CSUM6_RX, changelink,
|
|
|
|
false, extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
2014-06-04 17:20:29 -07:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_REMCSUM_TX]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX,
|
|
|
|
VXLAN_F_REMCSUM_TX, changelink, false,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
2015-01-12 17:00:38 -08:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_REMCSUM_RX]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX,
|
|
|
|
VXLAN_F_REMCSUM_RX, changelink, false,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2024-12-05 16:40:54 +01:00
|
|
|
used_bits.vx_flags |= VXLAN_HF_RCO;
|
|
|
|
used_bits.vx_vni |= ~VXLAN_VNI_MASK;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (data[IFLA_VXLAN_GBP]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP,
|
|
|
|
VXLAN_F_GBP, changelink, false, extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2024-12-05 16:40:54 +01:00
|
|
|
used_bits.vx_flags |= VXLAN_GBP_USED_BITS;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (data[IFLA_VXLAN_GPE]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE,
|
|
|
|
VXLAN_F_GPE, changelink, false,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2024-12-05 16:40:54 +01:00
|
|
|
|
|
|
|
used_bits.vx_flags |= VXLAN_GPE_USED_BITS;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
|
|
|
|
2024-12-05 16:40:57 +01:00
|
|
|
if (data[IFLA_VXLAN_RESERVED_BITS]) {
|
|
|
|
struct vxlanhdr reserved_bits;
|
|
|
|
|
|
|
|
if (changelink) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack,
|
|
|
|
data[IFLA_VXLAN_RESERVED_BITS],
|
|
|
|
"Cannot change reserved_bits");
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS],
|
|
|
|
sizeof(reserved_bits));
|
|
|
|
if (used_bits.vx_flags & reserved_bits.vx_flags ||
|
|
|
|
used_bits.vx_vni & reserved_bits.vx_vni) {
|
|
|
|
__be64 ub_be64, rb_be64;
|
|
|
|
|
|
|
|
memcpy(&ub_be64, &used_bits, sizeof(ub_be64));
|
|
|
|
memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64));
|
|
|
|
|
|
|
|
NL_SET_ERR_MSG_ATTR_FMT(extack,
|
|
|
|
data[IFLA_VXLAN_RESERVED_BITS],
|
|
|
|
"Used bits %#018llx cannot overlap reserved bits %#018llx",
|
|
|
|
be64_to_cpu(ub_be64),
|
|
|
|
be64_to_cpu(rb_be64));
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
conf->reserved_bits = reserved_bits;
|
|
|
|
} else {
|
|
|
|
/* For backwards compatibility, only allow reserved fields to be
|
|
|
|
* used by VXLAN extensions if explicitly requested.
|
|
|
|
*/
|
|
|
|
conf->reserved_bits = (struct vxlanhdr) {
|
|
|
|
.vx_flags = ~used_bits.vx_flags,
|
|
|
|
.vx_vni = ~used_bits.vx_vni,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL,
|
|
|
|
VXLAN_F_REMCSUM_NOPARTIAL, changelink,
|
|
|
|
false, extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (tb[IFLA_MTU]) {
|
2019-02-25 22:03:01 -08:00
|
|
|
if (changelink) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
|
|
|
|
"Cannot change mtu");
|
2017-02-20 08:29:19 -08:00
|
|
|
return -EOPNOTSUPP;
|
2019-02-25 22:03:01 -08:00
|
|
|
}
|
2017-02-20 08:29:19 -08:00
|
|
|
conf->mtu = nla_get_u32(tb[IFLA_MTU]);
|
|
|
|
}
|
|
|
|
|
vxlan: Allow configuration of DF behaviour
Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.
For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.
According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.
Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.
This only applies to non-lwt tunnels: if an external control plane is
used, tunnel key will still control the DF flag.
v2:
- DF behaviour configuration only applies for non-lwt tunnels, move DF
setting to if (!info) block in vxlan_xmit_one() (Stephen Hemminger)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 12:19:16 +01:00
|
|
|
if (data[IFLA_VXLAN_DF])
|
|
|
|
conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);
|
|
|
|
|
2022-03-01 05:04:36 +00:00
|
|
|
if (data[IFLA_VXLAN_VNIFILTER]) {
|
|
|
|
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_VNIFILTER,
|
|
|
|
VXLAN_F_VNIFILTER, changelink, false,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if ((conf->flags & VXLAN_F_VNIFILTER) &&
|
|
|
|
!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_VNIFILTER],
|
|
|
|
"vxlan vnifilter only valid in collect metadata mode");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vxlan_newlink(struct net *src_net, struct net_device *dev,
|
2017-06-25 23:55:59 +02:00
|
|
|
struct nlattr *tb[], struct nlattr *data[],
|
|
|
|
struct netlink_ext_ack *extack)
|
2017-02-20 08:29:19 -08:00
|
|
|
{
|
|
|
|
struct vxlan_config conf;
|
|
|
|
int err;
|
|
|
|
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2conf(tb, data, dev, &conf, false, extack);
|
2017-02-20 08:29:19 -08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2017-08-11 15:20:59 -07:00
|
|
|
return __vxlan_dev_create(src_net, dev, &conf, extack);
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
2015-01-12 17:00:38 -08:00
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
|
2017-06-25 23:56:00 +02:00
|
|
|
struct nlattr *data[],
|
|
|
|
struct netlink_ext_ack *extack)
|
2017-02-20 08:29:19 -08:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2019-01-16 23:06:39 +00:00
|
|
|
struct net_device *lowerdev;
|
2017-02-20 08:29:19 -08:00
|
|
|
struct vxlan_config conf;
|
2019-10-21 18:47:57 +00:00
|
|
|
struct vxlan_rdst *dst;
|
2017-02-20 08:29:19 -08:00
|
|
|
int err;
|
|
|
|
|
2019-10-21 18:47:57 +00:00
|
|
|
dst = &vxlan->default_dst;
|
2019-02-25 22:03:01 -08:00
|
|
|
err = vxlan_nl2conf(tb, data, dev, &conf, true, extack);
|
2017-02-20 08:29:19 -08:00
|
|
|
if (err)
|
|
|
|
return err;
|
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
|
|
|
|
2019-01-16 23:06:39 +00:00
|
|
|
err = vxlan_config_validate(vxlan->net, &conf, &lowerdev,
|
|
|
|
vxlan, extack);
|
2017-02-20 08:29:19 -08:00
|
|
|
if (err)
|
|
|
|
return err;
|
2015-02-10 16:30:32 -08:00
|
|
|
|
2019-10-30 08:15:12 +00:00
|
|
|
if (dst->remote_dev == lowerdev)
|
|
|
|
lowerdev = NULL;
|
|
|
|
|
2019-10-21 18:47:57 +00:00
|
|
|
err = netdev_adjacent_change_prepare(dst->remote_dev, lowerdev, dev,
|
|
|
|
extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2017-02-20 08:29:19 -08:00
|
|
|
/* handle default dst entry */
|
2019-01-16 23:06:41 +00:00
|
|
|
if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) {
|
2019-06-06 17:57:58 +08:00
|
|
|
u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni);
|
|
|
|
|
|
|
|
spin_lock_bh(&vxlan->hash_lock[hash_index]);
|
2019-01-16 23:06:41 +00:00
|
|
|
if (!vxlan_addr_any(&conf.remote_ip)) {
|
vxlan: changelink: Fix handling of default remotes
Default remotes are stored as FDB entries with an Ethernet address of
00:00:00:00:00:00. When a request is made to change a remote address of
a VXLAN device, vxlan_changelink() first deletes the existing default
remote, and then creates a new FDB entry.
This works well as long as the list of default remotes matches exactly
the configuration of a VXLAN remote address. Thus when the VXLAN device
has a remote of X, there should be exactly one default remote FDB entry
X. If the VXLAN device has no remote address, there should be no such
entry.
Besides using "ip link set", it is possible to manipulate the list of
default remotes by using the "bridge fdb". It is therefore easy to break
the above condition. Under such circumstances, the __vxlan_fdb_delete()
call doesn't delete the FDB entry itself, but just one remote. The
following vxlan_fdb_create() then creates a new FDB entry, leading to a
situation where two entries exist for the address 00:00:00:00:00:00,
each with a different subset of default remotes.
An even more obvious breakage rooted in the same cause can be observed
when a remote address is configured for a VXLAN device that did not have
one before. In that case vxlan_changelink() doesn't remove any remote,
and just creates a new FDB entry for the new address:
$ ip link add name vx up type vxlan id 2000 dstport 4789
$ bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.20 self permanent
$ bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.30 self permanent
$ ip link set dev vx type vxlan remote 192.0.2.30
$ bridge fdb sh dev vx | grep 00:00:00:00:00:00
00:00:00:00:00:00 dst 192.0.2.30 self permanent <- new entry, 1 rdst
00:00:00:00:00:00 dst 192.0.2.20 self permanent <- orig. entry, 2 rdsts
00:00:00:00:00:00 dst 192.0.2.30 self permanent
To fix this, instead of calling vxlan_fdb_create() directly, defer to
vxlan_fdb_update(). That has logic to handle the duplicates properly.
Additionally, it also handles notifications, so drop that call from
changelink as well.
Fixes: 0241b836732f ("vxlan: fix default fdb entry netlink notify ordering during netdev create")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-18 13:16:02 +00:00
|
|
|
err = vxlan_fdb_update(vxlan, all_zeros_mac,
|
2019-01-16 23:06:41 +00:00
|
|
|
&conf.remote_ip,
|
2017-02-20 08:29:19 -08:00
|
|
|
NUD_REACHABLE | NUD_PERMANENT,
|
vxlan: changelink: Fix handling of default remotes
Default remotes are stored as FDB entries with an Ethernet address of
00:00:00:00:00:00. When a request is made to change a remote address of
a VXLAN device, vxlan_changelink() first deletes the existing default
remote, and then creates a new FDB entry.
This works well as long as the list of default remotes matches exactly
the configuration of a VXLAN remote address. Thus when the VXLAN device
has a remote of X, there should be exactly one default remote FDB entry
X. If the VXLAN device has no remote address, there should be no such
entry.
Besides using "ip link set", it is possible to manipulate the list of
default remotes by using the "bridge fdb". It is therefore easy to break
the above condition. Under such circumstances, the __vxlan_fdb_delete()
call doesn't delete the FDB entry itself, but just one remote. The
following vxlan_fdb_create() then creates a new FDB entry, leading to a
situation where two entries exist for the address 00:00:00:00:00:00,
each with a different subset of default remotes.
An even more obvious breakage rooted in the same cause can be observed
when a remote address is configured for a VXLAN device that did not have
one before. In that case vxlan_changelink() doesn't remove any remote,
and just creates a new FDB entry for the new address:
$ ip link add name vx up type vxlan id 2000 dstport 4789
$ bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.20 self permanent
$ bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.30 self permanent
$ ip link set dev vx type vxlan remote 192.0.2.30
$ bridge fdb sh dev vx | grep 00:00:00:00:00:00
00:00:00:00:00:00 dst 192.0.2.30 self permanent <- new entry, 1 rdst
00:00:00:00:00:00 dst 192.0.2.20 self permanent <- orig. entry, 2 rdsts
00:00:00:00:00:00 dst 192.0.2.30 self permanent
To fix this, instead of calling vxlan_fdb_create() directly, defer to
vxlan_fdb_update(). That has logic to handle the duplicates properly.
Additionally, it also handles notifications, so drop that call from
changelink as well.
Fixes: 0241b836732f ("vxlan: fix default fdb entry netlink notify ordering during netdev create")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-18 13:16:02 +00:00
|
|
|
NLM_F_APPEND | NLM_F_CREATE,
|
2017-02-20 08:29:19 -08:00
|
|
|
vxlan->cfg.dst_port,
|
2019-01-16 23:06:41 +00:00
|
|
|
conf.vni, conf.vni,
|
|
|
|
conf.remote_ifindex,
|
2020-05-21 22:26:14 -07:00
|
|
|
NTF_SELF, 0, true, extack);
|
2017-02-20 08:29:19 -08:00
|
|
|
if (err) {
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
|
2019-10-21 18:47:57 +00:00
|
|
|
netdev_adjacent_change_abort(dst->remote_dev,
|
|
|
|
lowerdev, dev);
|
2017-02-20 08:29:19 -08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
}
|
2019-01-16 23:06:43 +00:00
|
|
|
if (!vxlan_addr_any(&dst->remote_ip))
|
|
|
|
__vxlan_fdb_delete(vxlan, all_zeros_mac,
|
|
|
|
dst->remote_ip,
|
|
|
|
vxlan->cfg.dst_port,
|
|
|
|
dst->remote_vni,
|
|
|
|
dst->remote_vni,
|
|
|
|
dst->remote_ifindex,
|
|
|
|
true);
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
|
2022-03-01 05:04:36 +00:00
|
|
|
|
|
|
|
/* If vni filtering device, also update fdb entries of
|
|
|
|
* all vnis that were using default remote ip
|
|
|
|
*/
|
|
|
|
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) {
|
|
|
|
err = vxlan_vnilist_update_group(vxlan, &dst->remote_ip,
|
|
|
|
&conf.remote_ip, extack);
|
|
|
|
if (err) {
|
|
|
|
netdev_adjacent_change_abort(dst->remote_dev,
|
|
|
|
lowerdev, dev);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
}
|
2017-02-20 08:29:19 -08:00
|
|
|
}
|
2016-05-27 10:49:11 +08:00
|
|
|
|
2019-01-16 23:06:41 +00:00
|
|
|
if (conf.age_interval != vxlan->cfg.age_interval)
|
|
|
|
mod_timer(&vxlan->age_timer, jiffies);
|
|
|
|
|
2019-10-21 18:47:57 +00:00
|
|
|
netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev);
|
2020-06-08 14:53:01 -07:00
|
|
|
if (lowerdev && lowerdev != dst->remote_dev)
|
2019-10-21 18:47:57 +00:00
|
|
|
dst->remote_dev = lowerdev;
|
2019-01-16 23:06:41 +00:00
|
|
|
vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true);
|
2017-02-20 08:29:19 -08:00
|
|
|
return 0;
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void vxlan_dellink(struct net_device *dev, struct list_head *head)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2023-10-09 13:06:10 +03:00
|
|
|
struct vxlan_fdb_flush_desc desc = {
|
|
|
|
/* Default entry is deleted at vxlan_uninit. */
|
|
|
|
.ignore_default_entry = true,
|
|
|
|
};
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2023-10-09 13:06:09 +03:00
|
|
|
vxlan_flush(vxlan, &desc);
|
2017-01-23 20:44:32 -08:00
|
|
|
|
2013-05-16 11:35:20 +00:00
|
|
|
list_del(&vxlan->next);
|
2012-10-01 12:32:35 +00:00
|
|
|
unregister_netdevice_queue(dev, head);
|
2019-10-21 18:47:57 +00:00
|
|
|
if (vxlan->default_dst.remote_dev)
|
|
|
|
netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev);
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static size_t vxlan_get_size(const struct net_device *dev)
|
|
|
|
{
|
|
|
|
return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */
|
2013-08-31 13:44:33 +08:00
|
|
|
nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
|
2012-10-01 12:32:35 +00:00
|
|
|
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
|
2013-08-31 13:44:33 +08:00
|
|
|
nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
|
2012-10-01 12:32:35 +00:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */
|
2018-09-26 10:35:42 +08:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL_INHERIT */
|
2012-10-01 12:32:35 +00:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */
|
vxlan: Allow configuration of DF behaviour
Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.
For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.
According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.
Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.
This only applies to non-lwt tunnels: if an external control plane is
used, tunnel key will still control the DF flag.
v2:
- DF behaviour configuration only applies for non-lwt tunnels, move DF
setting to if (!info) block in vxlan_xmit_one() (Stephen Hemminger)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 12:19:16 +01:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_DF */
|
2016-03-09 03:00:03 +01:00
|
|
|
nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
|
vxlan: add support for flowlabel inherit
By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with
an option for a fixed value. This commits add the ability to inherit the
flow label from the inner packet, like for other tunnel implementations.
This enables devices using only L3 headers for ECMP to correctly balance
VXLAN-encapsulated IPv6 packets.
```
$ ./ip/ip link add dummy1 type dummy
$ ./ip/ip addr add 2001:db8::2/64 dev dummy1
$ ./ip/ip link set up dev dummy1
$ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2
$ ./ip/ip link set up dev vxlan1
$ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
$ ./ip/ip link set arp off dev vxlan1
$ ping -q 2001:db8:1::1 &
$ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
[...]
Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
[...]
Virtual eXtensible Local Area Network
Flags: 0x0800, VXLAN Network ID (VNI)
Group Policy ID: 0
VXLAN Network Identifier (VNI): 100
[...]
Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
```
Signed-off-by: Alce Lafranque <alce@lafranque.net>
Co-developed-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-11-14 11:36:57 -06:00
|
|
|
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LABEL_POLICY */
|
2012-10-01 12:32:35 +00:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */
|
2012-11-20 02:50:14 +00:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */
|
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */
|
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */
|
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */
|
2015-08-04 22:51:07 -07:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */
|
2012-10-01 12:32:35 +00:00
|
|
|
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
|
|
|
|
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
|
2014-06-04 17:20:29 -07:00
|
|
|
nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
|
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
|
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
|
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
|
2015-01-12 17:00:38 -08:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
|
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
|
2023-05-12 11:40:33 +08:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */
|
2023-10-27 14:44:10 -04:00
|
|
|
/* IFLA_VXLAN_PORT_RANGE */
|
|
|
|
nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
|
2023-09-18 11:40:15 -04:00
|
|
|
nla_total_size(0) + /* IFLA_VXLAN_GBP */
|
|
|
|
nla_total_size(0) + /* IFLA_VXLAN_GPE */
|
|
|
|
nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */
|
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */
|
2024-12-05 16:40:57 +01:00
|
|
|
/* IFLA_VXLAN_RESERVED_BITS */
|
|
|
|
nla_total_size(sizeof(struct vxlanhdr)) +
|
2012-10-01 12:32:35 +00:00
|
|
|
0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
|
|
|
|
{
|
|
|
|
const struct vxlan_dev *vxlan = netdev_priv(dev);
|
2013-04-16 02:50:52 +00:00
|
|
|
const struct vxlan_rdst *dst = &vxlan->default_dst;
|
2012-10-09 20:35:50 +00:00
|
|
|
struct ifla_vxlan_port_range ports = {
|
2015-07-21 10:44:02 +02:00
|
|
|
.low = htons(vxlan->cfg.port_min),
|
|
|
|
.high = htons(vxlan->cfg.port_max),
|
2012-10-09 20:35:50 +00:00
|
|
|
};
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2016-02-16 21:58:58 +01:00
|
|
|
if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
|
2012-10-01 12:32:35 +00:00
|
|
|
goto nla_put_failure;
|
|
|
|
|
2013-08-31 13:44:33 +08:00
|
|
|
if (!vxlan_addr_any(&dst->remote_ip)) {
|
|
|
|
if (dst->remote_ip.sa.sa_family == AF_INET) {
|
2015-03-29 16:59:25 +02:00
|
|
|
if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
|
|
|
|
dst->remote_ip.sin.sin_addr.s_addr))
|
2013-08-31 13:44:33 +08:00
|
|
|
goto nla_put_failure;
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
} else {
|
2015-03-29 16:59:25 +02:00
|
|
|
if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
|
|
|
|
&dst->remote_ip.sin6.sin6_addr))
|
2013-08-31 13:44:33 +08:00
|
|
|
goto nla_put_failure;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2013-04-16 02:50:52 +00:00
|
|
|
if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
|
2012-10-01 12:32:35 +00:00
|
|
|
goto nla_put_failure;
|
|
|
|
|
2015-07-21 10:44:02 +02:00
|
|
|
if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
|
|
|
|
if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
|
2015-03-29 16:59:25 +02:00
|
|
|
if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
|
2015-07-21 10:44:02 +02:00
|
|
|
vxlan->cfg.saddr.sin.sin_addr.s_addr))
|
2013-08-31 13:44:33 +08:00
|
|
|
goto nla_put_failure;
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
} else {
|
2015-03-29 16:59:25 +02:00
|
|
|
if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
|
2015-07-21 10:44:02 +02:00
|
|
|
&vxlan->cfg.saddr.sin6.sin6_addr))
|
2013-08-31 13:44:33 +08:00
|
|
|
goto nla_put_failure;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2015-07-21 10:44:02 +02:00
|
|
|
if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
|
2018-09-26 10:35:42 +08:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
|
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
|
2015-07-21 10:44:02 +02:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
|
vxlan: Allow configuration of DF behaviour
Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.
For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.
According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.
Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.
This only applies to non-lwt tunnels: if an external control plane is
used, tunnel key will still control the DF flag.
v2:
- DF behaviour configuration only applies for non-lwt tunnels, move DF
setting to if (!info) block in vxlan_xmit_one() (Stephen Hemminger)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 12:19:16 +01:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
|
2016-03-09 03:00:03 +01:00
|
|
|
nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
|
vxlan: add support for flowlabel inherit
By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with
an option for a fixed value. This commits add the ability to inherit the
flow label from the inner packet, like for other tunnel implementations.
This enables devices using only L3 headers for ECMP to correctly balance
VXLAN-encapsulated IPv6 packets.
```
$ ./ip/ip link add dummy1 type dummy
$ ./ip/ip addr add 2001:db8::2/64 dev dummy1
$ ./ip/ip link set up dev dummy1
$ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2
$ ./ip/ip link set up dev vxlan1
$ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
$ ./ip/ip link set arp off dev vxlan1
$ ping -q 2001:db8:1::1 &
$ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
[...]
Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
[...]
Virtual eXtensible Local Area Network
Flags: 0x0800, VXLAN Network ID (VNI)
Group Policy ID: 0
VXLAN Network Identifier (VNI): 100
[...]
Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
```
Signed-off-by: Alce Lafranque <alce@lafranque.net>
Co-developed-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-11-14 11:36:57 -06:00
|
|
|
nla_put_u32(skb, IFLA_VXLAN_LABEL_POLICY, vxlan->cfg.label_policy) ||
|
2012-11-20 02:50:14 +00:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_LEARNING,
|
2019-12-30 17:52:22 +08:00
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
|
2012-11-20 02:50:14 +00:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_PROXY,
|
2019-12-30 17:52:22 +08:00
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_PROXY)) ||
|
2017-06-19 10:03:56 +02:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_RSC,
|
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_RSC)) ||
|
2012-11-20 02:50:14 +00:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_L2MISS,
|
2019-12-30 17:52:22 +08:00
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_L2MISS)) ||
|
2012-11-20 02:50:14 +00:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_L3MISS,
|
2019-12-30 17:52:22 +08:00
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_L3MISS)) ||
|
2015-08-04 22:51:07 -07:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
|
2017-06-19 10:03:56 +02:00
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) ||
|
2015-07-21 10:44:02 +02:00
|
|
|
nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
|
|
|
|
nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
|
|
|
|
nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
|
2014-06-04 17:20:29 -07:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
|
2019-12-30 17:52:22 +08:00
|
|
|
!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
|
2014-06-04 17:20:29 -07:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
|
2019-12-30 17:52:22 +08:00
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
|
2014-06-04 17:20:29 -07:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
|
2019-12-30 17:52:22 +08:00
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
|
2015-01-12 17:00:38 -08:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
|
2019-12-30 17:52:22 +08:00
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
|
2015-01-12 17:00:38 -08:00
|
|
|
nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
|
2023-05-12 11:40:33 +08:00
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) ||
|
|
|
|
nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS,
|
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS)))
|
2012-10-01 12:32:35 +00:00
|
|
|
goto nla_put_failure;
|
|
|
|
|
2012-10-09 20:35:50 +00:00
|
|
|
if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2017-06-19 10:03:56 +02:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_GBP &&
|
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
|
|
|
nla_put_flag(skb, IFLA_VXLAN_GBP))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2017-06-19 10:03:56 +02:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_GPE &&
|
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
|
|
|
nla_put_flag(skb, IFLA_VXLAN_GPE))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2017-06-19 10:03:56 +02:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL &&
|
2015-02-10 16:30:32 -08:00
|
|
|
nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2022-03-01 05:04:36 +00:00
|
|
|
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER &&
|
|
|
|
nla_put_u8(skb, IFLA_VXLAN_VNIFILTER,
|
|
|
|
!!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2024-12-05 16:40:57 +01:00
|
|
|
if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS,
|
|
|
|
sizeof(vxlan->cfg.reserved_bits),
|
|
|
|
&vxlan->cfg.reserved_bits))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
nla_put_failure:
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
2015-01-15 15:11:17 +01:00
|
|
|
static struct net *vxlan_get_link_net(const struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
|
2024-05-03 19:20:59 +00:00
|
|
|
return READ_ONCE(vxlan->net);
|
2015-01-15 15:11:17 +01:00
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
|
|
|
|
.kind = "vxlan",
|
|
|
|
.maxtype = IFLA_VXLAN_MAX,
|
|
|
|
.policy = vxlan_policy,
|
|
|
|
.priv_size = sizeof(struct vxlan_dev),
|
|
|
|
.setup = vxlan_setup,
|
|
|
|
.validate = vxlan_validate,
|
|
|
|
.newlink = vxlan_newlink,
|
2017-02-20 08:29:19 -08:00
|
|
|
.changelink = vxlan_changelink,
|
2012-10-01 12:32:35 +00:00
|
|
|
.dellink = vxlan_dellink,
|
|
|
|
.get_size = vxlan_get_size,
|
|
|
|
.fill_info = vxlan_fill_info,
|
2015-01-15 15:11:17 +01:00
|
|
|
.get_link_net = vxlan_get_link_net,
|
2012-10-01 12:32:35 +00:00
|
|
|
};
|
|
|
|
|
2016-06-13 10:31:05 +02:00
|
|
|
struct net_device *vxlan_dev_create(struct net *net, const char *name,
|
|
|
|
u8 name_assign_type,
|
|
|
|
struct vxlan_config *conf)
|
|
|
|
{
|
|
|
|
struct nlattr *tb[IFLA_MAX + 1];
|
|
|
|
struct net_device *dev;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
memset(&tb, 0, sizeof(tb));
|
|
|
|
|
|
|
|
dev = rtnl_create_link(net, name, name_assign_type,
|
2018-11-06 12:51:14 -08:00
|
|
|
&vxlan_link_ops, tb, NULL);
|
2016-06-13 10:31:05 +02:00
|
|
|
if (IS_ERR(dev))
|
|
|
|
return dev;
|
|
|
|
|
2017-08-11 15:20:59 -07:00
|
|
|
err = __vxlan_dev_create(net, dev, conf, NULL);
|
2016-06-13 10:31:05 +02:00
|
|
|
if (err < 0) {
|
|
|
|
free_netdev(dev);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
|
2022-10-28 04:42:21 -04:00
|
|
|
err = rtnl_configure_link(dev, NULL, 0, NULL);
|
2016-06-13 10:31:05 +02:00
|
|
|
if (err < 0) {
|
|
|
|
LIST_HEAD(list_kill);
|
|
|
|
|
|
|
|
vxlan_dellink(dev, &list_kill);
|
|
|
|
unregister_netdevice_many(&list_kill);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
|
|
|
|
return dev;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(vxlan_dev_create);
|
|
|
|
|
2014-01-13 18:41:19 +01:00
|
|
|
static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
|
|
|
|
struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan, *next;
|
|
|
|
LIST_HEAD(list_kill);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
|
|
|
|
struct vxlan_rdst *dst = &vxlan->default_dst;
|
|
|
|
|
|
|
|
/* In case we created vxlan device with carrier
|
|
|
|
* and we loose the carrier due to module unload
|
|
|
|
* we also need to remove vxlan device. In other
|
|
|
|
* cases, it's not necessary and remote_ifindex
|
|
|
|
* is 0 here, so no matches.
|
|
|
|
*/
|
|
|
|
if (dst->remote_ifindex == dev->ifindex)
|
|
|
|
vxlan_dellink(vxlan->dev, &list_kill);
|
|
|
|
}
|
|
|
|
|
|
|
|
unregister_netdevice_many(&list_kill);
|
|
|
|
}
|
|
|
|
|
2016-04-18 21:19:47 +02:00
|
|
|
static int vxlan_netdevice_event(struct notifier_block *unused,
|
|
|
|
unsigned long event, void *ptr)
|
2014-01-13 18:41:19 +01:00
|
|
|
{
|
|
|
|
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
|
|
|
struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
|
2014-01-13 18:41:19 +01:00
|
|
|
|
2021-01-06 13:06:35 -08:00
|
|
|
if (event == NETDEV_UNREGISTER)
|
2014-01-13 18:41:19 +01:00
|
|
|
vxlan_handle_lowerdev_unregister(vn, dev);
|
2021-01-06 13:06:35 -08:00
|
|
|
else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO)
|
|
|
|
vxlan_offload_rx_ports(dev, true);
|
|
|
|
else if (event == NETDEV_UDP_TUNNEL_DROP_INFO)
|
|
|
|
vxlan_offload_rx_ports(dev, false);
|
2014-01-13 18:41:19 +01:00
|
|
|
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block vxlan_notifier_block __read_mostly = {
|
2016-04-18 21:19:47 +02:00
|
|
|
.notifier_call = vxlan_netdevice_event,
|
2014-01-13 18:41:19 +01:00
|
|
|
};
|
|
|
|
|
2018-10-17 08:53:26 +00:00
|
|
|
static void
|
|
|
|
vxlan_fdb_offloaded_set(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_vxlan_fdb_info *fdb_info)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct vxlan_rdst *rdst;
|
|
|
|
struct vxlan_fdb *f;
|
2019-06-06 17:57:58 +08:00
|
|
|
u32 hash_index;
|
|
|
|
|
|
|
|
hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
|
2018-10-17 08:53:26 +00:00
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_lock_bh(&vxlan->hash_lock[hash_index]);
|
2018-10-17 08:53:26 +00:00
|
|
|
|
|
|
|
f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
|
|
|
|
if (!f)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip,
|
|
|
|
fdb_info->remote_port,
|
|
|
|
fdb_info->remote_vni,
|
|
|
|
fdb_info->remote_ifindex);
|
|
|
|
if (!rdst)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
rdst->offloaded = fdb_info->offloaded;
|
|
|
|
|
|
|
|
out:
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
|
2018-10-17 08:53:26 +00:00
|
|
|
}
|
|
|
|
|
2018-11-21 08:02:39 +00:00
|
|
|
static int
|
|
|
|
vxlan_fdb_external_learn_add(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_vxlan_fdb_info *fdb_info)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
2019-01-16 23:06:54 +00:00
|
|
|
struct netlink_ext_ack *extack;
|
2019-06-06 17:57:58 +08:00
|
|
|
u32 hash_index;
|
2018-11-21 08:02:39 +00:00
|
|
|
int err;
|
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
|
2019-01-16 23:06:54 +00:00
|
|
|
extack = switchdev_notifier_info_to_extack(&fdb_info->info);
|
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_lock_bh(&vxlan->hash_lock[hash_index]);
|
2018-11-21 08:02:39 +00:00
|
|
|
err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip,
|
|
|
|
NUD_REACHABLE,
|
|
|
|
NLM_F_CREATE | NLM_F_REPLACE,
|
|
|
|
fdb_info->remote_port,
|
|
|
|
fdb_info->vni,
|
|
|
|
fdb_info->remote_vni,
|
|
|
|
fdb_info->remote_ifindex,
|
|
|
|
NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
|
2020-05-21 22:26:14 -07:00
|
|
|
0, false, extack);
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
|
2018-11-21 08:02:39 +00:00
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
vxlan_fdb_external_learn_del(struct net_device *dev,
|
|
|
|
struct switchdev_notifier_vxlan_fdb_info *fdb_info)
|
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan = netdev_priv(dev);
|
|
|
|
struct vxlan_fdb *f;
|
2019-06-06 17:57:58 +08:00
|
|
|
u32 hash_index;
|
2018-11-21 08:02:39 +00:00
|
|
|
int err = 0;
|
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
|
|
|
|
spin_lock_bh(&vxlan->hash_lock[hash_index]);
|
2018-11-21 08:02:39 +00:00
|
|
|
|
|
|
|
f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
|
|
|
|
if (!f)
|
|
|
|
err = -ENOENT;
|
|
|
|
else if (f->flags & NTF_EXT_LEARNED)
|
|
|
|
err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr,
|
|
|
|
fdb_info->remote_ip,
|
|
|
|
fdb_info->remote_port,
|
|
|
|
fdb_info->vni,
|
|
|
|
fdb_info->remote_vni,
|
|
|
|
fdb_info->remote_ifindex,
|
|
|
|
false);
|
|
|
|
|
2019-06-06 17:57:58 +08:00
|
|
|
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
|
2018-11-21 08:02:39 +00:00
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2018-10-17 08:53:26 +00:00
|
|
|
static int vxlan_switchdev_event(struct notifier_block *unused,
|
|
|
|
unsigned long event, void *ptr)
|
|
|
|
{
|
|
|
|
struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
|
2018-11-21 08:02:39 +00:00
|
|
|
struct switchdev_notifier_vxlan_fdb_info *fdb_info;
|
|
|
|
int err = 0;
|
2018-10-17 08:53:26 +00:00
|
|
|
|
|
|
|
switch (event) {
|
|
|
|
case SWITCHDEV_VXLAN_FDB_OFFLOADED:
|
|
|
|
vxlan_fdb_offloaded_set(dev, ptr);
|
|
|
|
break;
|
2018-11-21 08:02:39 +00:00
|
|
|
case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE:
|
|
|
|
fdb_info = ptr;
|
|
|
|
err = vxlan_fdb_external_learn_add(dev, fdb_info);
|
|
|
|
if (err) {
|
|
|
|
err = notifier_from_errno(err);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
fdb_info->offloaded = true;
|
|
|
|
vxlan_fdb_offloaded_set(dev, fdb_info);
|
|
|
|
break;
|
|
|
|
case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE:
|
|
|
|
fdb_info = ptr;
|
|
|
|
err = vxlan_fdb_external_learn_del(dev, fdb_info);
|
|
|
|
if (err) {
|
|
|
|
err = notifier_from_errno(err);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
fdb_info->offloaded = false;
|
|
|
|
vxlan_fdb_offloaded_set(dev, fdb_info);
|
|
|
|
break;
|
2018-10-17 08:53:26 +00:00
|
|
|
}
|
|
|
|
|
2018-11-21 08:02:39 +00:00
|
|
|
return err;
|
2018-10-17 08:53:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
|
|
|
|
.notifier_call = vxlan_switchdev_event,
|
|
|
|
};
|
|
|
|
|
2020-05-28 22:12:36 -07:00
|
|
|
static void vxlan_fdb_nh_flush(struct nexthop *nh)
|
|
|
|
{
|
|
|
|
struct vxlan_fdb *fdb;
|
|
|
|
struct vxlan_dev *vxlan;
|
|
|
|
u32 hash_index;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) {
|
|
|
|
vxlan = rcu_dereference(fdb->vdev);
|
|
|
|
WARN_ON(!vxlan);
|
|
|
|
hash_index = fdb_head_index(vxlan, fdb->eth_addr,
|
|
|
|
vxlan->default_dst.remote_vni);
|
|
|
|
spin_lock_bh(&vxlan->hash_lock[hash_index]);
|
|
|
|
if (!hlist_unhashed(&fdb->hlist))
|
|
|
|
vxlan_fdb_destroy(vxlan, fdb, false, false);
|
|
|
|
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2020-05-21 22:26:16 -07:00
|
|
|
static int vxlan_nexthop_event(struct notifier_block *nb,
|
|
|
|
unsigned long event, void *ptr)
|
|
|
|
{
|
2020-11-04 15:30:26 +02:00
|
|
|
struct nh_notifier_info *info = ptr;
|
|
|
|
struct nexthop *nh;
|
|
|
|
|
|
|
|
if (event != NEXTHOP_EVENT_DEL)
|
|
|
|
return NOTIFY_DONE;
|
2020-05-21 22:26:16 -07:00
|
|
|
|
2020-11-04 15:30:26 +02:00
|
|
|
nh = nexthop_find_by_id(info->net, info->id);
|
|
|
|
if (!nh)
|
2020-05-21 22:26:16 -07:00
|
|
|
return NOTIFY_DONE;
|
|
|
|
|
2020-05-28 22:12:36 -07:00
|
|
|
vxlan_fdb_nh_flush(nh);
|
2020-05-21 22:26:16 -07:00
|
|
|
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
static __net_init int vxlan_init_net(struct net *net)
|
|
|
|
{
|
|
|
|
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
|
2013-05-27 22:35:52 +00:00
|
|
|
unsigned int h;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2013-05-16 11:35:20 +00:00
|
|
|
INIT_LIST_HEAD(&vn->vxlan_list);
|
2013-06-17 14:16:11 -07:00
|
|
|
spin_lock_init(&vn->sock_lock);
|
2020-11-01 13:39:26 +02:00
|
|
|
vn->nexthop_notifier_block.notifier_call = vxlan_nexthop_event;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2013-05-16 11:35:20 +00:00
|
|
|
for (h = 0; h < PORT_HASH_SIZE; ++h)
|
|
|
|
INIT_HLIST_HEAD(&vn->sock_list[h]);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2020-11-04 15:30:34 +02:00
|
|
|
return register_nexthop_notifier(net, &vn->nexthop_notifier_block,
|
|
|
|
NULL);
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
|
2024-02-06 14:43:05 +00:00
|
|
|
static void __net_exit vxlan_destroy_tunnels(struct vxlan_net *vn,
|
|
|
|
struct list_head *dev_to_kill)
|
2014-04-24 10:02:49 +02:00
|
|
|
{
|
|
|
|
struct vxlan_dev *vxlan, *next;
|
|
|
|
|
2024-02-06 14:43:05 +00:00
|
|
|
list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next)
|
|
|
|
vxlan_dellink(vxlan->dev, dev_to_kill);
|
2014-04-24 10:02:49 +02:00
|
|
|
}
|
|
|
|
|
2024-02-06 14:43:05 +00:00
|
|
|
static void __net_exit vxlan_exit_batch_rtnl(struct list_head *net_list,
|
|
|
|
struct list_head *dev_to_kill)
|
2017-12-16 17:54:49 +08:00
|
|
|
{
|
|
|
|
struct net *net;
|
|
|
|
|
2024-02-06 14:43:05 +00:00
|
|
|
ASSERT_RTNL();
|
2020-11-01 13:39:26 +02:00
|
|
|
list_for_each_entry(net, net_list, exit_list) {
|
|
|
|
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
|
|
|
|
|
2024-02-06 14:43:05 +00:00
|
|
|
__unregister_nexthop_notifier(net, &vn->nexthop_notifier_block);
|
2017-12-16 17:54:49 +08:00
|
|
|
|
2024-02-06 14:43:05 +00:00
|
|
|
vxlan_destroy_tunnels(vn, dev_to_kill);
|
|
|
|
}
|
|
|
|
}
|
2021-02-21 15:45:52 +00:00
|
|
|
|
2024-02-06 14:43:05 +00:00
|
|
|
static void __net_exit vxlan_exit_net(struct net *net)
|
|
|
|
{
|
|
|
|
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
|
|
|
|
unsigned int h;
|
2021-02-21 15:45:52 +00:00
|
|
|
|
2024-02-06 14:43:05 +00:00
|
|
|
for (h = 0; h < PORT_HASH_SIZE; ++h)
|
|
|
|
WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h]));
|
2017-12-16 17:54:49 +08:00
|
|
|
}
|
|
|
|
|
2012-10-01 12:32:35 +00:00
|
|
|
static struct pernet_operations vxlan_net_ops = {
|
|
|
|
.init = vxlan_init_net,
|
2024-02-06 14:43:05 +00:00
|
|
|
.exit_batch_rtnl = vxlan_exit_batch_rtnl,
|
|
|
|
.exit = vxlan_exit_net,
|
2012-10-01 12:32:35 +00:00
|
|
|
.id = &vxlan_net_id,
|
|
|
|
.size = sizeof(struct vxlan_net),
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init vxlan_init_module(void)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
|
|
|
|
|
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
|
|
|
rc = register_pernet_subsys(&vxlan_net_ops);
|
2012-10-01 12:32:35 +00:00
|
|
|
if (rc)
|
|
|
|
goto out1;
|
|
|
|
|
2014-01-13 18:41:19 +01:00
|
|
|
rc = register_netdevice_notifier(&vxlan_notifier_block);
|
2012-10-01 12:32:35 +00:00
|
|
|
if (rc)
|
|
|
|
goto out2;
|
|
|
|
|
2018-10-17 08:53:26 +00:00
|
|
|
rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block);
|
2014-01-13 18:41:19 +01:00
|
|
|
if (rc)
|
|
|
|
goto out3;
|
2012-10-01 12:32:35 +00:00
|
|
|
|
2018-10-17 08:53:26 +00:00
|
|
|
rc = rtnl_link_register(&vxlan_link_ops);
|
|
|
|
if (rc)
|
|
|
|
goto out4;
|
|
|
|
|
2024-10-08 11:47:33 -07:00
|
|
|
rc = vxlan_vnifilter_init();
|
|
|
|
if (rc)
|
|
|
|
goto out5;
|
2022-03-01 05:04:36 +00:00
|
|
|
|
2014-01-13 18:41:19 +01:00
|
|
|
return 0;
|
2024-10-08 11:47:33 -07:00
|
|
|
out5:
|
|
|
|
rtnl_link_unregister(&vxlan_link_ops);
|
2018-10-17 08:53:26 +00:00
|
|
|
out4:
|
|
|
|
unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
|
2014-01-13 18:41:19 +01:00
|
|
|
out3:
|
|
|
|
unregister_netdevice_notifier(&vxlan_notifier_block);
|
2012-10-01 12:32:35 +00:00
|
|
|
out2:
|
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
|
|
|
unregister_pernet_subsys(&vxlan_net_ops);
|
2012-10-01 12:32:35 +00:00
|
|
|
out1:
|
|
|
|
return rc;
|
|
|
|
}
|
2013-05-27 22:35:53 +00:00
|
|
|
late_initcall(vxlan_init_module);
|
2012-10-01 12:32:35 +00:00
|
|
|
|
|
|
|
static void __exit vxlan_cleanup_module(void)
|
|
|
|
{
|
2022-03-01 05:04:36 +00:00
|
|
|
vxlan_vnifilter_uninit();
|
2013-06-17 14:16:09 -07:00
|
|
|
rtnl_link_unregister(&vxlan_link_ops);
|
2018-10-17 08:53:26 +00:00
|
|
|
unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
|
2014-01-13 18:41:19 +01:00
|
|
|
unregister_netdevice_notifier(&vxlan_notifier_block);
|
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
|
|
|
unregister_pernet_subsys(&vxlan_net_ops);
|
|
|
|
/* rcu_barrier() is called by netns */
|
2012-10-01 12:32:35 +00:00
|
|
|
}
|
|
|
|
module_exit(vxlan_cleanup_module);
|
|
|
|
|
|
|
|
MODULE_LICENSE("GPL");
|
|
|
|
MODULE_VERSION(VXLAN_VERSION);
|
2013-04-27 11:31:52 +00:00
|
|
|
MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
|
2014-01-17 11:00:33 -08:00
|
|
|
MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
|
2012-10-01 12:32:35 +00:00
|
|
|
MODULE_ALIAS_RTNL_LINK("vxlan");
|