2019-06-04 08:11:33 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2015-05-13 16:57:30 +00:00
|
|
|
/*
|
|
|
|
* GENEVE: Generic Network Virtualization Encapsulation
|
|
|
|
*
|
|
|
|
* Copyright (c) 2015 Red Hat, Inc.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2020-11-20 22:50:52 +00:00
|
|
|
#include <linux/ethtool.h>
|
2015-05-13 16:57:30 +00:00
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/etherdevice.h>
|
|
|
|
#include <linux/hash.h>
|
2019-03-22 13:06:09 +00:00
|
|
|
#include <net/ipv6_stubs.h>
|
2015-08-27 06:46:52 +00:00
|
|
|
#include <net/dst_metadata.h>
|
2015-08-28 23:54:40 +00:00
|
|
|
#include <net/gro_cells.h>
|
2015-05-13 16:57:30 +00:00
|
|
|
#include <net/rtnetlink.h>
|
|
|
|
#include <net/geneve.h>
|
2021-11-15 17:05:51 +00:00
|
|
|
#include <net/gro.h>
|
2015-08-27 06:46:54 +00:00
|
|
|
#include <net/protocol.h>
|
2015-05-13 16:57:30 +00:00
|
|
|
|
|
|
|
#define GENEVE_NETDEV_VER "0.6"
|
|
|
|
|
|
|
|
#define GENEVE_N_VID (1u << 24)
|
|
|
|
#define GENEVE_VID_MASK (GENEVE_N_VID - 1)
|
|
|
|
|
|
|
|
#define VNI_HASH_BITS 10
|
|
|
|
#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
|
|
|
|
|
|
|
|
static bool log_ecn_error = true;
|
|
|
|
module_param(log_ecn_error, bool, 0644);
|
|
|
|
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
|
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
#define GENEVE_VER 0
|
|
|
|
#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
|
2018-04-19 12:42:30 +00:00
|
|
|
#define GENEVE_IPV4_HLEN (ETH_HLEN + sizeof(struct iphdr) + GENEVE_BASE_HLEN)
|
|
|
|
#define GENEVE_IPV6_HLEN (ETH_HLEN + sizeof(struct ipv6hdr) + GENEVE_BASE_HLEN)
|
2015-08-27 06:46:54 +00:00
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
/* per-network namespace private data for this module */
|
|
|
|
struct geneve_net {
|
2015-08-27 06:46:54 +00:00
|
|
|
struct list_head geneve_list;
|
|
|
|
struct list_head sock_list;
|
2015-05-13 16:57:30 +00:00
|
|
|
};
|
|
|
|
|
netns: make struct pernet_operations::id unsigned int
Make struct pernet_operations::id unsigned.
There are 2 reasons to do so:
1)
This field is really an index into an zero based array and
thus is unsigned entity. Using negative value is out-of-bound
access by definition.
2)
On x86_64 unsigned 32-bit data which are mixed with pointers
via array indexing or offsets added or subtracted to pointers
are preffered to signed 32-bit data.
"int" being used as an array index needs to be sign-extended
to 64-bit before being used.
void f(long *p, int i)
{
g(p[i]);
}
roughly translates to
movsx rsi, esi
mov rdi, [rsi+...]
call g
MOVSX is 3 byte instruction which isn't necessary if the variable is
unsigned because x86_64 is zero extending by default.
Now, there is net_generic() function which, you guessed it right, uses
"int" as an array index:
static inline void *net_generic(const struct net *net, int id)
{
...
ptr = ng->ptr[id - 1];
...
}
And this function is used a lot, so those sign extensions add up.
Patch snipes ~1730 bytes on allyesconfig kernel (without all junk
messing with code generation):
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
Unfortunately some functions actually grow bigger.
This is a semmingly random artefact of code generation with register
allocator being used differently. gcc decides that some variable
needs to live in new r8+ registers and every access now requires REX
prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be
used which is longer than [r8]
However, overall balance is in negative direction:
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
function old new delta
nfsd4_lock 3886 3959 +73
tipc_link_build_proto_msg 1096 1140 +44
mac80211_hwsim_new_radio 2776 2808 +32
tipc_mon_rcv 1032 1058 +26
svcauth_gss_legacy_init 1413 1429 +16
tipc_bcbase_select_primary 379 392 +13
nfsd4_exchange_id 1247 1260 +13
nfsd4_setclientid_confirm 782 793 +11
...
put_client_renew_locked 494 480 -14
ip_set_sockfn_get 730 716 -14
geneve_sock_add 829 813 -16
nfsd4_sequence_done 721 703 -18
nlmclnt_lookup_host 708 686 -22
nfsd4_lockt 1085 1063 -22
nfs_get_client 1077 1050 -27
tcf_bpf_init 1106 1076 -30
nfsd4_encode_fattr 5997 5930 -67
Total: Before=154856051, After=154854321, chg -0.00%
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-17 01:58:21 +00:00
|
|
|
static unsigned int geneve_net_id;
|
2015-08-27 06:46:54 +00:00
|
|
|
|
2017-07-02 17:00:58 +00:00
|
|
|
struct geneve_dev_node {
|
|
|
|
struct hlist_node hlist;
|
|
|
|
struct geneve_dev *geneve;
|
|
|
|
};
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
struct geneve_config {
|
|
|
|
struct ip_tunnel_info info;
|
|
|
|
bool collect_md;
|
|
|
|
bool use_udp6_rx_checksums;
|
|
|
|
bool ttl_inherit;
|
|
|
|
enum ifla_geneve_df df;
|
2022-03-16 06:15:57 +00:00
|
|
|
bool inner_proto_inherit;
|
2020-07-06 15:18:08 +00:00
|
|
|
};
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
/* Pseudo network device */
|
|
|
|
struct geneve_dev {
|
2017-07-02 17:00:58 +00:00
|
|
|
struct geneve_dev_node hlist4; /* vni hash table for IPv4 socket */
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
struct geneve_dev_node hlist6; /* vni hash table for IPv6 socket */
|
|
|
|
#endif
|
2015-05-13 16:57:30 +00:00
|
|
|
struct net *net; /* netns for packet i/o */
|
|
|
|
struct net_device *dev; /* netdev for geneve tunnel */
|
2016-10-28 16:59:16 +00:00
|
|
|
struct geneve_sock __rcu *sock4; /* IPv4 socket used for geneve tunnel */
|
2015-10-26 21:01:44 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2016-10-28 16:59:16 +00:00
|
|
|
struct geneve_sock __rcu *sock6; /* IPv6 socket used for geneve tunnel */
|
2015-10-26 21:01:44 +00:00
|
|
|
#endif
|
2015-05-13 16:57:30 +00:00
|
|
|
struct list_head next; /* geneve's per namespace list */
|
2015-08-28 23:54:40 +00:00
|
|
|
struct gro_cells gro_cells;
|
2020-07-06 15:18:08 +00:00
|
|
|
struct geneve_config cfg;
|
2015-05-13 16:57:30 +00:00
|
|
|
};
|
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
struct geneve_sock {
|
|
|
|
bool collect_md;
|
|
|
|
struct list_head list;
|
|
|
|
struct socket *sock;
|
|
|
|
struct rcu_head rcu;
|
|
|
|
int refcnt;
|
2015-08-27 06:46:55 +00:00
|
|
|
struct hlist_head vni_list[VNI_HASH_SIZE];
|
2015-08-27 06:46:54 +00:00
|
|
|
};
|
2015-05-13 16:57:30 +00:00
|
|
|
|
|
|
|
static inline __u32 geneve_net_vni_hash(u8 vni[3])
|
|
|
|
{
|
|
|
|
__u32 vnid;
|
|
|
|
|
|
|
|
vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2];
|
|
|
|
return hash_32(vnid, VNI_HASH_BITS);
|
|
|
|
}
|
|
|
|
|
2015-08-27 06:46:52 +00:00
|
|
|
static __be64 vni_to_tunnel_id(const __u8 *vni)
|
|
|
|
{
|
|
|
|
#ifdef __BIG_ENDIAN
|
|
|
|
return (vni[0] << 16) | (vni[1] << 8) | vni[2];
|
|
|
|
#else
|
|
|
|
return (__force __be64)(((__force u64)vni[0] << 40) |
|
|
|
|
((__force u64)vni[1] << 48) |
|
|
|
|
((__force u64)vni[2] << 56));
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
/* Convert 64 bit tunnel ID to 24 bit VNI. */
|
|
|
|
static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
|
|
|
|
{
|
|
|
|
#ifdef __BIG_ENDIAN
|
|
|
|
vni[0] = (__force __u8)(tun_id >> 16);
|
|
|
|
vni[1] = (__force __u8)(tun_id >> 8);
|
|
|
|
vni[2] = (__force __u8)tun_id;
|
|
|
|
#else
|
|
|
|
vni[0] = (__force __u8)((__force u64)tun_id >> 40);
|
|
|
|
vni[1] = (__force __u8)((__force u64)tun_id >> 48);
|
|
|
|
vni[2] = (__force __u8)((__force u64)tun_id >> 56);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2016-11-21 19:03:01 +00:00
|
|
|
static bool eq_tun_id_and_vni(u8 *tun_id, u8 *vni)
|
|
|
|
{
|
|
|
|
return !memcmp(vni, &tun_id[5], 3);
|
|
|
|
}
|
|
|
|
|
2016-02-18 10:22:49 +00:00
|
|
|
static sa_family_t geneve_get_sk_family(struct geneve_sock *gs)
|
|
|
|
{
|
|
|
|
return gs->sock->sk->sk_family;
|
|
|
|
}
|
|
|
|
|
2015-08-27 06:46:55 +00:00
|
|
|
static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
|
2015-08-27 06:46:54 +00:00
|
|
|
__be32 addr, u8 vni[])
|
2015-05-13 16:57:30 +00:00
|
|
|
{
|
|
|
|
struct hlist_head *vni_list_head;
|
2017-07-02 17:00:58 +00:00
|
|
|
struct geneve_dev_node *node;
|
2015-05-13 16:57:30 +00:00
|
|
|
__u32 hash;
|
|
|
|
|
|
|
|
/* Find the device for this VNI */
|
2015-08-27 06:46:54 +00:00
|
|
|
hash = geneve_net_vni_hash(vni);
|
2015-08-27 06:46:55 +00:00
|
|
|
vni_list_head = &gs->vni_list[hash];
|
2017-07-02 17:00:58 +00:00
|
|
|
hlist_for_each_entry_rcu(node, vni_list_head, hlist) {
|
2020-07-06 15:18:08 +00:00
|
|
|
if (eq_tun_id_and_vni((u8 *)&node->geneve->cfg.info.key.tun_id, vni) &&
|
|
|
|
addr == node->geneve->cfg.info.key.u.ipv4.dst)
|
2017-07-02 17:00:58 +00:00
|
|
|
return node->geneve;
|
2015-10-26 21:01:44 +00:00
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs,
|
|
|
|
struct in6_addr addr6, u8 vni[])
|
|
|
|
{
|
|
|
|
struct hlist_head *vni_list_head;
|
2017-07-02 17:00:58 +00:00
|
|
|
struct geneve_dev_node *node;
|
2015-10-26 21:01:44 +00:00
|
|
|
__u32 hash;
|
|
|
|
|
|
|
|
/* Find the device for this VNI */
|
|
|
|
hash = geneve_net_vni_hash(vni);
|
|
|
|
vni_list_head = &gs->vni_list[hash];
|
2017-07-02 17:00:58 +00:00
|
|
|
hlist_for_each_entry_rcu(node, vni_list_head, hlist) {
|
2020-07-06 15:18:08 +00:00
|
|
|
if (eq_tun_id_and_vni((u8 *)&node->geneve->cfg.info.key.tun_id, vni) &&
|
|
|
|
ipv6_addr_equal(&addr6, &node->geneve->cfg.info.key.u.ipv6.dst))
|
2017-07-02 17:00:58 +00:00
|
|
|
return node->geneve;
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
2015-08-27 06:46:52 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2015-10-26 21:01:44 +00:00
|
|
|
#endif
|
2015-08-27 06:46:52 +00:00
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return (struct genevehdr *)(udp_hdr(skb) + 1);
|
|
|
|
}
|
|
|
|
|
2016-02-18 10:22:50 +00:00
|
|
|
static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs,
|
|
|
|
struct sk_buff *skb)
|
2015-08-27 06:46:52 +00:00
|
|
|
{
|
2015-10-26 21:01:44 +00:00
|
|
|
static u8 zero_vni[3];
|
2016-11-21 19:02:58 +00:00
|
|
|
u8 *vni;
|
2015-08-27 06:46:52 +00:00
|
|
|
|
2016-02-18 10:22:49 +00:00
|
|
|
if (geneve_get_sk_family(gs) == AF_INET) {
|
2016-02-18 10:22:50 +00:00
|
|
|
struct iphdr *iph;
|
2016-11-21 19:02:58 +00:00
|
|
|
__be32 addr;
|
2016-02-18 10:22:50 +00:00
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
iph = ip_hdr(skb); /* outer IP header... */
|
2015-08-27 06:46:54 +00:00
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
if (gs->collect_md) {
|
|
|
|
vni = zero_vni;
|
|
|
|
addr = 0;
|
|
|
|
} else {
|
2016-02-18 10:22:50 +00:00
|
|
|
vni = geneve_hdr(skb)->vni;
|
2015-10-26 21:01:44 +00:00
|
|
|
addr = iph->saddr;
|
|
|
|
}
|
|
|
|
|
2016-02-18 10:22:50 +00:00
|
|
|
return geneve_lookup(gs, addr, vni);
|
2015-10-26 21:01:44 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2016-02-18 10:22:49 +00:00
|
|
|
} else if (geneve_get_sk_family(gs) == AF_INET6) {
|
2016-11-21 19:02:58 +00:00
|
|
|
static struct in6_addr zero_addr6;
|
2016-02-18 10:22:50 +00:00
|
|
|
struct ipv6hdr *ip6h;
|
|
|
|
struct in6_addr addr6;
|
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
ip6h = ipv6_hdr(skb); /* outer IPv6 header... */
|
2015-08-27 06:46:54 +00:00
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
if (gs->collect_md) {
|
|
|
|
vni = zero_vni;
|
|
|
|
addr6 = zero_addr6;
|
|
|
|
} else {
|
2016-02-18 10:22:50 +00:00
|
|
|
vni = geneve_hdr(skb)->vni;
|
2015-10-26 21:01:44 +00:00
|
|
|
addr6 = ip6h->saddr;
|
|
|
|
}
|
|
|
|
|
2016-02-18 10:22:50 +00:00
|
|
|
return geneve6_lookup(gs, addr6, vni);
|
2015-10-26 21:01:44 +00:00
|
|
|
#endif
|
|
|
|
}
|
2016-02-18 10:22:50 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* geneve receive/decap routine */
|
|
|
|
static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct genevehdr *gnvh = geneve_hdr(skb);
|
|
|
|
struct metadata_dst *tun_dst = NULL;
|
2017-06-09 00:07:48 +00:00
|
|
|
unsigned int len;
|
2024-02-29 13:11:52 +00:00
|
|
|
int nh, err = 0;
|
2016-02-18 10:22:50 +00:00
|
|
|
void *oiph;
|
2015-05-13 16:57:30 +00:00
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
if (ip_tunnel_collect_metadata() || gs->collect_md) {
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
IP_TUNNEL_DECLARE_FLAGS(flags) = { };
|
2015-08-27 06:46:52 +00:00
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_KEY_BIT, flags);
|
|
|
|
__assign_bit(IP_TUNNEL_OAM_BIT, flags, gnvh->oam);
|
|
|
|
__assign_bit(IP_TUNNEL_CRIT_OPT_BIT, flags, gnvh->critical);
|
2015-08-27 06:46:52 +00:00
|
|
|
|
2016-02-18 10:22:49 +00:00
|
|
|
tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags,
|
2015-08-27 06:46:52 +00:00
|
|
|
vni_to_tunnel_id(gnvh->vni),
|
|
|
|
gnvh->opt_len * 4);
|
2017-06-09 00:07:48 +00:00
|
|
|
if (!tun_dst) {
|
2024-01-04 16:36:33 +00:00
|
|
|
DEV_STATS_INC(geneve->dev, rx_dropped);
|
2015-08-27 06:46:52 +00:00
|
|
|
goto drop;
|
2017-06-09 00:07:48 +00:00
|
|
|
}
|
2015-08-27 06:46:52 +00:00
|
|
|
/* Update tunnel dst according to Geneve options. */
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
ip_tunnel_flags_zero(flags);
|
|
|
|
__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, flags);
|
2015-08-31 01:09:38 +00:00
|
|
|
ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
|
2018-06-27 04:39:36 +00:00
|
|
|
gnvh->options, gnvh->opt_len * 4,
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
flags);
|
2015-08-27 06:46:52 +00:00
|
|
|
} else {
|
|
|
|
/* Drop packets w/ critical options,
|
|
|
|
* since we don't support any...
|
|
|
|
*/
|
2017-06-09 00:07:48 +00:00
|
|
|
if (gnvh->critical) {
|
2024-01-04 16:36:33 +00:00
|
|
|
DEV_STATS_INC(geneve->dev, rx_frame_errors);
|
|
|
|
DEV_STATS_INC(geneve->dev, rx_errors);
|
2015-08-27 06:46:52 +00:00
|
|
|
goto drop;
|
2017-06-09 00:07:48 +00:00
|
|
|
}
|
2015-08-27 06:46:52 +00:00
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
|
2015-08-27 06:46:52 +00:00
|
|
|
if (tun_dst)
|
|
|
|
skb_dst_set(skb, &tun_dst->dst);
|
|
|
|
|
2022-03-16 06:15:57 +00:00
|
|
|
if (gnvh->proto_type == htons(ETH_P_TEB)) {
|
|
|
|
skb_reset_mac_header(skb);
|
|
|
|
skb->protocol = eth_type_trans(skb, geneve->dev);
|
|
|
|
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
|
|
|
|
|
|
|
|
/* Ignore packet loops (and multicast echo) */
|
|
|
|
if (ether_addr_equal(eth_hdr(skb)->h_source,
|
|
|
|
geneve->dev->dev_addr)) {
|
2024-01-04 16:36:33 +00:00
|
|
|
DEV_STATS_INC(geneve->dev, rx_errors);
|
2022-03-16 06:15:57 +00:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
skb_reset_mac_header(skb);
|
|
|
|
skb->dev = geneve->dev;
|
|
|
|
skb->pkt_type = PACKET_HOST;
|
2017-06-09 00:07:48 +00:00
|
|
|
}
|
2020-12-09 22:39:56 +00:00
|
|
|
|
2024-02-29 13:11:52 +00:00
|
|
|
/* Save offset of outer header relative to skb->head,
|
|
|
|
* because we are going to reset the network header to the inner header
|
|
|
|
* and might change skb->head.
|
|
|
|
*/
|
|
|
|
nh = skb_network_header(skb) - skb->head;
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
skb_reset_network_header(skb);
|
|
|
|
|
2024-02-29 13:11:52 +00:00
|
|
|
if (!pskb_inet_may_pull(skb)) {
|
|
|
|
DEV_STATS_INC(geneve->dev, rx_length_errors);
|
|
|
|
DEV_STATS_INC(geneve->dev, rx_errors);
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Get the outer header. */
|
|
|
|
oiph = skb->head + nh;
|
|
|
|
|
2016-02-18 10:22:50 +00:00
|
|
|
if (geneve_get_sk_family(gs) == AF_INET)
|
|
|
|
err = IP_ECN_decapsulate(oiph, skb);
|
2015-10-26 21:01:44 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2016-02-18 10:22:50 +00:00
|
|
|
else
|
|
|
|
err = IP6_ECN_decapsulate(oiph, skb);
|
2015-10-26 21:01:44 +00:00
|
|
|
#endif
|
2015-05-13 16:57:30 +00:00
|
|
|
|
|
|
|
if (unlikely(err)) {
|
2015-10-26 21:01:44 +00:00
|
|
|
if (log_ecn_error) {
|
2016-02-18 10:22:50 +00:00
|
|
|
if (geneve_get_sk_family(gs) == AF_INET)
|
2015-10-26 21:01:44 +00:00
|
|
|
net_info_ratelimited("non-ECT from %pI4 "
|
|
|
|
"with TOS=%#x\n",
|
2016-02-18 10:22:50 +00:00
|
|
|
&((struct iphdr *)oiph)->saddr,
|
|
|
|
((struct iphdr *)oiph)->tos);
|
2015-10-26 21:01:44 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2016-02-18 10:22:50 +00:00
|
|
|
else
|
2015-10-26 21:01:44 +00:00
|
|
|
net_info_ratelimited("non-ECT from %pI6\n",
|
2016-02-18 10:22:50 +00:00
|
|
|
&((struct ipv6hdr *)oiph)->saddr);
|
2015-10-26 21:01:44 +00:00
|
|
|
#endif
|
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
if (err > 1) {
|
2024-01-04 16:36:33 +00:00
|
|
|
DEV_STATS_INC(geneve->dev, rx_frame_errors);
|
|
|
|
DEV_STATS_INC(geneve->dev, rx_errors);
|
2015-05-13 16:57:30 +00:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-09 00:07:48 +00:00
|
|
|
len = skb->len;
|
|
|
|
err = gro_cells_receive(&geneve->gro_cells, skb);
|
2020-10-05 20:34:58 +00:00
|
|
|
if (likely(err == NET_RX_SUCCESS))
|
|
|
|
dev_sw_netstats_rx_add(geneve->dev, len);
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
return;
|
|
|
|
drop:
|
|
|
|
/* Consume bad packet */
|
|
|
|
kfree_skb(skb);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Setup stats when device is created */
|
|
|
|
static int geneve_init(struct net_device *dev)
|
|
|
|
{
|
2015-08-28 23:54:40 +00:00
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = gro_cells_init(&geneve->gro_cells, dev);
|
2024-03-05 17:29:09 +00:00
|
|
|
if (err)
|
2015-08-28 23:54:40 +00:00
|
|
|
return err;
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
err = dst_cache_init(&geneve->cfg.info.dst_cache, GFP_KERNEL);
|
2016-02-12 14:43:58 +00:00
|
|
|
if (err) {
|
|
|
|
gro_cells_destroy(&geneve->gro_cells);
|
|
|
|
return err;
|
|
|
|
}
|
net: add netdev_lockdep_set_classes() to virtual drivers
Based on a syzbot report, it appears many virtual
drivers do not yet use netdev_lockdep_set_classes(),
triggerring lockdep false positives.
WARNING: possible recursive locking detected
6.8.0-rc4-next-20240212-syzkaller #0 Not tainted
syz-executor.0/19016 is trying to acquire lock:
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
but task is already holding lock:
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
lock(_xmit_ETHER#2);
lock(_xmit_ETHER#2);
*** DEADLOCK ***
May be due to missing lock nesting notation
9 locks held by syz-executor.0/19016:
#0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock net/core/rtnetlink.c:79 [inline]
#0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x82c/0x1040 net/core/rtnetlink.c:6603
#1: ffffc90000a08c00 ((&in_dev->mr_ifc_timer)){+.-.}-{0:0}, at: call_timer_fn+0xc0/0x600 kernel/time/timer.c:1697
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline]
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline]
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline]
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline]
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline]
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline]
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325
stack backtrace:
CPU: 1 PID: 19016 Comm: syz-executor.0 Not tainted 6.8.0-rc4-next-20240212-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114
check_deadlock kernel/locking/lockdep.c:3062 [inline]
validate_chain+0x15c1/0x58e0 kernel/locking/lockdep.c:3856
__lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1e4/0x530 kernel/locking/lockdep.c:5754
__raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
_raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154
spin_lock include/linux/spinlock.h:351 [inline]
__netif_tx_lock include/linux/netdevice.h:4452 [inline]
sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
__dev_xmit_skb net/core/dev.c:3784 [inline]
__dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325
neigh_output include/net/neighbour.h:542 [inline]
ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235
iptunnel_xmit+0x540/0x9b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x20ee/0x2960 net/ipv4/ip_tunnel.c:831
erspan_xmit+0x9de/0x1460 net/ipv4/ip_gre.c:720
__netdev_start_xmit include/linux/netdevice.h:4989 [inline]
netdev_start_xmit include/linux/netdevice.h:5003 [inline]
xmit_one net/core/dev.c:3555 [inline]
dev_hard_start_xmit+0x242/0x770 net/core/dev.c:3571
sch_direct_xmit+0x2b6/0x5f0 net/sched/sch_generic.c:342
__dev_xmit_skb net/core/dev.c:3784 [inline]
__dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325
neigh_output include/net/neighbour.h:542 [inline]
ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235
igmpv3_send_cr net/ipv4/igmp.c:723 [inline]
igmp_ifc_timer_expire+0xb71/0xd90 net/ipv4/igmp.c:813
call_timer_fn+0x17e/0x600 kernel/time/timer.c:1700
expire_timers kernel/time/timer.c:1751 [inline]
__run_timers+0x621/0x830 kernel/time/timer.c:2038
run_timer_softirq+0x67/0xf0 kernel/time/timer.c:2051
__do_softirq+0x2bc/0x943 kernel/softirq.c:554
invoke_softirq kernel/softirq.c:428 [inline]
__irq_exit_rcu+0xf2/0x1c0 kernel/softirq.c:633
irq_exit_rcu+0x9/0x30 kernel/softirq.c:645
instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1076 [inline]
sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1076
</IRQ>
<TASK>
asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
RIP: 0010:resched_offsets_ok kernel/sched/core.c:10127 [inline]
RIP: 0010:__might_resched+0x16f/0x780 kernel/sched/core.c:10142
Code: 00 4c 89 e8 48 c1 e8 03 48 ba 00 00 00 00 00 fc ff df 48 89 44 24 38 0f b6 04 10 84 c0 0f 85 87 04 00 00 41 8b 45 00 c1 e0 08 <01> d8 44 39 e0 0f 85 d6 00 00 00 44 89 64 24 1c 48 8d bc 24 a0 00
RSP: 0018:ffffc9000ee069e0 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8880296a9e00
RDX: dffffc0000000000 RSI: ffff8880296a9e00 RDI: ffffffff8bfe8fa0
RBP: ffffc9000ee06b00 R08: ffffffff82326877 R09: 1ffff11002b5ad1b
R10: dffffc0000000000 R11: ffffed1002b5ad1c R12: 0000000000000000
R13: ffff8880296aa23c R14: 000000000000062a R15: 1ffff92001dc0d44
down_write+0x19/0x50 kernel/locking/rwsem.c:1578
kernfs_activate fs/kernfs/dir.c:1403 [inline]
kernfs_add_one+0x4af/0x8b0 fs/kernfs/dir.c:819
__kernfs_create_file+0x22e/0x2e0 fs/kernfs/file.c:1056
sysfs_add_file_mode_ns+0x24a/0x310 fs/sysfs/file.c:307
create_files fs/sysfs/group.c:64 [inline]
internal_create_group+0x4f4/0xf20 fs/sysfs/group.c:152
internal_create_groups fs/sysfs/group.c:192 [inline]
sysfs_create_groups+0x56/0x120 fs/sysfs/group.c:218
create_dir lib/kobject.c:78 [inline]
kobject_add_internal+0x472/0x8d0 lib/kobject.c:240
kobject_add_varg lib/kobject.c:374 [inline]
kobject_init_and_add+0x124/0x190 lib/kobject.c:457
netdev_queue_add_kobject net/core/net-sysfs.c:1706 [inline]
netdev_queue_update_kobjects+0x1f3/0x480 net/core/net-sysfs.c:1758
register_queue_kobjects net/core/net-sysfs.c:1819 [inline]
netdev_register_kobject+0x265/0x310 net/core/net-sysfs.c:2059
register_netdevice+0x1191/0x19c0 net/core/dev.c:10298
bond_newlink+0x3b/0x90 drivers/net/bonding/bond_netlink.c:576
rtnl_newlink_create net/core/rtnetlink.c:3506 [inline]
__rtnl_newlink net/core/rtnetlink.c:3726 [inline]
rtnl_newlink+0x158f/0x20a0 net/core/rtnetlink.c:3739
rtnetlink_rcv_msg+0x885/0x1040 net/core/rtnetlink.c:6606
netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543
netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline]
netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367
netlink_sendmsg+0xa3c/0xd70 net/netlink/af_netlink.c:1908
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0x221/0x270 net/socket.c:745
__sys_sendto+0x3a4/0x4f0 net/socket.c:2191
__do_sys_sendto net/socket.c:2203 [inline]
__se_sys_sendto net/socket.c:2199 [inline]
__x64_sys_sendto+0xde/0x100 net/socket.c:2199
do_syscall_64+0xfb/0x240
entry_SYSCALL_64_after_hwframe+0x6d/0x75
RIP: 0033:0x7fc3fa87fa9c
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240212140700.2795436-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-02-12 14:07:00 +00:00
|
|
|
netdev_lockdep_set_classes(dev);
|
2015-05-13 16:57:30 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void geneve_uninit(struct net_device *dev)
|
|
|
|
{
|
2015-08-28 23:54:40 +00:00
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
dst_cache_destroy(&geneve->cfg.info.dst_cache);
|
2015-08-28 23:54:40 +00:00
|
|
|
gro_cells_destroy(&geneve->gro_cells);
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
/* Callback from net/ipv4/udp.c to receive packets */
|
|
|
|
static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct genevehdr *geneveh;
|
2016-02-18 10:22:50 +00:00
|
|
|
struct geneve_dev *geneve;
|
2015-08-27 06:46:54 +00:00
|
|
|
struct geneve_sock *gs;
|
2022-03-16 06:15:57 +00:00
|
|
|
__be16 inner_proto;
|
2015-08-27 06:46:54 +00:00
|
|
|
int opts_len;
|
|
|
|
|
2017-06-09 00:07:48 +00:00
|
|
|
/* Need UDP and Geneve header to be present */
|
2015-08-27 06:46:54 +00:00
|
|
|
if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
|
2016-05-19 13:58:33 +00:00
|
|
|
goto drop;
|
2015-08-27 06:46:54 +00:00
|
|
|
|
|
|
|
/* Return packets with reserved bits set */
|
|
|
|
geneveh = geneve_hdr(skb);
|
|
|
|
if (unlikely(geneveh->ver != GENEVE_VER))
|
2016-05-19 13:58:33 +00:00
|
|
|
goto drop;
|
2015-08-27 06:46:54 +00:00
|
|
|
|
2016-02-18 10:22:50 +00:00
|
|
|
gs = rcu_dereference_sk_user_data(sk);
|
|
|
|
if (!gs)
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
geneve = geneve_lookup_skb(gs, skb);
|
|
|
|
if (!geneve)
|
|
|
|
goto drop;
|
|
|
|
|
net: geneve: accept every ethertype
The Geneve encapsulation, as defined in RFC 8926, has a Protocol Type
field, which states the Ethertype of the payload appearing after the
Geneve header.
Commit 435fe1c0c1f7 ("net: geneve: support IPv4/IPv6 as inner protocol")
introduced a new IFLA_GENEVE_INNER_PROTO_INHERIT flag that allowed the
use of other Ethertypes than Ethernet. However, it did not get rid of a
restriction that prohibits receiving payloads other than Ethernet,
instead the commit white-listed additional Ethertypes, IPv4 and IPv6.
This patch removes this restriction, making it possible to receive any
Ethertype as a payload, if the IFLA_GENEVE_INNER_PROTO_INHERIT flag is
set.
The restriction was set in place back in commit 0b5e8b8eeae4
("net: Add Geneve tunneling protocol driver"), which implemented a
protocol layer driver for Geneve to be used with Open vSwitch. The
relevant discussion about introducing the Ethertype white-list can be
found here:
https://lore.kernel.org/netdev/CAEP_g=_1q3ACX5NTHxLDnysL+dTMUVzdLpgw1apLKEdDSWPztw@mail.gmail.com/
<quote>
>> + if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
>
> Why? I thought the point of geneve carrying protocol field was to
> allow protocols other than Ethernet... is this temporary maybe?
Yes, it is temporary. Currently OVS only handles Ethernet packets but
this restriction can be lifted once we have a consumer that is capable
of handling other protocols.
</quote>
This white-list was then ported to a generic Geneve netdevice in commit
371bd1061d29 ("geneve: Consolidate Geneve functionality in single
module."). Preserving the Ethertype white-list at this point made sense,
as the Geneve device could send out only Ethernet payloads anyways.
However, now that the Geneve netdevice supports encapsulating other
payloads with IFLA_GENEVE_INNER_PROTO_INHERIT and we have a consumer
capable of other protocols, it seems appropriate to lift the restriction
and allow any Geneve payload to be received.
Signed-off-by: Josef Miegl <josef@miegl.cz>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Eyal Birger <eyal.birger@gmail.com>
Link: https://lore.kernel.org/r/20230319220954.21834-1-josef@miegl.cz
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2023-03-19 22:09:54 +00:00
|
|
|
inner_proto = geneveh->proto_type;
|
|
|
|
|
2022-03-16 06:15:57 +00:00
|
|
|
if (unlikely((!geneve->cfg.inner_proto_inherit &&
|
|
|
|
inner_proto != htons(ETH_P_TEB)))) {
|
2024-01-04 16:36:33 +00:00
|
|
|
DEV_STATS_INC(geneve->dev, rx_dropped);
|
2022-03-16 06:15:57 +00:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
opts_len = geneveh->opt_len * 4;
|
2022-03-16 06:15:57 +00:00
|
|
|
if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto,
|
2017-06-09 00:07:48 +00:00
|
|
|
!net_eq(geneve->net, dev_net(geneve->dev)))) {
|
2024-01-04 16:36:33 +00:00
|
|
|
DEV_STATS_INC(geneve->dev, rx_dropped);
|
2015-08-27 06:46:54 +00:00
|
|
|
goto drop;
|
2017-06-09 00:07:48 +00:00
|
|
|
}
|
2015-08-27 06:46:54 +00:00
|
|
|
|
2016-02-18 10:22:50 +00:00
|
|
|
geneve_rx(geneve, gs, skb);
|
2015-08-27 06:46:54 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
drop:
|
|
|
|
/* Consume bad packet */
|
|
|
|
kfree_skb(skb);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-11-08 11:19:18 +00:00
|
|
|
/* Callback from net/ipv{4,6}/udp.c to check that we have a tunnel for errors */
|
|
|
|
static int geneve_udp_encap_err_lookup(struct sock *sk, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct genevehdr *geneveh;
|
|
|
|
struct geneve_sock *gs;
|
|
|
|
u8 zero_vni[3] = { 0 };
|
|
|
|
u8 *vni = zero_vni;
|
|
|
|
|
2019-06-10 22:27:06 +00:00
|
|
|
if (!pskb_may_pull(skb, skb_transport_offset(skb) + GENEVE_BASE_HLEN))
|
2018-11-08 11:19:18 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
geneveh = geneve_hdr(skb);
|
|
|
|
if (geneveh->ver != GENEVE_VER)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (geneveh->proto_type != htons(ETH_P_TEB))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
gs = rcu_dereference_sk_user_data(sk);
|
|
|
|
if (!gs)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
if (geneve_get_sk_family(gs) == AF_INET) {
|
|
|
|
struct iphdr *iph = ip_hdr(skb);
|
|
|
|
__be32 addr4 = 0;
|
|
|
|
|
|
|
|
if (!gs->collect_md) {
|
|
|
|
vni = geneve_hdr(skb)->vni;
|
|
|
|
addr4 = iph->daddr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return geneve_lookup(gs, addr4, vni) ? 0 : -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
if (geneve_get_sk_family(gs) == AF_INET6) {
|
|
|
|
struct ipv6hdr *ip6h = ipv6_hdr(skb);
|
2018-11-17 01:36:27 +00:00
|
|
|
struct in6_addr addr6;
|
|
|
|
|
|
|
|
memset(&addr6, 0, sizeof(struct in6_addr));
|
2018-11-08 11:19:18 +00:00
|
|
|
|
|
|
|
if (!gs->collect_md) {
|
|
|
|
vni = geneve_hdr(skb)->vni;
|
|
|
|
addr6 = ip6h->daddr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return geneve6_lookup(gs, addr6, vni) ? 0 : -ENOENT;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return -EPFNOSUPPORT;
|
|
|
|
}
|
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
static struct socket *geneve_create_sock(struct net *net, bool ipv6,
|
2016-11-21 19:02:58 +00:00
|
|
|
__be16 port, bool ipv6_rx_csum)
|
2015-08-27 06:46:54 +00:00
|
|
|
{
|
|
|
|
struct socket *sock;
|
|
|
|
struct udp_port_cfg udp_conf;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
memset(&udp_conf, 0, sizeof(udp_conf));
|
|
|
|
|
|
|
|
if (ipv6) {
|
|
|
|
udp_conf.family = AF_INET6;
|
2015-10-26 21:01:44 +00:00
|
|
|
udp_conf.ipv6_v6only = 1;
|
2016-11-21 19:02:58 +00:00
|
|
|
udp_conf.use_udp6_rx_checksums = ipv6_rx_csum;
|
2015-08-27 06:46:54 +00:00
|
|
|
} else {
|
|
|
|
udp_conf.family = AF_INET;
|
|
|
|
udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
|
|
|
|
}
|
|
|
|
|
|
|
|
udp_conf.local_udp_port = port;
|
|
|
|
|
|
|
|
/* Open UDP socket */
|
|
|
|
err = udp_sock_create(net, &udp_conf, &sock);
|
|
|
|
if (err < 0)
|
|
|
|
return ERR_PTR(err);
|
|
|
|
|
2021-03-30 10:28:54 +00:00
|
|
|
udp_allow_gso(sock->sk);
|
2015-08-27 06:46:54 +00:00
|
|
|
return sock;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int geneve_hlen(struct genevehdr *gh)
|
|
|
|
{
|
|
|
|
return sizeof(*gh) + gh->opt_len * 4;
|
|
|
|
}
|
|
|
|
|
2018-06-24 05:13:49 +00:00
|
|
|
static struct sk_buff *geneve_gro_receive(struct sock *sk,
|
|
|
|
struct list_head *head,
|
|
|
|
struct sk_buff *skb)
|
2015-08-27 06:46:54 +00:00
|
|
|
{
|
2018-06-24 05:13:49 +00:00
|
|
|
struct sk_buff *pp = NULL;
|
|
|
|
struct sk_buff *p;
|
2015-08-27 06:46:54 +00:00
|
|
|
struct genevehdr *gh, *gh2;
|
|
|
|
unsigned int hlen, gh_len, off_gnv;
|
|
|
|
const struct packet_offload *ptype;
|
|
|
|
__be16 type;
|
|
|
|
int flush = 1;
|
|
|
|
|
|
|
|
off_gnv = skb_gro_offset(skb);
|
|
|
|
hlen = off_gnv + sizeof(*gh);
|
2022-08-23 07:10:49 +00:00
|
|
|
gh = skb_gro_header(skb, hlen, off_gnv);
|
|
|
|
if (unlikely(!gh))
|
|
|
|
goto out;
|
2015-08-27 06:46:54 +00:00
|
|
|
|
|
|
|
if (gh->ver != GENEVE_VER || gh->oam)
|
|
|
|
goto out;
|
|
|
|
gh_len = geneve_hlen(gh);
|
|
|
|
|
|
|
|
hlen = off_gnv + gh_len;
|
2024-03-01 19:37:37 +00:00
|
|
|
if (!skb_gro_may_pull(skb, hlen)) {
|
2015-08-27 06:46:54 +00:00
|
|
|
gh = skb_gro_header_slow(skb, hlen, off_gnv);
|
|
|
|
if (unlikely(!gh))
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2018-06-24 05:13:49 +00:00
|
|
|
list_for_each_entry(p, head, list) {
|
2015-08-27 06:46:54 +00:00
|
|
|
if (!NAPI_GRO_CB(p)->same_flow)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
gh2 = (struct genevehdr *)(p->data + off_gnv);
|
|
|
|
if (gh->opt_len != gh2->opt_len ||
|
|
|
|
memcmp(gh, gh2, gh_len)) {
|
|
|
|
NAPI_GRO_CB(p)->same_flow = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-04-13 08:44:40 +00:00
|
|
|
skb_gro_pull(skb, gh_len);
|
|
|
|
skb_gro_postpull_rcsum(skb, gh, gh_len);
|
2015-08-27 06:46:54 +00:00
|
|
|
type = gh->proto_type;
|
2022-04-13 08:44:40 +00:00
|
|
|
if (likely(type == htons(ETH_P_TEB)))
|
|
|
|
return call_gro_receive(eth_gro_receive, head, skb);
|
2015-08-27 06:46:54 +00:00
|
|
|
|
|
|
|
ptype = gro_find_receive_by_type(type);
|
2016-03-09 17:24:23 +00:00
|
|
|
if (!ptype)
|
2021-11-23 22:56:07 +00:00
|
|
|
goto out;
|
2015-08-27 06:46:54 +00:00
|
|
|
|
2016-10-20 13:58:02 +00:00
|
|
|
pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
|
2016-03-09 17:24:23 +00:00
|
|
|
flush = 0;
|
2015-08-27 06:46:54 +00:00
|
|
|
|
|
|
|
out:
|
2018-06-30 15:38:55 +00:00
|
|
|
skb_gro_flush_final(skb, pp, flush);
|
2015-08-27 06:46:54 +00:00
|
|
|
|
|
|
|
return pp;
|
|
|
|
}
|
|
|
|
|
2016-04-05 15:22:55 +00:00
|
|
|
static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
|
|
|
|
int nhoff)
|
2015-08-27 06:46:54 +00:00
|
|
|
{
|
|
|
|
struct genevehdr *gh;
|
|
|
|
struct packet_offload *ptype;
|
|
|
|
__be16 type;
|
|
|
|
int gh_len;
|
|
|
|
int err = -ENOSYS;
|
|
|
|
|
|
|
|
gh = (struct genevehdr *)(skb->data + nhoff);
|
|
|
|
gh_len = geneve_hlen(gh);
|
|
|
|
type = gh->proto_type;
|
|
|
|
|
2022-04-13 08:44:40 +00:00
|
|
|
/* since skb->encapsulation is set, eth_gro_complete() sets the inner mac header */
|
|
|
|
if (likely(type == htons(ETH_P_TEB)))
|
|
|
|
return eth_gro_complete(skb, nhoff + gh_len);
|
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
ptype = gro_find_complete_by_type(type);
|
|
|
|
if (ptype)
|
|
|
|
err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
|
|
|
|
|
2016-05-03 23:10:21 +00:00
|
|
|
skb_set_inner_mac_header(skb, nhoff + gh_len);
|
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Create new listen socket if needed */
|
|
|
|
static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
|
2016-11-21 19:02:58 +00:00
|
|
|
bool ipv6, bool ipv6_rx_csum)
|
2015-08-27 06:46:54 +00:00
|
|
|
{
|
|
|
|
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
|
|
|
struct geneve_sock *gs;
|
|
|
|
struct socket *sock;
|
|
|
|
struct udp_tunnel_sock_cfg tunnel_cfg;
|
2015-08-27 06:46:55 +00:00
|
|
|
int h;
|
2015-08-27 06:46:54 +00:00
|
|
|
|
|
|
|
gs = kzalloc(sizeof(*gs), GFP_KERNEL);
|
|
|
|
if (!gs)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
sock = geneve_create_sock(net, ipv6, port, ipv6_rx_csum);
|
2015-08-27 06:46:54 +00:00
|
|
|
if (IS_ERR(sock)) {
|
|
|
|
kfree(gs);
|
|
|
|
return ERR_CAST(sock);
|
|
|
|
}
|
|
|
|
|
|
|
|
gs->sock = sock;
|
|
|
|
gs->refcnt = 1;
|
2015-08-27 06:46:55 +00:00
|
|
|
for (h = 0; h < VNI_HASH_SIZE; ++h)
|
|
|
|
INIT_HLIST_HEAD(&gs->vni_list[h]);
|
2015-08-27 06:46:54 +00:00
|
|
|
|
|
|
|
/* Initialize the geneve udp offloads structure */
|
2016-06-16 19:20:52 +00:00
|
|
|
udp_tunnel_notify_add_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE);
|
2015-08-27 06:46:54 +00:00
|
|
|
|
|
|
|
/* Mark socket as an encapsulation socket */
|
2016-04-05 15:22:55 +00:00
|
|
|
memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
|
2015-08-27 06:46:54 +00:00
|
|
|
tunnel_cfg.sk_user_data = gs;
|
|
|
|
tunnel_cfg.encap_type = 1;
|
2016-04-05 15:22:55 +00:00
|
|
|
tunnel_cfg.gro_receive = geneve_gro_receive;
|
|
|
|
tunnel_cfg.gro_complete = geneve_gro_complete;
|
2015-08-27 06:46:54 +00:00
|
|
|
tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
|
2018-11-08 11:19:18 +00:00
|
|
|
tunnel_cfg.encap_err_lookup = geneve_udp_encap_err_lookup;
|
2015-08-27 06:46:54 +00:00
|
|
|
tunnel_cfg.encap_destroy = NULL;
|
|
|
|
setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
|
|
|
|
list_add(&gs->list, &gn->sock_list);
|
|
|
|
return gs;
|
|
|
|
}
|
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
static void __geneve_sock_release(struct geneve_sock *gs)
|
2015-08-27 06:46:54 +00:00
|
|
|
{
|
2015-10-26 21:01:44 +00:00
|
|
|
if (!gs || --gs->refcnt)
|
2015-08-27 06:46:54 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
list_del(&gs->list);
|
2016-06-16 19:20:52 +00:00
|
|
|
udp_tunnel_notify_del_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE);
|
2015-08-27 06:46:54 +00:00
|
|
|
udp_tunnel_sock_release(gs->sock);
|
|
|
|
kfree_rcu(gs, rcu);
|
|
|
|
}
|
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
static void geneve_sock_release(struct geneve_dev *geneve)
|
|
|
|
{
|
2016-10-28 16:59:16 +00:00
|
|
|
struct geneve_sock *gs4 = rtnl_dereference(geneve->sock4);
|
2015-10-26 21:01:44 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2016-10-28 16:59:16 +00:00
|
|
|
struct geneve_sock *gs6 = rtnl_dereference(geneve->sock6);
|
|
|
|
|
|
|
|
rcu_assign_pointer(geneve->sock6, NULL);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
rcu_assign_pointer(geneve->sock4, NULL);
|
|
|
|
synchronize_net();
|
|
|
|
|
|
|
|
__geneve_sock_release(gs4);
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
__geneve_sock_release(gs6);
|
2015-10-26 21:01:44 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
static struct geneve_sock *geneve_find_sock(struct geneve_net *gn,
|
2015-10-26 21:01:44 +00:00
|
|
|
sa_family_t family,
|
2015-08-27 06:46:54 +00:00
|
|
|
__be16 dst_port)
|
|
|
|
{
|
|
|
|
struct geneve_sock *gs;
|
|
|
|
|
|
|
|
list_for_each_entry(gs, &gn->sock_list, list) {
|
|
|
|
if (inet_sk(gs->sock->sk)->inet_sport == dst_port &&
|
2016-02-18 10:22:49 +00:00
|
|
|
geneve_get_sk_family(gs) == family) {
|
2015-08-27 06:46:54 +00:00
|
|
|
return gs;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
|
2015-05-13 16:57:30 +00:00
|
|
|
{
|
|
|
|
struct net *net = geneve->net;
|
2015-08-27 06:46:54 +00:00
|
|
|
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
2017-07-02 17:00:58 +00:00
|
|
|
struct geneve_dev_node *node;
|
2015-05-13 16:57:30 +00:00
|
|
|
struct geneve_sock *gs;
|
2016-11-21 19:02:58 +00:00
|
|
|
__u8 vni[3];
|
2015-08-27 06:46:55 +00:00
|
|
|
__u32 hash;
|
2015-05-13 16:57:30 +00:00
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->cfg.info.key.tp_dst);
|
2015-08-27 06:46:54 +00:00
|
|
|
if (gs) {
|
|
|
|
gs->refcnt++;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
gs = geneve_socket_create(net, geneve->cfg.info.key.tp_dst, ipv6,
|
|
|
|
geneve->cfg.use_udp6_rx_checksums);
|
2015-05-13 16:57:30 +00:00
|
|
|
if (IS_ERR(gs))
|
|
|
|
return PTR_ERR(gs);
|
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
out:
|
2020-07-06 15:18:08 +00:00
|
|
|
gs->collect_md = geneve->cfg.collect_md;
|
2015-10-26 21:01:44 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2017-07-02 17:00:58 +00:00
|
|
|
if (ipv6) {
|
2016-10-28 16:59:16 +00:00
|
|
|
rcu_assign_pointer(geneve->sock6, gs);
|
2017-07-02 17:00:58 +00:00
|
|
|
node = &geneve->hlist6;
|
|
|
|
} else
|
2015-10-26 21:01:44 +00:00
|
|
|
#endif
|
2017-07-02 17:00:58 +00:00
|
|
|
{
|
2016-10-28 16:59:16 +00:00
|
|
|
rcu_assign_pointer(geneve->sock4, gs);
|
2017-07-02 17:00:58 +00:00
|
|
|
node = &geneve->hlist4;
|
|
|
|
}
|
|
|
|
node->geneve = geneve;
|
2015-08-27 06:46:55 +00:00
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
tunnel_id_to_vni(geneve->cfg.info.key.tun_id, vni);
|
2016-11-21 19:02:58 +00:00
|
|
|
hash = geneve_net_vni_hash(vni);
|
2017-07-02 17:00:58 +00:00
|
|
|
hlist_add_head_rcu(&node->hlist, &gs->vni_list[hash]);
|
2015-05-13 16:57:30 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
static int geneve_open(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
2020-07-06 15:18:08 +00:00
|
|
|
bool metadata = geneve->cfg.collect_md;
|
2019-02-28 13:56:04 +00:00
|
|
|
bool ipv4, ipv6;
|
2015-10-26 21:01:44 +00:00
|
|
|
int ret = 0;
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
ipv6 = geneve->cfg.info.mode & IP_TUNNEL_INFO_IPV6 || metadata;
|
2019-02-28 13:56:04 +00:00
|
|
|
ipv4 = !ipv6 || metadata;
|
2015-10-26 21:01:44 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2019-02-28 13:56:04 +00:00
|
|
|
if (ipv6) {
|
2015-10-26 21:01:44 +00:00
|
|
|
ret = geneve_sock_add(geneve, true);
|
2019-02-28 13:56:04 +00:00
|
|
|
if (ret < 0 && ret != -EAFNOSUPPORT)
|
|
|
|
ipv4 = false;
|
|
|
|
}
|
2015-10-26 21:01:44 +00:00
|
|
|
#endif
|
2019-02-28 13:56:04 +00:00
|
|
|
if (ipv4)
|
2015-10-26 21:01:44 +00:00
|
|
|
ret = geneve_sock_add(geneve, false);
|
|
|
|
if (ret < 0)
|
|
|
|
geneve_sock_release(geneve);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
static int geneve_stop(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
|
|
|
|
2017-07-02 17:00:58 +00:00
|
|
|
hlist_del_init_rcu(&geneve->hlist4.hlist);
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
hlist_del_init_rcu(&geneve->hlist6.hlist);
|
|
|
|
#endif
|
2015-10-26 21:01:44 +00:00
|
|
|
geneve_sock_release(geneve);
|
2015-08-27 06:46:54 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
static void geneve_build_header(struct genevehdr *geneveh,
|
2022-03-16 06:15:57 +00:00
|
|
|
const struct ip_tunnel_info *info,
|
|
|
|
__be16 inner_proto)
|
2015-10-26 21:01:44 +00:00
|
|
|
{
|
|
|
|
geneveh->ver = GENEVE_VER;
|
2016-11-21 19:02:59 +00:00
|
|
|
geneveh->opt_len = info->options_len / 4;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
geneveh->oam = test_bit(IP_TUNNEL_OAM_BIT, info->key.tun_flags);
|
|
|
|
geneveh->critical = test_bit(IP_TUNNEL_CRIT_OPT_BIT,
|
|
|
|
info->key.tun_flags);
|
2015-10-26 21:01:44 +00:00
|
|
|
geneveh->rsvd1 = 0;
|
2016-11-21 19:02:59 +00:00
|
|
|
tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
|
2022-03-16 06:15:57 +00:00
|
|
|
geneveh->proto_type = inner_proto;
|
2015-10-26 21:01:44 +00:00
|
|
|
geneveh->rsvd2 = 0;
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags))
|
2018-06-27 04:39:36 +00:00
|
|
|
ip_tunnel_info_opts_get(geneveh->options, info);
|
2015-10-26 21:01:44 +00:00
|
|
|
}
|
|
|
|
|
2016-11-21 19:02:59 +00:00
|
|
|
static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
|
|
|
|
const struct ip_tunnel_info *info,
|
2022-03-16 06:15:57 +00:00
|
|
|
bool xnet, int ip_hdr_len,
|
|
|
|
bool inner_proto_inherit)
|
2015-08-27 06:46:54 +00:00
|
|
|
{
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
|
2015-10-26 21:01:44 +00:00
|
|
|
struct genevehdr *gnvh;
|
2022-03-16 06:15:57 +00:00
|
|
|
__be16 inner_proto;
|
2015-10-26 21:01:44 +00:00
|
|
|
int min_headroom;
|
|
|
|
int err;
|
|
|
|
|
2016-11-21 19:02:59 +00:00
|
|
|
skb_reset_mac_header(skb);
|
2015-10-26 21:01:44 +00:00
|
|
|
skb_scrub_packet(skb, xnet);
|
|
|
|
|
2016-11-21 19:02:59 +00:00
|
|
|
min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len +
|
|
|
|
GENEVE_BASE_HLEN + info->options_len + ip_hdr_len;
|
2015-10-26 21:01:44 +00:00
|
|
|
err = skb_cow_head(skb, min_headroom);
|
2016-04-14 19:33:37 +00:00
|
|
|
if (unlikely(err))
|
2015-10-26 21:01:44 +00:00
|
|
|
goto free_dst;
|
|
|
|
|
2016-04-14 19:33:37 +00:00
|
|
|
err = udp_tunnel_handle_offloads(skb, udp_sum);
|
2016-04-19 14:30:56 +00:00
|
|
|
if (err)
|
2015-10-26 21:01:44 +00:00
|
|
|
goto free_dst;
|
|
|
|
|
networking: make skb_push & __skb_push return void pointers
It seems like a historic accident that these return unsigned char *,
and in many places that means casts are required, more often than not.
Make these functions return void * and remove all the casts across
the tree, adding a (u8 *) cast only where the unsigned char pointer
was used directly, all done with the following spatch:
@@
expression SKB, LEN;
typedef u8;
identifier fn = { skb_push, __skb_push, skb_push_rcsum };
@@
- *(fn(SKB, LEN))
+ *(u8 *)fn(SKB, LEN)
@@
expression E, SKB, LEN;
identifier fn = { skb_push, __skb_push, skb_push_rcsum };
type T;
@@
- E = ((T *)(fn(SKB, LEN)))
+ E = fn(SKB, LEN)
@@
expression SKB, LEN;
identifier fn = { skb_push, __skb_push, skb_push_rcsum };
@@
- fn(SKB, LEN)[0]
+ *(u8 *)fn(SKB, LEN)
Note that the last part there converts from push(...)[0] to the
more idiomatic *(u8 *)push(...).
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-16 12:29:23 +00:00
|
|
|
gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
|
2022-03-16 06:15:57 +00:00
|
|
|
inner_proto = inner_proto_inherit ? skb->protocol : htons(ETH_P_TEB);
|
|
|
|
geneve_build_header(gnvh, info, inner_proto);
|
|
|
|
skb_set_inner_protocol(skb, inner_proto);
|
2015-10-26 21:01:44 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
free_dst:
|
|
|
|
dst_release(dst);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2023-10-16 07:15:24 +00:00
|
|
|
static u8 geneve_get_dsfield(struct sk_buff *skb, struct net_device *dev,
|
|
|
|
const struct ip_tunnel_info *info,
|
|
|
|
bool *use_cache)
|
|
|
|
{
|
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
|
|
|
u8 dsfield;
|
|
|
|
|
|
|
|
dsfield = info->key.tos;
|
|
|
|
if (dsfield == 1 && !geneve->cfg.collect_md) {
|
|
|
|
dsfield = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
|
|
|
|
*use_cache = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return dsfield;
|
|
|
|
}
|
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
|
2016-11-21 19:02:59 +00:00
|
|
|
struct geneve_dev *geneve,
|
|
|
|
const struct ip_tunnel_info *info)
|
2015-05-13 16:57:30 +00:00
|
|
|
{
|
2024-06-06 20:32:48 +00:00
|
|
|
bool inner_proto_inherit = geneve->cfg.inner_proto_inherit;
|
2016-11-21 19:02:58 +00:00
|
|
|
bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
|
|
|
|
struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
|
|
|
|
const struct ip_tunnel_key *key = &info->key;
|
|
|
|
struct rtable *rt;
|
2023-10-16 07:15:25 +00:00
|
|
|
bool use_cache;
|
2015-06-01 19:51:34 +00:00
|
|
|
__u8 tos, ttl;
|
2018-11-08 11:19:19 +00:00
|
|
|
__be16 df = 0;
|
2023-10-16 07:15:25 +00:00
|
|
|
__be32 saddr;
|
2015-08-27 06:46:52 +00:00
|
|
|
__be16 sport;
|
2016-11-21 19:03:00 +00:00
|
|
|
int err;
|
2015-08-27 06:46:49 +00:00
|
|
|
|
2024-06-06 20:32:48 +00:00
|
|
|
if (!skb_vlan_inet_prepare(skb, inner_proto_inherit))
|
2021-04-11 11:28:24 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2023-10-16 07:15:25 +00:00
|
|
|
if (!gs4)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
use_cache = ip_tunnel_dst_cache_usable(skb, info);
|
|
|
|
tos = geneve_get_dsfield(skb, dev, info, &use_cache);
|
2020-09-16 09:19:35 +00:00
|
|
|
sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
|
2023-10-16 07:15:25 +00:00
|
|
|
|
|
|
|
rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr,
|
|
|
|
&info->key,
|
|
|
|
sport, geneve->cfg.info.key.tp_dst, tos,
|
|
|
|
use_cache ?
|
|
|
|
(struct dst_cache *)&info->dst_cache : NULL);
|
2016-11-21 19:02:58 +00:00
|
|
|
if (IS_ERR(rt))
|
|
|
|
return PTR_ERR(rt);
|
2015-08-27 06:46:54 +00:00
|
|
|
|
2020-08-04 05:53:45 +00:00
|
|
|
err = skb_tunnel_check_pmtu(skb, &rt->dst,
|
|
|
|
GENEVE_IPV4_HLEN + info->options_len,
|
|
|
|
netif_is_any_bridge_port(dev));
|
|
|
|
if (err < 0) {
|
|
|
|
dst_release(&rt->dst);
|
|
|
|
return err;
|
|
|
|
} else if (err) {
|
|
|
|
struct ip_tunnel_info *info;
|
|
|
|
|
|
|
|
info = skb_tunnel_info(skb);
|
|
|
|
if (info) {
|
2021-03-25 15:35:33 +00:00
|
|
|
struct ip_tunnel_info *unclone;
|
|
|
|
|
|
|
|
unclone = skb_tunnel_info_unclone(skb);
|
|
|
|
if (unlikely(!unclone)) {
|
|
|
|
dst_release(&rt->dst);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2023-10-16 07:15:25 +00:00
|
|
|
unclone->key.u.ipv4.dst = saddr;
|
|
|
|
unclone->key.u.ipv4.src = info->key.u.ipv4.dst;
|
2020-08-04 05:53:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, ETH_HLEN)) {
|
|
|
|
dst_release(&rt->dst);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
skb->protocol = eth_type_trans(skb, geneve->dev);
|
2022-02-11 23:38:38 +00:00
|
|
|
__netif_rx(skb);
|
2020-08-04 05:53:45 +00:00
|
|
|
dst_release(&rt->dst);
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
2017-12-25 06:43:58 +00:00
|
|
|
|
2023-10-16 07:15:25 +00:00
|
|
|
tos = ip_tunnel_ecn_encap(tos, ip_hdr(skb), skb);
|
2020-07-06 15:18:08 +00:00
|
|
|
if (geneve->cfg.collect_md) {
|
2015-08-27 06:46:54 +00:00
|
|
|
ttl = key->ttl;
|
2018-11-08 11:19:19 +00:00
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
df = test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) ?
|
|
|
|
htons(IP_DF) : 0;
|
2015-08-27 06:46:52 +00:00
|
|
|
} else {
|
2020-07-06 15:18:08 +00:00
|
|
|
if (geneve->cfg.ttl_inherit)
|
2018-09-12 02:04:21 +00:00
|
|
|
ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
|
|
|
|
else
|
|
|
|
ttl = key->ttl;
|
|
|
|
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
|
2018-11-08 11:19:19 +00:00
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
if (geneve->cfg.df == GENEVE_DF_SET) {
|
2018-11-08 11:19:19 +00:00
|
|
|
df = htons(IP_DF);
|
2020-07-06 15:18:08 +00:00
|
|
|
} else if (geneve->cfg.df == GENEVE_DF_INHERIT) {
|
2018-11-08 11:19:19 +00:00
|
|
|
struct ethhdr *eth = eth_hdr(skb);
|
|
|
|
|
|
|
|
if (ntohs(eth->h_proto) == ETH_P_IPV6) {
|
|
|
|
df = htons(IP_DF);
|
|
|
|
} else if (ntohs(eth->h_proto) == ETH_P_IP) {
|
|
|
|
struct iphdr *iph = ip_hdr(skb);
|
|
|
|
|
|
|
|
if (iph->frag_off & htons(IP_DF))
|
|
|
|
df = htons(IP_DF);
|
|
|
|
}
|
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
|
|
|
|
2022-03-16 06:15:57 +00:00
|
|
|
err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr),
|
2024-06-06 20:32:48 +00:00
|
|
|
inner_proto_inherit);
|
2016-11-21 19:02:58 +00:00
|
|
|
if (unlikely(err))
|
|
|
|
return err;
|
2016-06-21 08:26:49 +00:00
|
|
|
|
2023-10-16 07:15:25 +00:00
|
|
|
udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, saddr, info->key.u.ipv4.dst,
|
2020-07-06 15:18:08 +00:00
|
|
|
tos, ttl, df, sport, geneve->cfg.info.key.tp_dst,
|
2016-11-21 19:02:58 +00:00
|
|
|
!net_eq(geneve->net, dev_net(geneve->dev)),
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
!test_bit(IP_TUNNEL_CSUM_BIT,
|
|
|
|
info->key.tun_flags));
|
2016-11-21 19:02:58 +00:00
|
|
|
return 0;
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2016-11-21 19:02:58 +00:00
|
|
|
static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
|
2016-11-21 19:02:59 +00:00
|
|
|
struct geneve_dev *geneve,
|
|
|
|
const struct ip_tunnel_info *info)
|
2015-10-26 21:01:44 +00:00
|
|
|
{
|
2024-06-06 20:32:48 +00:00
|
|
|
bool inner_proto_inherit = geneve->cfg.inner_proto_inherit;
|
2016-11-21 19:02:58 +00:00
|
|
|
bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
|
|
|
|
struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
|
|
|
|
const struct ip_tunnel_key *key = &info->key;
|
2015-10-26 21:01:44 +00:00
|
|
|
struct dst_entry *dst = NULL;
|
2023-10-20 11:55:28 +00:00
|
|
|
struct in6_addr saddr;
|
|
|
|
bool use_cache;
|
2015-10-26 21:01:45 +00:00
|
|
|
__u8 prio, ttl;
|
2015-10-26 21:01:44 +00:00
|
|
|
__be16 sport;
|
2016-11-21 19:03:00 +00:00
|
|
|
int err;
|
2015-10-26 21:01:44 +00:00
|
|
|
|
2024-06-06 20:32:48 +00:00
|
|
|
if (!skb_vlan_inet_prepare(skb, inner_proto_inherit))
|
2021-04-11 11:28:24 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2023-10-20 11:55:28 +00:00
|
|
|
if (!gs6)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
use_cache = ip_tunnel_dst_cache_usable(skb, info);
|
|
|
|
prio = geneve_get_dsfield(skb, dev, info, &use_cache);
|
2020-09-16 09:19:35 +00:00
|
|
|
sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
|
2023-10-20 11:55:28 +00:00
|
|
|
|
|
|
|
dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0,
|
|
|
|
&saddr, key, sport,
|
|
|
|
geneve->cfg.info.key.tp_dst, prio,
|
|
|
|
use_cache ?
|
|
|
|
(struct dst_cache *)&info->dst_cache : NULL);
|
2016-11-21 19:02:58 +00:00
|
|
|
if (IS_ERR(dst))
|
|
|
|
return PTR_ERR(dst);
|
2015-10-26 21:01:44 +00:00
|
|
|
|
2020-08-04 05:53:45 +00:00
|
|
|
err = skb_tunnel_check_pmtu(skb, dst,
|
|
|
|
GENEVE_IPV6_HLEN + info->options_len,
|
|
|
|
netif_is_any_bridge_port(dev));
|
|
|
|
if (err < 0) {
|
|
|
|
dst_release(dst);
|
|
|
|
return err;
|
|
|
|
} else if (err) {
|
|
|
|
struct ip_tunnel_info *info = skb_tunnel_info(skb);
|
|
|
|
|
|
|
|
if (info) {
|
2021-03-25 15:35:33 +00:00
|
|
|
struct ip_tunnel_info *unclone;
|
|
|
|
|
|
|
|
unclone = skb_tunnel_info_unclone(skb);
|
|
|
|
if (unlikely(!unclone)) {
|
|
|
|
dst_release(dst);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2023-10-20 11:55:28 +00:00
|
|
|
unclone->key.u.ipv6.dst = saddr;
|
|
|
|
unclone->key.u.ipv6.src = info->key.u.ipv6.dst;
|
2020-08-04 05:53:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, ETH_HLEN)) {
|
|
|
|
dst_release(dst);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
skb->protocol = eth_type_trans(skb, geneve->dev);
|
2022-02-11 23:38:38 +00:00
|
|
|
__netif_rx(skb);
|
2020-08-04 05:53:45 +00:00
|
|
|
dst_release(dst);
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
2017-12-25 06:43:58 +00:00
|
|
|
|
2023-10-20 11:55:28 +00:00
|
|
|
prio = ip_tunnel_ecn_encap(prio, ip_hdr(skb), skb);
|
2020-07-06 15:18:08 +00:00
|
|
|
if (geneve->cfg.collect_md) {
|
2016-11-21 19:02:58 +00:00
|
|
|
ttl = key->ttl;
|
|
|
|
} else {
|
2020-07-06 15:18:08 +00:00
|
|
|
if (geneve->cfg.ttl_inherit)
|
2018-09-12 02:04:21 +00:00
|
|
|
ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
|
|
|
|
else
|
|
|
|
ttl = key->ttl;
|
|
|
|
ttl = ttl ? : ip6_dst_hoplimit(dst);
|
2016-11-21 19:02:58 +00:00
|
|
|
}
|
2022-03-16 06:15:57 +00:00
|
|
|
err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr),
|
2024-06-06 20:32:48 +00:00
|
|
|
inner_proto_inherit);
|
2016-11-21 19:02:58 +00:00
|
|
|
if (unlikely(err))
|
|
|
|
return err;
|
2015-10-26 21:01:44 +00:00
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
|
2023-10-20 11:55:28 +00:00
|
|
|
&saddr, &key->u.ipv6.dst, prio, ttl,
|
2020-07-06 15:18:08 +00:00
|
|
|
info->key.label, sport, geneve->cfg.info.key.tp_dst,
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
!test_bit(IP_TUNNEL_CSUM_BIT,
|
|
|
|
info->key.tun_flags));
|
2016-11-21 19:02:58 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
2015-10-26 21:01:44 +00:00
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
|
|
|
struct ip_tunnel_info *info = NULL;
|
|
|
|
int err;
|
2015-12-10 20:37:45 +00:00
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
if (geneve->cfg.collect_md) {
|
2016-11-21 19:02:58 +00:00
|
|
|
info = skb_tunnel_info(skb);
|
|
|
|
if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) {
|
|
|
|
netdev_dbg(dev, "no tunnel metadata\n");
|
geneve: change from tx_error to tx_dropped on missing metadata
If the geneve interface is in collect_md (external) mode, it can't send any
packets submitted directly to its net interface, as such packets won't have
metadata attached. This is expected.
However, the kernel itself sends some packets to the interface, most
notably, IPv6 DAD, IPv6 multicast listener reports, etc. This is not wrong,
as tunnel metadata can be specified in routing table (although technically,
that has never worked for IPv6, but hopefully will be fixed eventually) and
then the interface must correctly participate in IPv6 housekeeping.
The problem is that any such attempt increases the tx_error counter. Just
bringing up a geneve interface with IPv6 enabled is enough to see a number
of tx_errors. That causes confusion among users, prompting them to find
a network error where there is none.
Change the counter used to tx_dropped. That better conveys the meaning
(there's nothing wrong going on, just some packets are getting dropped) and
hopefully will make admins panic less.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-03 09:12:14 +00:00
|
|
|
dev_kfree_skb(skb);
|
2024-01-04 16:36:33 +00:00
|
|
|
DEV_STATS_INC(dev, tx_dropped);
|
geneve: change from tx_error to tx_dropped on missing metadata
If the geneve interface is in collect_md (external) mode, it can't send any
packets submitted directly to its net interface, as such packets won't have
metadata attached. This is expected.
However, the kernel itself sends some packets to the interface, most
notably, IPv6 DAD, IPv6 multicast listener reports, etc. This is not wrong,
as tunnel metadata can be specified in routing table (although technically,
that has never worked for IPv6, but hopefully will be fixed eventually) and
then the interface must correctly participate in IPv6 housekeeping.
The problem is that any such attempt increases the tx_error counter. Just
bringing up a geneve interface with IPv6 enabled is enough to see a number
of tx_errors. That causes confusion among users, prompting them to find
a network error where there is none.
Change the counter used to tx_dropped. That better conveys the meaning
(there's nothing wrong going on, just some packets are getting dropped) and
hopefully will make admins panic less.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-03 09:12:14 +00:00
|
|
|
return NETDEV_TX_OK;
|
2016-11-21 19:02:58 +00:00
|
|
|
}
|
2015-10-26 21:01:44 +00:00
|
|
|
} else {
|
2020-07-06 15:18:08 +00:00
|
|
|
info = &geneve->cfg.info;
|
2015-10-26 21:01:44 +00:00
|
|
|
}
|
2016-03-09 02:00:04 +00:00
|
|
|
|
2017-02-24 19:43:37 +00:00
|
|
|
rcu_read_lock();
|
2016-11-21 19:02:58 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
if (info->mode & IP_TUNNEL_INFO_IPV6)
|
|
|
|
err = geneve6_xmit_skb(skb, dev, geneve, info);
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
err = geneve_xmit_skb(skb, dev, geneve, info);
|
2017-02-24 19:43:37 +00:00
|
|
|
rcu_read_unlock();
|
2015-10-26 21:01:44 +00:00
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
if (likely(!err))
|
|
|
|
return NETDEV_TX_OK;
|
geneve: change from tx_error to tx_dropped on missing metadata
If the geneve interface is in collect_md (external) mode, it can't send any
packets submitted directly to its net interface, as such packets won't have
metadata attached. This is expected.
However, the kernel itself sends some packets to the interface, most
notably, IPv6 DAD, IPv6 multicast listener reports, etc. This is not wrong,
as tunnel metadata can be specified in routing table (although technically,
that has never worked for IPv6, but hopefully will be fixed eventually) and
then the interface must correctly participate in IPv6 housekeeping.
The problem is that any such attempt increases the tx_error counter. Just
bringing up a geneve interface with IPv6 enabled is enough to see a number
of tx_errors. That causes confusion among users, prompting them to find
a network error where there is none.
Change the counter used to tx_dropped. That better conveys the meaning
(there's nothing wrong going on, just some packets are getting dropped) and
hopefully will make admins panic less.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-03 09:12:14 +00:00
|
|
|
|
2020-08-04 05:53:45 +00:00
|
|
|
if (err != -EMSGSIZE)
|
|
|
|
dev_kfree_skb(skb);
|
2016-04-14 19:33:37 +00:00
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
if (err == -ELOOP)
|
2024-01-04 16:36:33 +00:00
|
|
|
DEV_STATS_INC(dev, collisions);
|
2015-10-26 21:01:44 +00:00
|
|
|
else if (err == -ENETUNREACH)
|
2024-01-04 16:36:33 +00:00
|
|
|
DEV_STATS_INC(dev, tx_carrier_errors);
|
2016-06-21 08:26:49 +00:00
|
|
|
|
2024-01-04 16:36:33 +00:00
|
|
|
DEV_STATS_INC(dev, tx_errors);
|
2015-10-26 21:01:44 +00:00
|
|
|
return NETDEV_TX_OK;
|
|
|
|
}
|
|
|
|
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 17:55:20 +00:00
|
|
|
static int geneve_change_mtu(struct net_device *dev, int new_mtu)
|
2016-02-10 00:05:57 +00:00
|
|
|
{
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 17:55:20 +00:00
|
|
|
if (new_mtu > dev->max_mtu)
|
|
|
|
new_mtu = dev->max_mtu;
|
2018-04-19 12:42:31 +00:00
|
|
|
else if (new_mtu < dev->min_mtu)
|
|
|
|
new_mtu = dev->min_mtu;
|
2016-02-18 17:43:29 +00:00
|
|
|
|
2024-05-06 10:28:12 +00:00
|
|
|
WRITE_ONCE(dev->mtu, new_mtu);
|
2016-02-10 00:05:57 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-23 01:17:16 +00:00
|
|
|
static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct ip_tunnel_info *info = skb_tunnel_info(skb);
|
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
2020-09-16 09:19:35 +00:00
|
|
|
__be16 sport;
|
2015-10-23 01:17:16 +00:00
|
|
|
|
2015-10-27 13:49:00 +00:00
|
|
|
if (ip_tunnel_info_af(info) == AF_INET) {
|
2016-11-21 19:02:58 +00:00
|
|
|
struct rtable *rt;
|
2017-07-21 05:44:20 +00:00
|
|
|
struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
|
2023-10-16 07:15:25 +00:00
|
|
|
bool use_cache;
|
|
|
|
__be32 saddr;
|
|
|
|
u8 tos;
|
|
|
|
|
|
|
|
if (!gs4)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
use_cache = ip_tunnel_dst_cache_usable(skb, info);
|
|
|
|
tos = geneve_get_dsfield(skb, dev, info, &use_cache);
|
2020-09-16 09:19:35 +00:00
|
|
|
sport = udp_flow_src_port(geneve->net, skb,
|
|
|
|
1, USHRT_MAX, true);
|
2016-11-21 19:02:58 +00:00
|
|
|
|
2023-10-16 07:15:25 +00:00
|
|
|
rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr,
|
|
|
|
&info->key,
|
|
|
|
sport, geneve->cfg.info.key.tp_dst,
|
|
|
|
tos,
|
|
|
|
use_cache ? &info->dst_cache : NULL);
|
2015-10-27 13:49:00 +00:00
|
|
|
if (IS_ERR(rt))
|
|
|
|
return PTR_ERR(rt);
|
2015-10-23 01:17:16 +00:00
|
|
|
|
2015-10-27 13:49:00 +00:00
|
|
|
ip_rt_put(rt);
|
2023-10-16 07:15:25 +00:00
|
|
|
info->key.u.ipv4.src = saddr;
|
2015-10-27 13:49:00 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
} else if (ip_tunnel_info_af(info) == AF_INET6) {
|
2016-11-21 19:02:58 +00:00
|
|
|
struct dst_entry *dst;
|
2017-07-21 05:44:20 +00:00
|
|
|
struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
|
2023-10-20 11:55:28 +00:00
|
|
|
struct in6_addr saddr;
|
|
|
|
bool use_cache;
|
|
|
|
u8 prio;
|
|
|
|
|
|
|
|
if (!gs6)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
use_cache = ip_tunnel_dst_cache_usable(skb, info);
|
|
|
|
prio = geneve_get_dsfield(skb, dev, info, &use_cache);
|
2020-09-16 09:19:35 +00:00
|
|
|
sport = udp_flow_src_port(geneve->net, skb,
|
|
|
|
1, USHRT_MAX, true);
|
2016-11-21 19:02:58 +00:00
|
|
|
|
2023-10-20 11:55:28 +00:00
|
|
|
dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0,
|
|
|
|
&saddr, &info->key, sport,
|
|
|
|
geneve->cfg.info.key.tp_dst, prio,
|
|
|
|
use_cache ? &info->dst_cache : NULL);
|
2015-10-27 13:49:00 +00:00
|
|
|
if (IS_ERR(dst))
|
|
|
|
return PTR_ERR(dst);
|
|
|
|
|
|
|
|
dst_release(dst);
|
2023-10-20 11:55:28 +00:00
|
|
|
info->key.u.ipv6.src = saddr;
|
2015-10-27 13:49:00 +00:00
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2015-10-23 01:17:16 +00:00
|
|
|
|
2020-09-16 09:19:35 +00:00
|
|
|
info->key.tp_src = sport;
|
2020-07-06 15:18:08 +00:00
|
|
|
info->key.tp_dst = geneve->cfg.info.key.tp_dst;
|
2015-10-23 01:17:16 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
static const struct net_device_ops geneve_netdev_ops = {
|
|
|
|
.ndo_init = geneve_init,
|
|
|
|
.ndo_uninit = geneve_uninit,
|
|
|
|
.ndo_open = geneve_open,
|
|
|
|
.ndo_stop = geneve_stop,
|
|
|
|
.ndo_start_xmit = geneve_xmit,
|
2016-02-10 00:05:57 +00:00
|
|
|
.ndo_change_mtu = geneve_change_mtu,
|
2015-05-13 16:57:30 +00:00
|
|
|
.ndo_validate_addr = eth_validate_addr,
|
|
|
|
.ndo_set_mac_address = eth_mac_addr,
|
2015-10-23 01:17:16 +00:00
|
|
|
.ndo_fill_metadata_dst = geneve_fill_metadata_dst,
|
2015-05-13 16:57:30 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static void geneve_get_drvinfo(struct net_device *dev,
|
|
|
|
struct ethtool_drvinfo *drvinfo)
|
|
|
|
{
|
2022-08-30 20:14:52 +00:00
|
|
|
strscpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version));
|
|
|
|
strscpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver));
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct ethtool_ops geneve_ethtool_ops = {
|
|
|
|
.get_drvinfo = geneve_get_drvinfo,
|
|
|
|
.get_link = ethtool_op_get_link,
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Info for udev, that this is a virtual tunnel endpoint */
|
2024-02-17 20:13:28 +00:00
|
|
|
static const struct device_type geneve_type = {
|
2015-05-13 16:57:30 +00:00
|
|
|
.name = "geneve",
|
|
|
|
};
|
|
|
|
|
2016-07-11 11:12:28 +00:00
|
|
|
/* Calls the ndo_udp_tunnel_add of the caller in order to
|
2015-12-14 20:21:20 +00:00
|
|
|
* supply the listening GENEVE udp ports. Callers are expected
|
2016-07-11 11:12:28 +00:00
|
|
|
* to implement the ndo_udp_tunnel_add.
|
2015-12-14 20:21:20 +00:00
|
|
|
*/
|
2017-07-21 10:49:32 +00:00
|
|
|
static void geneve_offload_rx_ports(struct net_device *dev, bool push)
|
2015-12-14 20:21:20 +00:00
|
|
|
{
|
|
|
|
struct net *net = dev_net(dev);
|
|
|
|
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
|
|
|
struct geneve_sock *gs;
|
2016-04-18 19:19:48 +00:00
|
|
|
|
2015-12-14 20:21:20 +00:00
|
|
|
rcu_read_lock();
|
2017-07-21 10:49:32 +00:00
|
|
|
list_for_each_entry_rcu(gs, &gn->sock_list, list) {
|
|
|
|
if (push) {
|
|
|
|
udp_tunnel_push_rx_port(dev, gs->sock,
|
|
|
|
UDP_TUNNEL_TYPE_GENEVE);
|
|
|
|
} else {
|
|
|
|
udp_tunnel_drop_rx_port(dev, gs->sock,
|
|
|
|
UDP_TUNNEL_TYPE_GENEVE);
|
|
|
|
}
|
|
|
|
}
|
2015-12-14 20:21:20 +00:00
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
/* Initialize the device structure. */
|
|
|
|
static void geneve_setup(struct net_device *dev)
|
|
|
|
{
|
|
|
|
ether_setup(dev);
|
|
|
|
|
|
|
|
dev->netdev_ops = &geneve_netdev_ops;
|
|
|
|
dev->ethtool_ops = &geneve_ethtool_ops;
|
net: Fix inconsistent teardown and release of private netdev state.
Network devices can allocate reasources and private memory using
netdev_ops->ndo_init(). However, the release of these resources
can occur in one of two different places.
Either netdev_ops->ndo_uninit() or netdev->destructor().
The decision of which operation frees the resources depends upon
whether it is necessary for all netdev refs to be released before it
is safe to perform the freeing.
netdev_ops->ndo_uninit() presumably can occur right after the
NETDEV_UNREGISTER notifier completes and the unicast and multicast
address lists are flushed.
netdev->destructor(), on the other hand, does not run until the
netdev references all go away.
Further complicating the situation is that netdev->destructor()
almost universally does also a free_netdev().
This creates a problem for the logic in register_netdevice().
Because all callers of register_netdevice() manage the freeing
of the netdev, and invoke free_netdev(dev) if register_netdevice()
fails.
If netdev_ops->ndo_init() succeeds, but something else fails inside
of register_netdevice(), it does call ndo_ops->ndo_uninit(). But
it is not able to invoke netdev->destructor().
This is because netdev->destructor() will do a free_netdev() and
then the caller of register_netdevice() will do the same.
However, this means that the resources that would normally be released
by netdev->destructor() will not be.
Over the years drivers have added local hacks to deal with this, by
invoking their destructor parts by hand when register_netdevice()
fails.
Many drivers do not try to deal with this, and instead we have leaks.
Let's close this hole by formalizing the distinction between what
private things need to be freed up by netdev->destructor() and whether
the driver needs unregister_netdevice() to perform the free_netdev().
netdev->priv_destructor() performs all actions to free up the private
resources that used to be freed by netdev->destructor(), except for
free_netdev().
netdev->needs_free_netdev is a boolean that indicates whether
free_netdev() should be done at the end of unregister_netdevice().
Now, register_netdevice() can sanely release all resources after
ndo_ops->ndo_init() succeeds, by invoking both ndo_ops->ndo_uninit()
and netdev->priv_destructor().
And at the end of unregister_netdevice(), we invoke
netdev->priv_destructor() and optionally call free_netdev().
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-08 16:52:56 +00:00
|
|
|
dev->needs_free_netdev = true;
|
2015-05-13 16:57:30 +00:00
|
|
|
|
|
|
|
SET_NETDEV_DEVTYPE(dev, &geneve_type);
|
|
|
|
|
|
|
|
dev->features |= NETIF_F_LLTX;
|
2021-01-15 09:47:46 +00:00
|
|
|
dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
|
2015-05-13 16:57:30 +00:00
|
|
|
dev->features |= NETIF_F_RXCSUM;
|
|
|
|
dev->features |= NETIF_F_GSO_SOFTWARE;
|
|
|
|
|
2021-01-15 09:47:46 +00:00
|
|
|
dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
|
|
|
|
dev->hw_features |= NETIF_F_RXCSUM;
|
2015-05-13 16:57:30 +00:00
|
|
|
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
|
|
|
|
|
2024-03-05 17:29:09 +00:00
|
|
|
dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 17:55:20 +00:00
|
|
|
/* MTU range: 68 - (something less than 65535) */
|
|
|
|
dev->min_mtu = ETH_MIN_MTU;
|
|
|
|
/* The max_mtu calculation does not take account of GENEVE
|
|
|
|
* options, to avoid excluding potentially valid
|
|
|
|
* configurations. This will be further reduced by IPvX hdr size.
|
|
|
|
*/
|
|
|
|
dev->max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - dev->hard_header_len;
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
netif_keep_dst(dev);
|
2016-02-17 14:31:35 +00:00
|
|
|
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
|
2015-08-18 08:30:31 +00:00
|
|
|
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
|
2015-08-27 06:46:48 +00:00
|
|
|
eth_hw_addr_random(dev);
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
|
2022-03-22 04:39:54 +00:00
|
|
|
[IFLA_GENEVE_UNSPEC] = { .strict_start_type = IFLA_GENEVE_INNER_PROTO_INHERIT },
|
2015-05-13 16:57:30 +00:00
|
|
|
[IFLA_GENEVE_ID] = { .type = NLA_U32 },
|
2019-12-09 18:31:43 +00:00
|
|
|
[IFLA_GENEVE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) },
|
2015-10-26 21:01:44 +00:00
|
|
|
[IFLA_GENEVE_REMOTE6] = { .len = sizeof(struct in6_addr) },
|
2015-06-01 19:51:34 +00:00
|
|
|
[IFLA_GENEVE_TTL] = { .type = NLA_U8 },
|
2015-06-01 19:51:35 +00:00
|
|
|
[IFLA_GENEVE_TOS] = { .type = NLA_U8 },
|
2016-03-09 02:00:04 +00:00
|
|
|
[IFLA_GENEVE_LABEL] = { .type = NLA_U32 },
|
2015-08-27 06:46:51 +00:00
|
|
|
[IFLA_GENEVE_PORT] = { .type = NLA_U16 },
|
2015-08-27 06:46:52 +00:00
|
|
|
[IFLA_GENEVE_COLLECT_METADATA] = { .type = NLA_FLAG },
|
2015-12-10 20:37:45 +00:00
|
|
|
[IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 },
|
|
|
|
[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
|
|
|
|
[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
|
2018-09-12 02:04:21 +00:00
|
|
|
[IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 },
|
2018-11-08 11:19:19 +00:00
|
|
|
[IFLA_GENEVE_DF] = { .type = NLA_U8 },
|
2022-03-22 04:39:54 +00:00
|
|
|
[IFLA_GENEVE_INNER_PROTO_INHERIT] = { .type = NLA_FLAG },
|
2015-05-13 16:57:30 +00:00
|
|
|
};
|
|
|
|
|
2017-06-25 21:56:01 +00:00
|
|
|
static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
|
|
|
|
struct netlink_ext_ack *extack)
|
2015-05-13 16:57:30 +00:00
|
|
|
{
|
|
|
|
if (tb[IFLA_ADDRESS]) {
|
2017-08-09 08:09:28 +00:00
|
|
|
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
|
|
|
|
"Provided link layer address is not Ethernet");
|
2015-05-13 16:57:30 +00:00
|
|
|
return -EINVAL;
|
2017-08-09 08:09:28 +00:00
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
|
2017-08-09 08:09:28 +00:00
|
|
|
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
|
|
|
|
"Provided Ethernet address is not unicast");
|
2015-05-13 16:57:30 +00:00
|
|
|
return -EADDRNOTAVAIL;
|
2017-08-09 08:09:28 +00:00
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
|
|
|
|
2017-08-09 08:09:28 +00:00
|
|
|
if (!data) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Not enough attributes provided to perform the operation");
|
2015-05-13 16:57:30 +00:00
|
|
|
return -EINVAL;
|
2017-08-09 08:09:28 +00:00
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
|
|
|
|
if (data[IFLA_GENEVE_ID]) {
|
|
|
|
__u32 vni = nla_get_u32(data[IFLA_GENEVE_ID]);
|
|
|
|
|
2017-08-09 08:09:28 +00:00
|
|
|
if (vni >= GENEVE_N_VID) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_ID],
|
|
|
|
"Geneve ID must be lower than 16777216");
|
2015-05-13 16:57:30 +00:00
|
|
|
return -ERANGE;
|
2017-08-09 08:09:28 +00:00
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
|
|
|
|
2018-11-08 11:19:19 +00:00
|
|
|
if (data[IFLA_GENEVE_DF]) {
|
|
|
|
enum ifla_geneve_df df = nla_get_u8(data[IFLA_GENEVE_DF]);
|
|
|
|
|
|
|
|
if (df < 0 || df > GENEVE_DF_MAX) {
|
2020-04-22 15:29:51 +00:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_DF],
|
2018-11-08 11:19:19 +00:00
|
|
|
"Invalid DF attribute");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-08-27 06:46:54 +00:00
|
|
|
static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
|
2016-11-21 19:02:58 +00:00
|
|
|
const struct ip_tunnel_info *info,
|
2015-08-27 06:46:54 +00:00
|
|
|
bool *tun_on_same_port,
|
|
|
|
bool *tun_collect_md)
|
|
|
|
{
|
2016-11-21 19:02:58 +00:00
|
|
|
struct geneve_dev *geneve, *t = NULL;
|
2015-08-27 06:46:54 +00:00
|
|
|
|
|
|
|
*tun_on_same_port = false;
|
|
|
|
*tun_collect_md = false;
|
|
|
|
list_for_each_entry(geneve, &gn->geneve_list, next) {
|
2020-07-06 15:18:08 +00:00
|
|
|
if (info->key.tp_dst == geneve->cfg.info.key.tp_dst) {
|
|
|
|
*tun_collect_md = geneve->cfg.collect_md;
|
2015-08-27 06:46:54 +00:00
|
|
|
*tun_on_same_port = true;
|
|
|
|
}
|
2020-07-06 15:18:08 +00:00
|
|
|
if (info->key.tun_id == geneve->cfg.info.key.tun_id &&
|
|
|
|
info->key.tp_dst == geneve->cfg.info.key.tp_dst &&
|
|
|
|
!memcmp(&info->key.u, &geneve->cfg.info.key.u, sizeof(info->key.u)))
|
2015-08-27 06:46:54 +00:00
|
|
|
t = geneve;
|
|
|
|
}
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
static bool is_tnl_info_zero(const struct ip_tunnel_info *info)
|
|
|
|
{
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
return !(info->key.tun_id || info->key.tos ||
|
|
|
|
!ip_tunnel_flags_empty(info->key.tun_flags) ||
|
2017-10-20 11:31:36 +00:00
|
|
|
info->key.ttl || info->key.label || info->key.tp_src ||
|
|
|
|
memchr_inv(&info->key.u, 0, sizeof(info->key.u)));
|
2016-11-21 19:02:58 +00:00
|
|
|
}
|
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
static bool geneve_dst_addr_equal(struct ip_tunnel_info *a,
|
|
|
|
struct ip_tunnel_info *b)
|
|
|
|
{
|
|
|
|
if (ip_tunnel_info_af(a) == AF_INET)
|
|
|
|
return a->key.u.ipv4.dst == b->key.u.ipv4.dst;
|
|
|
|
else
|
|
|
|
return ipv6_addr_equal(&a->key.u.ipv6.dst, &b->key.u.ipv6.dst);
|
|
|
|
}
|
|
|
|
|
2015-08-27 06:46:52 +00:00
|
|
|
static int geneve_configure(struct net *net, struct net_device *dev,
|
2017-08-09 08:09:28 +00:00
|
|
|
struct netlink_ext_ack *extack,
|
2020-07-06 15:18:08 +00:00
|
|
|
const struct geneve_config *cfg)
|
2015-05-13 16:57:30 +00:00
|
|
|
{
|
|
|
|
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
2015-08-27 06:46:54 +00:00
|
|
|
struct geneve_dev *t, *geneve = netdev_priv(dev);
|
2020-07-06 15:18:08 +00:00
|
|
|
const struct ip_tunnel_info *info = &cfg->info;
|
2015-08-27 06:46:54 +00:00
|
|
|
bool tun_collect_md, tun_on_same_port;
|
2015-12-23 15:54:27 +00:00
|
|
|
int err, encap_len;
|
2015-05-13 16:57:30 +00:00
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
if (cfg->collect_md && !is_tnl_info_zero(info)) {
|
2017-08-09 08:09:28 +00:00
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Device is externally controlled, so attributes (VNI, Port, and so on) must not be specified");
|
2015-10-26 21:01:44 +00:00
|
|
|
return -EINVAL;
|
2017-08-09 08:09:28 +00:00
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
|
|
|
|
geneve->net = net;
|
|
|
|
geneve->dev = dev;
|
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
t = geneve_find_dev(gn, info, &tun_on_same_port, &tun_collect_md);
|
2015-08-27 06:46:54 +00:00
|
|
|
if (t)
|
|
|
|
return -EBUSY;
|
|
|
|
|
2015-12-23 15:54:27 +00:00
|
|
|
/* make enough headroom for basic scenario */
|
|
|
|
encap_len = GENEVE_BASE_HLEN + ETH_HLEN;
|
2020-07-06 15:18:08 +00:00
|
|
|
if (!cfg->collect_md && ip_tunnel_info_af(info) == AF_INET) {
|
2015-12-23 15:54:27 +00:00
|
|
|
encap_len += sizeof(struct iphdr);
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 17:55:20 +00:00
|
|
|
dev->max_mtu -= sizeof(struct iphdr);
|
|
|
|
} else {
|
2015-12-23 15:54:27 +00:00
|
|
|
encap_len += sizeof(struct ipv6hdr);
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 17:55:20 +00:00
|
|
|
dev->max_mtu -= sizeof(struct ipv6hdr);
|
|
|
|
}
|
2015-12-23 15:54:27 +00:00
|
|
|
dev->needed_headroom = encap_len + ETH_HLEN;
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
if (cfg->collect_md) {
|
2017-08-09 08:09:28 +00:00
|
|
|
if (tun_on_same_port) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"There can be only one externally controlled device on a destination port");
|
2015-08-27 06:46:54 +00:00
|
|
|
return -EPERM;
|
2017-08-09 08:09:28 +00:00
|
|
|
}
|
2015-08-27 06:46:54 +00:00
|
|
|
} else {
|
2017-08-09 08:09:28 +00:00
|
|
|
if (tun_collect_md) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"There already exists an externally controlled device on this destination port");
|
2015-08-27 06:46:54 +00:00
|
|
|
return -EPERM;
|
2017-08-09 08:09:28 +00:00
|
|
|
}
|
2015-08-27 06:46:54 +00:00
|
|
|
}
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
dst_cache_reset(&geneve->cfg.info.dst_cache);
|
|
|
|
memcpy(&geneve->cfg, cfg, sizeof(*cfg));
|
2016-02-12 14:43:58 +00:00
|
|
|
|
2022-03-16 06:15:57 +00:00
|
|
|
if (geneve->cfg.inner_proto_inherit) {
|
|
|
|
dev->header_ops = NULL;
|
|
|
|
dev->type = ARPHRD_NONE;
|
|
|
|
dev->hard_header_len = 0;
|
|
|
|
dev->addr_len = 0;
|
2023-03-12 16:45:57 +00:00
|
|
|
dev->flags = IFF_POINTOPOINT | IFF_NOARP;
|
2022-03-16 06:15:57 +00:00
|
|
|
}
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
err = register_netdevice(dev);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2015-08-27 06:46:52 +00:00
|
|
|
list_add(&geneve->next, &gn->geneve_list);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port)
|
|
|
|
{
|
|
|
|
memset(info, 0, sizeof(*info));
|
|
|
|
info->key.tp_dst = htons(dst_port);
|
|
|
|
}
|
|
|
|
|
2017-08-09 08:09:28 +00:00
|
|
|
static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
|
|
|
|
struct netlink_ext_ack *extack,
|
2020-07-06 15:18:08 +00:00
|
|
|
struct geneve_config *cfg, bool changelink)
|
2015-08-27 06:46:52 +00:00
|
|
|
{
|
2020-07-06 15:18:08 +00:00
|
|
|
struct ip_tunnel_info *info = &cfg->info;
|
2017-08-09 08:09:28 +00:00
|
|
|
int attrtype;
|
|
|
|
|
|
|
|
if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Cannot specify both IPv4 and IPv6 Remote addresses");
|
2015-10-26 21:01:44 +00:00
|
|
|
return -EINVAL;
|
2017-08-09 08:09:28 +00:00
|
|
|
}
|
2015-10-26 21:01:44 +00:00
|
|
|
|
|
|
|
if (data[IFLA_GENEVE_REMOTE]) {
|
2017-08-09 08:09:28 +00:00
|
|
|
if (changelink && (ip_tunnel_info_af(info) == AF_INET6)) {
|
|
|
|
attrtype = IFLA_GENEVE_REMOTE;
|
|
|
|
goto change_notsup;
|
|
|
|
}
|
2017-07-21 05:44:20 +00:00
|
|
|
|
|
|
|
info->key.u.ipv4.dst =
|
2015-10-26 21:01:44 +00:00
|
|
|
nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
|
2016-11-21 19:02:58 +00:00
|
|
|
|
2019-09-02 23:29:36 +00:00
|
|
|
if (ipv4_is_multicast(info->key.u.ipv4.dst)) {
|
2017-08-09 08:09:28 +00:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE],
|
|
|
|
"Remote IPv4 address cannot be Multicast");
|
2016-11-21 19:02:58 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
2015-10-26 21:01:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (data[IFLA_GENEVE_REMOTE6]) {
|
2018-04-19 12:42:29 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2017-08-09 08:09:28 +00:00
|
|
|
if (changelink && (ip_tunnel_info_af(info) == AF_INET)) {
|
|
|
|
attrtype = IFLA_GENEVE_REMOTE6;
|
|
|
|
goto change_notsup;
|
|
|
|
}
|
2017-07-21 05:44:20 +00:00
|
|
|
|
|
|
|
info->mode = IP_TUNNEL_INFO_IPV6;
|
|
|
|
info->key.u.ipv6.dst =
|
2015-10-26 21:01:44 +00:00
|
|
|
nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]);
|
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
if (ipv6_addr_type(&info->key.u.ipv6.dst) &
|
2015-10-26 21:01:44 +00:00
|
|
|
IPV6_ADDR_LINKLOCAL) {
|
2017-08-09 08:09:28 +00:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6],
|
|
|
|
"Remote IPv6 address cannot be link-local");
|
2015-10-26 21:01:44 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
2017-07-21 05:44:20 +00:00
|
|
|
if (ipv6_addr_is_multicast(&info->key.u.ipv6.dst)) {
|
2017-08-09 08:09:28 +00:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6],
|
|
|
|
"Remote IPv6 address cannot be Multicast");
|
2016-11-21 19:02:58 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
|
2020-07-06 15:18:08 +00:00
|
|
|
cfg->use_udp6_rx_checksums = true;
|
2016-11-21 19:02:58 +00:00
|
|
|
#else
|
2017-08-09 08:09:28 +00:00
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6],
|
|
|
|
"IPv6 support not enabled in the kernel");
|
2016-11-21 19:02:58 +00:00
|
|
|
return -EPFNOSUPPORT;
|
|
|
|
#endif
|
2015-10-26 21:01:44 +00:00
|
|
|
}
|
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
if (data[IFLA_GENEVE_ID]) {
|
|
|
|
__u32 vni;
|
|
|
|
__u8 tvni[3];
|
2017-07-21 05:44:20 +00:00
|
|
|
__be64 tunid;
|
2016-11-21 19:02:58 +00:00
|
|
|
|
2015-10-16 23:36:00 +00:00
|
|
|
vni = nla_get_u32(data[IFLA_GENEVE_ID]);
|
2016-11-21 19:02:58 +00:00
|
|
|
tvni[0] = (vni & 0x00ff0000) >> 16;
|
|
|
|
tvni[1] = (vni & 0x0000ff00) >> 8;
|
|
|
|
tvni[2] = vni & 0x000000ff;
|
2015-08-27 06:46:52 +00:00
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
tunid = vni_to_tunnel_id(tvni);
|
2017-08-09 08:09:28 +00:00
|
|
|
if (changelink && (tunid != info->key.tun_id)) {
|
|
|
|
attrtype = IFLA_GENEVE_ID;
|
|
|
|
goto change_notsup;
|
|
|
|
}
|
2017-07-21 05:44:20 +00:00
|
|
|
info->key.tun_id = tunid;
|
2016-11-21 19:02:58 +00:00
|
|
|
}
|
2017-07-21 05:44:20 +00:00
|
|
|
|
2018-09-29 15:06:29 +00:00
|
|
|
if (data[IFLA_GENEVE_TTL_INHERIT]) {
|
|
|
|
if (nla_get_u8(data[IFLA_GENEVE_TTL_INHERIT]))
|
2020-07-06 15:18:08 +00:00
|
|
|
cfg->ttl_inherit = true;
|
2018-09-29 15:06:29 +00:00
|
|
|
else
|
2020-07-06 15:18:08 +00:00
|
|
|
cfg->ttl_inherit = false;
|
2018-09-29 15:06:29 +00:00
|
|
|
} else if (data[IFLA_GENEVE_TTL]) {
|
2017-07-21 05:44:20 +00:00
|
|
|
info->key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
|
2020-07-06 15:18:08 +00:00
|
|
|
cfg->ttl_inherit = false;
|
2018-09-29 15:06:29 +00:00
|
|
|
}
|
2018-09-12 02:04:21 +00:00
|
|
|
|
2015-06-01 19:51:35 +00:00
|
|
|
if (data[IFLA_GENEVE_TOS])
|
2017-07-21 05:44:20 +00:00
|
|
|
info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
|
2015-06-01 19:51:35 +00:00
|
|
|
|
2018-11-08 11:19:19 +00:00
|
|
|
if (data[IFLA_GENEVE_DF])
|
2020-07-06 15:18:08 +00:00
|
|
|
cfg->df = nla_get_u8(data[IFLA_GENEVE_DF]);
|
2018-11-08 11:19:19 +00:00
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
if (data[IFLA_GENEVE_LABEL]) {
|
2017-07-21 05:44:20 +00:00
|
|
|
info->key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
|
2016-11-21 19:02:58 +00:00
|
|
|
IPV6_FLOWLABEL_MASK;
|
2017-08-09 08:09:28 +00:00
|
|
|
if (info->key.label && (!(info->mode & IP_TUNNEL_INFO_IPV6))) {
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_LABEL],
|
|
|
|
"Label attribute only applies for IPv6 Geneve devices");
|
2016-11-21 19:02:58 +00:00
|
|
|
return -EINVAL;
|
2017-08-09 08:09:28 +00:00
|
|
|
}
|
2016-11-21 19:02:58 +00:00
|
|
|
}
|
2016-03-09 02:00:04 +00:00
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
if (data[IFLA_GENEVE_PORT]) {
|
2017-08-09 08:09:28 +00:00
|
|
|
if (changelink) {
|
|
|
|
attrtype = IFLA_GENEVE_PORT;
|
|
|
|
goto change_notsup;
|
|
|
|
}
|
2017-07-21 05:44:20 +00:00
|
|
|
info->key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]);
|
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
if (data[IFLA_GENEVE_COLLECT_METADATA]) {
|
2017-08-09 08:09:28 +00:00
|
|
|
if (changelink) {
|
|
|
|
attrtype = IFLA_GENEVE_COLLECT_METADATA;
|
|
|
|
goto change_notsup;
|
|
|
|
}
|
2020-07-06 15:18:08 +00:00
|
|
|
cfg->collect_md = true;
|
2017-07-21 05:44:20 +00:00
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
if (data[IFLA_GENEVE_UDP_CSUM]) {
|
2017-08-09 08:09:28 +00:00
|
|
|
if (changelink) {
|
|
|
|
attrtype = IFLA_GENEVE_UDP_CSUM;
|
|
|
|
goto change_notsup;
|
|
|
|
}
|
2017-07-21 05:44:20 +00:00
|
|
|
if (nla_get_u8(data[IFLA_GENEVE_UDP_CSUM]))
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
|
2017-07-21 05:44:20 +00:00
|
|
|
}
|
2015-12-10 20:37:45 +00:00
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]) {
|
2017-11-23 03:27:24 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2017-08-09 08:09:28 +00:00
|
|
|
if (changelink) {
|
|
|
|
attrtype = IFLA_GENEVE_UDP_ZERO_CSUM6_TX;
|
|
|
|
goto change_notsup;
|
|
|
|
}
|
2017-07-21 05:44:20 +00:00
|
|
|
if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]))
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__clear_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
|
2017-11-23 03:27:24 +00:00
|
|
|
#else
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX],
|
|
|
|
"IPv6 support not enabled in the kernel");
|
|
|
|
return -EPFNOSUPPORT;
|
|
|
|
#endif
|
2017-07-21 05:44:20 +00:00
|
|
|
}
|
2015-12-10 20:37:45 +00:00
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]) {
|
2017-11-23 03:27:24 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2017-08-09 08:09:28 +00:00
|
|
|
if (changelink) {
|
|
|
|
attrtype = IFLA_GENEVE_UDP_ZERO_CSUM6_RX;
|
|
|
|
goto change_notsup;
|
|
|
|
}
|
2017-07-21 05:44:20 +00:00
|
|
|
if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]))
|
2020-07-06 15:18:08 +00:00
|
|
|
cfg->use_udp6_rx_checksums = false;
|
2017-11-23 03:27:24 +00:00
|
|
|
#else
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX],
|
|
|
|
"IPv6 support not enabled in the kernel");
|
|
|
|
return -EPFNOSUPPORT;
|
|
|
|
#endif
|
2017-07-21 05:44:20 +00:00
|
|
|
}
|
|
|
|
|
2022-03-16 06:15:57 +00:00
|
|
|
if (data[IFLA_GENEVE_INNER_PROTO_INHERIT]) {
|
|
|
|
if (changelink) {
|
|
|
|
attrtype = IFLA_GENEVE_INNER_PROTO_INHERIT;
|
|
|
|
goto change_notsup;
|
|
|
|
}
|
|
|
|
cfg->inner_proto_inherit = true;
|
|
|
|
}
|
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
return 0;
|
2017-08-09 08:09:28 +00:00
|
|
|
change_notsup:
|
|
|
|
NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
|
2022-03-16 06:15:57 +00:00
|
|
|
"Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, and UDP checksum attributes are not supported");
|
2017-08-09 08:09:28 +00:00
|
|
|
return -EOPNOTSUPP;
|
2017-07-21 05:44:20 +00:00
|
|
|
}
|
|
|
|
|
geneve: configure MTU based on a lower device
Currently, on a new link creation or when 'remote' address parameter
is updated, an MTU is not changed and always equals 1500. When a lower
device has a larger MTU, it might not be efficient, e.g. for UDP, and
requires the manual MTU adjustments to match the MTU of the lower
device.
This patch tries to automate this process, finds a lower device using
the 'remote' address parameter, then uses its MTU to tune GENEVE's MTU:
* on a new link creation
* when 'remote' parameter is changed
Also with this patch, the MTU from a user, on a new link creation, is
passed to geneve_change_mtu() where it is verified, and MTU adjustments
with a lower device is skipped in that case. Prior that change, it was
possible to set the invalid MTU values on a new link creation.
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-19 12:42:32 +00:00
|
|
|
static void geneve_link_config(struct net_device *dev,
|
|
|
|
struct ip_tunnel_info *info, struct nlattr *tb[])
|
|
|
|
{
|
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
|
|
|
int ldev_mtu = 0;
|
|
|
|
|
|
|
|
if (tb[IFLA_MTU]) {
|
|
|
|
geneve_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (ip_tunnel_info_af(info)) {
|
|
|
|
case AF_INET: {
|
|
|
|
struct flowi4 fl4 = { .daddr = info->key.u.ipv4.dst };
|
|
|
|
struct rtable *rt = ip_route_output_key(geneve->net, &fl4);
|
|
|
|
|
|
|
|
if (!IS_ERR(rt) && rt->dst.dev) {
|
|
|
|
ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV4_HLEN;
|
|
|
|
ip_rt_put(rt);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
case AF_INET6: {
|
2019-02-07 10:36:10 +00:00
|
|
|
struct rt6_info *rt;
|
|
|
|
|
|
|
|
if (!__in6_dev_get(dev))
|
|
|
|
break;
|
|
|
|
|
|
|
|
rt = rt6_lookup(geneve->net, &info->key.u.ipv6.dst, NULL, 0,
|
|
|
|
NULL, 0);
|
geneve: configure MTU based on a lower device
Currently, on a new link creation or when 'remote' address parameter
is updated, an MTU is not changed and always equals 1500. When a lower
device has a larger MTU, it might not be efficient, e.g. for UDP, and
requires the manual MTU adjustments to match the MTU of the lower
device.
This patch tries to automate this process, finds a lower device using
the 'remote' address parameter, then uses its MTU to tune GENEVE's MTU:
* on a new link creation
* when 'remote' parameter is changed
Also with this patch, the MTU from a user, on a new link creation, is
passed to geneve_change_mtu() where it is verified, and MTU adjustments
with a lower device is skipped in that case. Prior that change, it was
possible to set the invalid MTU values on a new link creation.
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-19 12:42:32 +00:00
|
|
|
|
|
|
|
if (rt && rt->dst.dev)
|
|
|
|
ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV6_HLEN;
|
|
|
|
ip6_rt_put(rt);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ldev_mtu <= 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
geneve_change_mtu(dev, ldev_mtu - info->options_len);
|
|
|
|
}
|
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
static int geneve_newlink(struct net *net, struct net_device *dev,
|
|
|
|
struct nlattr *tb[], struct nlattr *data[],
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
2020-07-06 15:18:08 +00:00
|
|
|
struct geneve_config cfg = {
|
|
|
|
.df = GENEVE_DF_UNSET,
|
|
|
|
.use_udp6_rx_checksums = false,
|
|
|
|
.ttl_inherit = false,
|
|
|
|
.collect_md = false,
|
|
|
|
};
|
2017-07-21 05:44:20 +00:00
|
|
|
int err;
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
init_tnl_info(&cfg.info, GENEVE_UDP_PORT);
|
|
|
|
err = geneve_nl2info(tb, data, extack, &cfg, false);
|
2017-07-21 05:44:20 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
2015-12-10 20:37:45 +00:00
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
err = geneve_configure(net, dev, extack, &cfg);
|
geneve: configure MTU based on a lower device
Currently, on a new link creation or when 'remote' address parameter
is updated, an MTU is not changed and always equals 1500. When a lower
device has a larger MTU, it might not be efficient, e.g. for UDP, and
requires the manual MTU adjustments to match the MTU of the lower
device.
This patch tries to automate this process, finds a lower device using
the 'remote' address parameter, then uses its MTU to tune GENEVE's MTU:
* on a new link creation
* when 'remote' parameter is changed
Also with this patch, the MTU from a user, on a new link creation, is
passed to geneve_change_mtu() where it is verified, and MTU adjustments
with a lower device is skipped in that case. Prior that change, it was
possible to set the invalid MTU values on a new link creation.
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-19 12:42:32 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
geneve_link_config(dev, &cfg.info, tb);
|
geneve: configure MTU based on a lower device
Currently, on a new link creation or when 'remote' address parameter
is updated, an MTU is not changed and always equals 1500. When a lower
device has a larger MTU, it might not be efficient, e.g. for UDP, and
requires the manual MTU adjustments to match the MTU of the lower
device.
This patch tries to automate this process, finds a lower device using
the 'remote' address parameter, then uses its MTU to tune GENEVE's MTU:
* on a new link creation
* when 'remote' parameter is changed
Also with this patch, the MTU from a user, on a new link creation, is
passed to geneve_change_mtu() where it is verified, and MTU adjustments
with a lower device is skipped in that case. Prior that change, it was
possible to set the invalid MTU values on a new link creation.
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-19 12:42:32 +00:00
|
|
|
|
|
|
|
return 0;
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
|
|
|
|
2017-07-21 05:44:20 +00:00
|
|
|
/* Quiesces the geneve device data path for both TX and RX.
|
|
|
|
*
|
|
|
|
* On transmit geneve checks for non-NULL geneve_sock before it proceeds.
|
|
|
|
* So, if we set that socket to NULL under RCU and wait for synchronize_net()
|
|
|
|
* to complete for the existing set of in-flight packets to be transmitted,
|
|
|
|
* then we would have quiesced the transmit data path. All the future packets
|
|
|
|
* will get dropped until we unquiesce the data path.
|
|
|
|
*
|
|
|
|
* On receive geneve dereference the geneve_sock stashed in the socket. So,
|
|
|
|
* if we set that to NULL under RCU and wait for synchronize_net() to
|
|
|
|
* complete, then we would have quiesced the receive data path.
|
|
|
|
*/
|
|
|
|
static void geneve_quiesce(struct geneve_dev *geneve, struct geneve_sock **gs4,
|
|
|
|
struct geneve_sock **gs6)
|
|
|
|
{
|
|
|
|
*gs4 = rtnl_dereference(geneve->sock4);
|
|
|
|
rcu_assign_pointer(geneve->sock4, NULL);
|
|
|
|
if (*gs4)
|
|
|
|
rcu_assign_sk_user_data((*gs4)->sock->sk, NULL);
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
*gs6 = rtnl_dereference(geneve->sock6);
|
|
|
|
rcu_assign_pointer(geneve->sock6, NULL);
|
|
|
|
if (*gs6)
|
|
|
|
rcu_assign_sk_user_data((*gs6)->sock->sk, NULL);
|
|
|
|
#else
|
|
|
|
*gs6 = NULL;
|
|
|
|
#endif
|
|
|
|
synchronize_net();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Resumes the geneve device data path for both TX and RX. */
|
|
|
|
static void geneve_unquiesce(struct geneve_dev *geneve, struct geneve_sock *gs4,
|
|
|
|
struct geneve_sock __maybe_unused *gs6)
|
|
|
|
{
|
|
|
|
rcu_assign_pointer(geneve->sock4, gs4);
|
|
|
|
if (gs4)
|
|
|
|
rcu_assign_sk_user_data(gs4->sock->sk, gs4);
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
rcu_assign_pointer(geneve->sock6, gs6);
|
|
|
|
if (gs6)
|
|
|
|
rcu_assign_sk_user_data(gs6->sock->sk, gs6);
|
|
|
|
#endif
|
|
|
|
synchronize_net();
|
|
|
|
}
|
|
|
|
|
|
|
|
static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
|
|
|
|
struct nlattr *data[],
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
|
|
|
struct geneve_sock *gs4, *gs6;
|
2020-07-06 15:18:08 +00:00
|
|
|
struct geneve_config cfg;
|
2017-07-21 05:44:20 +00:00
|
|
|
int err;
|
|
|
|
|
|
|
|
/* If the geneve device is configured for metadata (or externally
|
|
|
|
* controlled, for example, OVS), then nothing can be changed.
|
|
|
|
*/
|
2020-07-06 15:18:08 +00:00
|
|
|
if (geneve->cfg.collect_md)
|
2017-07-21 05:44:20 +00:00
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
/* Start with the existing info. */
|
2020-07-06 15:18:08 +00:00
|
|
|
memcpy(&cfg, &geneve->cfg, sizeof(cfg));
|
|
|
|
err = geneve_nl2info(tb, data, extack, &cfg, true);
|
2017-07-21 05:44:20 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
if (!geneve_dst_addr_equal(&geneve->cfg.info, &cfg.info)) {
|
|
|
|
dst_cache_reset(&cfg.info.dst_cache);
|
|
|
|
geneve_link_config(dev, &cfg.info, tb);
|
geneve: configure MTU based on a lower device
Currently, on a new link creation or when 'remote' address parameter
is updated, an MTU is not changed and always equals 1500. When a lower
device has a larger MTU, it might not be efficient, e.g. for UDP, and
requires the manual MTU adjustments to match the MTU of the lower
device.
This patch tries to automate this process, finds a lower device using
the 'remote' address parameter, then uses its MTU to tune GENEVE's MTU:
* on a new link creation
* when 'remote' parameter is changed
Also with this patch, the MTU from a user, on a new link creation, is
passed to geneve_change_mtu() where it is verified, and MTU adjustments
with a lower device is skipped in that case. Prior that change, it was
possible to set the invalid MTU values on a new link creation.
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-19 12:42:32 +00:00
|
|
|
}
|
2017-07-21 05:44:20 +00:00
|
|
|
|
|
|
|
geneve_quiesce(geneve, &gs4, &gs6);
|
2020-07-06 15:18:08 +00:00
|
|
|
memcpy(&geneve->cfg, &cfg, sizeof(cfg));
|
2017-07-21 05:44:20 +00:00
|
|
|
geneve_unquiesce(geneve, gs4, gs6);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
static void geneve_dellink(struct net_device *dev, struct list_head *head)
|
|
|
|
{
|
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
|
|
|
|
|
|
|
list_del(&geneve->next);
|
|
|
|
unregister_netdevice_queue(dev, head);
|
|
|
|
}
|
|
|
|
|
|
|
|
static size_t geneve_get_size(const struct net_device *dev)
|
|
|
|
{
|
|
|
|
return nla_total_size(sizeof(__u32)) + /* IFLA_GENEVE_ID */
|
2015-10-26 21:01:44 +00:00
|
|
|
nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */
|
2015-06-01 19:51:34 +00:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */
|
2015-06-01 19:51:35 +00:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */
|
2018-11-08 11:19:19 +00:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_DF */
|
2016-03-09 02:00:04 +00:00
|
|
|
nla_total_size(sizeof(__be32)) + /* IFLA_GENEVE_LABEL */
|
2015-09-22 17:09:32 +00:00
|
|
|
nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */
|
2015-08-27 06:46:52 +00:00
|
|
|
nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */
|
2015-12-10 20:37:45 +00:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
|
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */
|
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */
|
2018-09-12 02:04:21 +00:00
|
|
|
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */
|
2022-03-22 04:39:54 +00:00
|
|
|
nla_total_size(0) + /* IFLA_GENEVE_INNER_PROTO_INHERIT */
|
2015-05-13 16:57:30 +00:00
|
|
|
0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct geneve_dev *geneve = netdev_priv(dev);
|
2020-07-06 15:18:08 +00:00
|
|
|
struct ip_tunnel_info *info = &geneve->cfg.info;
|
|
|
|
bool ttl_inherit = geneve->cfg.ttl_inherit;
|
|
|
|
bool metadata = geneve->cfg.collect_md;
|
2016-11-21 19:02:58 +00:00
|
|
|
__u8 tmp_vni[3];
|
2015-05-13 16:57:30 +00:00
|
|
|
__u32 vni;
|
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
tunnel_id_to_vni(info->key.tun_id, tmp_vni);
|
|
|
|
vni = (tmp_vni[0] << 16) | (tmp_vni[1] << 8) | tmp_vni[2];
|
2015-05-13 16:57:30 +00:00
|
|
|
if (nla_put_u32(skb, IFLA_GENEVE_ID, vni))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2017-11-15 01:43:09 +00:00
|
|
|
if (!metadata && ip_tunnel_info_af(info) == AF_INET) {
|
2015-10-26 21:01:44 +00:00
|
|
|
if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE,
|
2016-11-21 19:02:58 +00:00
|
|
|
info->key.u.ipv4.dst))
|
|
|
|
goto nla_put_failure;
|
|
|
|
if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM,
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
test_bit(IP_TUNNEL_CSUM_BIT,
|
|
|
|
info->key.tun_flags)))
|
2015-10-26 21:01:44 +00:00
|
|
|
goto nla_put_failure;
|
2016-11-21 19:02:58 +00:00
|
|
|
|
2015-10-26 21:01:44 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2017-11-15 01:43:09 +00:00
|
|
|
} else if (!metadata) {
|
2015-10-26 21:01:44 +00:00
|
|
|
if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6,
|
2016-11-21 19:02:58 +00:00
|
|
|
&info->key.u.ipv6.dst))
|
|
|
|
goto nla_put_failure;
|
|
|
|
if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
!test_bit(IP_TUNNEL_CSUM_BIT,
|
|
|
|
info->key.tun_flags)))
|
2016-11-21 19:02:58 +00:00
|
|
|
goto nla_put_failure;
|
2017-05-23 22:37:27 +00:00
|
|
|
#endif
|
2017-11-15 01:43:09 +00:00
|
|
|
}
|
2015-05-13 16:57:30 +00:00
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) ||
|
|
|
|
nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) ||
|
|
|
|
nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label))
|
2015-06-01 19:51:34 +00:00
|
|
|
goto nla_put_failure;
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
if (nla_put_u8(skb, IFLA_GENEVE_DF, geneve->cfg.df))
|
2018-11-08 11:19:19 +00:00
|
|
|
goto nla_put_failure;
|
|
|
|
|
2016-11-21 19:02:58 +00:00
|
|
|
if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst))
|
2015-08-27 06:46:51 +00:00
|
|
|
goto nla_put_failure;
|
|
|
|
|
2017-11-15 01:43:09 +00:00
|
|
|
if (metadata && nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
|
2017-11-23 03:27:24 +00:00
|
|
|
goto nla_put_failure;
|
2017-11-15 01:43:09 +00:00
|
|
|
|
2017-11-23 03:27:24 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2017-11-15 01:43:09 +00:00
|
|
|
if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
|
2020-07-06 15:18:08 +00:00
|
|
|
!geneve->cfg.use_udp6_rx_checksums))
|
2017-11-15 01:43:09 +00:00
|
|
|
goto nla_put_failure;
|
2017-11-23 03:27:24 +00:00
|
|
|
#endif
|
2017-11-15 01:43:09 +00:00
|
|
|
|
2018-09-12 02:04:21 +00:00
|
|
|
if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2022-03-16 06:15:57 +00:00
|
|
|
if (geneve->cfg.inner_proto_inherit &&
|
|
|
|
nla_put_flag(skb, IFLA_GENEVE_INNER_PROTO_INHERIT))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
nla_put_failure:
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct rtnl_link_ops geneve_link_ops __read_mostly = {
|
|
|
|
.kind = "geneve",
|
|
|
|
.maxtype = IFLA_GENEVE_MAX,
|
|
|
|
.policy = geneve_policy,
|
|
|
|
.priv_size = sizeof(struct geneve_dev),
|
|
|
|
.setup = geneve_setup,
|
|
|
|
.validate = geneve_validate,
|
|
|
|
.newlink = geneve_newlink,
|
2017-07-21 05:44:20 +00:00
|
|
|
.changelink = geneve_changelink,
|
2015-05-13 16:57:30 +00:00
|
|
|
.dellink = geneve_dellink,
|
|
|
|
.get_size = geneve_get_size,
|
|
|
|
.fill_info = geneve_fill_info,
|
|
|
|
};
|
|
|
|
|
2015-08-27 06:46:52 +00:00
|
|
|
struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
|
|
|
|
u8 name_assign_type, u16 dst_port)
|
|
|
|
{
|
|
|
|
struct nlattr *tb[IFLA_MAX + 1];
|
|
|
|
struct net_device *dev;
|
2016-06-13 08:31:04 +00:00
|
|
|
LIST_HEAD(list_kill);
|
2015-08-27 06:46:52 +00:00
|
|
|
int err;
|
2020-07-06 15:18:08 +00:00
|
|
|
struct geneve_config cfg = {
|
|
|
|
.df = GENEVE_DF_UNSET,
|
|
|
|
.use_udp6_rx_checksums = true,
|
|
|
|
.ttl_inherit = false,
|
|
|
|
.collect_md = true,
|
|
|
|
};
|
2015-08-27 06:46:52 +00:00
|
|
|
|
|
|
|
memset(tb, 0, sizeof(tb));
|
|
|
|
dev = rtnl_create_link(net, name, name_assign_type,
|
2018-11-06 20:51:14 +00:00
|
|
|
&geneve_link_ops, tb, NULL);
|
2015-08-27 06:46:52 +00:00
|
|
|
if (IS_ERR(dev))
|
|
|
|
return dev;
|
|
|
|
|
2020-07-06 15:18:08 +00:00
|
|
|
init_tnl_info(&cfg.info, dst_port);
|
|
|
|
err = geneve_configure(net, dev, NULL, &cfg);
|
2016-06-13 08:31:04 +00:00
|
|
|
if (err) {
|
|
|
|
free_netdev(dev);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 00:05:58 +00:00
|
|
|
|
|
|
|
/* openvswitch users expect packet sizes to be unrestricted,
|
|
|
|
* so set the largest MTU we can.
|
|
|
|
*/
|
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 17:55:20 +00:00
|
|
|
err = geneve_change_mtu(dev, IP_MAX_MTU);
|
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 00:05:58 +00:00
|
|
|
if (err)
|
|
|
|
goto err;
|
|
|
|
|
2022-10-28 08:42:21 +00:00
|
|
|
err = rtnl_configure_link(dev, NULL, 0, NULL);
|
2016-06-13 08:31:07 +00:00
|
|
|
if (err < 0)
|
|
|
|
goto err;
|
|
|
|
|
2015-08-27 06:46:52 +00:00
|
|
|
return dev;
|
2016-11-21 19:02:58 +00:00
|
|
|
err:
|
2016-06-13 08:31:04 +00:00
|
|
|
geneve_dellink(dev, &list_kill);
|
|
|
|
unregister_netdevice_many(&list_kill);
|
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 00:05:58 +00:00
|
|
|
return ERR_PTR(err);
|
2015-08-27 06:46:52 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(geneve_dev_create_fb);
|
|
|
|
|
2016-04-18 19:19:48 +00:00
|
|
|
static int geneve_netdevice_event(struct notifier_block *unused,
|
|
|
|
unsigned long event, void *ptr)
|
|
|
|
{
|
|
|
|
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
|
|
|
|
2021-01-06 21:06:35 +00:00
|
|
|
if (event == NETDEV_UDP_TUNNEL_PUSH_INFO)
|
|
|
|
geneve_offload_rx_ports(dev, true);
|
|
|
|
else if (event == NETDEV_UDP_TUNNEL_DROP_INFO)
|
|
|
|
geneve_offload_rx_ports(dev, false);
|
2016-04-18 19:19:48 +00:00
|
|
|
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block geneve_notifier_block __read_mostly = {
|
|
|
|
.notifier_call = geneve_netdevice_event,
|
|
|
|
};
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
static __net_init int geneve_init_net(struct net *net)
|
|
|
|
{
|
|
|
|
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&gn->geneve_list);
|
2015-08-27 06:46:54 +00:00
|
|
|
INIT_LIST_HEAD(&gn->sock_list);
|
2015-05-13 16:57:30 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-12-16 09:54:50 +00:00
|
|
|
static void geneve_destroy_tunnels(struct net *net, struct list_head *head)
|
2015-05-13 16:57:30 +00:00
|
|
|
{
|
|
|
|
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
|
|
|
struct geneve_dev *geneve, *next;
|
|
|
|
struct net_device *dev, *aux;
|
|
|
|
|
|
|
|
/* gather any geneve devices that were moved into this ns */
|
|
|
|
for_each_netdev_safe(net, dev, aux)
|
|
|
|
if (dev->rtnl_link_ops == &geneve_link_ops)
|
2017-12-16 09:54:50 +00:00
|
|
|
unregister_netdevice_queue(dev, head);
|
2015-05-13 16:57:30 +00:00
|
|
|
|
|
|
|
/* now gather any other geneve devices that were created in this ns */
|
|
|
|
list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) {
|
|
|
|
/* If geneve->dev is in the same netns, it was already added
|
|
|
|
* to the list by the previous loop.
|
|
|
|
*/
|
|
|
|
if (!net_eq(dev_net(geneve->dev), net))
|
2017-12-16 09:54:50 +00:00
|
|
|
unregister_netdevice_queue(geneve->dev, head);
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
2017-12-16 09:54:50 +00:00
|
|
|
}
|
|
|
|
|
2024-02-06 14:43:02 +00:00
|
|
|
static void __net_exit geneve_exit_batch_rtnl(struct list_head *net_list,
|
|
|
|
struct list_head *dev_to_kill)
|
2017-12-16 09:54:50 +00:00
|
|
|
{
|
|
|
|
struct net *net;
|
|
|
|
|
|
|
|
list_for_each_entry(net, net_list, exit_list)
|
2024-02-06 14:43:02 +00:00
|
|
|
geneve_destroy_tunnels(net, dev_to_kill);
|
|
|
|
}
|
2020-03-14 07:18:42 +00:00
|
|
|
|
2024-02-06 14:43:02 +00:00
|
|
|
static void __net_exit geneve_exit_net(struct net *net)
|
|
|
|
{
|
|
|
|
const struct geneve_net *gn = net_generic(net, geneve_net_id);
|
2020-03-14 07:18:42 +00:00
|
|
|
|
2024-02-06 14:43:02 +00:00
|
|
|
WARN_ON_ONCE(!list_empty(&gn->sock_list));
|
2015-05-13 16:57:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct pernet_operations geneve_net_ops = {
|
|
|
|
.init = geneve_init_net,
|
2024-02-06 14:43:02 +00:00
|
|
|
.exit_batch_rtnl = geneve_exit_batch_rtnl,
|
|
|
|
.exit = geneve_exit_net,
|
2015-05-13 16:57:30 +00:00
|
|
|
.id = &geneve_net_id,
|
|
|
|
.size = sizeof(struct geneve_net),
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init geneve_init_module(void)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
rc = register_pernet_subsys(&geneve_net_ops);
|
|
|
|
if (rc)
|
|
|
|
goto out1;
|
|
|
|
|
2016-04-18 19:19:48 +00:00
|
|
|
rc = register_netdevice_notifier(&geneve_notifier_block);
|
2015-05-13 16:57:30 +00:00
|
|
|
if (rc)
|
|
|
|
goto out2;
|
|
|
|
|
2016-04-18 19:19:48 +00:00
|
|
|
rc = rtnl_link_register(&geneve_link_ops);
|
|
|
|
if (rc)
|
|
|
|
goto out3;
|
|
|
|
|
2015-05-13 16:57:30 +00:00
|
|
|
return 0;
|
2016-04-18 19:19:48 +00:00
|
|
|
out3:
|
|
|
|
unregister_netdevice_notifier(&geneve_notifier_block);
|
2015-05-13 16:57:30 +00:00
|
|
|
out2:
|
|
|
|
unregister_pernet_subsys(&geneve_net_ops);
|
|
|
|
out1:
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
late_initcall(geneve_init_module);
|
|
|
|
|
|
|
|
static void __exit geneve_cleanup_module(void)
|
|
|
|
{
|
|
|
|
rtnl_link_unregister(&geneve_link_ops);
|
2016-04-18 19:19:48 +00:00
|
|
|
unregister_netdevice_notifier(&geneve_notifier_block);
|
2015-05-13 16:57:30 +00:00
|
|
|
unregister_pernet_subsys(&geneve_net_ops);
|
|
|
|
}
|
|
|
|
module_exit(geneve_cleanup_module);
|
|
|
|
|
|
|
|
MODULE_LICENSE("GPL");
|
|
|
|
MODULE_VERSION(GENEVE_NETDEV_VER);
|
|
|
|
MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>");
|
|
|
|
MODULE_DESCRIPTION("Interface driver for GENEVE encapsulated traffic");
|
|
|
|
MODULE_ALIAS_RTNL_LINK("geneve");
|