ip6_tunnel: add collect_md mode to IPv6 tunnels

Similar to gre, vxlan, geneve tunnels allow IPIP6 and IP6IP6 tunnels
to operate in 'collect metadata' mode.
Unlike ipv4 code here it's possible to reuse ip6_tnl_xmit() function
for both collect_md and traditional tunnels.
bpf_skb_[gs]et_tunnel_key() helpers and ovs (in the future) are the users.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Alexei Starovoitov 2016-09-15 13:00:30 -07:00 committed by David S. Miller
parent cfc7381b30
commit 8d79266bc4
2 changed files with 138 additions and 49 deletions

View File

@ -23,6 +23,7 @@ struct __ip6_tnl_parm {
__u8 proto; /* tunnel protocol */ __u8 proto; /* tunnel protocol */
__u8 encap_limit; /* encapsulation limit for tunnel */ __u8 encap_limit; /* encapsulation limit for tunnel */
__u8 hop_limit; /* hop limit for tunnel */ __u8 hop_limit; /* hop limit for tunnel */
bool collect_md;
__be32 flowinfo; /* traffic class and flowlabel for tunnel */ __be32 flowinfo; /* traffic class and flowlabel for tunnel */
__u32 flags; /* tunnel flags */ __u32 flags; /* tunnel flags */
struct in6_addr laddr; /* local tunnel end-point address */ struct in6_addr laddr; /* local tunnel end-point address */

View File

@ -57,6 +57,7 @@
#include <net/inet_ecn.h> #include <net/inet_ecn.h>
#include <net/net_namespace.h> #include <net/net_namespace.h>
#include <net/netns/generic.h> #include <net/netns/generic.h>
#include <net/dst_metadata.h>
MODULE_AUTHOR("Ville Nuorvala"); MODULE_AUTHOR("Ville Nuorvala");
MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_DESCRIPTION("IPv6 tunneling device");
@ -90,6 +91,7 @@ struct ip6_tnl_net {
struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu *tnls_wc[1];
struct ip6_tnl __rcu **tnls[2]; struct ip6_tnl __rcu **tnls[2];
struct ip6_tnl __rcu *collect_md_tun;
}; };
static struct net_device_stats *ip6_get_stats(struct net_device *dev) static struct net_device_stats *ip6_get_stats(struct net_device *dev)
@ -166,6 +168,10 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
return t; return t;
} }
t = rcu_dereference(ip6n->collect_md_tun);
if (t)
return t;
t = rcu_dereference(ip6n->tnls_wc[0]); t = rcu_dereference(ip6n->tnls_wc[0]);
if (t && (t->dev->flags & IFF_UP)) if (t && (t->dev->flags & IFF_UP))
return t; return t;
@ -209,6 +215,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
{ {
struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
if (t->parms.collect_md)
rcu_assign_pointer(ip6n->collect_md_tun, t);
rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(t->next , rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t); rcu_assign_pointer(*tp, t);
} }
@ -224,6 +232,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
struct ip6_tnl __rcu **tp; struct ip6_tnl __rcu **tp;
struct ip6_tnl *iter; struct ip6_tnl *iter;
if (t->parms.collect_md)
rcu_assign_pointer(ip6n->collect_md_tun, NULL);
for (tp = ip6_tnl_bucket(ip6n, &t->parms); for (tp = ip6_tnl_bucket(ip6n, &t->parms);
(iter = rtnl_dereference(*tp)) != NULL; (iter = rtnl_dereference(*tp)) != NULL;
tp = &iter->next) { tp = &iter->next) {
@ -829,6 +840,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
if (tun_dst)
skb_dst_set(skb, (struct dst_entry *)tun_dst);
gro_cells_receive(&tunnel->gro_cells, skb); gro_cells_receive(&tunnel->gro_cells, skb);
return 0; return 0;
@ -865,6 +879,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
{ {
struct ip6_tnl *t; struct ip6_tnl *t;
const struct ipv6hdr *ipv6h = ipv6_hdr(skb); const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct metadata_dst *tun_dst = NULL;
int ret = -1; int ret = -1;
rcu_read_lock(); rcu_read_lock();
@ -881,7 +896,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
goto drop; goto drop;
if (iptunnel_pull_header(skb, 0, tpi->proto, false)) if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop; goto drop;
ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate, if (t->parms.collect_md) {
tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
if (!tun_dst)
return 0;
}
ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
log_ecn_error); log_ecn_error);
} }
@ -1012,8 +1032,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
int mtu; int mtu;
unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
unsigned int max_headroom = psh_hlen; unsigned int max_headroom = psh_hlen;
u8 hop_limit;
int err = -1; int err = -1;
if (t->parms.collect_md) {
hop_limit = skb_tunnel_info(skb)->key.ttl;
goto route_lookup;
} else {
hop_limit = t->parms.hop_limit;
}
/* NBMA tunnel */ /* NBMA tunnel */
if (ipv6_addr_any(&t->parms.raddr)) { if (ipv6_addr_any(&t->parms.raddr)) {
struct in6_addr *addr6; struct in6_addr *addr6;
@ -1043,6 +1071,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
goto tx_err_link_failure; goto tx_err_link_failure;
if (!dst) { if (!dst) {
route_lookup:
dst = ip6_route_output(net, NULL, fl6); dst = ip6_route_output(net, NULL, fl6);
if (dst->error) if (dst->error)
@ -1053,6 +1082,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
dst = NULL; dst = NULL;
goto tx_err_link_failure; goto tx_err_link_failure;
} }
if (t->parms.collect_md &&
ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
&fl6->daddr, 0, &fl6->saddr))
goto tx_err_link_failure;
ndst = dst; ndst = dst;
} }
@ -1071,7 +1104,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
} }
if (mtu < IPV6_MIN_MTU) if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU; mtu = IPV6_MIN_MTU;
if (skb_dst(skb)) if (skb_dst(skb) && !t->parms.collect_md)
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
if (skb->len > mtu && !skb_is_gso(skb)) { if (skb->len > mtu && !skb_is_gso(skb)) {
*pmtu = mtu; *pmtu = mtu;
@ -1111,8 +1144,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
skb = new_skb; skb = new_skb;
} }
if (!fl6->flowi6_mark && ndst) if (t->parms.collect_md) {
dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); if (t->encap.type != TUNNEL_ENCAP_NONE)
goto tx_err_dst_release;
} else {
if (!fl6->flowi6_mark && ndst)
dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
}
skb_dst_set(skb, dst); skb_dst_set(skb, dst);
if (encap_limit >= 0) { if (encap_limit >= 0) {
@ -1137,7 +1175,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
ipv6h = ipv6_hdr(skb); ipv6h = ipv6_hdr(skb);
ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
ipv6h->hop_limit = t->parms.hop_limit; ipv6h->hop_limit = hop_limit;
ipv6h->nexthdr = proto; ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr; ipv6h->saddr = fl6->saddr;
ipv6h->daddr = fl6->daddr; ipv6h->daddr = fl6->daddr;
@ -1170,19 +1208,34 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (tproto != IPPROTO_IPIP && tproto != 0) if (tproto != IPPROTO_IPIP && tproto != 0)
return -1; return -1;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPIP;
dsfield = ipv4_get_dsfield(iph); dsfield = ipv4_get_dsfield(iph);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) if (t->parms.collect_md) {
fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) struct ip_tunnel_info *tun_info;
& IPV6_TCLASS_MASK; const struct ip_tunnel_key *key;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark; tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
ip_tunnel_info_af(tun_info) != AF_INET6))
return -1;
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPIP;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
} else {
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPIP;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
& IPV6_TCLASS_MASK;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
}
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1; return -1;
@ -1220,29 +1273,47 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
ip6_tnl_addr_conflict(t, ipv6h)) ip6_tnl_addr_conflict(t, ipv6h))
return -1; return -1;
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
if (offset > 0) {
struct ipv6_tlv_tnl_enc_lim *tel;
tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
if (tel->encap_limit == 0) {
icmpv6_send(skb, ICMPV6_PARAMPROB,
ICMPV6_HDR_FIELD, offset + 2);
return -1;
}
encap_limit = tel->encap_limit - 1;
} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPV6;
dsfield = ipv6_get_dsfield(ipv6h); dsfield = ipv6_get_dsfield(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); if (t->parms.collect_md) {
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) struct ip_tunnel_info *tun_info;
fl6.flowlabel |= ip6_flowlabel(ipv6h); const struct ip_tunnel_key *key;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark; tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
ip_tunnel_info_af(tun_info) != AF_INET6))
return -1;
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPV6;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
} else {
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
if (offset > 0) {
struct ipv6_tlv_tnl_enc_lim *tel;
tel = (void *)&skb_network_header(skb)[offset];
if (tel->encap_limit == 0) {
icmpv6_send(skb, ICMPV6_PARAMPROB,
ICMPV6_HDR_FIELD, offset + 2);
return -1;
}
encap_limit = tel->encap_limit - 1;
} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
encap_limit = t->parms.encap_limit;
}
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPV6;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
fl6.flowlabel |= ip6_flowlabel(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
}
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1; return -1;
@ -1741,6 +1812,10 @@ static int ip6_tnl_dev_init(struct net_device *dev)
if (err) if (err)
return err; return err;
ip6_tnl_link_config(t); ip6_tnl_link_config(t);
if (t->parms.collect_md) {
dev->features |= NETIF_F_NETNS_LOCAL;
netif_keep_dst(dev);
}
return 0; return 0;
} }
@ -1811,6 +1886,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
if (data[IFLA_IPTUN_PROTO]) if (data[IFLA_IPTUN_PROTO])
parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
if (data[IFLA_IPTUN_COLLECT_METADATA])
parms->collect_md = true;
} }
static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[], static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
@ -1850,6 +1928,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[]) struct nlattr *tb[], struct nlattr *data[])
{ {
struct net *net = dev_net(dev); struct net *net = dev_net(dev);
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct ip6_tnl *nt, *t; struct ip6_tnl *nt, *t;
struct ip_tunnel_encap ipencap; struct ip_tunnel_encap ipencap;
@ -1864,9 +1943,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
ip6_tnl_netlink_parms(data, &nt->parms); ip6_tnl_netlink_parms(data, &nt->parms);
t = ip6_tnl_locate(net, &nt->parms, 0); if (nt->parms.collect_md) {
if (!IS_ERR(t)) if (rtnl_dereference(ip6n->collect_md_tun))
return -EEXIST; return -EEXIST;
} else {
t = ip6_tnl_locate(net, &nt->parms, 0);
if (!IS_ERR(t))
return -EEXIST;
}
return ip6_tnl_create2(dev); return ip6_tnl_create2(dev);
} }
@ -1890,6 +1974,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
return err; return err;
} }
ip6_tnl_netlink_parms(data, &p); ip6_tnl_netlink_parms(data, &p);
if (p.collect_md)
return -EINVAL;
t = ip6_tnl_locate(net, &p, 0); t = ip6_tnl_locate(net, &p, 0);
if (!IS_ERR(t)) { if (!IS_ERR(t)) {
@ -1937,6 +2023,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
nla_total_size(2) + nla_total_size(2) +
/* IFLA_IPTUN_ENCAP_DPORT */ /* IFLA_IPTUN_ENCAP_DPORT */
nla_total_size(2) + nla_total_size(2) +
/* IFLA_IPTUN_COLLECT_METADATA */
nla_total_size(0) +
0; 0;
} }
@ -1955,16 +2043,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto)) nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
goto nla_put_failure; goto nla_put_failure;
if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
tunnel->encap.sport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
tunnel->encap.dport) ||
nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
tunnel->encap.flags))
goto nla_put_failure; goto nla_put_failure;
if (parm->collect_md)
if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
goto nla_put_failure;
return 0; return 0;
nla_put_failure: nla_put_failure:
@ -1992,6 +2079,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
}; };
static struct rtnl_link_ops ip6_link_ops __read_mostly = { static struct rtnl_link_ops ip6_link_ops __read_mostly = {