2019-05-29 14:12:43 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2013-10-04 01:16:47 +00:00
|
|
|
/*
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
* Copyright (c) 2007-2017 Nicira, Inc.
|
2013-10-04 01:16:47 +00:00
|
|
|
*/
|
|
|
|
|
2014-02-04 01:18:21 +00:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
#include "flow.h"
|
|
|
|
#include "datapath.h"
|
|
|
|
#include <linux/uaccess.h>
|
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <linux/etherdevice.h>
|
|
|
|
#include <linux/if_ether.h>
|
|
|
|
#include <linux/if_vlan.h>
|
|
|
|
#include <net/llc_pdu.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/jhash.h>
|
|
|
|
#include <linux/jiffies.h>
|
|
|
|
#include <linux/llc.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/in.h>
|
|
|
|
#include <linux/rcupdate.h>
|
|
|
|
#include <linux/if_arp.h>
|
|
|
|
#include <linux/ip.h>
|
|
|
|
#include <linux/ipv6.h>
|
|
|
|
#include <linux/sctp.h>
|
|
|
|
#include <linux/tcp.h>
|
|
|
|
#include <linux/udp.h>
|
|
|
|
#include <linux/icmp.h>
|
|
|
|
#include <linux/icmpv6.h>
|
|
|
|
#include <linux/rculist.h>
|
2014-10-03 22:35:33 +00:00
|
|
|
#include <net/geneve.h>
|
2013-10-04 01:16:47 +00:00
|
|
|
#include <net/ip.h>
|
|
|
|
#include <net/ipv6.h>
|
|
|
|
#include <net/ndisc.h>
|
2014-10-06 12:05:13 +00:00
|
|
|
#include <net/mpls.h>
|
2015-07-21 08:44:06 +00:00
|
|
|
#include <net/vxlan.h>
|
2017-11-07 13:07:02 +00:00
|
|
|
#include <net/tun_proto.h>
|
2018-01-25 21:20:11 +00:00
|
|
|
#include <net/erspan.h>
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2023-08-11 14:12:50 +00:00
|
|
|
#include "drop.h"
|
2013-10-04 01:16:47 +00:00
|
|
|
#include "flow_netlink.h"
|
|
|
|
|
2015-01-15 02:53:58 +00:00
|
|
|
struct ovs_len_tbl {
|
|
|
|
int len;
|
|
|
|
const struct ovs_len_tbl *next;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define OVS_ATTR_NESTED -1
|
2015-09-12 01:38:28 +00:00
|
|
|
#define OVS_ATTR_VARIABLE -2
|
2024-02-07 13:24:15 +00:00
|
|
|
#define OVS_COPY_ACTIONS_MAX_DEPTH 16
|
2015-01-15 02:53:58 +00:00
|
|
|
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
static bool actions_may_change_flow(const struct nlattr *actions)
|
|
|
|
{
|
|
|
|
struct nlattr *nla;
|
|
|
|
int rem;
|
|
|
|
|
|
|
|
nla_for_each_nested(nla, actions, rem) {
|
|
|
|
u16 action = nla_type(nla);
|
|
|
|
|
|
|
|
switch (action) {
|
|
|
|
case OVS_ACTION_ATTR_OUTPUT:
|
|
|
|
case OVS_ACTION_ATTR_RECIRC:
|
|
|
|
case OVS_ACTION_ATTR_TRUNC:
|
|
|
|
case OVS_ACTION_ATTR_USERSPACE:
|
2023-08-11 14:12:50 +00:00
|
|
|
case OVS_ACTION_ATTR_DROP:
|
2024-07-04 08:56:56 +00:00
|
|
|
case OVS_ACTION_ATTR_PSAMPLE:
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_CT:
|
2017-10-10 20:54:44 +00:00
|
|
|
case OVS_ACTION_ATTR_CT_CLEAR:
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
case OVS_ACTION_ATTR_HASH:
|
|
|
|
case OVS_ACTION_ATTR_POP_ETH:
|
|
|
|
case OVS_ACTION_ATTR_POP_MPLS:
|
2017-11-07 13:07:02 +00:00
|
|
|
case OVS_ACTION_ATTR_POP_NSH:
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
case OVS_ACTION_ATTR_POP_VLAN:
|
|
|
|
case OVS_ACTION_ATTR_PUSH_ETH:
|
|
|
|
case OVS_ACTION_ATTR_PUSH_MPLS:
|
2017-11-07 13:07:02 +00:00
|
|
|
case OVS_ACTION_ATTR_PUSH_NSH:
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
case OVS_ACTION_ATTR_PUSH_VLAN:
|
|
|
|
case OVS_ACTION_ATTR_SAMPLE:
|
|
|
|
case OVS_ACTION_ATTR_SET:
|
|
|
|
case OVS_ACTION_ATTR_SET_MASKED:
|
2017-11-10 20:09:43 +00:00
|
|
|
case OVS_ACTION_ATTR_METER:
|
2019-03-26 00:43:46 +00:00
|
|
|
case OVS_ACTION_ATTR_CHECK_PKT_LEN:
|
2019-12-21 03:20:46 +00:00
|
|
|
case OVS_ACTION_ATTR_ADD_MPLS:
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
case OVS_ACTION_ATTR_DEC_TTL:
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
default:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-10-19 19:03:40 +00:00
|
|
|
static void update_range(struct sw_flow_match *match,
|
|
|
|
size_t offset, size_t size, bool is_mask)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
2014-10-19 19:03:40 +00:00
|
|
|
struct sw_flow_key_range *range;
|
2013-10-04 01:16:47 +00:00
|
|
|
size_t start = rounddown(offset, sizeof(long));
|
|
|
|
size_t end = roundup(offset + size, sizeof(long));
|
|
|
|
|
|
|
|
if (!is_mask)
|
|
|
|
range = &match->range;
|
2014-10-19 19:03:40 +00:00
|
|
|
else
|
2013-10-04 01:16:47 +00:00
|
|
|
range = &match->mask->range;
|
|
|
|
|
|
|
|
if (range->start == range->end) {
|
|
|
|
range->start = start;
|
|
|
|
range->end = end;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (range->start > start)
|
|
|
|
range->start = start;
|
|
|
|
|
|
|
|
if (range->end < end)
|
|
|
|
range->end = end;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
|
|
|
|
do { \
|
2014-10-19 19:03:40 +00:00
|
|
|
update_range(match, offsetof(struct sw_flow_key, field), \
|
|
|
|
sizeof((match)->key->field), is_mask); \
|
|
|
|
if (is_mask) \
|
|
|
|
(match)->mask->key.field = value; \
|
|
|
|
else \
|
2013-10-04 01:16:47 +00:00
|
|
|
(match)->key->field = value; \
|
|
|
|
} while (0)
|
|
|
|
|
2014-10-03 22:35:33 +00:00
|
|
|
#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \
|
|
|
|
do { \
|
2014-10-19 19:03:40 +00:00
|
|
|
update_range(match, offset, len, is_mask); \
|
2014-10-03 22:35:33 +00:00
|
|
|
if (is_mask) \
|
|
|
|
memcpy((u8 *)&(match)->mask->key + offset, value_p, \
|
2014-10-19 19:03:40 +00:00
|
|
|
len); \
|
2014-10-03 22:35:33 +00:00
|
|
|
else \
|
|
|
|
memcpy((u8 *)(match)->key + offset, value_p, len); \
|
2013-10-04 01:16:47 +00:00
|
|
|
} while (0)
|
|
|
|
|
2014-10-03 22:35:33 +00:00
|
|
|
#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
|
|
|
|
SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \
|
|
|
|
value_p, len, is_mask)
|
|
|
|
|
2014-10-19 19:03:40 +00:00
|
|
|
#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \
|
|
|
|
do { \
|
|
|
|
update_range(match, offsetof(struct sw_flow_key, field), \
|
|
|
|
sizeof((match)->key->field), is_mask); \
|
|
|
|
if (is_mask) \
|
|
|
|
memset((u8 *)&(match)->mask->key.field, value, \
|
|
|
|
sizeof((match)->mask->key.field)); \
|
|
|
|
else \
|
2014-10-17 04:55:45 +00:00
|
|
|
memset((u8 *)&(match)->key->field, value, \
|
|
|
|
sizeof((match)->key->field)); \
|
|
|
|
} while (0)
|
2013-10-04 01:16:47 +00:00
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
#define SW_FLOW_KEY_BITMAP_COPY(match, field, value_p, nbits, is_mask) ({ \
|
|
|
|
update_range(match, offsetof(struct sw_flow_key, field), \
|
|
|
|
bitmap_size(nbits), is_mask); \
|
|
|
|
bitmap_copy(is_mask ? (match)->mask->key.field : (match)->key->field, \
|
|
|
|
value_p, nbits); \
|
|
|
|
})
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
static bool match_validate(const struct sw_flow_match *match,
|
2014-11-06 15:03:05 +00:00
|
|
|
u64 key_attrs, u64 mask_attrs, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
2016-11-10 15:28:22 +00:00
|
|
|
u64 key_expected = 0;
|
2013-10-04 01:16:47 +00:00
|
|
|
u64 mask_allowed = key_attrs; /* At most allow all key attributes */
|
|
|
|
|
|
|
|
/* The following mask attributes allowed only if they
|
|
|
|
* pass the validation tests. */
|
|
|
|
mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
| (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)
|
2013-10-04 01:16:47 +00:00
|
|
|
| (1 << OVS_KEY_ATTR_IPV6)
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
| (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)
|
2013-10-04 01:16:47 +00:00
|
|
|
| (1 << OVS_KEY_ATTR_TCP)
|
2013-10-23 08:44:59 +00:00
|
|
|
| (1 << OVS_KEY_ATTR_TCP_FLAGS)
|
2013-10-04 01:16:47 +00:00
|
|
|
| (1 << OVS_KEY_ATTR_UDP)
|
|
|
|
| (1 << OVS_KEY_ATTR_SCTP)
|
|
|
|
| (1 << OVS_KEY_ATTR_ICMP)
|
|
|
|
| (1 << OVS_KEY_ATTR_ICMPV6)
|
|
|
|
| (1 << OVS_KEY_ATTR_ARP)
|
2014-10-06 12:05:13 +00:00
|
|
|
| (1 << OVS_KEY_ATTR_ND)
|
2017-11-07 13:07:02 +00:00
|
|
|
| (1 << OVS_KEY_ATTR_MPLS)
|
|
|
|
| (1 << OVS_KEY_ATTR_NSH));
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
/* Always allowed mask fields. */
|
|
|
|
mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
|
|
|
|
| (1 << OVS_KEY_ATTR_IN_PORT)
|
|
|
|
| (1 << OVS_KEY_ATTR_ETHERTYPE));
|
|
|
|
|
|
|
|
/* Check key attributes. */
|
|
|
|
if (match->key->eth.type == htons(ETH_P_ARP)
|
|
|
|
|| match->key->eth.type == htons(ETH_P_RARP)) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_ARP;
|
2014-12-01 07:04:17 +00:00
|
|
|
if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
|
2013-10-04 01:16:47 +00:00
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
|
|
|
|
}
|
|
|
|
|
2014-10-06 12:05:13 +00:00
|
|
|
if (eth_p_mpls(match->key->eth.type)) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_MPLS;
|
|
|
|
if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
|
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_MPLS;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (match->key->eth.type == htons(ETH_P_IP)) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_IPV4;
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
|
2013-10-04 01:16:47 +00:00
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
|
|
|
|
if (match->key->ip.proto == IPPROTO_UDP) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_UDP;
|
|
|
|
if (match->mask && (match->mask->key.ip.proto == 0xff))
|
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (match->key->ip.proto == IPPROTO_SCTP) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_SCTP;
|
|
|
|
if (match->mask && (match->mask->key.ip.proto == 0xff))
|
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (match->key->ip.proto == IPPROTO_TCP) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_TCP;
|
2013-10-23 08:44:59 +00:00
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
|
|
|
|
if (match->mask && (match->mask->key.ip.proto == 0xff)) {
|
2013-10-04 01:16:47 +00:00
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
|
2013-10-23 08:44:59 +00:00
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (match->key->ip.proto == IPPROTO_ICMP) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_ICMP;
|
|
|
|
if (match->mask && (match->mask->key.ip.proto == 0xff))
|
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (match->key->eth.type == htons(ETH_P_IPV6)) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_IPV6;
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
|
2013-10-04 01:16:47 +00:00
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
|
|
|
|
if (match->key->ip.proto == IPPROTO_UDP) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_UDP;
|
|
|
|
if (match->mask && (match->mask->key.ip.proto == 0xff))
|
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (match->key->ip.proto == IPPROTO_SCTP) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_SCTP;
|
|
|
|
if (match->mask && (match->mask->key.ip.proto == 0xff))
|
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (match->key->ip.proto == IPPROTO_TCP) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_TCP;
|
2013-10-23 08:44:59 +00:00
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
|
|
|
|
if (match->mask && (match->mask->key.ip.proto == 0xff)) {
|
2013-10-04 01:16:47 +00:00
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
|
2013-10-23 08:44:59 +00:00
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (match->key->ip.proto == IPPROTO_ICMPV6) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
|
|
|
|
if (match->mask && (match->mask->key.ip.proto == 0xff))
|
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;
|
|
|
|
|
2014-05-05 16:54:49 +00:00
|
|
|
if (match->key->tp.src ==
|
2013-10-04 01:16:47 +00:00
|
|
|
htons(NDISC_NEIGHBOUR_SOLICITATION) ||
|
2014-05-05 16:54:49 +00:00
|
|
|
match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
|
2013-10-04 01:16:47 +00:00
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_ND;
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
/* Original direction conntrack tuple
|
|
|
|
* uses the same space as the ND fields
|
|
|
|
* in the key, so both are not allowed
|
|
|
|
* at the same time.
|
|
|
|
*/
|
|
|
|
mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
|
2014-12-01 07:04:17 +00:00
|
|
|
if (match->mask && (match->mask->key.tp.src == htons(0xff)))
|
2013-10-04 01:16:47 +00:00
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_ND;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-07 13:07:02 +00:00
|
|
|
if (match->key->eth.type == htons(ETH_P_NSH)) {
|
|
|
|
key_expected |= 1 << OVS_KEY_ATTR_NSH;
|
|
|
|
if (match->mask &&
|
|
|
|
match->mask->key.eth.type == htons(0xffff)) {
|
|
|
|
mask_allowed |= 1 << OVS_KEY_ATTR_NSH;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if ((key_attrs & key_expected) != key_expected) {
|
|
|
|
/* Key attributes check failed. */
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)",
|
|
|
|
(unsigned long long)key_attrs,
|
|
|
|
(unsigned long long)key_expected);
|
2013-10-04 01:16:47 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((mask_attrs & mask_allowed) != mask_attrs) {
|
|
|
|
/* Mask attributes check failed. */
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Unexpected mask (mask=%llx, allowed=%llx)",
|
|
|
|
(unsigned long long)mask_attrs,
|
|
|
|
(unsigned long long)mask_allowed);
|
2013-10-04 01:16:47 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-11-06 14:51:24 +00:00
|
|
|
size_t ovs_tun_key_attr_size(void)
|
|
|
|
{
|
|
|
|
/* Whenever adding new OVS_TUNNEL_KEY_ FIELDS, we should consider
|
|
|
|
* updating this function.
|
|
|
|
*/
|
2016-04-22 15:31:18 +00:00
|
|
|
return nla_total_size_64bit(8) /* OVS_TUNNEL_KEY_ATTR_ID */
|
2015-10-05 11:09:47 +00:00
|
|
|
+ nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */
|
|
|
|
+ nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */
|
2014-11-06 14:51:24 +00:00
|
|
|
+ nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */
|
|
|
|
+ nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */
|
|
|
|
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
|
|
|
|
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */
|
|
|
|
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */
|
|
|
|
+ nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
|
2018-01-25 21:20:11 +00:00
|
|
|
/* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS and
|
|
|
|
* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS is mutually exclusive with
|
2015-01-15 02:53:59 +00:00
|
|
|
* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
|
|
|
|
*/
|
2014-11-06 14:51:24 +00:00
|
|
|
+ nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
|
2018-01-12 20:29:22 +00:00
|
|
|
+ nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */
|
2014-11-06 14:51:24 +00:00
|
|
|
}
|
|
|
|
|
2017-11-14 06:27:03 +00:00
|
|
|
static size_t ovs_nsh_key_attr_size(void)
|
2017-11-07 13:07:02 +00:00
|
|
|
{
|
|
|
|
/* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider
|
|
|
|
* updating this function.
|
|
|
|
*/
|
|
|
|
return nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */
|
|
|
|
/* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are
|
|
|
|
* mutually exclusive, so the bigger one can cover
|
|
|
|
* the small one.
|
|
|
|
*/
|
|
|
|
+ nla_total_size(NSH_CTX_HDRS_MAX_LEN);
|
|
|
|
}
|
|
|
|
|
2014-10-18 23:14:14 +00:00
|
|
|
size_t ovs_key_attr_size(void)
|
|
|
|
{
|
|
|
|
/* Whenever adding new OVS_KEY_ FIELDS, we should consider
|
|
|
|
* updating this function.
|
|
|
|
*/
|
2022-03-09 22:20:33 +00:00
|
|
|
BUILD_BUG_ON(OVS_KEY_ATTR_MAX != 32);
|
2014-10-18 23:14:14 +00:00
|
|
|
|
|
|
|
return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
|
|
|
|
+ nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
|
2014-11-06 14:51:24 +00:00
|
|
|
+ ovs_tun_key_attr_size()
|
2014-10-18 23:14:14 +00:00
|
|
|
+ nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */
|
|
|
|
+ nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */
|
|
|
|
+ nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */
|
|
|
|
+ nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */
|
2015-10-06 18:00:00 +00:00
|
|
|
+ nla_total_size(4) /* OVS_KEY_ATTR_CT_STATE */
|
2015-08-26 18:31:48 +00:00
|
|
|
+ nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */
|
2015-08-26 18:31:49 +00:00
|
|
|
+ nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */
|
2015-10-01 22:00:37 +00:00
|
|
|
+ nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
+ nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
|
2017-11-07 13:07:02 +00:00
|
|
|
+ nla_total_size(0) /* OVS_KEY_ATTR_NSH */
|
|
|
|
+ ovs_nsh_key_attr_size()
|
2014-10-18 23:14:14 +00:00
|
|
|
+ nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
|
|
|
|
+ nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
|
|
|
|
+ nla_total_size(4) /* OVS_KEY_ATTR_VLAN */
|
|
|
|
+ nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */
|
|
|
|
+ nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
|
|
|
|
+ nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */
|
|
|
|
+ nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */
|
2022-02-24 00:54:09 +00:00
|
|
|
+ nla_total_size(28) /* OVS_KEY_ATTR_ND */
|
|
|
|
+ nla_total_size(2); /* OVS_KEY_ATTR_IPV6_EXTHDRS */
|
2014-10-18 23:14:14 +00:00
|
|
|
}
|
|
|
|
|
2015-09-12 01:38:28 +00:00
|
|
|
static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = {
|
|
|
|
[OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) },
|
|
|
|
};
|
|
|
|
|
2015-01-15 02:53:58 +00:00
|
|
|
static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_IPV4_DST] = { .len = sizeof(u32) },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_TOS] = { .len = 1 },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_TTL] = { .len = 1 },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = { .len = 0 },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_CSUM] = { .len = 0 },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 },
|
2015-09-12 01:38:28 +00:00
|
|
|
[OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED,
|
|
|
|
.next = ovs_vxlan_ext_key_lens },
|
2015-10-05 11:09:47 +00:00
|
|
|
[OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
|
|
|
|
[OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) },
|
2018-01-25 21:20:11 +00:00
|
|
|
[OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = OVS_ATTR_VARIABLE },
|
2019-03-28 04:43:23 +00:00
|
|
|
[OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE] = { .len = 0 },
|
2015-01-15 02:53:58 +00:00
|
|
|
};
|
|
|
|
|
2017-11-07 13:07:02 +00:00
|
|
|
static const struct ovs_len_tbl
|
|
|
|
ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = {
|
|
|
|
[OVS_NSH_KEY_ATTR_BASE] = { .len = sizeof(struct ovs_nsh_key_base) },
|
|
|
|
[OVS_NSH_KEY_ATTR_MD1] = { .len = sizeof(struct ovs_nsh_key_md1) },
|
|
|
|
[OVS_NSH_KEY_ATTR_MD2] = { .len = OVS_ATTR_VARIABLE },
|
|
|
|
};
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
|
2015-01-15 02:53:58 +00:00
|
|
|
static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
|
|
|
|
[OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED },
|
|
|
|
[OVS_KEY_ATTR_PRIORITY] = { .len = sizeof(u32) },
|
|
|
|
[OVS_KEY_ATTR_IN_PORT] = { .len = sizeof(u32) },
|
|
|
|
[OVS_KEY_ATTR_SKB_MARK] = { .len = sizeof(u32) },
|
|
|
|
[OVS_KEY_ATTR_ETHERNET] = { .len = sizeof(struct ovs_key_ethernet) },
|
|
|
|
[OVS_KEY_ATTR_VLAN] = { .len = sizeof(__be16) },
|
|
|
|
[OVS_KEY_ATTR_ETHERTYPE] = { .len = sizeof(__be16) },
|
|
|
|
[OVS_KEY_ATTR_IPV4] = { .len = sizeof(struct ovs_key_ipv4) },
|
|
|
|
[OVS_KEY_ATTR_IPV6] = { .len = sizeof(struct ovs_key_ipv6) },
|
|
|
|
[OVS_KEY_ATTR_TCP] = { .len = sizeof(struct ovs_key_tcp) },
|
|
|
|
[OVS_KEY_ATTR_TCP_FLAGS] = { .len = sizeof(__be16) },
|
|
|
|
[OVS_KEY_ATTR_UDP] = { .len = sizeof(struct ovs_key_udp) },
|
|
|
|
[OVS_KEY_ATTR_SCTP] = { .len = sizeof(struct ovs_key_sctp) },
|
|
|
|
[OVS_KEY_ATTR_ICMP] = { .len = sizeof(struct ovs_key_icmp) },
|
|
|
|
[OVS_KEY_ATTR_ICMPV6] = { .len = sizeof(struct ovs_key_icmpv6) },
|
|
|
|
[OVS_KEY_ATTR_ARP] = { .len = sizeof(struct ovs_key_arp) },
|
|
|
|
[OVS_KEY_ATTR_ND] = { .len = sizeof(struct ovs_key_nd) },
|
|
|
|
[OVS_KEY_ATTR_RECIRC_ID] = { .len = sizeof(u32) },
|
|
|
|
[OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) },
|
|
|
|
[OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED,
|
|
|
|
.next = ovs_tunnel_key_lens, },
|
2019-11-04 01:57:44 +00:00
|
|
|
[OVS_KEY_ATTR_MPLS] = { .len = OVS_ATTR_VARIABLE },
|
2015-10-06 18:00:00 +00:00
|
|
|
[OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) },
|
2015-08-26 18:31:48 +00:00
|
|
|
[OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) },
|
2015-08-26 18:31:49 +00:00
|
|
|
[OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) },
|
2015-10-01 22:00:37 +00:00
|
|
|
[OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = {
|
|
|
|
.len = sizeof(struct ovs_key_ct_tuple_ipv4) },
|
|
|
|
[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
|
|
|
|
.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
|
2017-11-07 13:07:02 +00:00
|
|
|
[OVS_KEY_ATTR_NSH] = { .len = OVS_ATTR_NESTED,
|
|
|
|
.next = ovs_nsh_key_attr_lens, },
|
2022-02-24 00:54:09 +00:00
|
|
|
[OVS_KEY_ATTR_IPV6_EXTHDRS] = {
|
|
|
|
.len = sizeof(struct ovs_key_ipv6_exthdrs) },
|
2013-10-04 01:16:47 +00:00
|
|
|
};
|
|
|
|
|
2015-09-12 01:38:28 +00:00
|
|
|
static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
|
|
|
|
{
|
|
|
|
return expected_len == attr_len ||
|
|
|
|
expected_len == OVS_ATTR_NESTED ||
|
|
|
|
expected_len == OVS_ATTR_VARIABLE;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
static bool is_all_zero(const u8 *fp, size_t size)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!fp)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (i = 0; i < size; i++)
|
|
|
|
if (fp[i])
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __parse_flow_nlattrs(const struct nlattr *attr,
|
|
|
|
const struct nlattr *a[],
|
2014-11-06 15:03:05 +00:00
|
|
|
u64 *attrsp, bool log, bool nz)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
const struct nlattr *nla;
|
|
|
|
u64 attrs;
|
|
|
|
int rem;
|
|
|
|
|
|
|
|
attrs = *attrsp;
|
|
|
|
nla_for_each_nested(nla, attr, rem) {
|
|
|
|
u16 type = nla_type(nla);
|
|
|
|
int expected_len;
|
|
|
|
|
|
|
|
if (type > OVS_KEY_ATTR_MAX) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Key type %d is out of range max %d",
|
2013-10-04 01:16:47 +00:00
|
|
|
type, OVS_KEY_ATTR_MAX);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2022-03-09 22:20:33 +00:00
|
|
|
if (type == OVS_KEY_ATTR_PACKET_TYPE ||
|
|
|
|
type == OVS_KEY_ATTR_ND_EXTENSIONS ||
|
|
|
|
type == OVS_KEY_ATTR_TUNNEL_INFO) {
|
|
|
|
OVS_NLERR(log, "Key type %d is not supported", type);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (attrs & (1ULL << type)) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Duplicate key (type %d).", type);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2015-01-15 02:53:58 +00:00
|
|
|
expected_len = ovs_key_lens[type].len;
|
2015-09-12 01:38:28 +00:00
|
|
|
if (!check_attr_len(nla_len(nla), expected_len)) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Key %d has unexpected len %d expected %d",
|
|
|
|
type, nla_len(nla), expected_len);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2019-01-14 09:16:56 +00:00
|
|
|
if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) {
|
2022-03-09 22:20:33 +00:00
|
|
|
attrs |= 1ULL << type;
|
2013-10-04 01:16:47 +00:00
|
|
|
a[type] = nla;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (rem) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Message has %d unknown bytes.", rem);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
*attrsp = attrs;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int parse_flow_mask_nlattrs(const struct nlattr *attr,
|
2014-11-06 15:03:05 +00:00
|
|
|
const struct nlattr *a[], u64 *attrsp,
|
|
|
|
bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
2014-11-06 15:03:05 +00:00
|
|
|
return __parse_flow_nlattrs(attr, a, attrsp, log, true);
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
|
|
|
|
u64 *attrsp, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
2014-11-06 15:03:05 +00:00
|
|
|
return __parse_flow_nlattrs(attr, a, attrsp, log, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int genev_tun_opt_from_nlattr(const struct nlattr *a,
|
|
|
|
struct sw_flow_match *match, bool is_mask,
|
|
|
|
bool log)
|
|
|
|
{
|
|
|
|
unsigned long opt_key_offset;
|
|
|
|
|
|
|
|
if (nla_len(a) > sizeof(match->key->tun_opts)) {
|
|
|
|
OVS_NLERR(log, "Geneve option length err (len %d, max %zu).",
|
|
|
|
nla_len(a), sizeof(match->key->tun_opts));
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nla_len(a) % 4 != 0) {
|
|
|
|
OVS_NLERR(log, "Geneve opt len %d is not a multiple of 4.",
|
|
|
|
nla_len(a));
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We need to record the length of the options passed
|
|
|
|
* down, otherwise packets with the same format but
|
|
|
|
* additional options will be silently matched.
|
|
|
|
*/
|
|
|
|
if (!is_mask) {
|
|
|
|
SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a),
|
|
|
|
false);
|
|
|
|
} else {
|
|
|
|
/* This is somewhat unusual because it looks at
|
|
|
|
* both the key and mask while parsing the
|
|
|
|
* attributes (and by extension assumes the key
|
|
|
|
* is parsed first). Normally, we would verify
|
|
|
|
* that each is the correct length and that the
|
|
|
|
* attributes line up in the validate function.
|
|
|
|
* However, that is difficult because this is
|
|
|
|
* variable length and we won't have the
|
|
|
|
* information later.
|
|
|
|
*/
|
|
|
|
if (match->key->tun_opts_len != nla_len(a)) {
|
|
|
|
OVS_NLERR(log, "Geneve option len %d != mask len %d",
|
|
|
|
match->key->tun_opts_len, nla_len(a));
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
|
|
|
|
}
|
|
|
|
|
2015-01-15 02:53:57 +00:00
|
|
|
opt_key_offset = TUN_METADATA_OFFSET(nla_len(a));
|
2014-11-06 15:03:05 +00:00
|
|
|
SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a),
|
|
|
|
nla_len(a), is_mask);
|
|
|
|
return 0;
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
2015-09-12 01:38:28 +00:00
|
|
|
static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,
|
2015-01-15 02:53:59 +00:00
|
|
|
struct sw_flow_match *match, bool is_mask,
|
|
|
|
bool log)
|
|
|
|
{
|
2015-09-12 01:38:28 +00:00
|
|
|
struct nlattr *a;
|
|
|
|
int rem;
|
2015-01-15 02:53:59 +00:00
|
|
|
unsigned long opt_key_offset;
|
2015-07-21 08:44:06 +00:00
|
|
|
struct vxlan_metadata opts;
|
2015-01-15 02:53:59 +00:00
|
|
|
|
|
|
|
BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
|
|
|
|
|
|
|
|
memset(&opts, 0, sizeof(opts));
|
2015-09-12 01:38:28 +00:00
|
|
|
nla_for_each_nested(a, attr, rem) {
|
|
|
|
int type = nla_type(a);
|
2015-01-15 02:53:59 +00:00
|
|
|
|
2015-09-12 01:38:28 +00:00
|
|
|
if (type > OVS_VXLAN_EXT_MAX) {
|
|
|
|
OVS_NLERR(log, "VXLAN extension %d out of range max %d",
|
|
|
|
type, OVS_VXLAN_EXT_MAX);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!check_attr_len(nla_len(a),
|
|
|
|
ovs_vxlan_ext_key_lens[type].len)) {
|
|
|
|
OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d",
|
|
|
|
type, nla_len(a),
|
|
|
|
ovs_vxlan_ext_key_lens[type].len);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case OVS_VXLAN_EXT_GBP:
|
|
|
|
opts.gbp = nla_get_u32(a);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
OVS_NLERR(log, "Unknown VXLAN extension attribute %d",
|
|
|
|
type);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (rem) {
|
|
|
|
OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.",
|
|
|
|
rem);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2015-01-15 02:53:59 +00:00
|
|
|
|
|
|
|
if (!is_mask)
|
|
|
|
SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false);
|
|
|
|
else
|
|
|
|
SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
|
|
|
|
|
|
|
|
opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts));
|
|
|
|
SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts),
|
|
|
|
is_mask);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-01-25 21:20:11 +00:00
|
|
|
static int erspan_tun_opt_from_nlattr(const struct nlattr *a,
|
|
|
|
struct sw_flow_match *match, bool is_mask,
|
|
|
|
bool log)
|
|
|
|
{
|
|
|
|
unsigned long opt_key_offset;
|
|
|
|
|
|
|
|
BUILD_BUG_ON(sizeof(struct erspan_metadata) >
|
|
|
|
sizeof(match->key->tun_opts));
|
|
|
|
|
|
|
|
if (nla_len(a) > sizeof(match->key->tun_opts)) {
|
|
|
|
OVS_NLERR(log, "ERSPAN option length err (len %d, max %zu).",
|
|
|
|
nla_len(a), sizeof(match->key->tun_opts));
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!is_mask)
|
|
|
|
SW_FLOW_KEY_PUT(match, tun_opts_len,
|
|
|
|
sizeof(struct erspan_metadata), false);
|
|
|
|
else
|
|
|
|
SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
|
|
|
|
|
|
|
|
opt_key_offset = TUN_METADATA_OFFSET(nla_len(a));
|
|
|
|
SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a),
|
|
|
|
nla_len(a), is_mask);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-05 11:09:47 +00:00
|
|
|
static int ip_tun_from_nlattr(const struct nlattr *attr,
|
|
|
|
struct sw_flow_match *match, bool is_mask,
|
|
|
|
bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
2015-10-21 03:47:46 +00:00
|
|
|
bool ttl = false, ipv4 = false, ipv6 = false;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { };
|
2019-03-28 04:43:23 +00:00
|
|
|
bool info_bridge_mode = false;
|
2015-10-21 03:47:46 +00:00
|
|
|
int opts_type = 0;
|
2013-10-04 01:16:47 +00:00
|
|
|
struct nlattr *a;
|
|
|
|
int rem;
|
|
|
|
|
|
|
|
nla_for_each_nested(a, attr, rem) {
|
|
|
|
int type = nla_type(a);
|
2014-11-06 15:03:05 +00:00
|
|
|
int err;
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Tunnel attr %d out of range max %d",
|
|
|
|
type, OVS_TUNNEL_KEY_ATTR_MAX);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2015-09-12 01:38:28 +00:00
|
|
|
if (!check_attr_len(nla_len(a),
|
|
|
|
ovs_tunnel_key_lens[type].len)) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d",
|
2015-01-15 02:53:58 +00:00
|
|
|
type, nla_len(a), ovs_tunnel_key_lens[type].len);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_ID:
|
|
|
|
SW_FLOW_KEY_PUT(match, tun_key.tun_id,
|
|
|
|
nla_get_be64(a), is_mask);
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_KEY_BIT, tun_flags);
|
2013-10-04 01:16:47 +00:00
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
|
2015-08-20 11:56:23 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src,
|
2015-03-29 14:59:26 +00:00
|
|
|
nla_get_in_addr(a), is_mask);
|
2015-10-05 11:09:47 +00:00
|
|
|
ipv4 = true;
|
2013-10-04 01:16:47 +00:00
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
|
2015-08-20 11:56:23 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst,
|
2015-03-29 14:59:26 +00:00
|
|
|
nla_get_in_addr(a), is_mask);
|
2015-10-05 11:09:47 +00:00
|
|
|
ipv4 = true;
|
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_IPV6_SRC:
|
2017-03-15 16:10:47 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src,
|
2015-10-05 11:09:47 +00:00
|
|
|
nla_get_in6_addr(a), is_mask);
|
|
|
|
ipv6 = true;
|
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_IPV6_DST:
|
|
|
|
SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst,
|
|
|
|
nla_get_in6_addr(a), is_mask);
|
|
|
|
ipv6 = true;
|
2013-10-04 01:16:47 +00:00
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_TOS:
|
2015-08-20 11:56:24 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tun_key.tos,
|
2013-10-04 01:16:47 +00:00
|
|
|
nla_get_u8(a), is_mask);
|
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_TTL:
|
2015-08-20 11:56:24 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tun_key.ttl,
|
2013-10-04 01:16:47 +00:00
|
|
|
nla_get_u8(a), is_mask);
|
|
|
|
ttl = true;
|
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_flags);
|
2013-10-04 01:16:47 +00:00
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_CSUM:
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_CSUM_BIT, tun_flags);
|
2013-10-04 01:16:47 +00:00
|
|
|
break;
|
2014-11-06 14:51:24 +00:00
|
|
|
case OVS_TUNNEL_KEY_ATTR_TP_SRC:
|
|
|
|
SW_FLOW_KEY_PUT(match, tun_key.tp_src,
|
|
|
|
nla_get_be16(a), is_mask);
|
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_TP_DST:
|
|
|
|
SW_FLOW_KEY_PUT(match, tun_key.tp_dst,
|
|
|
|
nla_get_be16(a), is_mask);
|
|
|
|
break;
|
2014-10-03 22:35:30 +00:00
|
|
|
case OVS_TUNNEL_KEY_ATTR_OAM:
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_OAM_BIT, tun_flags);
|
2014-10-03 22:35:30 +00:00
|
|
|
break;
|
2014-10-03 22:35:33 +00:00
|
|
|
case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
|
2015-01-15 02:53:59 +00:00
|
|
|
if (opts_type) {
|
|
|
|
OVS_NLERR(log, "Multiple metadata blocks provided");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2014-11-06 15:03:05 +00:00
|
|
|
err = genev_tun_opt_from_nlattr(a, match, is_mask, log);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2014-10-03 22:35:33 +00:00
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_flags);
|
2015-01-15 02:53:59 +00:00
|
|
|
opts_type = type;
|
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
|
|
|
|
if (opts_type) {
|
|
|
|
OVS_NLERR(log, "Multiple metadata blocks provided");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = vxlan_tun_opt_from_nlattr(a, match, is_mask, log);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_flags);
|
2015-01-15 02:53:59 +00:00
|
|
|
opts_type = type;
|
2014-10-03 22:35:33 +00:00
|
|
|
break;
|
2017-03-16 15:51:28 +00:00
|
|
|
case OVS_TUNNEL_KEY_ATTR_PAD:
|
|
|
|
break;
|
2018-01-25 21:20:11 +00:00
|
|
|
case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
|
|
|
|
if (opts_type) {
|
|
|
|
OVS_NLERR(log, "Multiple metadata blocks provided");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = erspan_tun_opt_from_nlattr(a, match, is_mask,
|
|
|
|
log);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_flags);
|
2018-01-25 21:20:11 +00:00
|
|
|
opts_type = type;
|
|
|
|
break;
|
2019-03-28 04:43:23 +00:00
|
|
|
case OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE:
|
|
|
|
info_bridge_mode = true;
|
|
|
|
ipv4 = true;
|
|
|
|
break;
|
2013-10-04 01:16:47 +00:00
|
|
|
default:
|
2015-10-05 11:09:47 +00:00
|
|
|
OVS_NLERR(log, "Unknown IP tunnel attribute %d",
|
2014-10-03 22:35:33 +00:00
|
|
|
type);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
SW_FLOW_KEY_BITMAP_COPY(match, tun_key.tun_flags, tun_flags,
|
|
|
|
__IP_TUNNEL_FLAG_NUM, is_mask);
|
2015-10-05 11:09:46 +00:00
|
|
|
if (is_mask)
|
|
|
|
SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true);
|
|
|
|
else
|
2015-10-05 11:09:47 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tun_proto, ipv6 ? AF_INET6 : AF_INET,
|
|
|
|
false);
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
if (rem > 0) {
|
2015-10-05 11:09:47 +00:00
|
|
|
OVS_NLERR(log, "IP tunnel attribute has %d unknown bytes.",
|
2014-11-06 15:03:05 +00:00
|
|
|
rem);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2015-10-05 11:09:47 +00:00
|
|
|
if (ipv4 && ipv6) {
|
|
|
|
OVS_NLERR(log, "Mixed IPv4 and IPv6 tunnel attributes");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (!is_mask) {
|
2015-10-05 11:09:47 +00:00
|
|
|
if (!ipv4 && !ipv6) {
|
|
|
|
OVS_NLERR(log, "IP tunnel dst address not specified");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2019-03-28 04:43:23 +00:00
|
|
|
if (ipv4) {
|
|
|
|
if (info_bridge_mode) {
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__clear_bit(IP_TUNNEL_KEY_BIT, tun_flags);
|
|
|
|
|
2019-03-28 04:43:23 +00:00
|
|
|
if (match->key->tun_key.u.ipv4.src ||
|
|
|
|
match->key->tun_key.u.ipv4.dst ||
|
|
|
|
match->key->tun_key.tp_src ||
|
|
|
|
match->key->tun_key.tp_dst ||
|
|
|
|
match->key->tun_key.ttl ||
|
|
|
|
match->key->tun_key.tos ||
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
!ip_tunnel_flags_empty(tun_flags)) {
|
2019-03-28 04:43:23 +00:00
|
|
|
OVS_NLERR(log, "IPv4 tun info is not correct");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
} else if (!match->key->tun_key.u.ipv4.dst) {
|
|
|
|
OVS_NLERR(log, "IPv4 tunnel dst address is zero");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
2015-10-05 11:09:47 +00:00
|
|
|
if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) {
|
|
|
|
OVS_NLERR(log, "IPv6 tunnel dst address is zero");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2019-03-28 04:43:23 +00:00
|
|
|
if (!ttl && !info_bridge_mode) {
|
2015-10-05 11:09:47 +00:00
|
|
|
OVS_NLERR(log, "IP tunnel TTL not specified.");
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-15 02:53:59 +00:00
|
|
|
return opts_type;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vxlan_opt_to_nlattr(struct sk_buff *skb,
|
|
|
|
const void *tun_opts, int swkey_tun_opts_len)
|
|
|
|
{
|
2015-07-21 08:44:06 +00:00
|
|
|
const struct vxlan_metadata *opts = tun_opts;
|
2015-01-15 02:53:59 +00:00
|
|
|
struct nlattr *nla;
|
|
|
|
|
2019-04-26 09:13:06 +00:00
|
|
|
nla = nla_nest_start_noflag(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS);
|
2015-01-15 02:53:59 +00:00
|
|
|
if (!nla)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
if (nla_put_u32(skb, OVS_VXLAN_EXT_GBP, opts->gbp) < 0)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
nla_nest_end(skb, nla);
|
2013-10-04 01:16:47 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-05 11:09:47 +00:00
|
|
|
static int __ip_tun_to_nlattr(struct sk_buff *skb,
|
|
|
|
const struct ip_tunnel_key *output,
|
|
|
|
const void *tun_opts, int swkey_tun_opts_len,
|
2019-03-28 04:43:23 +00:00
|
|
|
unsigned short tun_proto, u8 mode)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
if (test_bit(IP_TUNNEL_KEY_BIT, output->tun_flags) &&
|
2016-04-22 15:31:18 +00:00
|
|
|
nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id,
|
|
|
|
OVS_TUNNEL_KEY_ATTR_PAD))
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EMSGSIZE;
|
2019-03-28 04:43:23 +00:00
|
|
|
|
|
|
|
if (mode & IP_TUNNEL_INFO_BRIDGE)
|
|
|
|
return nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE)
|
|
|
|
? -EMSGSIZE : 0;
|
|
|
|
|
2015-10-05 11:09:47 +00:00
|
|
|
switch (tun_proto) {
|
|
|
|
case AF_INET:
|
|
|
|
if (output->u.ipv4.src &&
|
|
|
|
nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC,
|
|
|
|
output->u.ipv4.src))
|
|
|
|
return -EMSGSIZE;
|
|
|
|
if (output->u.ipv4.dst &&
|
|
|
|
nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST,
|
|
|
|
output->u.ipv4.dst))
|
|
|
|
return -EMSGSIZE;
|
|
|
|
break;
|
|
|
|
case AF_INET6:
|
|
|
|
if (!ipv6_addr_any(&output->u.ipv6.src) &&
|
|
|
|
nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_SRC,
|
|
|
|
&output->u.ipv6.src))
|
|
|
|
return -EMSGSIZE;
|
|
|
|
if (!ipv6_addr_any(&output->u.ipv6.dst) &&
|
|
|
|
nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_DST,
|
|
|
|
&output->u.ipv6.dst))
|
|
|
|
return -EMSGSIZE;
|
|
|
|
break;
|
|
|
|
}
|
2015-08-20 11:56:24 +00:00
|
|
|
if (output->tos &&
|
|
|
|
nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos))
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EMSGSIZE;
|
2015-08-20 11:56:24 +00:00
|
|
|
if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl))
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EMSGSIZE;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, output->tun_flags) &&
|
2014-10-03 22:35:30 +00:00
|
|
|
nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EMSGSIZE;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
if (test_bit(IP_TUNNEL_CSUM_BIT, output->tun_flags) &&
|
2014-10-03 22:35:30 +00:00
|
|
|
nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
|
|
|
|
return -EMSGSIZE;
|
2014-11-06 14:51:24 +00:00
|
|
|
if (output->tp_src &&
|
|
|
|
nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_SRC, output->tp_src))
|
|
|
|
return -EMSGSIZE;
|
|
|
|
if (output->tp_dst &&
|
|
|
|
nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst))
|
|
|
|
return -EMSGSIZE;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
if (test_bit(IP_TUNNEL_OAM_BIT, output->tun_flags) &&
|
2014-10-03 22:35:30 +00:00
|
|
|
nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EMSGSIZE;
|
2015-10-23 01:17:16 +00:00
|
|
|
if (swkey_tun_opts_len) {
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, output->tun_flags) &&
|
2015-01-15 02:53:59 +00:00
|
|
|
nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
|
|
|
|
swkey_tun_opts_len, tun_opts))
|
|
|
|
return -EMSGSIZE;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT,
|
|
|
|
output->tun_flags) &&
|
2015-01-15 02:53:59 +00:00
|
|
|
vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
|
|
|
|
return -EMSGSIZE;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
|
|
|
|
output->tun_flags) &&
|
2018-01-25 21:20:11 +00:00
|
|
|
nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
|
|
|
|
swkey_tun_opts_len, tun_opts))
|
|
|
|
return -EMSGSIZE;
|
2015-01-15 02:53:59 +00:00
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-05 11:09:47 +00:00
|
|
|
static int ip_tun_to_nlattr(struct sk_buff *skb,
|
|
|
|
const struct ip_tunnel_key *output,
|
|
|
|
const void *tun_opts, int swkey_tun_opts_len,
|
2019-03-28 04:43:23 +00:00
|
|
|
unsigned short tun_proto, u8 mode)
|
2014-10-03 22:35:33 +00:00
|
|
|
{
|
|
|
|
struct nlattr *nla;
|
|
|
|
int err;
|
|
|
|
|
2019-04-26 09:13:06 +00:00
|
|
|
nla = nla_nest_start_noflag(skb, OVS_KEY_ATTR_TUNNEL);
|
2014-10-03 22:35:33 +00:00
|
|
|
if (!nla)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
2015-10-05 11:09:47 +00:00
|
|
|
err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len,
|
2019-03-28 04:43:23 +00:00
|
|
|
tun_proto, mode);
|
2014-10-03 22:35:33 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
nla_nest_end(skb, nla);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-23 01:17:16 +00:00
|
|
|
int ovs_nla_put_tunnel_info(struct sk_buff *skb,
|
|
|
|
struct ip_tunnel_info *tun_info)
|
2014-11-06 14:51:24 +00:00
|
|
|
{
|
2015-10-24 13:54:12 +00:00
|
|
|
return __ip_tun_to_nlattr(skb, &tun_info->key,
|
|
|
|
ip_tunnel_info_opts(tun_info),
|
|
|
|
tun_info->options_len,
|
2019-03-28 04:43:23 +00:00
|
|
|
ip_tunnel_info_af(tun_info), tun_info->mode);
|
2014-11-06 14:51:24 +00:00
|
|
|
}
|
|
|
|
|
2016-09-07 16:56:59 +00:00
|
|
|
static int encode_vlan_from_nlattrs(struct sw_flow_match *match,
|
|
|
|
const struct nlattr *a[],
|
|
|
|
bool is_mask, bool inner)
|
|
|
|
{
|
|
|
|
__be16 tci = 0;
|
|
|
|
__be16 tpid = 0;
|
|
|
|
|
|
|
|
if (a[OVS_KEY_ATTR_VLAN])
|
|
|
|
tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
|
|
|
|
|
|
|
|
if (a[OVS_KEY_ATTR_ETHERTYPE])
|
|
|
|
tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
|
|
|
|
|
|
|
|
if (likely(!inner)) {
|
|
|
|
SW_FLOW_KEY_PUT(match, eth.vlan.tpid, tpid, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, eth.vlan.tci, tci, is_mask);
|
|
|
|
} else {
|
|
|
|
SW_FLOW_KEY_PUT(match, eth.cvlan.tpid, tpid, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, eth.cvlan.tci, tci, is_mask);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int validate_vlan_from_nlattrs(const struct sw_flow_match *match,
|
|
|
|
u64 key_attrs, bool inner,
|
|
|
|
const struct nlattr **a, bool log)
|
|
|
|
{
|
|
|
|
__be16 tci = 0;
|
|
|
|
|
|
|
|
if (!((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
|
|
|
|
(key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
|
|
|
|
eth_type_vlan(nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE])))) {
|
|
|
|
/* Not a VLAN. */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
|
|
|
|
(key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
|
|
|
|
OVS_NLERR(log, "Invalid %s frame", (inner) ? "C-VLAN" : "VLAN");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (a[OVS_KEY_ATTR_VLAN])
|
|
|
|
tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
|
|
|
|
|
2018-11-08 17:44:50 +00:00
|
|
|
if (!(tci & htons(VLAN_CFI_MASK))) {
|
2016-09-07 16:56:59 +00:00
|
|
|
if (tci) {
|
2018-11-08 17:44:50 +00:00
|
|
|
OVS_NLERR(log, "%s TCI does not have VLAN_CFI_MASK bit set.",
|
2016-09-07 16:56:59 +00:00
|
|
|
(inner) ? "C-VLAN" : "VLAN");
|
|
|
|
return -EINVAL;
|
|
|
|
} else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) {
|
|
|
|
/* Corner case for truncated VLAN header. */
|
|
|
|
OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.",
|
|
|
|
(inner) ? "C-VLAN" : "VLAN");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match,
|
|
|
|
u64 key_attrs, bool inner,
|
|
|
|
const struct nlattr **a, bool log)
|
|
|
|
{
|
|
|
|
__be16 tci = 0;
|
|
|
|
__be16 tpid = 0;
|
|
|
|
bool encap_valid = !!(match->key->eth.vlan.tci &
|
2018-11-08 17:44:50 +00:00
|
|
|
htons(VLAN_CFI_MASK));
|
2016-09-07 16:56:59 +00:00
|
|
|
bool i_encap_valid = !!(match->key->eth.cvlan.tci &
|
2018-11-08 17:44:50 +00:00
|
|
|
htons(VLAN_CFI_MASK));
|
2016-09-07 16:56:59 +00:00
|
|
|
|
|
|
|
if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) {
|
|
|
|
/* Not a VLAN. */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((!inner && !encap_valid) || (inner && !i_encap_valid)) {
|
|
|
|
OVS_NLERR(log, "Encap mask attribute is set for non-%s frame.",
|
|
|
|
(inner) ? "C-VLAN" : "VLAN");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (a[OVS_KEY_ATTR_VLAN])
|
|
|
|
tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
|
|
|
|
|
|
|
|
if (a[OVS_KEY_ATTR_ETHERTYPE])
|
|
|
|
tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
|
|
|
|
|
|
|
|
if (tpid != htons(0xffff)) {
|
|
|
|
OVS_NLERR(log, "Must have an exact match on %s TPID (mask=%x).",
|
|
|
|
(inner) ? "C-VLAN" : "VLAN", ntohs(tpid));
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2018-11-08 17:44:50 +00:00
|
|
|
if (!(tci & htons(VLAN_CFI_MASK))) {
|
|
|
|
OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_CFI_MASK bit.",
|
2016-09-07 16:56:59 +00:00
|
|
|
(inner) ? "C-VLAN" : "VLAN");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __parse_vlan_from_nlattrs(struct sw_flow_match *match,
|
|
|
|
u64 *key_attrs, bool inner,
|
|
|
|
const struct nlattr **a, bool is_mask,
|
|
|
|
bool log)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
const struct nlattr *encap;
|
|
|
|
|
|
|
|
if (!is_mask)
|
|
|
|
err = validate_vlan_from_nlattrs(match, *key_attrs, inner,
|
|
|
|
a, log);
|
|
|
|
else
|
|
|
|
err = validate_vlan_mask_from_nlattrs(match, *key_attrs, inner,
|
|
|
|
a, log);
|
|
|
|
if (err <= 0)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
err = encode_vlan_from_nlattrs(match, a, is_mask, inner);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
*key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
|
|
|
|
*key_attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
|
|
|
|
*key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
|
|
|
|
|
|
|
|
encap = a[OVS_KEY_ATTR_ENCAP];
|
|
|
|
|
|
|
|
if (!is_mask)
|
|
|
|
err = parse_flow_nlattrs(encap, a, key_attrs, log);
|
|
|
|
else
|
|
|
|
err = parse_flow_mask_nlattrs(encap, a, key_attrs, log);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int parse_vlan_from_nlattrs(struct sw_flow_match *match,
|
|
|
|
u64 *key_attrs, const struct nlattr **a,
|
|
|
|
bool is_mask, bool log)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
bool encap_valid = false;
|
|
|
|
|
|
|
|
err = __parse_vlan_from_nlattrs(match, key_attrs, false, a,
|
|
|
|
is_mask, log);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2018-11-08 17:44:50 +00:00
|
|
|
encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_CFI_MASK));
|
2016-09-07 16:56:59 +00:00
|
|
|
if (encap_valid) {
|
|
|
|
err = __parse_vlan_from_nlattrs(match, key_attrs, true, a,
|
|
|
|
is_mask, log);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-11-10 15:28:22 +00:00
|
|
|
static int parse_eth_type_from_nlattrs(struct sw_flow_match *match,
|
|
|
|
u64 *attrs, const struct nlattr **a,
|
|
|
|
bool is_mask, bool log)
|
|
|
|
{
|
|
|
|
__be16 eth_type;
|
|
|
|
|
|
|
|
eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
|
|
|
|
if (is_mask) {
|
|
|
|
/* Always exact match EtherType. */
|
|
|
|
eth_type = htons(0xffff);
|
|
|
|
} else if (!eth_proto_is_802_3(eth_type)) {
|
|
|
|
OVS_NLERR(log, "EtherType %x is less than min %x",
|
|
|
|
ntohs(eth_type), ETH_P_802_3_MIN);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
|
|
|
|
*attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-08-26 18:31:52 +00:00
|
|
|
static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
|
|
|
|
u64 *attrs, const struct nlattr **a,
|
|
|
|
bool is_mask, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
2016-11-10 15:28:22 +00:00
|
|
|
u8 mac_proto = MAC_PROTO_ETHERNET;
|
|
|
|
|
2014-09-16 02:37:25 +00:00
|
|
|
if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) {
|
|
|
|
u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]);
|
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask);
|
|
|
|
*attrs &= ~(1 << OVS_KEY_ATTR_DP_HASH);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*attrs & (1 << OVS_KEY_ATTR_RECIRC_ID)) {
|
|
|
|
u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]);
|
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask);
|
|
|
|
*attrs &= ~(1 << OVS_KEY_ATTR_RECIRC_ID);
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
|
|
|
|
SW_FLOW_KEY_PUT(match, phy.priority,
|
|
|
|
nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
|
|
|
|
*attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
|
|
|
|
u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
|
|
|
|
|
2014-10-06 12:08:38 +00:00
|
|
|
if (is_mask) {
|
2013-10-04 01:16:47 +00:00
|
|
|
in_port = 0xffffffff; /* Always exact match in_port. */
|
2014-10-06 12:08:38 +00:00
|
|
|
} else if (in_port >= DP_MAX_PORTS) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Port %d exceeds max allowable %d",
|
2014-10-06 12:08:38 +00:00
|
|
|
in_port, DP_MAX_PORTS);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
2014-10-06 12:08:38 +00:00
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
|
|
|
|
*attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
|
|
|
|
} else if (!is_mask) {
|
|
|
|
SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
|
|
|
|
uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
|
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
|
|
|
|
*attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
|
|
|
|
}
|
|
|
|
if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
|
2015-10-05 11:09:47 +00:00
|
|
|
if (ip_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
|
|
|
|
is_mask, log) < 0)
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
*attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
|
|
|
|
}
|
2015-08-26 18:31:48 +00:00
|
|
|
|
|
|
|
if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) &&
|
2015-08-26 18:31:52 +00:00
|
|
|
ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) {
|
2015-10-06 18:00:00 +00:00
|
|
|
u32 ct_state = nla_get_u32(a[OVS_KEY_ATTR_CT_STATE]);
|
2015-08-26 18:31:48 +00:00
|
|
|
|
2015-10-20 02:18:57 +00:00
|
|
|
if (ct_state & ~CT_SUPPORTED_MASK) {
|
2015-10-06 18:00:00 +00:00
|
|
|
OVS_NLERR(log, "ct_state flags %08x unsupported",
|
2015-10-06 17:59:59 +00:00
|
|
|
ct_state);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2015-08-26 18:31:48 +00:00
|
|
|
|
2017-02-09 19:22:01 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask);
|
2015-08-26 18:31:48 +00:00
|
|
|
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
|
|
|
|
}
|
|
|
|
if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
|
2015-08-26 18:31:52 +00:00
|
|
|
ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) {
|
2015-08-26 18:31:48 +00:00
|
|
|
u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
|
|
|
|
|
2017-02-09 19:22:01 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask);
|
2015-08-26 18:31:48 +00:00
|
|
|
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
|
|
|
|
}
|
2015-08-26 18:31:49 +00:00
|
|
|
if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
|
2015-08-26 18:31:52 +00:00
|
|
|
ovs_ct_verify(net, OVS_KEY_ATTR_CT_MARK)) {
|
2015-08-26 18:31:49 +00:00
|
|
|
u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]);
|
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask);
|
|
|
|
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK);
|
|
|
|
}
|
2015-10-01 22:00:37 +00:00
|
|
|
if (*attrs & (1 << OVS_KEY_ATTR_CT_LABELS) &&
|
|
|
|
ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABELS)) {
|
|
|
|
const struct ovs_key_ct_labels *cl;
|
2015-08-26 18:31:52 +00:00
|
|
|
|
2015-10-01 22:00:37 +00:00
|
|
|
cl = nla_data(a[OVS_KEY_ATTR_CT_LABELS]);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, ct.labels, cl->ct_labels,
|
2015-08-26 18:31:52 +00:00
|
|
|
sizeof(*cl), is_mask);
|
2015-10-01 22:00:37 +00:00
|
|
|
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
|
2015-08-26 18:31:52 +00:00
|
|
|
}
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) {
|
|
|
|
const struct ovs_key_ct_tuple_ipv4 *ct;
|
|
|
|
|
|
|
|
ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]);
|
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
|
2017-02-09 19:22:01 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask);
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4);
|
|
|
|
}
|
|
|
|
if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) {
|
|
|
|
const struct ovs_key_ct_tuple_ipv6 *ct;
|
|
|
|
|
|
|
|
ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]);
|
|
|
|
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src,
|
|
|
|
sizeof(match->key->ipv6.ct_orig.src),
|
|
|
|
is_mask);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst,
|
|
|
|
sizeof(match->key->ipv6.ct_orig.dst),
|
|
|
|
is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
|
2017-02-09 19:22:01 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask);
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
|
|
|
|
}
|
2016-11-10 15:28:18 +00:00
|
|
|
|
2016-11-10 15:28:22 +00:00
|
|
|
/* For layer 3 packets the Ethernet type is provided
|
|
|
|
* and treated as metadata but no MAC addresses are provided.
|
|
|
|
*/
|
|
|
|
if (!(*attrs & (1ULL << OVS_KEY_ATTR_ETHERNET)) &&
|
|
|
|
(*attrs & (1ULL << OVS_KEY_ATTR_ETHERTYPE)))
|
|
|
|
mac_proto = MAC_PROTO_NONE;
|
|
|
|
|
2016-11-10 15:28:18 +00:00
|
|
|
/* Always exact match mac_proto */
|
2016-11-10 15:28:22 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, mac_proto, is_mask ? 0xff : mac_proto, is_mask);
|
|
|
|
|
|
|
|
if (mac_proto == MAC_PROTO_NONE)
|
|
|
|
return parse_eth_type_from_nlattrs(match, attrs, a, is_mask,
|
|
|
|
log);
|
2016-11-10 15:28:18 +00:00
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-11-07 13:07:02 +00:00
|
|
|
int nsh_hdr_from_nlattr(const struct nlattr *attr,
|
|
|
|
struct nshhdr *nh, size_t size)
|
|
|
|
{
|
|
|
|
struct nlattr *a;
|
|
|
|
int rem;
|
|
|
|
u8 flags = 0;
|
|
|
|
u8 ttl = 0;
|
|
|
|
int mdlen = 0;
|
|
|
|
|
|
|
|
/* validate_nsh has check this, so we needn't do duplicate check here
|
|
|
|
*/
|
|
|
|
if (size < NSH_BASE_HDR_LEN)
|
|
|
|
return -ENOBUFS;
|
|
|
|
|
|
|
|
nla_for_each_nested(a, attr, rem) {
|
|
|
|
int type = nla_type(a);
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case OVS_NSH_KEY_ATTR_BASE: {
|
|
|
|
const struct ovs_nsh_key_base *base = nla_data(a);
|
|
|
|
|
|
|
|
flags = base->flags;
|
|
|
|
ttl = base->ttl;
|
|
|
|
nh->np = base->np;
|
|
|
|
nh->mdtype = base->mdtype;
|
|
|
|
nh->path_hdr = base->path_hdr;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case OVS_NSH_KEY_ATTR_MD1:
|
|
|
|
mdlen = nla_len(a);
|
|
|
|
if (mdlen > size - NSH_BASE_HDR_LEN)
|
|
|
|
return -ENOBUFS;
|
|
|
|
memcpy(&nh->md1, nla_data(a), mdlen);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OVS_NSH_KEY_ATTR_MD2:
|
|
|
|
mdlen = nla_len(a);
|
|
|
|
if (mdlen > size - NSH_BASE_HDR_LEN)
|
|
|
|
return -ENOBUFS;
|
|
|
|
memcpy(&nh->md2, nla_data(a), mdlen);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* nsh header length = NSH_BASE_HDR_LEN + mdlen */
|
|
|
|
nh->ver_flags_ttl_len = 0;
|
|
|
|
nsh_set_flags_ttl_len(nh, flags, ttl, NSH_BASE_HDR_LEN + mdlen);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int nsh_key_from_nlattr(const struct nlattr *attr,
|
|
|
|
struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask)
|
|
|
|
{
|
|
|
|
struct nlattr *a;
|
|
|
|
int rem;
|
|
|
|
|
|
|
|
/* validate_nsh has check this, so we needn't do duplicate check here
|
|
|
|
*/
|
|
|
|
nla_for_each_nested(a, attr, rem) {
|
|
|
|
int type = nla_type(a);
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case OVS_NSH_KEY_ATTR_BASE: {
|
|
|
|
const struct ovs_nsh_key_base *base = nla_data(a);
|
|
|
|
const struct ovs_nsh_key_base *base_mask = base + 1;
|
|
|
|
|
|
|
|
nsh->base = *base;
|
|
|
|
nsh_mask->base = *base_mask;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case OVS_NSH_KEY_ATTR_MD1: {
|
|
|
|
const struct ovs_nsh_key_md1 *md1 = nla_data(a);
|
|
|
|
const struct ovs_nsh_key_md1 *md1_mask = md1 + 1;
|
|
|
|
|
|
|
|
memcpy(nsh->context, md1->context, sizeof(*md1));
|
|
|
|
memcpy(nsh_mask->context, md1_mask->context,
|
|
|
|
sizeof(*md1_mask));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case OVS_NSH_KEY_ATTR_MD2:
|
|
|
|
/* Not supported yet */
|
|
|
|
return -ENOTSUPP;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nsh_key_put_from_nlattr(const struct nlattr *attr,
|
|
|
|
struct sw_flow_match *match, bool is_mask,
|
|
|
|
bool is_push_nsh, bool log)
|
|
|
|
{
|
|
|
|
struct nlattr *a;
|
|
|
|
int rem;
|
|
|
|
bool has_base = false;
|
|
|
|
bool has_md1 = false;
|
|
|
|
bool has_md2 = false;
|
|
|
|
u8 mdtype = 0;
|
|
|
|
int mdlen = 0;
|
|
|
|
|
|
|
|
if (WARN_ON(is_push_nsh && is_mask))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
nla_for_each_nested(a, attr, rem) {
|
|
|
|
int type = nla_type(a);
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (type > OVS_NSH_KEY_ATTR_MAX) {
|
|
|
|
OVS_NLERR(log, "nsh attr %d is out of range max %d",
|
|
|
|
type, OVS_NSH_KEY_ATTR_MAX);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!check_attr_len(nla_len(a),
|
|
|
|
ovs_nsh_key_attr_lens[type].len)) {
|
|
|
|
OVS_NLERR(
|
|
|
|
log,
|
|
|
|
"nsh attr %d has unexpected len %d expected %d",
|
|
|
|
type,
|
|
|
|
nla_len(a),
|
|
|
|
ovs_nsh_key_attr_lens[type].len
|
|
|
|
);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case OVS_NSH_KEY_ATTR_BASE: {
|
|
|
|
const struct ovs_nsh_key_base *base = nla_data(a);
|
|
|
|
|
|
|
|
has_base = true;
|
|
|
|
mdtype = base->mdtype;
|
|
|
|
SW_FLOW_KEY_PUT(match, nsh.base.flags,
|
|
|
|
base->flags, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, nsh.base.ttl,
|
|
|
|
base->ttl, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, nsh.base.mdtype,
|
|
|
|
base->mdtype, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, nsh.base.np,
|
|
|
|
base->np, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, nsh.base.path_hdr,
|
|
|
|
base->path_hdr, is_mask);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case OVS_NSH_KEY_ATTR_MD1: {
|
|
|
|
const struct ovs_nsh_key_md1 *md1 = nla_data(a);
|
|
|
|
|
|
|
|
has_md1 = true;
|
|
|
|
for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++)
|
|
|
|
SW_FLOW_KEY_PUT(match, nsh.context[i],
|
|
|
|
md1->context[i], is_mask);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case OVS_NSH_KEY_ATTR_MD2:
|
|
|
|
if (!is_push_nsh) /* Not supported MD type 2 yet */
|
|
|
|
return -ENOTSUPP;
|
|
|
|
|
|
|
|
has_md2 = true;
|
|
|
|
mdlen = nla_len(a);
|
|
|
|
if (mdlen > NSH_CTX_HDRS_MAX_LEN || mdlen <= 0) {
|
|
|
|
OVS_NLERR(
|
|
|
|
log,
|
|
|
|
"Invalid MD length %d for MD type %d",
|
|
|
|
mdlen,
|
|
|
|
mdtype
|
|
|
|
);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
OVS_NLERR(log, "Unknown nsh attribute %d",
|
|
|
|
type);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rem > 0) {
|
|
|
|
OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (has_md1 && has_md2) {
|
|
|
|
OVS_NLERR(
|
|
|
|
1,
|
|
|
|
"invalid nsh attribute: md1 and md2 are exclusive."
|
|
|
|
);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!is_mask) {
|
|
|
|
if ((has_md1 && mdtype != NSH_M_TYPE1) ||
|
|
|
|
(has_md2 && mdtype != NSH_M_TYPE2)) {
|
|
|
|
OVS_NLERR(1, "nsh attribute has unmatched MD type %d.",
|
|
|
|
mdtype);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_push_nsh &&
|
|
|
|
(!has_base || (!has_md1 && !has_md2))) {
|
|
|
|
OVS_NLERR(
|
|
|
|
1,
|
|
|
|
"push_nsh: missing base or metadata attributes"
|
|
|
|
);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-08-26 18:31:52 +00:00
|
|
|
static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
|
|
|
|
u64 attrs, const struct nlattr **a,
|
|
|
|
bool is_mask, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
2015-08-26 18:31:52 +00:00
|
|
|
err = metadata_from_nlattrs(net, match, &attrs, a, is_mask, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) {
|
|
|
|
const struct ovs_key_ethernet *eth_key;
|
|
|
|
|
|
|
|
eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, eth.src,
|
|
|
|
eth_key->eth_src, ETH_ALEN, is_mask);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, eth.dst,
|
|
|
|
eth_key->eth_dst, ETH_ALEN, is_mask);
|
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
|
|
|
|
|
2016-11-10 15:28:22 +00:00
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
|
|
|
|
/* VLAN attribute is always parsed before getting here since it
|
|
|
|
* may occur multiple times.
|
|
|
|
*/
|
|
|
|
OVS_NLERR(log, "VLAN attribute unexpected.");
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2016-11-10 15:28:22 +00:00
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
|
|
|
|
err = parse_eth_type_from_nlattrs(match, &attrs, a, is_mask,
|
|
|
|
log);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
} else if (!is_mask) {
|
|
|
|
SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
|
|
|
|
}
|
|
|
|
} else if (!match->key->eth.type) {
|
|
|
|
OVS_NLERR(log, "Either Ethernet header or EtherType is required.");
|
|
|
|
return -EINVAL;
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
|
|
|
|
const struct ovs_key_ipv4 *ipv4_key;
|
|
|
|
|
|
|
|
ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
|
|
|
|
if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "IPv4 frag type %d is out of range max %d",
|
|
|
|
ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
SW_FLOW_KEY_PUT(match, ip.proto,
|
|
|
|
ipv4_key->ipv4_proto, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ip.tos,
|
|
|
|
ipv4_key->ipv4_tos, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ip.ttl,
|
|
|
|
ipv4_key->ipv4_ttl, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ip.frag,
|
|
|
|
ipv4_key->ipv4_frag, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ipv4.addr.src,
|
|
|
|
ipv4_key->ipv4_src, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
|
|
|
|
ipv4_key->ipv4_dst, is_mask);
|
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
|
|
|
|
const struct ovs_key_ipv6 *ipv6_key;
|
|
|
|
|
|
|
|
ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
|
|
|
|
if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "IPv6 frag type %d is out of range max %d",
|
|
|
|
ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
2014-11-11 22:36:30 +00:00
|
|
|
|
2014-11-19 21:54:49 +00:00
|
|
|
if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) {
|
2017-08-11 11:26:26 +00:00
|
|
|
OVS_NLERR(log, "IPv6 flow label %x is out of range (max=%x)",
|
2014-11-11 22:36:30 +00:00
|
|
|
ntohl(ipv6_key->ipv6_label), (1 << 20) - 1);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, ipv6.label,
|
|
|
|
ipv6_key->ipv6_label, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ip.proto,
|
|
|
|
ipv6_key->ipv6_proto, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ip.tos,
|
|
|
|
ipv6_key->ipv6_tclass, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ip.ttl,
|
|
|
|
ipv6_key->ipv6_hlimit, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ip.frag,
|
|
|
|
ipv6_key->ipv6_frag, is_mask);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
|
|
|
|
ipv6_key->ipv6_src,
|
|
|
|
sizeof(match->key->ipv6.addr.src),
|
|
|
|
is_mask);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
|
|
|
|
ipv6_key->ipv6_dst,
|
|
|
|
sizeof(match->key->ipv6.addr.dst),
|
|
|
|
is_mask);
|
|
|
|
|
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
|
|
|
|
}
|
|
|
|
|
2022-02-24 00:54:09 +00:00
|
|
|
if (attrs & (1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS)) {
|
|
|
|
const struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key;
|
|
|
|
|
|
|
|
ipv6_exthdrs_key = nla_data(a[OVS_KEY_ATTR_IPV6_EXTHDRS]);
|
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, ipv6.exthdrs,
|
|
|
|
ipv6_exthdrs_key->hdrs, is_mask);
|
|
|
|
|
|
|
|
attrs &= ~(1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS);
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
|
|
|
|
const struct ovs_key_arp *arp_key;
|
|
|
|
|
|
|
|
arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
|
|
|
|
if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Unknown ARP opcode (opcode=%d).",
|
2013-10-04 01:16:47 +00:00
|
|
|
arp_key->arp_op);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, ipv4.addr.src,
|
|
|
|
arp_key->arp_sip, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
|
|
|
|
arp_key->arp_tip, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, ip.proto,
|
|
|
|
ntohs(arp_key->arp_op), is_mask);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
|
|
|
|
arp_key->arp_sha, ETH_ALEN, is_mask);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
|
|
|
|
arp_key->arp_tha, ETH_ALEN, is_mask);
|
|
|
|
|
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_ARP);
|
|
|
|
}
|
|
|
|
|
2017-11-07 13:07:02 +00:00
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_NSH)) {
|
|
|
|
if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match,
|
|
|
|
is_mask, false, log) < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_NSH);
|
|
|
|
}
|
|
|
|
|
2014-10-06 12:05:13 +00:00
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
|
|
|
|
const struct ovs_key_mpls *mpls_key;
|
2019-11-04 01:57:44 +00:00
|
|
|
u32 hdr_len;
|
|
|
|
u32 label_count, label_count_mask, i;
|
2014-10-06 12:05:13 +00:00
|
|
|
|
|
|
|
mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]);
|
2019-11-04 01:57:44 +00:00
|
|
|
hdr_len = nla_len(a[OVS_KEY_ATTR_MPLS]);
|
|
|
|
label_count = hdr_len / sizeof(struct ovs_key_mpls);
|
|
|
|
|
|
|
|
if (label_count == 0 || label_count > MPLS_LABEL_DEPTH ||
|
|
|
|
hdr_len % sizeof(struct ovs_key_mpls))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
label_count_mask = GENMASK(label_count - 1, 0);
|
|
|
|
|
|
|
|
for (i = 0 ; i < label_count; i++)
|
|
|
|
SW_FLOW_KEY_PUT(match, mpls.lse[i],
|
|
|
|
mpls_key[i].mpls_lse, is_mask);
|
|
|
|
|
|
|
|
SW_FLOW_KEY_PUT(match, mpls.num_labels_mask,
|
|
|
|
label_count_mask, is_mask);
|
2014-10-06 12:05:13 +00:00
|
|
|
|
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_MPLS);
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
|
|
|
|
const struct ovs_key_tcp *tcp_key;
|
|
|
|
|
|
|
|
tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
|
2014-05-05 16:54:49 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tp.src, tcp_key->tcp_src, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, tp.dst, tcp_key->tcp_dst, is_mask);
|
2013-10-04 01:16:47 +00:00
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_TCP);
|
|
|
|
}
|
|
|
|
|
2013-10-23 08:44:59 +00:00
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) {
|
2014-09-08 05:11:08 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tp.flags,
|
|
|
|
nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
|
|
|
|
is_mask);
|
2013-10-23 08:44:59 +00:00
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS);
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
|
|
|
|
const struct ovs_key_udp *udp_key;
|
|
|
|
|
|
|
|
udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
|
2014-05-05 16:54:49 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tp.src, udp_key->udp_src, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, tp.dst, udp_key->udp_dst, is_mask);
|
2013-10-04 01:16:47 +00:00
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_UDP);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
|
|
|
|
const struct ovs_key_sctp *sctp_key;
|
|
|
|
|
|
|
|
sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]);
|
2014-05-05 16:54:49 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tp.src, sctp_key->sctp_src, is_mask);
|
|
|
|
SW_FLOW_KEY_PUT(match, tp.dst, sctp_key->sctp_dst, is_mask);
|
2013-10-04 01:16:47 +00:00
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_SCTP);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
|
|
|
|
const struct ovs_key_icmp *icmp_key;
|
|
|
|
|
|
|
|
icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
|
2014-05-05 16:54:49 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tp.src,
|
2013-10-04 01:16:47 +00:00
|
|
|
htons(icmp_key->icmp_type), is_mask);
|
2014-05-05 16:54:49 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tp.dst,
|
2013-10-04 01:16:47 +00:00
|
|
|
htons(icmp_key->icmp_code), is_mask);
|
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) {
|
|
|
|
const struct ovs_key_icmpv6 *icmpv6_key;
|
|
|
|
|
|
|
|
icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
|
2014-05-05 16:54:49 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tp.src,
|
2013-10-04 01:16:47 +00:00
|
|
|
htons(icmpv6_key->icmpv6_type), is_mask);
|
2014-05-05 16:54:49 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, tp.dst,
|
2013-10-04 01:16:47 +00:00
|
|
|
htons(icmpv6_key->icmpv6_code), is_mask);
|
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (attrs & (1 << OVS_KEY_ATTR_ND)) {
|
|
|
|
const struct ovs_key_nd *nd_key;
|
|
|
|
|
|
|
|
nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
|
|
|
|
nd_key->nd_target,
|
|
|
|
sizeof(match->key->ipv6.nd.target),
|
|
|
|
is_mask);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
|
|
|
|
nd_key->nd_sll, ETH_ALEN, is_mask);
|
|
|
|
SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
|
|
|
|
nd_key->nd_tll, ETH_ALEN, is_mask);
|
|
|
|
attrs &= ~(1 << OVS_KEY_ATTR_ND);
|
|
|
|
}
|
|
|
|
|
2014-10-06 12:08:38 +00:00
|
|
|
if (attrs != 0) {
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Unknown key attributes %llx",
|
2014-10-06 12:08:38 +00:00
|
|
|
(unsigned long long)attrs);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
2014-10-06 12:08:38 +00:00
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-01-15 02:53:58 +00:00
|
|
|
static void nlattr_set(struct nlattr *attr, u8 val,
|
|
|
|
const struct ovs_len_tbl *tbl)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
2014-10-17 04:55:45 +00:00
|
|
|
struct nlattr *nla;
|
|
|
|
int rem;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2014-10-17 04:55:45 +00:00
|
|
|
/* The nlattr stream should already have been validated */
|
|
|
|
nla_for_each_nested(nla, attr, rem) {
|
openvswitch: Don't swap table in nlattr_set() after OVS_ATTR_NESTED is found
If an OVS_ATTR_NESTED attribute type is found while walking
through netlink attributes, we call nlattr_set() recursively
passing the length table for the following nested attributes, if
different from the current one.
However, once we're done with those sub-nested attributes, we
should continue walking through attributes using the current
table, instead of using the one related to the sub-nested
attributes.
For example, given this sequence:
1 OVS_KEY_ATTR_PRIORITY
2 OVS_KEY_ATTR_TUNNEL
3 OVS_TUNNEL_KEY_ATTR_ID
4 OVS_TUNNEL_KEY_ATTR_IPV4_SRC
5 OVS_TUNNEL_KEY_ATTR_IPV4_DST
6 OVS_TUNNEL_KEY_ATTR_TTL
7 OVS_TUNNEL_KEY_ATTR_TP_SRC
8 OVS_TUNNEL_KEY_ATTR_TP_DST
9 OVS_KEY_ATTR_IN_PORT
10 OVS_KEY_ATTR_SKB_MARK
11 OVS_KEY_ATTR_MPLS
we switch to the 'ovs_tunnel_key_lens' table on attribute #3,
and we don't switch back to 'ovs_key_lens' while setting
attributes #9 to #11 in the sequence. As OVS_KEY_ATTR_MPLS
evaluates to 21, and the array size of 'ovs_tunnel_key_lens' is
15, we also get this kind of KASan splat while accessing the
wrong table:
[ 7654.586496] ==================================================================
[ 7654.594573] BUG: KASAN: global-out-of-bounds in nlattr_set+0x164/0xde9 [openvswitch]
[ 7654.603214] Read of size 4 at addr ffffffffc169ecf0 by task handler29/87430
[ 7654.610983]
[ 7654.612644] CPU: 21 PID: 87430 Comm: handler29 Kdump: loaded Not tainted 3.10.0-866.el7.test.x86_64 #1
[ 7654.623030] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.1.7 06/16/2016
[ 7654.631379] Call Trace:
[ 7654.634108] [<ffffffffb65a7c50>] dump_stack+0x19/0x1b
[ 7654.639843] [<ffffffffb53ff373>] print_address_description+0x33/0x290
[ 7654.647129] [<ffffffffc169b37b>] ? nlattr_set+0x164/0xde9 [openvswitch]
[ 7654.654607] [<ffffffffb53ff812>] kasan_report.part.3+0x242/0x330
[ 7654.661406] [<ffffffffb53ff9b4>] __asan_report_load4_noabort+0x34/0x40
[ 7654.668789] [<ffffffffc169b37b>] nlattr_set+0x164/0xde9 [openvswitch]
[ 7654.676076] [<ffffffffc167ef68>] ovs_nla_get_match+0x10c8/0x1900 [openvswitch]
[ 7654.684234] [<ffffffffb61e9cc8>] ? genl_rcv+0x28/0x40
[ 7654.689968] [<ffffffffb61e7733>] ? netlink_unicast+0x3f3/0x590
[ 7654.696574] [<ffffffffc167dea0>] ? ovs_nla_put_tunnel_info+0xb0/0xb0 [openvswitch]
[ 7654.705122] [<ffffffffb4f41b50>] ? unwind_get_return_address+0xb0/0xb0
[ 7654.712503] [<ffffffffb65d9355>] ? system_call_fastpath+0x1c/0x21
[ 7654.719401] [<ffffffffb4f41d79>] ? update_stack_state+0x229/0x370
[ 7654.726298] [<ffffffffb4f41d79>] ? update_stack_state+0x229/0x370
[ 7654.733195] [<ffffffffb53fe4b5>] ? kasan_unpoison_shadow+0x35/0x50
[ 7654.740187] [<ffffffffb53fe62a>] ? kasan_kmalloc+0xaa/0xe0
[ 7654.746406] [<ffffffffb53fec32>] ? kasan_slab_alloc+0x12/0x20
[ 7654.752914] [<ffffffffb53fe711>] ? memset+0x31/0x40
[ 7654.758456] [<ffffffffc165bf92>] ovs_flow_cmd_new+0x2b2/0xf00 [openvswitch]
[snip]
[ 7655.132484] The buggy address belongs to the variable:
[ 7655.138226] ovs_tunnel_key_lens+0xf0/0xffffffffffffd400 [openvswitch]
[ 7655.145507]
[ 7655.147166] Memory state around the buggy address:
[ 7655.152514] ffffffffc169eb80: 00 00 00 00 00 00 00 00 00 00 fa fa fa fa fa fa
[ 7655.160585] ffffffffc169ec00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 7655.168644] >ffffffffc169ec80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fa fa
[ 7655.176701] ^
[ 7655.184372] ffffffffc169ed00: fa fa fa fa 00 00 00 00 fa fa fa fa 00 00 00 05
[ 7655.192431] ffffffffc169ed80: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 00
[ 7655.200490] ==================================================================
Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Fixes: 982b52700482 ("openvswitch: Fix mask generation for nested attributes.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-03 16:13:25 +00:00
|
|
|
if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED)
|
|
|
|
nlattr_set(nla, val, tbl[nla_type(nla)].next ? : tbl);
|
|
|
|
else
|
2014-10-17 04:55:45 +00:00
|
|
|
memset(nla_data(nla), val, nla_len(nla));
|
2015-10-20 02:18:57 +00:00
|
|
|
|
|
|
|
if (nla_type(nla) == OVS_KEY_ATTR_CT_STATE)
|
|
|
|
*(u32 *)nla_data(nla) &= CT_SUPPORTED_MASK;
|
2014-10-17 04:55:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mask_set_nlattr(struct nlattr *attr, u8 val)
|
|
|
|
{
|
2015-01-15 02:53:58 +00:00
|
|
|
nlattr_set(attr, val, ovs_key_lens);
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ovs_nla_get_match - parses Netlink attributes into a flow key and
|
|
|
|
* mask. In case the 'mask' is NULL, the flow is treated as exact match
|
|
|
|
* flow. Otherwise, it is treated as a wildcarded flow, except the mask
|
|
|
|
* does not include any don't care bit.
|
2015-08-26 18:31:52 +00:00
|
|
|
* @net: Used to determine per-namespace field support.
|
2013-10-04 01:16:47 +00:00
|
|
|
* @match: receives the extracted flow match information.
|
2020-07-12 23:15:09 +00:00
|
|
|
* @nla_key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
|
2013-10-04 01:16:47 +00:00
|
|
|
* sequence. The fields should of the packet that triggered the creation
|
|
|
|
* of this flow.
|
2020-07-12 23:15:09 +00:00
|
|
|
* @nla_mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_*
|
|
|
|
* Netlink attribute specifies the mask field of the wildcarded flow.
|
2014-11-06 15:03:05 +00:00
|
|
|
* @log: Boolean to allow kernel error logging. Normally true, but when
|
|
|
|
* probing for feature compatibility this should be passed in as false to
|
|
|
|
* suppress unnecessary error logging.
|
2013-10-04 01:16:47 +00:00
|
|
|
*/
|
2015-08-26 18:31:52 +00:00
|
|
|
int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
|
2014-10-19 19:03:40 +00:00
|
|
|
const struct nlattr *nla_key,
|
2014-11-06 15:03:05 +00:00
|
|
|
const struct nlattr *nla_mask,
|
|
|
|
bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
|
2014-10-17 04:55:45 +00:00
|
|
|
struct nlattr *newmask = NULL;
|
2013-10-04 01:16:47 +00:00
|
|
|
u64 key_attrs = 0;
|
|
|
|
u64 mask_attrs = 0;
|
|
|
|
int err;
|
|
|
|
|
2014-11-06 15:03:05 +00:00
|
|
|
err = parse_flow_nlattrs(nla_key, a, &key_attrs, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2016-09-07 16:56:59 +00:00
|
|
|
err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2015-08-26 18:31:52 +00:00
|
|
|
err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2014-10-19 19:03:40 +00:00
|
|
|
if (match->mask) {
|
|
|
|
if (!nla_mask) {
|
|
|
|
/* Create an exact match mask. We need to set to 0xff
|
|
|
|
* all the 'match->mask' fields that have been touched
|
|
|
|
* in 'match->key'. We cannot simply memset
|
|
|
|
* 'match->mask', because padding bytes and fields not
|
|
|
|
* specified in 'match->key' should be left to 0.
|
|
|
|
* Instead, we use a stream of netlink attributes,
|
|
|
|
* copied from 'key' and set to 0xff.
|
|
|
|
* ovs_key_from_nlattrs() will take care of filling
|
|
|
|
* 'match->mask' appropriately.
|
|
|
|
*/
|
|
|
|
newmask = kmemdup(nla_key,
|
|
|
|
nla_total_size(nla_len(nla_key)),
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (!newmask)
|
|
|
|
return -ENOMEM;
|
2014-10-17 04:55:45 +00:00
|
|
|
|
2014-10-19 19:03:40 +00:00
|
|
|
mask_set_nlattr(newmask, 0xff);
|
2014-10-17 04:55:45 +00:00
|
|
|
|
2014-10-19 19:03:40 +00:00
|
|
|
/* The userspace does not send tunnel attributes that
|
|
|
|
* are 0, but we should not wildcard them nonetheless.
|
|
|
|
*/
|
2015-10-05 11:09:46 +00:00
|
|
|
if (match->key->tun_proto)
|
2014-10-19 19:03:40 +00:00
|
|
|
SW_FLOW_KEY_MEMSET_FIELD(match, tun_key,
|
|
|
|
0xff, true);
|
2014-10-17 04:55:45 +00:00
|
|
|
|
2014-10-19 19:03:40 +00:00
|
|
|
nla_mask = newmask;
|
|
|
|
}
|
2014-10-17 04:55:45 +00:00
|
|
|
|
2014-11-06 15:03:05 +00:00
|
|
|
err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
2014-10-17 04:55:45 +00:00
|
|
|
goto free_newmask;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2014-10-19 19:03:40 +00:00
|
|
|
/* Always match on tci. */
|
2016-09-07 16:56:59 +00:00
|
|
|
SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true);
|
|
|
|
SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true);
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2016-09-07 16:56:59 +00:00
|
|
|
err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log);
|
|
|
|
if (err)
|
|
|
|
goto free_newmask;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2015-08-26 18:31:52 +00:00
|
|
|
err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true,
|
|
|
|
log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
2014-10-17 04:55:45 +00:00
|
|
|
goto free_newmask;
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
2014-11-06 15:03:05 +00:00
|
|
|
if (!match_validate(match, key_attrs, mask_attrs, log))
|
2014-10-17 04:55:45 +00:00
|
|
|
err = -EINVAL;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2014-10-17 04:55:45 +00:00
|
|
|
free_newmask:
|
|
|
|
kfree(newmask);
|
|
|
|
return err;
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
2015-01-22 00:42:52 +00:00
|
|
|
static size_t get_ufid_len(const struct nlattr *attr, bool log)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
|
|
|
|
if (!attr)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
len = nla_len(attr);
|
|
|
|
if (len < 1 || len > MAX_UFID_LENGTH) {
|
|
|
|
OVS_NLERR(log, "ufid size %u bytes exceeds the range (1, %d)",
|
|
|
|
nla_len(attr), MAX_UFID_LENGTH);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Initializes 'flow->ufid', returning true if 'attr' contains a valid UFID,
|
|
|
|
* or false otherwise.
|
|
|
|
*/
|
|
|
|
bool ovs_nla_get_ufid(struct sw_flow_id *sfid, const struct nlattr *attr,
|
|
|
|
bool log)
|
|
|
|
{
|
|
|
|
sfid->ufid_len = get_ufid_len(attr, log);
|
|
|
|
if (sfid->ufid_len)
|
|
|
|
memcpy(sfid->ufid, nla_data(attr), sfid->ufid_len);
|
|
|
|
|
|
|
|
return sfid->ufid_len;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
|
|
|
|
const struct sw_flow_key *key, bool log)
|
|
|
|
{
|
|
|
|
struct sw_flow_key *new_key;
|
|
|
|
|
|
|
|
if (ovs_nla_get_ufid(sfid, ufid, log))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* If UFID was not provided, use unmasked key. */
|
|
|
|
new_key = kmalloc(sizeof(*new_key), GFP_KERNEL);
|
|
|
|
if (!new_key)
|
|
|
|
return -ENOMEM;
|
|
|
|
memcpy(new_key, key, sizeof(*key));
|
|
|
|
sfid->unmasked_key = new_key;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
|
|
|
|
{
|
2024-11-08 10:41:45 +00:00
|
|
|
return nla_get_u32_default(attr, 0);
|
2015-01-22 00:42:52 +00:00
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
/**
|
|
|
|
* ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
* @net: Network namespace.
|
|
|
|
* @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack
|
|
|
|
* metadata.
|
|
|
|
* @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink
|
|
|
|
* attributes.
|
|
|
|
* @attrs: Bit mask for the netlink attributes included in @a.
|
2014-11-06 15:03:05 +00:00
|
|
|
* @log: Boolean to allow kernel error logging. Normally true, but when
|
|
|
|
* probing for feature compatibility this should be passed in as false to
|
|
|
|
* suppress unnecessary error logging.
|
2013-10-04 01:16:47 +00:00
|
|
|
*
|
|
|
|
* This parses a series of Netlink attributes that form a flow key, which must
|
|
|
|
* take the same form accepted by flow_from_nlattrs(), but only enough of it to
|
|
|
|
* get the metadata, that is, the parts of the flow key that cannot be
|
|
|
|
* extracted from the packet itself.
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
*
|
|
|
|
* This must be called before the packet key fields are filled in 'key'.
|
2013-10-04 01:16:47 +00:00
|
|
|
*/
|
|
|
|
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
int ovs_nla_get_flow_metadata(struct net *net,
|
|
|
|
const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
|
|
|
|
u64 attrs, struct sw_flow_key *key, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
2014-09-16 02:20:31 +00:00
|
|
|
struct sw_flow_match match;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
memset(&match, 0, sizeof(match));
|
2014-09-16 02:20:31 +00:00
|
|
|
match.key = key;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2017-02-09 19:22:01 +00:00
|
|
|
key->ct_state = 0;
|
|
|
|
key->ct_zone = 0;
|
|
|
|
key->ct_orig_proto = 0;
|
2015-08-26 18:31:48 +00:00
|
|
|
memset(&key->ct, 0, sizeof(key->ct));
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig));
|
|
|
|
memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig));
|
|
|
|
|
2014-09-16 02:20:31 +00:00
|
|
|
key->phy.in_port = DP_MAX_PORTS;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2015-08-26 18:31:52 +00:00
|
|
|
return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
2016-09-07 16:56:59 +00:00
|
|
|
static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh,
|
|
|
|
bool is_mask)
|
|
|
|
{
|
|
|
|
__be16 eth_type = !is_mask ? vh->tpid : htons(0xffff);
|
|
|
|
|
|
|
|
if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
|
|
|
|
nla_put_be16(skb, OVS_KEY_ATTR_VLAN, vh->tci))
|
|
|
|
return -EMSGSIZE;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-11-07 13:07:02 +00:00
|
|
|
static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct nlattr *start;
|
|
|
|
|
2019-04-26 09:13:06 +00:00
|
|
|
start = nla_nest_start_noflag(skb, OVS_KEY_ATTR_NSH);
|
2017-11-07 13:07:02 +00:00
|
|
|
if (!start)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(nsh->base), &nsh->base))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
|
|
|
if (is_mask || nsh->base.mdtype == NSH_M_TYPE1) {
|
|
|
|
if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1,
|
|
|
|
sizeof(nsh->context), nsh->context))
|
|
|
|
goto nla_put_failure;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Don't support MD type 2 yet */
|
|
|
|
|
|
|
|
nla_nest_end(skb, start);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
nla_put_failure:
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
2015-01-22 00:42:48 +00:00
|
|
|
static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
|
|
|
|
const struct sw_flow_key *output, bool is_mask,
|
|
|
|
struct sk_buff *skb)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
struct ovs_key_ethernet *eth_key;
|
2016-09-07 16:56:59 +00:00
|
|
|
struct nlattr *nla;
|
|
|
|
struct nlattr *encap = NULL;
|
|
|
|
struct nlattr *in_encap = NULL;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2014-09-16 02:37:25 +00:00
|
|
|
if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
|
|
|
if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2015-10-05 11:09:46 +00:00
|
|
|
if ((swkey->tun_proto || is_mask)) {
|
2015-01-15 02:53:57 +00:00
|
|
|
const void *opts = NULL;
|
2014-10-03 22:35:33 +00:00
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
if (ip_tunnel_is_options_present(output->tun_key.tun_flags))
|
2015-01-15 02:53:57 +00:00
|
|
|
opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len);
|
2014-10-03 22:35:33 +00:00
|
|
|
|
2015-10-05 11:09:47 +00:00
|
|
|
if (ip_tun_to_nlattr(skb, &output->tun_key, opts,
|
2019-03-28 04:43:23 +00:00
|
|
|
swkey->tun_opts_len, swkey->tun_proto, 0))
|
2014-10-03 22:35:33 +00:00
|
|
|
goto nla_put_failure;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
if (swkey->phy.in_port == DP_MAX_PORTS) {
|
|
|
|
if (is_mask && (output->phy.in_port == 0xffff))
|
|
|
|
if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
|
|
|
|
goto nla_put_failure;
|
|
|
|
} else {
|
|
|
|
u16 upper_u16;
|
|
|
|
upper_u16 = !is_mask ? 0 : 0xffff;
|
|
|
|
|
|
|
|
if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
|
|
|
|
(upper_u16 << 16) | output->phy.in_port))
|
|
|
|
goto nla_put_failure;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
openvswitch: Add original direction conntrack tuple to sw_flow_key.
Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key. The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry. This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.
The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.
The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple. This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.
When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state. While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards. If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change. When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.
It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information. If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.
The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields. Hence, the IP addresses are overlaid in union with ARP
and ND fields. This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets. ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-09 19:21:59 +00:00
|
|
|
if (ovs_ct_put_key(swkey, output, skb))
|
2015-08-26 18:31:48 +00:00
|
|
|
goto nla_put_failure;
|
|
|
|
|
2016-11-10 15:28:22 +00:00
|
|
|
if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) {
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
|
|
|
|
if (!nla)
|
2013-10-04 01:16:47 +00:00
|
|
|
goto nla_put_failure;
|
2016-09-07 16:56:59 +00:00
|
|
|
|
2016-11-10 15:28:22 +00:00
|
|
|
eth_key = nla_data(nla);
|
|
|
|
ether_addr_copy(eth_key->eth_src, output->eth.src);
|
|
|
|
ether_addr_copy(eth_key->eth_dst, output->eth.dst);
|
|
|
|
|
|
|
|
if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
|
|
|
|
if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
|
2016-09-07 16:56:59 +00:00
|
|
|
goto nla_put_failure;
|
2019-04-26 09:13:06 +00:00
|
|
|
encap = nla_nest_start_noflag(skb, OVS_KEY_ATTR_ENCAP);
|
2016-11-10 15:28:22 +00:00
|
|
|
if (!swkey->eth.vlan.tci)
|
2016-09-07 16:56:59 +00:00
|
|
|
goto unencap;
|
2016-11-10 15:28:22 +00:00
|
|
|
|
|
|
|
if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) {
|
|
|
|
if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask))
|
|
|
|
goto nla_put_failure;
|
2019-04-26 09:13:06 +00:00
|
|
|
in_encap = nla_nest_start_noflag(skb,
|
|
|
|
OVS_KEY_ATTR_ENCAP);
|
2016-11-10 15:28:22 +00:00
|
|
|
if (!swkey->eth.cvlan.tci)
|
|
|
|
goto unencap;
|
|
|
|
}
|
2016-09-07 16:56:59 +00:00
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2016-11-10 15:28:22 +00:00
|
|
|
if (swkey->eth.type == htons(ETH_P_802_2)) {
|
|
|
|
/*
|
|
|
|
* Ethertype 802.2 is represented in the netlink with omitted
|
|
|
|
* OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
|
|
|
|
* 0xffff in the mask attribute. Ethertype can also
|
|
|
|
* be wildcarded.
|
|
|
|
*/
|
|
|
|
if (is_mask && output->eth.type)
|
|
|
|
if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
|
|
|
|
output->eth.type))
|
|
|
|
goto nla_put_failure;
|
|
|
|
goto unencap;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
|
|
|
|
goto nla_put_failure;
|
|
|
|
|
2016-09-07 16:56:59 +00:00
|
|
|
if (eth_type_vlan(swkey->eth.type)) {
|
|
|
|
/* There are 3 VLAN tags, we don't know anything about the rest
|
|
|
|
* of the packet, so truncate here.
|
|
|
|
*/
|
|
|
|
WARN_ON_ONCE(!(encap && in_encap));
|
|
|
|
goto unencap;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (swkey->eth.type == htons(ETH_P_IP)) {
|
|
|
|
struct ovs_key_ipv4 *ipv4_key;
|
|
|
|
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
|
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
|
|
|
ipv4_key = nla_data(nla);
|
|
|
|
ipv4_key->ipv4_src = output->ipv4.addr.src;
|
|
|
|
ipv4_key->ipv4_dst = output->ipv4.addr.dst;
|
|
|
|
ipv4_key->ipv4_proto = output->ip.proto;
|
|
|
|
ipv4_key->ipv4_tos = output->ip.tos;
|
|
|
|
ipv4_key->ipv4_ttl = output->ip.ttl;
|
|
|
|
ipv4_key->ipv4_frag = output->ip.frag;
|
|
|
|
} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
|
|
|
|
struct ovs_key_ipv6 *ipv6_key;
|
2022-02-24 00:54:09 +00:00
|
|
|
struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
|
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
|
|
|
ipv6_key = nla_data(nla);
|
|
|
|
memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
|
|
|
|
sizeof(ipv6_key->ipv6_src));
|
|
|
|
memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
|
|
|
|
sizeof(ipv6_key->ipv6_dst));
|
|
|
|
ipv6_key->ipv6_label = output->ipv6.label;
|
|
|
|
ipv6_key->ipv6_proto = output->ip.proto;
|
|
|
|
ipv6_key->ipv6_tclass = output->ip.tos;
|
|
|
|
ipv6_key->ipv6_hlimit = output->ip.ttl;
|
|
|
|
ipv6_key->ipv6_frag = output->ip.frag;
|
2022-02-24 00:54:09 +00:00
|
|
|
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6_EXTHDRS,
|
|
|
|
sizeof(*ipv6_exthdrs_key));
|
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
|
|
|
ipv6_exthdrs_key = nla_data(nla);
|
|
|
|
ipv6_exthdrs_key->hdrs = output->ipv6.exthdrs;
|
2017-11-07 13:07:02 +00:00
|
|
|
} else if (swkey->eth.type == htons(ETH_P_NSH)) {
|
|
|
|
if (nsh_key_to_nlattr(&output->nsh, is_mask, skb))
|
|
|
|
goto nla_put_failure;
|
2013-10-04 01:16:47 +00:00
|
|
|
} else if (swkey->eth.type == htons(ETH_P_ARP) ||
|
|
|
|
swkey->eth.type == htons(ETH_P_RARP)) {
|
|
|
|
struct ovs_key_arp *arp_key;
|
|
|
|
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
|
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
|
|
|
arp_key = nla_data(nla);
|
|
|
|
memset(arp_key, 0, sizeof(struct ovs_key_arp));
|
|
|
|
arp_key->arp_sip = output->ipv4.addr.src;
|
|
|
|
arp_key->arp_tip = output->ipv4.addr.dst;
|
|
|
|
arp_key->arp_op = htons(output->ip.proto);
|
2014-02-18 19:15:45 +00:00
|
|
|
ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha);
|
|
|
|
ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha);
|
2014-10-06 12:05:13 +00:00
|
|
|
} else if (eth_p_mpls(swkey->eth.type)) {
|
2019-11-04 01:57:44 +00:00
|
|
|
u8 i, num_labels;
|
2014-10-06 12:05:13 +00:00
|
|
|
struct ovs_key_mpls *mpls_key;
|
|
|
|
|
2019-11-04 01:57:44 +00:00
|
|
|
num_labels = hweight_long(output->mpls.num_labels_mask);
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS,
|
|
|
|
num_labels * sizeof(*mpls_key));
|
2014-10-06 12:05:13 +00:00
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
2019-11-04 01:57:44 +00:00
|
|
|
|
2014-10-06 12:05:13 +00:00
|
|
|
mpls_key = nla_data(nla);
|
2019-11-04 01:57:44 +00:00
|
|
|
for (i = 0; i < num_labels; i++)
|
|
|
|
mpls_key[i].mpls_lse = output->mpls.lse[i];
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if ((swkey->eth.type == htons(ETH_P_IP) ||
|
|
|
|
swkey->eth.type == htons(ETH_P_IPV6)) &&
|
|
|
|
swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
|
|
|
|
|
|
|
|
if (swkey->ip.proto == IPPROTO_TCP) {
|
|
|
|
struct ovs_key_tcp *tcp_key;
|
|
|
|
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
|
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
|
|
|
tcp_key = nla_data(nla);
|
2014-05-05 16:54:49 +00:00
|
|
|
tcp_key->tcp_src = output->tp.src;
|
|
|
|
tcp_key->tcp_dst = output->tp.dst;
|
|
|
|
if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
|
|
|
|
output->tp.flags))
|
|
|
|
goto nla_put_failure;
|
2013-10-04 01:16:47 +00:00
|
|
|
} else if (swkey->ip.proto == IPPROTO_UDP) {
|
|
|
|
struct ovs_key_udp *udp_key;
|
|
|
|
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
|
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
|
|
|
udp_key = nla_data(nla);
|
2014-05-05 16:54:49 +00:00
|
|
|
udp_key->udp_src = output->tp.src;
|
|
|
|
udp_key->udp_dst = output->tp.dst;
|
2013-10-04 01:16:47 +00:00
|
|
|
} else if (swkey->ip.proto == IPPROTO_SCTP) {
|
|
|
|
struct ovs_key_sctp *sctp_key;
|
|
|
|
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key));
|
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
|
|
|
sctp_key = nla_data(nla);
|
2014-05-05 16:54:49 +00:00
|
|
|
sctp_key->sctp_src = output->tp.src;
|
|
|
|
sctp_key->sctp_dst = output->tp.dst;
|
2013-10-04 01:16:47 +00:00
|
|
|
} else if (swkey->eth.type == htons(ETH_P_IP) &&
|
|
|
|
swkey->ip.proto == IPPROTO_ICMP) {
|
|
|
|
struct ovs_key_icmp *icmp_key;
|
|
|
|
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
|
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
|
|
|
icmp_key = nla_data(nla);
|
2014-05-05 16:54:49 +00:00
|
|
|
icmp_key->icmp_type = ntohs(output->tp.src);
|
|
|
|
icmp_key->icmp_code = ntohs(output->tp.dst);
|
2013-10-04 01:16:47 +00:00
|
|
|
} else if (swkey->eth.type == htons(ETH_P_IPV6) &&
|
|
|
|
swkey->ip.proto == IPPROTO_ICMPV6) {
|
|
|
|
struct ovs_key_icmpv6 *icmpv6_key;
|
|
|
|
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
|
|
|
|
sizeof(*icmpv6_key));
|
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
|
|
|
icmpv6_key = nla_data(nla);
|
2014-05-05 16:54:49 +00:00
|
|
|
icmpv6_key->icmpv6_type = ntohs(output->tp.src);
|
|
|
|
icmpv6_key->icmpv6_code = ntohs(output->tp.dst);
|
2013-10-04 01:16:47 +00:00
|
|
|
|
openvswitch: Fixed nd target mask field in the flow dump.
IPv6 nd target mask was not getting populated in flow dump.
In the function __ovs_nla_put_key the icmp code mask field was checked
instead of icmp code key field to classify the flow as neighbour discovery.
ufid:bdfbe3e5-60c2-43b0-a5ff-dfcac1c37328, recirc_id(0),dp_hash(0/0),
skb_priority(0/0),in_port(ovs-nm1),skb_mark(0/0),ct_state(0/0),
ct_zone(0/0),ct_mark(0/0),ct_label(0/0),
eth(src=00:00:00:00:00:00/00:00:00:00:00:00,
dst=00:00:00:00:00:00/00:00:00:00:00:00),
eth_type(0x86dd),
ipv6(src=::/::,dst=::/::,label=0/0,proto=58,tclass=0/0,hlimit=0/0,frag=no),
icmpv6(type=135,code=0),
nd(target=2001::2/::,
sll=00:00:00:00:00:00/00:00:00:00:00:00,
tll=00:00:00:00:00:00/00:00:00:00:00:00),
packets:10, bytes:860, used:0.504s, dp:ovs, actions:ovs-nm2
Fixes: e64457191a25 (openvswitch: Restructure datapath.c and flow.c)
Signed-off-by: Martin Varghese <martin.varghese@nokia.com>
Link: https://lore.kernel.org/r/20220328054148.3057-1-martinvarghesenokia@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-03-28 05:41:48 +00:00
|
|
|
if (swkey->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
|
|
|
|
swkey->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
|
2013-10-04 01:16:47 +00:00
|
|
|
struct ovs_key_nd *nd_key;
|
|
|
|
|
|
|
|
nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
|
|
|
|
if (!nla)
|
|
|
|
goto nla_put_failure;
|
|
|
|
nd_key = nla_data(nla);
|
|
|
|
memcpy(nd_key->nd_target, &output->ipv6.nd.target,
|
|
|
|
sizeof(nd_key->nd_target));
|
2014-02-18 19:15:45 +00:00
|
|
|
ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll);
|
|
|
|
ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll);
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
unencap:
|
2016-09-07 16:56:59 +00:00
|
|
|
if (in_encap)
|
|
|
|
nla_nest_end(skb, in_encap);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (encap)
|
|
|
|
nla_nest_end(skb, encap);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
nla_put_failure:
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
2015-01-22 00:42:48 +00:00
|
|
|
int ovs_nla_put_key(const struct sw_flow_key *swkey,
|
|
|
|
const struct sw_flow_key *output, int attr, bool is_mask,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
struct nlattr *nla;
|
|
|
|
|
2019-04-26 09:13:06 +00:00
|
|
|
nla = nla_nest_start_noflag(skb, attr);
|
2015-01-22 00:42:48 +00:00
|
|
|
if (!nla)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
err = __ovs_nla_put_key(swkey, output, is_mask, skb);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
nla_nest_end(skb, nla);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Called with ovs_mutex or RCU read lock. */
|
2015-01-22 00:42:52 +00:00
|
|
|
int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (ovs_identifier_is_ufid(&flow->id))
|
|
|
|
return nla_put(skb, OVS_FLOW_ATTR_UFID, flow->id.ufid_len,
|
|
|
|
flow->id.ufid);
|
|
|
|
|
|
|
|
return ovs_nla_put_key(flow->id.unmasked_key, flow->id.unmasked_key,
|
|
|
|
OVS_FLOW_ATTR_KEY, false, skb);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Called with ovs_mutex or RCU read lock. */
|
|
|
|
int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb)
|
2015-01-22 00:42:48 +00:00
|
|
|
{
|
2015-02-12 17:58:48 +00:00
|
|
|
return ovs_nla_put_key(&flow->key, &flow->key,
|
2015-01-22 00:42:48 +00:00
|
|
|
OVS_FLOW_ATTR_KEY, false, skb);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Called with ovs_mutex or RCU read lock. */
|
|
|
|
int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return ovs_nla_put_key(&flow->key, &flow->mask->key,
|
|
|
|
OVS_FLOW_ATTR_MASK, true, skb);
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
#define MAX_ACTIONS_BUFSIZE (32 * 1024)
|
|
|
|
|
2017-11-25 14:02:12 +00:00
|
|
|
static struct sw_flow_actions *nla_alloc_flow_actions(int size)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
struct sw_flow_actions *sfa;
|
|
|
|
|
2017-11-25 14:02:12 +00:00
|
|
|
WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE);
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2022-10-18 09:06:33 +00:00
|
|
|
sfa = kmalloc(kmalloc_size_roundup(sizeof(*sfa) + size), GFP_KERNEL);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (!sfa)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
sfa->actions_len = 0;
|
|
|
|
return sfa;
|
|
|
|
}
|
|
|
|
|
2022-04-04 15:43:45 +00:00
|
|
|
static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len);
|
|
|
|
|
|
|
|
static void ovs_nla_free_check_pkt_len_action(const struct nlattr *action)
|
|
|
|
{
|
|
|
|
const struct nlattr *a;
|
|
|
|
int rem;
|
|
|
|
|
|
|
|
nla_for_each_nested(a, action, rem) {
|
|
|
|
switch (nla_type(a)) {
|
|
|
|
case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL:
|
|
|
|
case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER:
|
|
|
|
ovs_nla_free_nested_actions(nla_data(a), nla_len(a));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ovs_nla_free_clone_action(const struct nlattr *action)
|
|
|
|
{
|
|
|
|
const struct nlattr *a = nla_data(action);
|
|
|
|
int rem = nla_len(action);
|
|
|
|
|
|
|
|
switch (nla_type(a)) {
|
|
|
|
case OVS_CLONE_ATTR_EXEC:
|
|
|
|
/* The real list of actions follows this attribute. */
|
|
|
|
a = nla_next(a, &rem);
|
|
|
|
ovs_nla_free_nested_actions(a, rem);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ovs_nla_free_dec_ttl_action(const struct nlattr *action)
|
|
|
|
{
|
|
|
|
const struct nlattr *a = nla_data(action);
|
|
|
|
|
|
|
|
switch (nla_type(a)) {
|
|
|
|
case OVS_DEC_TTL_ATTR_ACTION:
|
|
|
|
ovs_nla_free_nested_actions(nla_data(a), nla_len(a));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ovs_nla_free_sample_action(const struct nlattr *action)
|
|
|
|
{
|
|
|
|
const struct nlattr *a = nla_data(action);
|
|
|
|
int rem = nla_len(action);
|
|
|
|
|
|
|
|
switch (nla_type(a)) {
|
|
|
|
case OVS_SAMPLE_ATTR_ARG:
|
|
|
|
/* The real list of actions follows this attribute. */
|
|
|
|
a = nla_next(a, &rem);
|
|
|
|
ovs_nla_free_nested_actions(a, rem);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-21 08:44:03 +00:00
|
|
|
static void ovs_nla_free_set_action(const struct nlattr *a)
|
|
|
|
{
|
|
|
|
const struct nlattr *ovs_key = nla_data(a);
|
|
|
|
struct ovs_tunnel_info *ovs_tun;
|
|
|
|
|
|
|
|
switch (nla_type(ovs_key)) {
|
|
|
|
case OVS_KEY_ATTR_TUNNEL_INFO:
|
|
|
|
ovs_tun = nla_data(ovs_key);
|
|
|
|
dst_release((struct dst_entry *)ovs_tun->tun_dst);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-04-04 15:43:45 +00:00
|
|
|
static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len)
|
2015-07-21 08:44:03 +00:00
|
|
|
{
|
|
|
|
const struct nlattr *a;
|
|
|
|
int rem;
|
|
|
|
|
2022-04-04 15:43:45 +00:00
|
|
|
/* Whenever new actions are added, the need to update this
|
|
|
|
* function should be considered.
|
|
|
|
*/
|
2024-07-04 08:56:56 +00:00
|
|
|
BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 25);
|
2022-04-04 15:43:45 +00:00
|
|
|
|
|
|
|
if (!actions)
|
2015-07-21 08:44:03 +00:00
|
|
|
return;
|
|
|
|
|
2022-04-04 15:43:45 +00:00
|
|
|
nla_for_each_attr(a, actions, len, rem) {
|
2015-07-21 08:44:03 +00:00
|
|
|
switch (nla_type(a)) {
|
2022-04-04 15:43:45 +00:00
|
|
|
case OVS_ACTION_ATTR_CHECK_PKT_LEN:
|
|
|
|
ovs_nla_free_check_pkt_len_action(a);
|
2015-07-21 08:44:03 +00:00
|
|
|
break;
|
2022-04-04 15:43:45 +00:00
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_CLONE:
|
|
|
|
ovs_nla_free_clone_action(a);
|
|
|
|
break;
|
|
|
|
|
2015-08-26 18:31:48 +00:00
|
|
|
case OVS_ACTION_ATTR_CT:
|
|
|
|
ovs_ct_free_action(a);
|
|
|
|
break;
|
2022-04-04 15:43:45 +00:00
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_DEC_TTL:
|
|
|
|
ovs_nla_free_dec_ttl_action(a);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_SAMPLE:
|
|
|
|
ovs_nla_free_sample_action(a);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_SET:
|
|
|
|
ovs_nla_free_set_action(a);
|
|
|
|
break;
|
2015-07-21 08:44:03 +00:00
|
|
|
}
|
|
|
|
}
|
2022-04-04 15:43:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
|
|
|
|
{
|
|
|
|
if (!sf_acts)
|
|
|
|
return;
|
2015-07-21 08:44:03 +00:00
|
|
|
|
2022-04-04 15:43:45 +00:00
|
|
|
ovs_nla_free_nested_actions(sf_acts->actions, sf_acts->actions_len);
|
2015-07-21 08:44:03 +00:00
|
|
|
kfree(sf_acts);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __ovs_nla_free_flow_actions(struct rcu_head *head)
|
|
|
|
{
|
|
|
|
ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu));
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
/* Schedules 'sf_acts' to be freed after the next RCU grace period.
|
|
|
|
* The caller must hold rcu_read_lock for this to be sensible. */
|
2015-07-21 08:44:03 +00:00
|
|
|
void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
2015-07-21 08:44:03 +00:00
|
|
|
call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions);
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
|
2014-11-06 15:03:05 +00:00
|
|
|
int attr_len, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
struct sw_flow_actions *acts;
|
|
|
|
int new_acts_size;
|
2019-03-28 06:36:00 +00:00
|
|
|
size_t req_size = NLA_ALIGN(attr_len);
|
2013-10-04 01:16:47 +00:00
|
|
|
int next_offset = offsetof(struct sw_flow_actions, actions) +
|
|
|
|
(*sfa)->actions_len;
|
|
|
|
|
|
|
|
if (req_size <= (ksize(*sfa) - next_offset))
|
|
|
|
goto out;
|
|
|
|
|
2019-03-28 06:36:00 +00:00
|
|
|
new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2);
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
|
2022-04-15 08:08:41 +00:00
|
|
|
if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) {
|
2017-11-25 14:02:12 +00:00
|
|
|
OVS_NLERR(log, "Flow action size exceeds max %u",
|
|
|
|
MAX_ACTIONS_BUFSIZE);
|
2013-10-04 01:16:47 +00:00
|
|
|
return ERR_PTR(-EMSGSIZE);
|
2017-11-25 14:02:12 +00:00
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
new_acts_size = MAX_ACTIONS_BUFSIZE;
|
|
|
|
}
|
|
|
|
|
2017-11-25 14:02:12 +00:00
|
|
|
acts = nla_alloc_flow_actions(new_acts_size);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (IS_ERR(acts))
|
2024-08-29 09:55:09 +00:00
|
|
|
return ERR_CAST(acts);
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
|
|
|
|
acts->actions_len = (*sfa)->actions_len;
|
2015-08-26 18:31:44 +00:00
|
|
|
acts->orig_len = (*sfa)->orig_len;
|
2013-10-04 01:16:47 +00:00
|
|
|
kfree(*sfa);
|
|
|
|
*sfa = acts;
|
|
|
|
|
|
|
|
out:
|
|
|
|
(*sfa)->actions_len += req_size;
|
|
|
|
return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
|
|
|
|
}
|
|
|
|
|
2014-10-03 22:35:31 +00:00
|
|
|
static struct nlattr *__add_action(struct sw_flow_actions **sfa,
|
2014-11-06 15:03:05 +00:00
|
|
|
int attrtype, void *data, int len, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
struct nlattr *a;
|
|
|
|
|
2014-11-06 15:03:05 +00:00
|
|
|
a = reserve_sfa_size(sfa, nla_attr_size(len), log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (IS_ERR(a))
|
2014-10-03 22:35:31 +00:00
|
|
|
return a;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
a->nla_type = attrtype;
|
|
|
|
a->nla_len = nla_attr_size(len);
|
|
|
|
|
|
|
|
if (data)
|
|
|
|
memcpy(nla_data(a), data, len);
|
|
|
|
memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
|
|
|
|
|
2014-10-03 22:35:31 +00:00
|
|
|
return a;
|
|
|
|
}
|
|
|
|
|
2015-08-26 18:31:48 +00:00
|
|
|
int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, void *data,
|
|
|
|
int len, bool log)
|
2014-10-03 22:35:31 +00:00
|
|
|
{
|
|
|
|
struct nlattr *a;
|
|
|
|
|
2014-11-06 15:03:05 +00:00
|
|
|
a = __add_action(sfa, attrtype, data, len, log);
|
2014-10-03 22:35:31 +00:00
|
|
|
|
2014-11-14 18:32:58 +00:00
|
|
|
return PTR_ERR_OR_ZERO(a);
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int add_nested_action_start(struct sw_flow_actions **sfa,
|
2014-11-06 15:03:05 +00:00
|
|
|
int attrtype, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
int used = (*sfa)->actions_len;
|
|
|
|
int err;
|
|
|
|
|
2015-08-26 18:31:48 +00:00
|
|
|
err = ovs_nla_add_action(sfa, attrtype, NULL, 0, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
return used;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void add_nested_action_end(struct sw_flow_actions *sfa,
|
|
|
|
int st_offset)
|
|
|
|
{
|
|
|
|
struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions +
|
|
|
|
st_offset);
|
|
|
|
|
|
|
|
a->nla_len = sfa->actions_len - st_offset;
|
|
|
|
}
|
|
|
|
|
2015-08-26 18:31:48 +00:00
|
|
|
static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
|
2014-10-06 12:05:13 +00:00
|
|
|
const struct sw_flow_key *key,
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
struct sw_flow_actions **sfa,
|
2019-11-04 01:57:44 +00:00
|
|
|
__be16 eth_type, __be16 vlan_tci,
|
2024-02-07 13:24:15 +00:00
|
|
|
u32 mpls_label_count, bool log,
|
|
|
|
u32 depth);
|
2014-10-06 12:05:13 +00:00
|
|
|
|
2015-08-26 18:31:48 +00:00
|
|
|
static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
const struct sw_flow_key *key,
|
2014-10-06 12:05:13 +00:00
|
|
|
struct sw_flow_actions **sfa,
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
__be16 eth_type, __be16 vlan_tci,
|
2024-02-07 13:24:15 +00:00
|
|
|
u32 mpls_label_count, bool log, bool last,
|
|
|
|
u32 depth)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
|
|
|
|
const struct nlattr *probability, *actions;
|
|
|
|
const struct nlattr *a;
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
int rem, start, err;
|
|
|
|
struct sample_arg arg;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
memset(attrs, 0, sizeof(attrs));
|
|
|
|
nla_for_each_nested(a, attr, rem) {
|
|
|
|
int type = nla_type(a);
|
|
|
|
if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
|
|
|
|
return -EINVAL;
|
|
|
|
attrs[type] = a;
|
|
|
|
}
|
|
|
|
if (rem)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
|
|
|
|
if (!probability || nla_len(probability) != sizeof(u32))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
|
|
|
|
if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* validation done, copy sample action. */
|
2014-11-06 15:03:05 +00:00
|
|
|
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (start < 0)
|
|
|
|
return start;
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
|
|
|
|
/* When both skb and flow may be changed, put the sample
|
|
|
|
* into a deferred fifo. On the other hand, if only skb
|
|
|
|
* may be modified, the actions can be executed in place.
|
|
|
|
*
|
|
|
|
* Do this analysis at the flow installation time.
|
|
|
|
* Set 'clone_action->exec' to true if the actions can be
|
|
|
|
* executed without being deferred.
|
|
|
|
*
|
|
|
|
* If the sample is the last action, it can always be excuted
|
|
|
|
* rather than deferred.
|
|
|
|
*/
|
|
|
|
arg.exec = last || !actions_may_change_flow(actions);
|
|
|
|
arg.probability = nla_get_u32(probability);
|
|
|
|
|
|
|
|
err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_ARG, &arg, sizeof(arg),
|
|
|
|
log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
err = __ovs_nla_copy_actions(net, actions, key, sfa,
|
2024-02-07 13:24:15 +00:00
|
|
|
eth_type, vlan_tci, mpls_label_count, log,
|
|
|
|
depth + 1);
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
add_nested_action_end(*sfa, start);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
static int validate_and_copy_dec_ttl(struct net *net,
|
|
|
|
const struct nlattr *attr,
|
|
|
|
const struct sw_flow_key *key,
|
|
|
|
struct sw_flow_actions **sfa,
|
|
|
|
__be16 eth_type, __be16 vlan_tci,
|
2024-02-07 13:24:15 +00:00
|
|
|
u32 mpls_label_count, bool log,
|
|
|
|
u32 depth)
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
{
|
2020-11-24 12:34:44 +00:00
|
|
|
const struct nlattr *attrs[OVS_DEC_TTL_ATTR_MAX + 1];
|
|
|
|
int start, action_start, err, rem;
|
|
|
|
const struct nlattr *a, *actions;
|
|
|
|
|
|
|
|
memset(attrs, 0, sizeof(attrs));
|
|
|
|
nla_for_each_nested(a, attr, rem) {
|
|
|
|
int type = nla_type(a);
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
|
2020-11-24 12:34:44 +00:00
|
|
|
/* Ignore unknown attributes to be future proof. */
|
|
|
|
if (type > OVS_DEC_TTL_ATTR_MAX)
|
|
|
|
continue;
|
|
|
|
|
2021-01-13 13:50:00 +00:00
|
|
|
if (!type || attrs[type]) {
|
|
|
|
OVS_NLERR(log, "Duplicate or invalid key (type %d).",
|
|
|
|
type);
|
2020-11-24 12:34:44 +00:00
|
|
|
return -EINVAL;
|
2021-01-13 13:50:00 +00:00
|
|
|
}
|
2020-11-24 12:34:44 +00:00
|
|
|
|
|
|
|
attrs[type] = a;
|
|
|
|
}
|
|
|
|
|
2021-01-13 13:50:00 +00:00
|
|
|
if (rem) {
|
|
|
|
OVS_NLERR(log, "Message has %d unknown bytes.", rem);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2020-11-24 12:34:44 +00:00
|
|
|
actions = attrs[OVS_DEC_TTL_ATTR_ACTION];
|
2021-01-13 13:50:00 +00:00
|
|
|
if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) {
|
|
|
|
OVS_NLERR(log, "Missing valid actions attribute.");
|
2020-11-24 12:34:44 +00:00
|
|
|
return -EINVAL;
|
2021-01-13 13:50:00 +00:00
|
|
|
}
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
|
|
|
|
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log);
|
|
|
|
if (start < 0)
|
|
|
|
return start;
|
|
|
|
|
2020-11-24 12:34:44 +00:00
|
|
|
action_start = add_nested_action_start(sfa, OVS_DEC_TTL_ATTR_ACTION, log);
|
|
|
|
if (action_start < 0)
|
2020-12-04 11:43:14 +00:00
|
|
|
return action_start;
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
|
2020-11-24 12:34:44 +00:00
|
|
|
err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type,
|
2024-02-07 13:24:15 +00:00
|
|
|
vlan_tci, mpls_label_count, log,
|
|
|
|
depth + 1);
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2020-11-24 12:34:44 +00:00
|
|
|
add_nested_action_end(*sfa, action_start);
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
add_nested_action_end(*sfa, start);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-02 15:18:03 +00:00
|
|
|
static int validate_and_copy_clone(struct net *net,
|
|
|
|
const struct nlattr *attr,
|
|
|
|
const struct sw_flow_key *key,
|
|
|
|
struct sw_flow_actions **sfa,
|
|
|
|
__be16 eth_type, __be16 vlan_tci,
|
2024-02-07 13:24:15 +00:00
|
|
|
u32 mpls_label_count, bool log, bool last,
|
|
|
|
u32 depth)
|
2018-07-02 15:18:03 +00:00
|
|
|
{
|
|
|
|
int start, err;
|
|
|
|
u32 exec;
|
|
|
|
|
|
|
|
if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log);
|
|
|
|
if (start < 0)
|
|
|
|
return start;
|
|
|
|
|
|
|
|
exec = last || !actions_may_change_flow(attr);
|
|
|
|
|
|
|
|
err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec,
|
|
|
|
sizeof(exec), log);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
err = __ovs_nla_copy_actions(net, attr, key, sfa,
|
2024-02-07 13:24:15 +00:00
|
|
|
eth_type, vlan_tci, mpls_label_count, log,
|
|
|
|
depth + 1);
|
2018-07-02 15:18:03 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
add_nested_action_end(*sfa, start);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
void ovs_match_init(struct sw_flow_match *match,
|
|
|
|
struct sw_flow_key *key,
|
2016-09-19 20:51:00 +00:00
|
|
|
bool reset_key,
|
2013-10-04 01:16:47 +00:00
|
|
|
struct sw_flow_mask *mask)
|
|
|
|
{
|
|
|
|
memset(match, 0, sizeof(*match));
|
|
|
|
match->key = key;
|
|
|
|
match->mask = mask;
|
|
|
|
|
2016-09-19 20:51:00 +00:00
|
|
|
if (reset_key)
|
|
|
|
memset(key, 0, sizeof(*key));
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
if (mask) {
|
|
|
|
memset(&mask->key, 0, sizeof(mask->key));
|
|
|
|
mask->range.start = mask->range.end = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-15 02:53:57 +00:00
|
|
|
static int validate_geneve_opts(struct sw_flow_key *key)
|
|
|
|
{
|
|
|
|
struct geneve_opt *option;
|
|
|
|
int opts_len = key->tun_opts_len;
|
|
|
|
bool crit_opt = false;
|
|
|
|
|
|
|
|
option = (struct geneve_opt *)TUN_METADATA_OPTS(key, key->tun_opts_len);
|
|
|
|
while (opts_len > 0) {
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (opts_len < sizeof(*option))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
len = sizeof(*option) + option->length * 4;
|
|
|
|
if (len > opts_len)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE);
|
|
|
|
|
|
|
|
option = (struct geneve_opt *)((u8 *)option + len);
|
|
|
|
opts_len -= len;
|
2018-01-17 21:10:28 +00:00
|
|
|
}
|
2015-01-15 02:53:57 +00:00
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
if (crit_opt)
|
|
|
|
__set_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_key.tun_flags);
|
2015-01-15 02:53:57 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
static int validate_and_copy_set_tun(const struct nlattr *attr,
|
2014-11-06 15:03:05 +00:00
|
|
|
struct sw_flow_actions **sfa, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
IP_TUNNEL_DECLARE_FLAGS(dst_opt_type) = { };
|
2013-10-04 01:16:47 +00:00
|
|
|
struct sw_flow_match match;
|
|
|
|
struct sw_flow_key key;
|
2015-07-21 08:44:03 +00:00
|
|
|
struct metadata_dst *tun_dst;
|
2015-07-21 08:43:54 +00:00
|
|
|
struct ip_tunnel_info *tun_info;
|
2015-07-21 08:44:03 +00:00
|
|
|
struct ovs_tunnel_info *ovs_tun;
|
2014-10-03 22:35:31 +00:00
|
|
|
struct nlattr *a;
|
2015-02-11 10:23:38 +00:00
|
|
|
int err = 0, start, opts_type;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2016-09-19 20:51:00 +00:00
|
|
|
ovs_match_init(&match, &key, true, NULL);
|
2015-10-05 11:09:47 +00:00
|
|
|
opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
|
2015-01-15 02:53:59 +00:00
|
|
|
if (opts_type < 0)
|
|
|
|
return opts_type;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2014-10-03 22:35:33 +00:00
|
|
|
if (key.tun_opts_len) {
|
2015-01-15 02:53:59 +00:00
|
|
|
switch (opts_type) {
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
|
|
|
|
err = validate_geneve_opts(&key);
|
|
|
|
if (err < 0)
|
|
|
|
return err;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
|
|
|
|
__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, dst_opt_type);
|
2015-01-15 02:53:59 +00:00
|
|
|
break;
|
|
|
|
case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, dst_opt_type);
|
2015-01-15 02:53:59 +00:00
|
|
|
break;
|
2018-01-25 21:20:11 +00:00
|
|
|
case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 15:23:53 +00:00
|
|
|
__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, dst_opt_type);
|
2018-01-25 21:20:11 +00:00
|
|
|
break;
|
2015-01-15 02:53:59 +00:00
|
|
|
}
|
2018-01-17 21:10:28 +00:00
|
|
|
}
|
2014-10-03 22:35:33 +00:00
|
|
|
|
2014-11-06 15:03:05 +00:00
|
|
|
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (start < 0)
|
|
|
|
return start;
|
|
|
|
|
2017-06-23 20:11:58 +00:00
|
|
|
tun_dst = metadata_dst_alloc(key.tun_opts_len, METADATA_IP_TUNNEL,
|
|
|
|
GFP_KERNEL);
|
|
|
|
|
2015-07-21 08:44:03 +00:00
|
|
|
if (!tun_dst)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2016-02-12 14:43:57 +00:00
|
|
|
err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL);
|
|
|
|
if (err) {
|
|
|
|
dst_release((struct dst_entry *)tun_dst);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2014-10-03 22:35:31 +00:00
|
|
|
a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
|
2015-07-21 08:44:03 +00:00
|
|
|
sizeof(*ovs_tun), log);
|
|
|
|
if (IS_ERR(a)) {
|
|
|
|
dst_release((struct dst_entry *)tun_dst);
|
2014-10-03 22:35:31 +00:00
|
|
|
return PTR_ERR(a);
|
2015-07-21 08:44:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ovs_tun = nla_data(a);
|
|
|
|
ovs_tun->tun_dst = tun_dst;
|
2014-10-03 22:35:31 +00:00
|
|
|
|
2015-07-21 08:44:03 +00:00
|
|
|
tun_info = &tun_dst->u.tun_info;
|
|
|
|
tun_info->mode = IP_TUNNEL_INFO_TX;
|
2015-10-05 11:09:46 +00:00
|
|
|
if (key.tun_proto == AF_INET6)
|
|
|
|
tun_info->mode |= IP_TUNNEL_INFO_IPV6;
|
2019-03-28 04:43:23 +00:00
|
|
|
else if (key.tun_proto == AF_INET && key.tun_key.u.ipv4.dst == 0)
|
|
|
|
tun_info->mode |= IP_TUNNEL_INFO_BRIDGE;
|
2015-07-21 08:43:54 +00:00
|
|
|
tun_info->key = key.tun_key;
|
2014-10-03 22:35:31 +00:00
|
|
|
|
2015-08-31 01:09:38 +00:00
|
|
|
/* We need to store the options in the action itself since
|
|
|
|
* everything else will go away after flow setup. We can append
|
|
|
|
* it to tun_info and then point there.
|
|
|
|
*/
|
|
|
|
ip_tunnel_info_opts_set(tun_info,
|
|
|
|
TUN_METADATA_OPTS(&key, key.tun_opts_len),
|
2018-06-27 04:39:36 +00:00
|
|
|
key.tun_opts_len, dst_opt_type);
|
2013-10-04 01:16:47 +00:00
|
|
|
add_nested_action_end(*sfa, start);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2017-11-07 13:07:02 +00:00
|
|
|
static bool validate_nsh(const struct nlattr *attr, bool is_mask,
|
|
|
|
bool is_push_nsh, bool log)
|
|
|
|
{
|
|
|
|
struct sw_flow_match match;
|
|
|
|
struct sw_flow_key key;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
ovs_match_init(&match, &key, true, NULL);
|
|
|
|
ret = nsh_key_put_from_nlattr(attr, &match, is_mask,
|
|
|
|
is_push_nsh, log);
|
|
|
|
return !ret;
|
|
|
|
}
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
/* Return false if there are any non-masked bits set.
|
|
|
|
* Mask follows data immediately, before any netlink padding.
|
|
|
|
*/
|
|
|
|
static bool validate_masked(u8 *data, int len)
|
|
|
|
{
|
|
|
|
u8 *mask = data + len;
|
|
|
|
|
|
|
|
while (len--)
|
|
|
|
if (*data++ & ~*mask++)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
static int validate_set(const struct nlattr *a,
|
|
|
|
const struct sw_flow_key *flow_key,
|
2016-11-10 15:28:22 +00:00
|
|
|
struct sw_flow_actions **sfa, bool *skip_copy,
|
|
|
|
u8 mac_proto, __be16 eth_type, bool masked, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
const struct nlattr *ovs_key = nla_data(a);
|
|
|
|
int key_type = nla_type(ovs_key);
|
2015-02-05 21:40:49 +00:00
|
|
|
size_t key_len;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
/* There can be only one key in a action */
|
|
|
|
if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
key_len = nla_len(ovs_key);
|
|
|
|
if (masked)
|
|
|
|
key_len /= 2;
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
if (key_type > OVS_KEY_ATTR_MAX ||
|
2015-09-12 01:38:28 +00:00
|
|
|
!check_attr_len(key_len, ovs_key_lens[key_type].len))
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
if (masked && !validate_masked(nla_data(ovs_key), key_len))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
switch (key_type) {
|
|
|
|
case OVS_KEY_ATTR_PRIORITY:
|
|
|
|
case OVS_KEY_ATTR_SKB_MARK:
|
2015-08-26 18:31:49 +00:00
|
|
|
case OVS_KEY_ATTR_CT_MARK:
|
2015-10-01 22:00:37 +00:00
|
|
|
case OVS_KEY_ATTR_CT_LABELS:
|
2013-10-04 01:16:47 +00:00
|
|
|
break;
|
|
|
|
|
2016-11-10 15:28:22 +00:00
|
|
|
case OVS_KEY_ATTR_ETHERNET:
|
|
|
|
if (mac_proto != MAC_PROTO_ETHERNET)
|
|
|
|
return -EINVAL;
|
2016-12-20 01:06:33 +00:00
|
|
|
break;
|
2016-11-10 15:28:22 +00:00
|
|
|
|
2020-02-20 06:23:09 +00:00
|
|
|
case OVS_KEY_ATTR_TUNNEL: {
|
|
|
|
int err;
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
if (masked)
|
|
|
|
return -EINVAL; /* Masked tunnel set not supported. */
|
|
|
|
|
|
|
|
*skip_copy = true;
|
2014-11-06 15:03:05 +00:00
|
|
|
err = validate_and_copy_set_tun(a, sfa, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
2020-02-20 06:23:09 +00:00
|
|
|
}
|
|
|
|
case OVS_KEY_ATTR_IPV4: {
|
|
|
|
const struct ovs_key_ipv4 *ipv4_key;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2014-10-06 12:05:13 +00:00
|
|
|
if (eth_type != htons(ETH_P_IP))
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
ipv4_key = nla_data(ovs_key);
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
if (masked) {
|
|
|
|
const struct ovs_key_ipv4 *mask = ipv4_key + 1;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
/* Non-writeable fields. */
|
|
|
|
if (mask->ipv4_proto || mask->ipv4_frag)
|
|
|
|
return -EINVAL;
|
|
|
|
} else {
|
|
|
|
if (ipv4_key->ipv4_proto != flow_key->ip.proto)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (ipv4_key->ipv4_frag != flow_key->ip.frag)
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
break;
|
2020-02-20 06:23:09 +00:00
|
|
|
}
|
|
|
|
case OVS_KEY_ATTR_IPV6: {
|
|
|
|
const struct ovs_key_ipv6 *ipv6_key;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2014-10-06 12:05:13 +00:00
|
|
|
if (eth_type != htons(ETH_P_IPV6))
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
ipv6_key = nla_data(ovs_key);
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
if (masked) {
|
|
|
|
const struct ovs_key_ipv6 *mask = ipv6_key + 1;
|
|
|
|
|
|
|
|
/* Non-writeable fields. */
|
|
|
|
if (mask->ipv6_proto || mask->ipv6_frag)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* Invalid bits in the flow label mask? */
|
|
|
|
if (ntohl(mask->ipv6_label) & 0xFFF00000)
|
|
|
|
return -EINVAL;
|
|
|
|
} else {
|
|
|
|
if (ipv6_key->ipv6_proto != flow_key->ip.proto)
|
|
|
|
return -EINVAL;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
if (ipv6_key->ipv6_frag != flow_key->ip.frag)
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
break;
|
2020-02-20 06:23:09 +00:00
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
case OVS_KEY_ATTR_TCP:
|
2015-02-05 21:40:49 +00:00
|
|
|
if ((eth_type != htons(ETH_P_IP) &&
|
|
|
|
eth_type != htons(ETH_P_IPV6)) ||
|
|
|
|
flow_key->ip.proto != IPPROTO_TCP)
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
break;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
case OVS_KEY_ATTR_UDP:
|
2015-02-05 21:40:49 +00:00
|
|
|
if ((eth_type != htons(ETH_P_IP) &&
|
|
|
|
eth_type != htons(ETH_P_IPV6)) ||
|
|
|
|
flow_key->ip.proto != IPPROTO_UDP)
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
break;
|
2014-10-06 12:05:13 +00:00
|
|
|
|
|
|
|
case OVS_KEY_ATTR_MPLS:
|
|
|
|
if (!eth_p_mpls(eth_type))
|
|
|
|
return -EINVAL;
|
|
|
|
break;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
case OVS_KEY_ATTR_SCTP:
|
2015-02-05 21:40:49 +00:00
|
|
|
if ((eth_type != htons(ETH_P_IP) &&
|
|
|
|
eth_type != htons(ETH_P_IPV6)) ||
|
|
|
|
flow_key->ip.proto != IPPROTO_SCTP)
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
break;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2017-11-07 13:07:02 +00:00
|
|
|
case OVS_KEY_ATTR_NSH:
|
|
|
|
if (eth_type != htons(ETH_P_NSH))
|
|
|
|
return -EINVAL;
|
|
|
|
if (!validate_nsh(nla_data(a), masked, false, log))
|
|
|
|
return -EINVAL;
|
|
|
|
break;
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
/* Convert non-masked non-tunnel set actions to masked set actions. */
|
|
|
|
if (!masked && key_type != OVS_KEY_ATTR_TUNNEL) {
|
|
|
|
int start, len = key_len * 2;
|
|
|
|
struct nlattr *at;
|
|
|
|
|
|
|
|
*skip_copy = true;
|
|
|
|
|
|
|
|
start = add_nested_action_start(sfa,
|
|
|
|
OVS_ACTION_ATTR_SET_TO_MASKED,
|
|
|
|
log);
|
|
|
|
if (start < 0)
|
|
|
|
return start;
|
|
|
|
|
|
|
|
at = __add_action(sfa, key_type, NULL, len, log);
|
|
|
|
if (IS_ERR(at))
|
|
|
|
return PTR_ERR(at);
|
|
|
|
|
|
|
|
memcpy(nla_data(at), nla_data(ovs_key), key_len); /* Key. */
|
|
|
|
memset(nla_data(at) + key_len, 0xff, key_len); /* Mask. */
|
|
|
|
/* Clear non-writeable bits from otherwise writeable fields. */
|
|
|
|
if (key_type == OVS_KEY_ATTR_IPV6) {
|
|
|
|
struct ovs_key_ipv6 *mask = nla_data(at) + key_len;
|
|
|
|
|
|
|
|
mask->ipv6_label &= htonl(0x000FFFFF);
|
|
|
|
}
|
|
|
|
add_nested_action_end(*sfa, start);
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int validate_userspace(const struct nlattr *attr)
|
|
|
|
{
|
|
|
|
static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
|
|
|
|
[OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
|
|
|
|
[OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
|
2014-11-06 14:51:24 +00:00
|
|
|
[OVS_USERSPACE_ATTR_EGRESS_TUN_PORT] = {.type = NLA_U32 },
|
2013-10-04 01:16:47 +00:00
|
|
|
};
|
|
|
|
struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
|
|
|
|
int error;
|
|
|
|
|
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 12:07:28 +00:00
|
|
|
error = nla_parse_nested_deprecated(a, OVS_USERSPACE_ATTR_MAX, attr,
|
|
|
|
userspace_policy, NULL);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
if (!a[OVS_USERSPACE_ATTR_PID] ||
|
|
|
|
!nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-03-26 00:43:46 +00:00
|
|
|
static const struct nla_policy cpl_policy[OVS_CHECK_PKT_LEN_ATTR_MAX + 1] = {
|
|
|
|
[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN] = {.type = NLA_U16 },
|
|
|
|
[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER] = {.type = NLA_NESTED },
|
|
|
|
[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL] = {.type = NLA_NESTED },
|
|
|
|
};
|
|
|
|
|
|
|
|
static int validate_and_copy_check_pkt_len(struct net *net,
|
|
|
|
const struct nlattr *attr,
|
|
|
|
const struct sw_flow_key *key,
|
|
|
|
struct sw_flow_actions **sfa,
|
|
|
|
__be16 eth_type, __be16 vlan_tci,
|
2019-11-04 01:57:44 +00:00
|
|
|
u32 mpls_label_count,
|
2024-02-07 13:24:15 +00:00
|
|
|
bool log, bool last, u32 depth)
|
2019-03-26 00:43:46 +00:00
|
|
|
{
|
|
|
|
const struct nlattr *acts_if_greater, *acts_if_lesser_eq;
|
|
|
|
struct nlattr *a[OVS_CHECK_PKT_LEN_ATTR_MAX + 1];
|
|
|
|
struct check_pkt_len_arg arg;
|
|
|
|
int nested_acts_start;
|
|
|
|
int start, err;
|
|
|
|
|
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 12:07:28 +00:00
|
|
|
err = nla_parse_deprecated_strict(a, OVS_CHECK_PKT_LEN_ATTR_MAX,
|
|
|
|
nla_data(attr), nla_len(attr),
|
|
|
|
cpl_policy, NULL);
|
2019-03-26 00:43:46 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (!a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN] ||
|
|
|
|
!nla_get_u16(a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN]))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
acts_if_lesser_eq = a[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL];
|
|
|
|
acts_if_greater = a[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER];
|
|
|
|
|
|
|
|
/* Both the nested action should be present. */
|
|
|
|
if (!acts_if_greater || !acts_if_lesser_eq)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* validation done, copy the nested actions. */
|
|
|
|
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CHECK_PKT_LEN,
|
|
|
|
log);
|
|
|
|
if (start < 0)
|
|
|
|
return start;
|
|
|
|
|
|
|
|
arg.pkt_len = nla_get_u16(a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN]);
|
|
|
|
arg.exec_for_lesser_equal =
|
|
|
|
last || !actions_may_change_flow(acts_if_lesser_eq);
|
|
|
|
arg.exec_for_greater =
|
|
|
|
last || !actions_may_change_flow(acts_if_greater);
|
|
|
|
|
|
|
|
err = ovs_nla_add_action(sfa, OVS_CHECK_PKT_LEN_ATTR_ARG, &arg,
|
|
|
|
sizeof(arg), log);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
nested_acts_start = add_nested_action_start(sfa,
|
|
|
|
OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL, log);
|
|
|
|
if (nested_acts_start < 0)
|
|
|
|
return nested_acts_start;
|
|
|
|
|
|
|
|
err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa,
|
2024-02-07 13:24:15 +00:00
|
|
|
eth_type, vlan_tci, mpls_label_count, log,
|
|
|
|
depth + 1);
|
2019-03-26 00:43:46 +00:00
|
|
|
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
add_nested_action_end(*sfa, nested_acts_start);
|
|
|
|
|
|
|
|
nested_acts_start = add_nested_action_start(sfa,
|
|
|
|
OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER, log);
|
|
|
|
if (nested_acts_start < 0)
|
|
|
|
return nested_acts_start;
|
|
|
|
|
|
|
|
err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa,
|
2024-02-07 13:24:15 +00:00
|
|
|
eth_type, vlan_tci, mpls_label_count, log,
|
|
|
|
depth + 1);
|
2019-03-26 00:43:46 +00:00
|
|
|
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
add_nested_action_end(*sfa, nested_acts_start);
|
|
|
|
add_nested_action_end(*sfa, start);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-07-04 08:56:56 +00:00
|
|
|
static int validate_psample(const struct nlattr *attr)
|
|
|
|
{
|
|
|
|
static const struct nla_policy policy[OVS_PSAMPLE_ATTR_MAX + 1] = {
|
|
|
|
[OVS_PSAMPLE_ATTR_GROUP] = { .type = NLA_U32 },
|
|
|
|
[OVS_PSAMPLE_ATTR_COOKIE] = {
|
|
|
|
.type = NLA_BINARY,
|
|
|
|
.len = OVS_PSAMPLE_COOKIE_MAX_SIZE,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
struct nlattr *a[OVS_PSAMPLE_ATTR_MAX + 1];
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (!IS_ENABLED(CONFIG_PSAMPLE))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
err = nla_parse_nested(a, OVS_PSAMPLE_ATTR_MAX, attr, policy, NULL);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
return a[OVS_PSAMPLE_ATTR_GROUP] ? 0 : -EINVAL;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
static int copy_action(const struct nlattr *from,
|
2014-11-06 15:03:05 +00:00
|
|
|
struct sw_flow_actions **sfa, bool log)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
|
|
|
int totlen = NLA_ALIGN(from->nla_len);
|
|
|
|
struct nlattr *to;
|
|
|
|
|
2014-11-06 15:03:05 +00:00
|
|
|
to = reserve_sfa_size(sfa, from->nla_len, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (IS_ERR(to))
|
|
|
|
return PTR_ERR(to);
|
|
|
|
|
|
|
|
memcpy(to, from, totlen);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-08-26 18:31:48 +00:00
|
|
|
static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
|
2014-10-06 12:05:13 +00:00
|
|
|
const struct sw_flow_key *key,
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
struct sw_flow_actions **sfa,
|
2019-11-04 01:57:44 +00:00
|
|
|
__be16 eth_type, __be16 vlan_tci,
|
2024-02-07 13:24:15 +00:00
|
|
|
u32 mpls_label_count, bool log,
|
|
|
|
u32 depth)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
2016-11-10 15:28:22 +00:00
|
|
|
u8 mac_proto = ovs_key_mac_proto(key);
|
2013-10-04 01:16:47 +00:00
|
|
|
const struct nlattr *a;
|
|
|
|
int rem, err;
|
|
|
|
|
2024-02-07 13:24:15 +00:00
|
|
|
if (depth > OVS_COPY_ACTIONS_MAX_DEPTH)
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
nla_for_each_nested(a, attr, rem) {
|
|
|
|
/* Expected argument lengths, (u32)-1 for variable length. */
|
|
|
|
static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
|
|
|
|
[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
|
2014-09-16 02:37:25 +00:00
|
|
|
[OVS_ACTION_ATTR_RECIRC] = sizeof(u32),
|
2013-10-04 01:16:47 +00:00
|
|
|
[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
|
2014-10-06 12:05:13 +00:00
|
|
|
[OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls),
|
|
|
|
[OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16),
|
2013-10-04 01:16:47 +00:00
|
|
|
[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
|
|
|
|
[OVS_ACTION_ATTR_POP_VLAN] = 0,
|
|
|
|
[OVS_ACTION_ATTR_SET] = (u32)-1,
|
2015-02-05 21:40:49 +00:00
|
|
|
[OVS_ACTION_ATTR_SET_MASKED] = (u32)-1,
|
2014-09-16 02:37:25 +00:00
|
|
|
[OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
|
2015-08-26 18:31:48 +00:00
|
|
|
[OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
|
|
|
|
[OVS_ACTION_ATTR_CT] = (u32)-1,
|
2017-10-10 20:54:44 +00:00
|
|
|
[OVS_ACTION_ATTR_CT_CLEAR] = 0,
|
2016-06-10 18:49:33 +00:00
|
|
|
[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
|
2016-11-10 15:28:23 +00:00
|
|
|
[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
|
|
|
|
[OVS_ACTION_ATTR_POP_ETH] = 0,
|
2017-11-07 13:07:02 +00:00
|
|
|
[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
|
|
|
|
[OVS_ACTION_ATTR_POP_NSH] = 0,
|
2017-11-10 20:09:43 +00:00
|
|
|
[OVS_ACTION_ATTR_METER] = sizeof(u32),
|
2018-07-02 15:18:03 +00:00
|
|
|
[OVS_ACTION_ATTR_CLONE] = (u32)-1,
|
2019-03-26 00:43:46 +00:00
|
|
|
[OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1,
|
2019-12-21 03:20:46 +00:00
|
|
|
[OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls),
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
[OVS_ACTION_ATTR_DEC_TTL] = (u32)-1,
|
2023-08-11 14:12:50 +00:00
|
|
|
[OVS_ACTION_ATTR_DROP] = sizeof(u32),
|
2024-07-04 08:56:56 +00:00
|
|
|
[OVS_ACTION_ATTR_PSAMPLE] = (u32)-1,
|
2013-10-04 01:16:47 +00:00
|
|
|
};
|
|
|
|
const struct ovs_action_push_vlan *vlan;
|
|
|
|
int type = nla_type(a);
|
|
|
|
bool skip_copy;
|
|
|
|
|
|
|
|
if (type > OVS_ACTION_ATTR_MAX ||
|
|
|
|
(action_lens[type] != nla_len(a) &&
|
|
|
|
action_lens[type] != (u32)-1))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
skip_copy = false;
|
|
|
|
switch (type) {
|
|
|
|
case OVS_ACTION_ATTR_UNSPEC:
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_USERSPACE:
|
|
|
|
err = validate_userspace(a);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_OUTPUT:
|
|
|
|
if (nla_get_u32(a) >= DP_MAX_PORTS)
|
|
|
|
return -EINVAL;
|
|
|
|
break;
|
|
|
|
|
2016-06-10 18:49:33 +00:00
|
|
|
case OVS_ACTION_ATTR_TRUNC: {
|
|
|
|
const struct ovs_action_trunc *trunc = nla_data(a);
|
|
|
|
|
|
|
|
if (trunc->max_len < ETH_HLEN)
|
|
|
|
return -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-09-16 02:37:25 +00:00
|
|
|
case OVS_ACTION_ATTR_HASH: {
|
|
|
|
const struct ovs_action_hash *act_hash = nla_data(a);
|
|
|
|
|
|
|
|
switch (act_hash->hash_alg) {
|
|
|
|
case OVS_HASH_ALG_L4:
|
net: openvswitch: add support for l4 symmetric hashing
Since its introduction, the ovs module execute_hash action allowed
hash algorithms other than the skb->l4_hash to be used. However,
additional hash algorithms were not implemented. This means flows
requiring different hash distributions weren't able to use the
kernel datapath.
Now, introduce support for symmetric hashing algorithm as an
alternative hash supported by the ovs module using the flow
dissector.
Output of flow using l4_sym hash:
recirc_id(0),in_port(3),eth(),eth_type(0x0800),
ipv4(dst=64.0.0.0/192.0.0.0,proto=6,frag=no), packets:30473425,
bytes:45902883702, used:0.000s, flags:SP.,
actions:hash(sym_l4(0)),recirc(0xd)
Some performance testing with no GRO/GSO, two veths, single flow:
hash(l4(0)): 4.35 GBits/s
hash(l4_sym(0)): 4.24 GBits/s
Signed-off-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-06-09 13:59:55 +00:00
|
|
|
fallthrough;
|
|
|
|
case OVS_HASH_ALG_SYM_L4:
|
2014-09-16 02:37:25 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_POP_VLAN:
|
2016-11-10 15:28:22 +00:00
|
|
|
if (mac_proto != MAC_PROTO_ETHERNET)
|
|
|
|
return -EINVAL;
|
2014-10-06 12:05:13 +00:00
|
|
|
vlan_tci = htons(0);
|
2013-10-04 01:16:47 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_PUSH_VLAN:
|
2016-11-10 15:28:22 +00:00
|
|
|
if (mac_proto != MAC_PROTO_ETHERNET)
|
|
|
|
return -EINVAL;
|
2013-10-04 01:16:47 +00:00
|
|
|
vlan = nla_data(a);
|
2016-09-07 16:56:59 +00:00
|
|
|
if (!eth_type_vlan(vlan->vlan_tpid))
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
2018-11-08 17:44:50 +00:00
|
|
|
if (!(vlan->vlan_tci & htons(VLAN_CFI_MASK)))
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
2014-10-06 12:05:13 +00:00
|
|
|
vlan_tci = vlan->vlan_tci;
|
2013-10-04 01:16:47 +00:00
|
|
|
break;
|
|
|
|
|
2014-09-16 02:37:25 +00:00
|
|
|
case OVS_ACTION_ATTR_RECIRC:
|
|
|
|
break;
|
|
|
|
|
2019-12-21 03:20:46 +00:00
|
|
|
case OVS_ACTION_ATTR_ADD_MPLS: {
|
|
|
|
const struct ovs_action_add_mpls *mpls = nla_data(a);
|
|
|
|
|
|
|
|
if (!eth_p_mpls(mpls->mpls_ethertype))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK) {
|
|
|
|
if (vlan_tci & htons(VLAN_CFI_MASK) ||
|
|
|
|
(eth_type != htons(ETH_P_IP) &&
|
|
|
|
eth_type != htons(ETH_P_IPV6) &&
|
|
|
|
eth_type != htons(ETH_P_ARP) &&
|
|
|
|
eth_type != htons(ETH_P_RARP) &&
|
|
|
|
!eth_p_mpls(eth_type)))
|
|
|
|
return -EINVAL;
|
|
|
|
mpls_label_count++;
|
|
|
|
} else {
|
|
|
|
if (mac_proto == MAC_PROTO_ETHERNET) {
|
|
|
|
mpls_label_count = 1;
|
|
|
|
mac_proto = MAC_PROTO_NONE;
|
|
|
|
} else {
|
|
|
|
mpls_label_count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
eth_type = mpls->mpls_ethertype;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-10-06 12:05:13 +00:00
|
|
|
case OVS_ACTION_ATTR_PUSH_MPLS: {
|
|
|
|
const struct ovs_action_push_mpls *mpls = nla_data(a);
|
|
|
|
|
|
|
|
if (!eth_p_mpls(mpls->mpls_ethertype))
|
|
|
|
return -EINVAL;
|
|
|
|
/* Prohibit push MPLS other than to a white list
|
|
|
|
* for packets that have a known tag order.
|
|
|
|
*/
|
2018-11-08 17:44:50 +00:00
|
|
|
if (vlan_tci & htons(VLAN_CFI_MASK) ||
|
2014-10-06 12:05:13 +00:00
|
|
|
(eth_type != htons(ETH_P_IP) &&
|
|
|
|
eth_type != htons(ETH_P_IPV6) &&
|
|
|
|
eth_type != htons(ETH_P_ARP) &&
|
|
|
|
eth_type != htons(ETH_P_RARP) &&
|
|
|
|
!eth_p_mpls(eth_type)))
|
|
|
|
return -EINVAL;
|
|
|
|
eth_type = mpls->mpls_ethertype;
|
2019-11-04 01:57:44 +00:00
|
|
|
mpls_label_count++;
|
2014-10-06 12:05:13 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-11-04 01:57:44 +00:00
|
|
|
case OVS_ACTION_ATTR_POP_MPLS: {
|
|
|
|
__be16 proto;
|
2018-11-08 17:44:50 +00:00
|
|
|
if (vlan_tci & htons(VLAN_CFI_MASK) ||
|
2014-10-06 12:05:13 +00:00
|
|
|
!eth_p_mpls(eth_type))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2019-11-04 01:57:44 +00:00
|
|
|
/* Disallow subsequent L2.5+ set actions and mpls_pop
|
|
|
|
* actions once the last MPLS label in the packet is
|
2022-09-07 04:03:46 +00:00
|
|
|
* popped as there is no check here to ensure that
|
2019-11-04 01:57:44 +00:00
|
|
|
* the new eth type is valid and thus set actions could
|
|
|
|
* write off the end of the packet or otherwise corrupt
|
|
|
|
* it.
|
2014-10-06 12:05:13 +00:00
|
|
|
*
|
|
|
|
* Support for these actions is planned using packet
|
|
|
|
* recirculation.
|
|
|
|
*/
|
2019-11-04 01:57:44 +00:00
|
|
|
proto = nla_get_be16(a);
|
2019-12-21 03:20:46 +00:00
|
|
|
|
|
|
|
if (proto == htons(ETH_P_TEB) &&
|
|
|
|
mac_proto != MAC_PROTO_NONE)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2019-11-04 01:57:44 +00:00
|
|
|
mpls_label_count--;
|
|
|
|
|
|
|
|
if (!eth_p_mpls(proto) || !mpls_label_count)
|
|
|
|
eth_type = htons(0);
|
|
|
|
else
|
|
|
|
eth_type = proto;
|
|
|
|
|
2014-10-06 12:05:13 +00:00
|
|
|
break;
|
2019-11-04 01:57:44 +00:00
|
|
|
}
|
2014-10-06 12:05:13 +00:00
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
case OVS_ACTION_ATTR_SET:
|
2014-10-06 12:05:13 +00:00
|
|
|
err = validate_set(a, key, sfa,
|
2016-11-10 15:28:22 +00:00
|
|
|
&skip_copy, mac_proto, eth_type,
|
|
|
|
false, log);
|
2015-02-05 21:40:49 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_SET_MASKED:
|
|
|
|
err = validate_set(a, key, sfa,
|
2016-11-10 15:28:22 +00:00
|
|
|
&skip_copy, mac_proto, eth_type,
|
|
|
|
true, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
|
|
|
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
case OVS_ACTION_ATTR_SAMPLE: {
|
|
|
|
bool last = nla_is_last(a, rem);
|
|
|
|
|
|
|
|
err = validate_and_copy_sample(net, a, key, sfa,
|
|
|
|
eth_type, vlan_tci,
|
2019-11-04 01:57:44 +00:00
|
|
|
mpls_label_count,
|
2024-02-07 13:24:15 +00:00
|
|
|
log, last, depth);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
skip_copy = true;
|
|
|
|
break;
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2015-08-26 18:31:48 +00:00
|
|
|
case OVS_ACTION_ATTR_CT:
|
|
|
|
err = ovs_ct_copy_action(net, a, key, sfa, log);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
skip_copy = true;
|
|
|
|
break;
|
|
|
|
|
2017-10-10 20:54:44 +00:00
|
|
|
case OVS_ACTION_ATTR_CT_CLEAR:
|
|
|
|
break;
|
|
|
|
|
2016-11-10 15:28:23 +00:00
|
|
|
case OVS_ACTION_ATTR_PUSH_ETH:
|
|
|
|
/* Disallow pushing an Ethernet header if one
|
|
|
|
* is already present */
|
|
|
|
if (mac_proto != MAC_PROTO_NONE)
|
|
|
|
return -EINVAL;
|
2018-10-31 17:52:03 +00:00
|
|
|
mac_proto = MAC_PROTO_ETHERNET;
|
2016-11-10 15:28:23 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_POP_ETH:
|
|
|
|
if (mac_proto != MAC_PROTO_ETHERNET)
|
|
|
|
return -EINVAL;
|
2018-11-08 17:44:50 +00:00
|
|
|
if (vlan_tci & htons(VLAN_CFI_MASK))
|
2016-11-10 15:28:23 +00:00
|
|
|
return -EINVAL;
|
2018-10-31 17:52:03 +00:00
|
|
|
mac_proto = MAC_PROTO_NONE;
|
2016-11-10 15:28:23 +00:00
|
|
|
break;
|
|
|
|
|
2017-11-07 13:07:02 +00:00
|
|
|
case OVS_ACTION_ATTR_PUSH_NSH:
|
|
|
|
if (mac_proto != MAC_PROTO_ETHERNET) {
|
|
|
|
u8 next_proto;
|
|
|
|
|
|
|
|
next_proto = tun_p_from_eth_p(eth_type);
|
|
|
|
if (!next_proto)
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
mac_proto = MAC_PROTO_NONE;
|
|
|
|
if (!validate_nsh(nla_data(a), false, true, true))
|
|
|
|
return -EINVAL;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_POP_NSH: {
|
|
|
|
__be16 inner_proto;
|
|
|
|
|
|
|
|
if (eth_type != htons(ETH_P_NSH))
|
|
|
|
return -EINVAL;
|
|
|
|
inner_proto = tun_p_to_eth_p(key->nsh.base.np);
|
|
|
|
if (!inner_proto)
|
|
|
|
return -EINVAL;
|
|
|
|
if (key->nsh.base.np == TUN_P_ETHERNET)
|
|
|
|
mac_proto = MAC_PROTO_ETHERNET;
|
|
|
|
else
|
|
|
|
mac_proto = MAC_PROTO_NONE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2017-11-10 20:09:43 +00:00
|
|
|
case OVS_ACTION_ATTR_METER:
|
|
|
|
/* Non-existent meters are simply ignored. */
|
|
|
|
break;
|
|
|
|
|
2018-07-02 15:18:03 +00:00
|
|
|
case OVS_ACTION_ATTR_CLONE: {
|
|
|
|
bool last = nla_is_last(a, rem);
|
|
|
|
|
|
|
|
err = validate_and_copy_clone(net, a, key, sfa,
|
|
|
|
eth_type, vlan_tci,
|
2019-11-04 01:57:44 +00:00
|
|
|
mpls_label_count,
|
2024-02-07 13:24:15 +00:00
|
|
|
log, last, depth);
|
2018-07-02 15:18:03 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
skip_copy = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-03-26 00:43:46 +00:00
|
|
|
case OVS_ACTION_ATTR_CHECK_PKT_LEN: {
|
|
|
|
bool last = nla_is_last(a, rem);
|
|
|
|
|
|
|
|
err = validate_and_copy_check_pkt_len(net, a, key, sfa,
|
|
|
|
eth_type,
|
2019-11-04 01:57:44 +00:00
|
|
|
vlan_tci,
|
|
|
|
mpls_label_count,
|
2024-02-07 13:24:15 +00:00
|
|
|
log, last,
|
|
|
|
depth);
|
2019-03-26 00:43:46 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
skip_copy = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
case OVS_ACTION_ATTR_DEC_TTL:
|
|
|
|
err = validate_and_copy_dec_ttl(net, a, key, sfa,
|
|
|
|
eth_type, vlan_tci,
|
2024-02-07 13:24:15 +00:00
|
|
|
mpls_label_count, log,
|
|
|
|
depth);
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
skip_copy = true;
|
|
|
|
break;
|
|
|
|
|
2023-08-11 14:12:50 +00:00
|
|
|
case OVS_ACTION_ATTR_DROP:
|
|
|
|
if (!nla_is_last(a, rem))
|
|
|
|
return -EINVAL;
|
|
|
|
break;
|
|
|
|
|
2024-07-04 08:56:56 +00:00
|
|
|
case OVS_ACTION_ATTR_PSAMPLE:
|
|
|
|
err = validate_psample(a);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
default:
|
2014-11-06 15:03:05 +00:00
|
|
|
OVS_NLERR(log, "Unknown Action type %d", type);
|
2013-10-04 01:16:47 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (!skip_copy) {
|
2014-11-06 15:03:05 +00:00
|
|
|
err = copy_action(a, sfa, log);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rem > 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
/* 'key' must be the masked key. */
|
2015-08-26 18:31:48 +00:00
|
|
|
int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
|
2014-10-06 12:05:13 +00:00
|
|
|
const struct sw_flow_key *key,
|
2014-11-06 15:03:05 +00:00
|
|
|
struct sw_flow_actions **sfa, bool log)
|
2014-10-06 12:05:13 +00:00
|
|
|
{
|
2014-10-19 18:19:51 +00:00
|
|
|
int err;
|
2019-11-04 01:57:44 +00:00
|
|
|
u32 mpls_label_count = 0;
|
2014-10-19 18:19:51 +00:00
|
|
|
|
2017-11-25 14:02:12 +00:00
|
|
|
*sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE));
|
2014-10-19 18:19:51 +00:00
|
|
|
if (IS_ERR(*sfa))
|
|
|
|
return PTR_ERR(*sfa);
|
|
|
|
|
2019-11-04 01:57:44 +00:00
|
|
|
if (eth_p_mpls(key->eth.type))
|
|
|
|
mpls_label_count = hweight_long(key->mpls.num_labels_mask);
|
|
|
|
|
2015-08-26 18:31:44 +00:00
|
|
|
(*sfa)->orig_len = nla_len(attr);
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type,
|
2024-02-07 13:24:15 +00:00
|
|
|
key->eth.vlan.tci, mpls_label_count, log,
|
|
|
|
0);
|
2014-10-19 18:19:51 +00:00
|
|
|
if (err)
|
2015-07-21 08:44:03 +00:00
|
|
|
ovs_nla_free_flow_actions(*sfa);
|
2014-10-19 18:19:51 +00:00
|
|
|
|
|
|
|
return err;
|
2014-10-06 12:05:13 +00:00
|
|
|
}
|
|
|
|
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
static int sample_action_to_attr(const struct nlattr *attr,
|
|
|
|
struct sk_buff *skb)
|
2013-10-04 01:16:47 +00:00
|
|
|
{
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
struct nlattr *start, *ac_start = NULL, *sample_arg;
|
|
|
|
int err = 0, rem = nla_len(attr);
|
|
|
|
const struct sample_arg *arg;
|
|
|
|
struct nlattr *actions;
|
2013-10-04 01:16:47 +00:00
|
|
|
|
2019-04-26 09:13:06 +00:00
|
|
|
start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SAMPLE);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (!start)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
sample_arg = nla_data(attr);
|
|
|
|
arg = nla_data(sample_arg);
|
|
|
|
actions = nla_next(sample_arg, &rem);
|
2013-10-04 01:16:47 +00:00
|
|
|
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
if (nla_put_u32(skb, OVS_SAMPLE_ATTR_PROBABILITY, arg->probability)) {
|
|
|
|
err = -EMSGSIZE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2019-04-26 09:13:06 +00:00
|
|
|
ac_start = nla_nest_start_noflag(skb, OVS_SAMPLE_ATTR_ACTIONS);
|
openvswitch: Optimize sample action for the clone use cases
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.
While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.
The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.
One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.
Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-20 23:32:29 +00:00
|
|
|
if (!ac_start) {
|
|
|
|
err = -EMSGSIZE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = ovs_nla_put_actions(actions, rem, skb);
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (err) {
|
|
|
|
nla_nest_cancel(skb, ac_start);
|
|
|
|
nla_nest_cancel(skb, start);
|
|
|
|
} else {
|
|
|
|
nla_nest_end(skb, ac_start);
|
|
|
|
nla_nest_end(skb, start);
|
2013-10-04 01:16:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2018-07-02 15:18:03 +00:00
|
|
|
static int clone_action_to_attr(const struct nlattr *attr,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct nlattr *start;
|
|
|
|
int err = 0, rem = nla_len(attr);
|
|
|
|
|
2019-04-26 09:13:06 +00:00
|
|
|
start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CLONE);
|
2018-07-02 15:18:03 +00:00
|
|
|
if (!start)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
net: openvswitch: don't send internal clone attribute to the userspace.
'OVS_CLONE_ATTR_EXEC' is an internal attribute that is used for
performance optimization inside the kernel. It's added by the kernel
while parsing user-provided actions and should not be sent during the
flow dump as it's not part of the uAPI.
The issue doesn't cause any significant problems to the ovs-vswitchd
process, because reported actions are not really used in the
application lifecycle and only supposed to be shown to a human via
ovs-dpctl flow dump. However, the action list is still incorrect
and causes the following error if the user wants to look at the
datapath flows:
# ovs-dpctl add-dp system@ovs-system
# ovs-dpctl add-flow "<flow match>" "clone(ct(commit),0)"
# ovs-dpctl dump-flows
<flow match>, packets:0, bytes:0, used:never,
actions:clone(bad length 4, expected -1 for: action0(01 00 00 00),
ct(commit),0)
With the fix:
# ovs-dpctl dump-flows
<flow match>, packets:0, bytes:0, used:never,
actions:clone(ct(commit),0)
Additionally fixed an incorrect attribute name in the comment.
Fixes: b233504033db ("openvswitch: kernel datapath clone action")
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Aaron Conole <aconole@redhat.com>
Link: https://lore.kernel.org/r/20220404104150.2865736-1-i.maximets@ovn.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-04-04 10:41:50 +00:00
|
|
|
/* Skipping the OVS_CLONE_ATTR_EXEC that is always the first attribute. */
|
|
|
|
attr = nla_next(nla_data(attr), &rem);
|
|
|
|
err = ovs_nla_put_actions(attr, rem, skb);
|
2018-07-02 15:18:03 +00:00
|
|
|
|
|
|
|
if (err)
|
|
|
|
nla_nest_cancel(skb, start);
|
|
|
|
else
|
|
|
|
nla_nest_end(skb, start);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2019-03-26 00:43:46 +00:00
|
|
|
static int check_pkt_len_action_to_attr(const struct nlattr *attr,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct nlattr *start, *ac_start = NULL;
|
|
|
|
const struct check_pkt_len_arg *arg;
|
|
|
|
const struct nlattr *a, *cpl_arg;
|
|
|
|
int err = 0, rem = nla_len(attr);
|
|
|
|
|
2019-04-26 09:13:06 +00:00
|
|
|
start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CHECK_PKT_LEN);
|
2019-03-26 00:43:46 +00:00
|
|
|
if (!start)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
/* The first nested attribute in 'attr' is always
|
|
|
|
* 'OVS_CHECK_PKT_LEN_ATTR_ARG'.
|
|
|
|
*/
|
|
|
|
cpl_arg = nla_data(attr);
|
|
|
|
arg = nla_data(cpl_arg);
|
|
|
|
|
|
|
|
if (nla_put_u16(skb, OVS_CHECK_PKT_LEN_ATTR_PKT_LEN, arg->pkt_len)) {
|
|
|
|
err = -EMSGSIZE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Second nested attribute in 'attr' is always
|
|
|
|
* 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'.
|
|
|
|
*/
|
|
|
|
a = nla_next(cpl_arg, &rem);
|
2019-04-26 09:13:06 +00:00
|
|
|
ac_start = nla_nest_start_noflag(skb,
|
|
|
|
OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL);
|
2019-03-26 00:43:46 +00:00
|
|
|
if (!ac_start) {
|
|
|
|
err = -EMSGSIZE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
|
|
|
|
if (err) {
|
|
|
|
nla_nest_cancel(skb, ac_start);
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
nla_nest_end(skb, ac_start);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Third nested attribute in 'attr' is always
|
|
|
|
* OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER.
|
|
|
|
*/
|
|
|
|
a = nla_next(a, &rem);
|
2019-04-26 09:13:06 +00:00
|
|
|
ac_start = nla_nest_start_noflag(skb,
|
|
|
|
OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER);
|
2019-03-26 00:43:46 +00:00
|
|
|
if (!ac_start) {
|
|
|
|
err = -EMSGSIZE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
|
|
|
|
if (err) {
|
|
|
|
nla_nest_cancel(skb, ac_start);
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
nla_nest_end(skb, ac_start);
|
|
|
|
}
|
|
|
|
|
|
|
|
nla_nest_end(skb, start);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
nla_nest_cancel(skb, start);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
static int dec_ttl_action_to_attr(const struct nlattr *attr,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
2020-11-24 12:34:44 +00:00
|
|
|
struct nlattr *start, *action_start;
|
|
|
|
const struct nlattr *a;
|
|
|
|
int err = 0, rem;
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
|
|
|
|
start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL);
|
|
|
|
if (!start)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
2020-11-24 12:34:44 +00:00
|
|
|
nla_for_each_attr(a, nla_data(attr), nla_len(attr), rem) {
|
|
|
|
switch (nla_type(a)) {
|
|
|
|
case OVS_DEC_TTL_ATTR_ACTION:
|
|
|
|
|
|
|
|
action_start = nla_nest_start_noflag(skb, OVS_DEC_TTL_ATTR_ACTION);
|
|
|
|
if (!action_start) {
|
|
|
|
err = -EMSGSIZE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
|
|
|
|
if (err)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
nla_nest_end(skb, action_start);
|
|
|
|
break;
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
|
2020-11-24 12:34:44 +00:00
|
|
|
default:
|
|
|
|
/* Ignore all other option to be future compatible */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
nla_nest_end(skb, start);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
nla_nest_cancel(skb, start);
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
const struct nlattr *ovs_key = nla_data(a);
|
|
|
|
int key_type = nla_type(ovs_key);
|
|
|
|
struct nlattr *start;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
switch (key_type) {
|
2014-10-03 22:35:31 +00:00
|
|
|
case OVS_KEY_ATTR_TUNNEL_INFO: {
|
2015-07-21 08:44:03 +00:00
|
|
|
struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key);
|
|
|
|
struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info;
|
2014-10-03 22:35:31 +00:00
|
|
|
|
2019-04-26 09:13:06 +00:00
|
|
|
start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SET);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (!start)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
2015-12-18 10:43:15 +00:00
|
|
|
err = ip_tun_to_nlattr(skb, &tun_info->key,
|
|
|
|
ip_tunnel_info_opts(tun_info),
|
|
|
|
tun_info->options_len,
|
2019-03-28 04:43:23 +00:00
|
|
|
ip_tunnel_info_af(tun_info), tun_info->mode);
|
2013-10-04 01:16:47 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
nla_nest_end(skb, start);
|
|
|
|
break;
|
2014-10-03 22:35:31 +00:00
|
|
|
}
|
2013-10-04 01:16:47 +00:00
|
|
|
default:
|
|
|
|
if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
|
|
|
|
return -EMSGSIZE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
static int masked_set_action_to_set_action_attr(const struct nlattr *a,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
const struct nlattr *ovs_key = nla_data(a);
|
2015-03-03 02:49:56 +00:00
|
|
|
struct nlattr *nla;
|
2015-02-05 21:40:49 +00:00
|
|
|
size_t key_len = nla_len(ovs_key) / 2;
|
|
|
|
|
|
|
|
/* Revert the conversion we did from a non-masked set action to
|
|
|
|
* masked set action.
|
|
|
|
*/
|
2019-04-26 09:13:06 +00:00
|
|
|
nla = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SET);
|
2015-03-03 02:49:56 +00:00
|
|
|
if (!nla)
|
2015-02-05 21:40:49 +00:00
|
|
|
return -EMSGSIZE;
|
|
|
|
|
2015-03-03 02:49:56 +00:00
|
|
|
if (nla_put(skb, nla_type(ovs_key), key_len, nla_data(ovs_key)))
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
nla_nest_end(skb, nla);
|
2015-02-05 21:40:49 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
const struct nlattr *a;
|
|
|
|
int rem, err;
|
|
|
|
|
|
|
|
nla_for_each_attr(a, attr, len, rem) {
|
|
|
|
int type = nla_type(a);
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case OVS_ACTION_ATTR_SET:
|
|
|
|
err = set_action_to_attr(a, skb);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
|
|
|
|
2015-02-05 21:40:49 +00:00
|
|
|
case OVS_ACTION_ATTR_SET_TO_MASKED:
|
|
|
|
err = masked_set_action_to_set_action_attr(a, skb);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
case OVS_ACTION_ATTR_SAMPLE:
|
|
|
|
err = sample_action_to_attr(a, skb);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
2015-08-26 18:31:48 +00:00
|
|
|
|
|
|
|
case OVS_ACTION_ATTR_CT:
|
|
|
|
err = ovs_ct_action_to_attr(nla_data(a), skb);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
|
|
|
|
2018-07-02 15:18:03 +00:00
|
|
|
case OVS_ACTION_ATTR_CLONE:
|
|
|
|
err = clone_action_to_attr(a, skb);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
|
|
|
|
2019-03-26 00:43:46 +00:00
|
|
|
case OVS_ACTION_ATTR_CHECK_PKT_LEN:
|
|
|
|
err = check_pkt_len_action_to_attr(a, skb);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
|
|
|
|
openvswitch: add TTL decrement action
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, drop it
or execute an action passed via a nested attribute.
The default TTL expired action is to drop the packet.
Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
Tested with a corresponding change in the userspace:
# ovs-dpctl dump-flows
in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1
in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2
in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
# ping -c1 192.168.0.2 -t 42
IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
# ping -c1 192.168.0.2 -t 120
IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
# ping -c1 192.168.0.2 -t 1
#
Co-developed-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-15 13:20:56 +00:00
|
|
|
case OVS_ACTION_ATTR_DEC_TTL:
|
|
|
|
err = dec_ttl_action_to_attr(a, skb);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
break;
|
|
|
|
|
2013-10-04 01:16:47 +00:00
|
|
|
default:
|
|
|
|
if (nla_put(skb, type, nla_len(a), nla_data(a)))
|
|
|
|
return -EMSGSIZE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|