Merge branch 'Add IP-TFS mode to xfrm'

Christian Hopps says:

====================
* Summary of Changes:

This patchset adds a new xfrm mode implementing on-demand IP-TFS. IP-TFS
(AggFrag encapsulation) has been standardized in RFC9347.

  Link: https://www.rfc-editor.org/rfc/rfc9347.txt

This feature supports demand driven (i.e., non-constant send rate)
IP-TFS to take advantage of the AGGFRAG ESP payload encapsulation. This
payload type supports aggregation and fragmentation of the inner IP
packet stream which in turn yields higher small-packet bandwidth as well
as reducing MTU/PMTU issues. Congestion control is unimplementated as
the send rate is demand driven rather than constant.

In order to allow loading this fucntionality as a module a set of
callbacks xfrm_mode_cbs has been added to xfrm as well.

Patchset Structure:
-------------------

The first 5 commits are changes to the net and xfrm infrastructure to
support the callbacks as well as more generic IP-TFS additions that
may be used outside the actual IP-TFS implementation.

  - xfrm: config: add CONFIG_XFRM_IPTFS
  - include: uapi: protocol number and packet structs for AGGFRAG in ESP
  - xfrm: netlink: add config (netlink) options
  - xfrm: add mode_cbs module functionality
  - xfrm: add generic iptfs defines and functionality

The last 10 commits constitute the IP-TFS implementation constructed in
layers to make review easier. The first 9 commits all apply to a single
file `net/xfrm/xfrm_iptfs.c`, the last commit adds a new tracepoint
header file along with the use of these new tracepoint calls.

  - xfrm: iptfs: add new iptfs xfrm mode impl
  - xfrm: iptfs: add user packet (tunnel ingress) handling
  - xfrm: iptfs: share page fragments of inner packets
  - xfrm: iptfs: add fragmenting of larger than MTU user packets
  - xfrm: iptfs: add basic receive packet (tunnel egress) handling
  - xfrm: iptfs: handle received fragmented inner packets
  - xfrm: iptfs: add reusing received skb for the tunnel egress packet
  - xfrm: iptfs: add skb-fragment sharing code
  - xfrm: iptfs: handle reordering of received packets
  - xfrm: iptfs: add tracepoint functionality
====================

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
This commit is contained in:
Steffen Klassert 2024-12-09 10:09:40 +01:00
commit 59af653a69
21 changed files with 3292 additions and 19 deletions

View File

@ -38,6 +38,7 @@
#define XFRM_PROTO_COMP 108
#define XFRM_PROTO_IPIP 4
#define XFRM_PROTO_IPV6 41
#define XFRM_PROTO_IPTFS IPPROTO_AGGFRAG
#define XFRM_PROTO_ROUTING IPPROTO_ROUTING
#define XFRM_PROTO_DSTOPTS IPPROTO_DSTOPTS
@ -213,6 +214,7 @@ struct xfrm_state {
u16 family;
xfrm_address_t saddr;
int header_len;
int enc_hdr_len;
int trailer_len;
u32 extra_flags;
struct xfrm_mark smark;
@ -303,6 +305,9 @@ struct xfrm_state {
* interpreted by xfrm_type methods. */
void *data;
u8 dir;
const struct xfrm_mode_cbs *mode_cbs;
void *mode_data;
};
static inline struct net *xs_net(struct xfrm_state *x)
@ -460,6 +465,45 @@ struct xfrm_type_offload {
int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
/**
* struct xfrm_mode_cbs - XFRM mode callbacks
* @owner: module owner or NULL
* @init_state: Add/init mode specific state in `xfrm_state *x`
* @clone_state: Copy mode specific values from `orig` to new state `x`
* @destroy_state: Cleanup mode specific state from `xfrm_state *x`
* @user_init: Process mode specific netlink attributes from user
* @copy_to_user: Add netlink attributes to `attrs` based on state in `x`
* @sa_len: Return space required to store mode specific netlink attributes
* @get_inner_mtu: Return avail payload space after removing encap overhead
* @input: Process received packet from SA using mode
* @output: Output given packet using mode
* @prepare_output: Add mode specific encapsulation to packet in skb. On return
* `transport_header` should point at ESP header, `network_header` should
* point at outer IP header and `mac_header` should opint at the
* protocol/nexthdr field of the outer IP.
*
* One should examine and understand the specific uses of these callbacks in
* xfrm for further detail on how and when these functions are called. RTSL.
*/
struct xfrm_mode_cbs {
struct module *owner;
int (*init_state)(struct xfrm_state *x);
int (*clone_state)(struct xfrm_state *x, struct xfrm_state *orig);
void (*destroy_state)(struct xfrm_state *x);
int (*user_init)(struct net *net, struct xfrm_state *x,
struct nlattr **attrs,
struct netlink_ext_ack *extack);
int (*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb);
unsigned int (*sa_len)(const struct xfrm_state *x);
u32 (*get_inner_mtu)(struct xfrm_state *x, int outer_mtu);
int (*input)(struct xfrm_state *x, struct sk_buff *skb);
int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
int (*prepare_output)(struct xfrm_state *x, struct sk_buff *skb);
};
int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs);
void xfrm_unregister_mode_cbs(u8 mode);
static inline int xfrm_af2proto(unsigned int family)
{
switch(family) {

View File

@ -79,6 +79,8 @@ enum {
#define IPPROTO_MPLS IPPROTO_MPLS
IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */
#define IPPROTO_ETHERNET IPPROTO_ETHERNET
IPPROTO_AGGFRAG = 144, /* AGGFRAG in ESP (RFC 9347) */
#define IPPROTO_AGGFRAG IPPROTO_AGGFRAG
IPPROTO_RAW = 255, /* Raw IP packets */
#define IPPROTO_RAW IPPROTO_RAW
IPPROTO_SMC = 256, /* Shared Memory Communications */

View File

@ -137,6 +137,22 @@ struct ip_beet_phdr {
__u8 reserved;
};
struct ip_iptfs_hdr {
__u8 subtype; /* 0*: basic, 1: CC */
__u8 flags;
__be16 block_offset;
};
struct ip_iptfs_cc_hdr {
__u8 subtype; /* 0: basic, 1*: CC */
__u8 flags;
__be16 block_offset;
__be32 loss_rate;
__be64 rtt_adelay_xdelay;
__be32 tval;
__be32 techo;
};
/* index values for the variables in ipv4_devconf */
enum
{

View File

@ -14,7 +14,8 @@ enum {
IPSEC_MODE_ANY = 0, /* We do not support this for SA */
IPSEC_MODE_TRANSPORT = 1,
IPSEC_MODE_TUNNEL = 2,
IPSEC_MODE_BEET = 3
IPSEC_MODE_BEET = 3,
IPSEC_MODE_IPTFS = 4
};
enum {

View File

@ -339,6 +339,8 @@ enum
LINUX_MIB_XFRMACQUIREERROR, /* XfrmAcquireError */
LINUX_MIB_XFRMOUTSTATEDIRERROR, /* XfrmOutStateDirError */
LINUX_MIB_XFRMINSTATEDIRERROR, /* XfrmInStateDirError */
LINUX_MIB_XFRMINIPTFSERROR, /* XfrmInIptfsError */
LINUX_MIB_XFRMOUTNOQSPACE, /* XfrmOutNoQueueSpace */
__LINUX_MIB_XFRMMAX
};

View File

@ -158,7 +158,8 @@ enum {
#define XFRM_MODE_ROUTEOPTIMIZATION 2
#define XFRM_MODE_IN_TRIGGER 3
#define XFRM_MODE_BEET 4
#define XFRM_MODE_MAX 5
#define XFRM_MODE_IPTFS 5
#define XFRM_MODE_MAX 6
/* Netlink configuration messages. */
enum {
@ -323,6 +324,12 @@ enum xfrm_attr_type_t {
XFRMA_SA_DIR, /* __u8 */
XFRMA_NAT_KEEPALIVE_INTERVAL, /* __u32 in seconds for NAT keepalive */
XFRMA_SA_PCPU, /* __u32 */
XFRMA_IPTFS_DROP_TIME, /* __u32 in: usec to wait for next seq */
XFRMA_IPTFS_REORDER_WINDOW, /* __u16 in: reorder window size (pkts) */
XFRMA_IPTFS_DONT_FRAG, /* out: don't use fragmentation */
XFRMA_IPTFS_INIT_DELAY, /* __u32 out: initial packet wait delay (usec) */
XFRMA_IPTFS_MAX_QSIZE, /* __u32 out: max ingress queue size (octets) */
XFRMA_IPTFS_PKT_SIZE, /* __u32 out: size of outer packet, 0 for PMTU */
__XFRMA_MAX
#define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */

View File

@ -816,7 +816,8 @@ int esp_input_done2(struct sk_buff *skb, int err)
}
skb_pull_rcsum(skb, hlen);
if (x->props.mode == XFRM_MODE_TUNNEL)
if (x->props.mode == XFRM_MODE_TUNNEL ||
x->props.mode == XFRM_MODE_IPTFS)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -ihl);

View File

@ -859,7 +859,8 @@ int esp6_input_done2(struct sk_buff *skb, int err)
skb_postpull_rcsum(skb, skb_network_header(skb),
skb_network_header_len(skb));
skb_pull_rcsum(skb, hlen);
if (x->props.mode == XFRM_MODE_TUNNEL)
if (x->props.mode == XFRM_MODE_TUNNEL ||
x->props.mode == XFRM_MODE_IPTFS)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -hdr_len);

View File

@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode)
return true;
}
return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL;
return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL ||
mode == XFRM_MODE_IPTFS;
}
static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,

View File

@ -135,6 +135,22 @@ config NET_KEY_MIGRATE
If unsure, say N.
config XFRM_IPTFS
tristate "IPsec IP-TFS/AGGFRAG (RFC 9347) encapsulation support"
depends on XFRM
help
Information on the IP-TFS/AGGFRAG encapsulation can be found
in RFC 9347. This feature supports demand driven (i.e.,
non-constant send rate) IP-TFS to take advantage of the
AGGFRAG ESP payload encapsulation. This payload type
supports aggregation and fragmentation of the inner IP
packet stream which in turn yields higher small-packet
bandwidth as well as reducing MTU/PMTU issues. Congestion
control is unimplementated as the send rate is demand driven
rather than constant.
If unsure, say N.
config XFRM_ESPINTCP
bool

View File

@ -21,5 +21,6 @@ obj-$(CONFIG_XFRM_USER) += xfrm_user.o
obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
obj-$(CONFIG_XFRM_IPTFS) += xfrm_iptfs.o
obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o

218
net/xfrm/trace_iptfs.h Normal file
View File

@ -0,0 +1,218 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* xfrm_trace_iptfs.h
*
* August 12 2023, Christian Hopps <chopps@labn.net>
*
* Copyright (c) 2023, LabN Consulting, L.L.C.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM iptfs
#if !defined(_TRACE_IPTFS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IPTFS_H
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/tracepoint.h>
#include <net/ip.h>
struct xfrm_iptfs_data;
TRACE_EVENT(iptfs_egress_recv,
TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u16 blkoff),
TP_ARGS(skb, xtfs, blkoff),
TP_STRUCT__entry(__field(struct sk_buff *, skb)
__field(void *, head)
__field(void *, head_pg_addr)
__field(void *, pg0addr)
__field(u32, skb_len)
__field(u32, data_len)
__field(u32, headroom)
__field(u32, tailroom)
__field(u32, tail)
__field(u32, end)
__field(u32, pg0off)
__field(u8, head_frag)
__field(u8, frag_list)
__field(u8, nr_frags)
__field(u16, blkoff)),
TP_fast_assign(__entry->skb = skb;
__entry->head = skb->head;
__entry->skb_len = skb->len;
__entry->data_len = skb->data_len;
__entry->headroom = skb_headroom(skb);
__entry->tailroom = skb_tailroom(skb);
__entry->tail = (u32)skb->tail;
__entry->end = (u32)skb->end;
__entry->head_frag = skb->head_frag;
__entry->frag_list = (bool)skb_shinfo(skb)->frag_list;
__entry->nr_frags = skb_shinfo(skb)->nr_frags;
__entry->blkoff = blkoff;
__entry->head_pg_addr = page_address(virt_to_head_page(skb->head));
__entry->pg0addr = (__entry->nr_frags
? page_address(netmem_to_page(skb_shinfo(skb)->frags[0].netmem))
: NULL);
__entry->pg0off = (__entry->nr_frags
? skb_shinfo(skb)->frags[0].offset
: 0);
),
TP_printk("EGRESS: skb=%p len=%u data_len=%u headroom=%u head_frag=%u frag_list=%u nr_frags=%u blkoff=%u\n\t\ttailroom=%u tail=%u end=%u head=%p hdpgaddr=%p pg0->addr=%p pg0->data=%p pg0->off=%u",
__entry->skb, __entry->skb_len, __entry->data_len, __entry->headroom,
__entry->head_frag, __entry->frag_list, __entry->nr_frags, __entry->blkoff,
__entry->tailroom, __entry->tail, __entry->end, __entry->head,
__entry->head_pg_addr, __entry->pg0addr, __entry->pg0addr + __entry->pg0off,
__entry->pg0off)
)
DECLARE_EVENT_CLASS(iptfs_ingress_preq_event,
TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs,
u32 pmtu, u8 was_gso),
TP_ARGS(skb, xtfs, pmtu, was_gso),
TP_STRUCT__entry(__field(struct sk_buff *, skb)
__field(u32, skb_len)
__field(u32, data_len)
__field(u32, pmtu)
__field(u32, queue_size)
__field(u32, proto_seq)
__field(u8, proto)
__field(u8, was_gso)
),
TP_fast_assign(__entry->skb = skb;
__entry->skb_len = skb->len;
__entry->data_len = skb->data_len;
__entry->queue_size =
xtfs->cfg.max_queue_size - xtfs->queue_size;
__entry->proto = __trace_ip_proto(ip_hdr(skb));
__entry->proto_seq = __trace_ip_proto_seq(ip_hdr(skb));
__entry->pmtu = pmtu;
__entry->was_gso = was_gso;
),
TP_printk("INGRPREQ: skb=%p len=%u data_len=%u qsize=%u proto=%u proto_seq=%u pmtu=%u was_gso=%u",
__entry->skb, __entry->skb_len, __entry->data_len,
__entry->queue_size, __entry->proto, __entry->proto_seq,
__entry->pmtu, __entry->was_gso));
DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_enqueue,
TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso),
TP_ARGS(skb, xtfs, pmtu, was_gso));
DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_no_queue_space,
TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso),
TP_ARGS(skb, xtfs, pmtu, was_gso));
DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_too_big,
TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso),
TP_ARGS(skb, xtfs, pmtu, was_gso));
DECLARE_EVENT_CLASS(iptfs_ingress_postq_event,
TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, struct iphdr *iph),
TP_ARGS(skb, mtu, blkoff, iph),
TP_STRUCT__entry(__field(struct sk_buff *, skb)
__field(u32, skb_len)
__field(u32, data_len)
__field(u32, mtu)
__field(u32, proto_seq)
__field(u16, blkoff)
__field(u8, proto)),
TP_fast_assign(__entry->skb = skb;
__entry->skb_len = skb->len;
__entry->data_len = skb->data_len;
__entry->mtu = mtu;
__entry->blkoff = blkoff;
__entry->proto = iph ? __trace_ip_proto(iph) : 0;
__entry->proto_seq = iph ? __trace_ip_proto_seq(iph) : 0;
),
TP_printk("INGRPSTQ: skb=%p len=%u data_len=%u mtu=%u blkoff=%u proto=%u proto_seq=%u",
__entry->skb, __entry->skb_len, __entry->data_len, __entry->mtu,
__entry->blkoff, __entry->proto, __entry->proto_seq));
DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_dequeue,
TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
struct iphdr *iph),
TP_ARGS(skb, mtu, blkoff, iph));
DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_fragmenting,
TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
struct iphdr *iph),
TP_ARGS(skb, mtu, blkoff, iph));
DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_final_fragment,
TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
struct iphdr *iph),
TP_ARGS(skb, mtu, blkoff, iph));
DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_toobig,
TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
struct iphdr *iph),
TP_ARGS(skb, mtu, blkoff, iph));
TRACE_EVENT(iptfs_ingress_nth_peek,
TP_PROTO(struct sk_buff *skb, u32 remaining),
TP_ARGS(skb, remaining),
TP_STRUCT__entry(__field(struct sk_buff *, skb)
__field(u32, skb_len)
__field(u32, remaining)),
TP_fast_assign(__entry->skb = skb;
__entry->skb_len = skb->len;
__entry->remaining = remaining;
),
TP_printk("INGRPSTQ: NTHPEEK: skb=%p len=%u remaining=%u",
__entry->skb, __entry->skb_len, __entry->remaining));
TRACE_EVENT(iptfs_ingress_nth_add, TP_PROTO(struct sk_buff *skb, u8 share_ok),
TP_ARGS(skb, share_ok),
TP_STRUCT__entry(__field(struct sk_buff *, skb)
__field(u32, skb_len)
__field(u32, data_len)
__field(u8, share_ok)
__field(u8, head_frag)
__field(u8, pp_recycle)
__field(u8, cloned)
__field(u8, shared)
__field(u8, nr_frags)
__field(u8, frag_list)
),
TP_fast_assign(__entry->skb = skb;
__entry->skb_len = skb->len;
__entry->data_len = skb->data_len;
__entry->share_ok = share_ok;
__entry->head_frag = skb->head_frag;
__entry->pp_recycle = skb->pp_recycle;
__entry->cloned = skb_cloned(skb);
__entry->shared = skb_shared(skb);
__entry->nr_frags = skb_shinfo(skb)->nr_frags;
__entry->frag_list = (bool)skb_shinfo(skb)->frag_list;
),
TP_printk("INGRPSTQ: NTHADD: skb=%p len=%u data_len=%u share_ok=%u head_frag=%u pp_recycle=%u cloned=%u shared=%u nr_frags=%u frag_list=%u",
__entry->skb, __entry->skb_len, __entry->data_len, __entry->share_ok,
__entry->head_frag, __entry->pp_recycle, __entry->cloned, __entry->shared,
__entry->nr_frags, __entry->frag_list));
DECLARE_EVENT_CLASS(iptfs_timer_event,
TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val),
TP_ARGS(xtfs, time_val),
TP_STRUCT__entry(__field(u64, time_val)
__field(u64, set_time)),
TP_fast_assign(__entry->time_val = time_val;
__entry->set_time = xtfs->iptfs_settime;
),
TP_printk("TIMER: set_time=%llu time_val=%llu",
__entry->set_time, __entry->time_val));
DEFINE_EVENT(iptfs_timer_event, iptfs_timer_start,
TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val),
TP_ARGS(xtfs, time_val));
DEFINE_EVENT(iptfs_timer_event, iptfs_timer_expire,
TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val),
TP_ARGS(xtfs, time_val));
#endif /* _TRACE_IPTFS_H */
/* This part must be outside protection */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../net/xfrm
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace_iptfs
#include <trace/define_trace.h>

View File

@ -284,9 +284,15 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src)
case XFRMA_SA_DIR:
case XFRMA_NAT_KEEPALIVE_INTERVAL:
case XFRMA_SA_PCPU:
case XFRMA_IPTFS_DROP_TIME:
case XFRMA_IPTFS_REORDER_WINDOW:
case XFRMA_IPTFS_DONT_FRAG:
case XFRMA_IPTFS_INIT_DELAY:
case XFRMA_IPTFS_MAX_QSIZE:
case XFRMA_IPTFS_PKT_SIZE:
return xfrm_nla_cpy(dst, src, nla_len(src));
default:
BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU);
BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE);
pr_warn_once("unsupported nla_type %d\n", src->nla_type);
return -EOPNOTSUPP;
}
@ -441,7 +447,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla,
int err;
if (type > XFRMA_MAX) {
BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU);
BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE);
NL_SET_ERR_MSG(extack, "Bad attribute");
return -EOPNOTSUPP;
}

View File

@ -42,7 +42,8 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb,
skb->transport_header = skb->network_header + hsize;
skb_reset_mac_len(skb);
pskb_pull(skb, skb->mac_len + x->props.header_len);
pskb_pull(skb,
skb->mac_len + x->props.header_len - x->props.enc_hdr_len);
}
static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
@ -68,6 +69,7 @@ static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
{
switch (x->outer_mode.encap) {
case XFRM_MODE_IPTFS:
case XFRM_MODE_TUNNEL:
if (x->outer_mode.family == AF_INET)
return __xfrm_mode_tunnel_prep(x, skb,

View File

@ -446,6 +446,9 @@ static int xfrm_inner_mode_input(struct xfrm_state *x,
WARN_ON_ONCE(1);
break;
default:
if (x->mode_cbs && x->mode_cbs->input)
return x->mode_cbs->input(x, skb);
WARN_ON_ONCE(1);
break;
}
@ -453,6 +456,10 @@ static int xfrm_inner_mode_input(struct xfrm_state *x,
return -EOPNOTSUPP;
}
/* NOTE: encap_type - In addition to the normal (non-negative) values for
* encap_type, a negative value of -1 or -2 can be used to resume/restart this
* function after a previous invocation early terminated for async operation.
*/
int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
{
const struct xfrm_state_afinfo *afinfo;
@ -489,6 +496,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
family = x->props.family;
/* An encap_type of -2 indicates reconstructed inner packet */
if (encap_type == -2)
goto resume_decapped;
/* An encap_type of -1 indicates async resumption. */
if (encap_type == -1) {
async = 1;
@ -679,11 +690,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
XFRM_MODE_SKB_CB(skb)->protocol = nexthdr;
if (xfrm_inner_mode_input(x, skb)) {
err = xfrm_inner_mode_input(x, skb);
if (err == -EINPROGRESS)
return 0;
else if (err) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
goto drop;
}
resume_decapped:
if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) {
decaps = 1;
break;

2764
net/xfrm/xfrm_iptfs.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -472,6 +472,8 @@ static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb)
WARN_ON_ONCE(1);
break;
default:
if (x->mode_cbs && x->mode_cbs->prepare_output)
return x->mode_cbs->prepare_output(x, skb);
WARN_ON_ONCE(1);
break;
}
@ -675,6 +677,10 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x)
return;
}
if (x->outer_mode.encap == XFRM_MODE_IPTFS) {
xo->inner_ipproto = IPPROTO_AGGFRAG;
return;
}
/* non-Tunnel Mode */
if (!skb->encapsulation)

View File

@ -2497,6 +2497,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
if (tmpl->mode == XFRM_MODE_TUNNEL ||
tmpl->mode == XFRM_MODE_IPTFS ||
tmpl->mode == XFRM_MODE_BEET) {
remote = &tmpl->id.daddr;
local = &tmpl->saddr;
@ -2748,6 +2749,9 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
dst1->input = dst_discard;
if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) {
dst1->output = xfrm[i]->mode_cbs->output;
} else {
rcu_read_lock();
afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
if (likely(afinfo))
@ -2755,6 +2759,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
else
dst1->output = dst_discard_out;
rcu_read_unlock();
}
xdst_prev = xdst;
@ -3290,7 +3295,8 @@ struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
ok:
xfrm_pols_put(pols, drop_pols);
if (dst && dst->xfrm &&
dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
(dst->xfrm->props.mode == XFRM_MODE_TUNNEL ||
dst->xfrm->props.mode == XFRM_MODE_IPTFS))
dst->flags |= DST_XFRM_TUNNEL;
return dst;
@ -4519,6 +4525,7 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm
switch (t->mode) {
case XFRM_MODE_TUNNEL:
case XFRM_MODE_BEET:
case XFRM_MODE_IPTFS:
if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
m->old_family) &&
xfrm_addr_equal(&t->saddr, &m->old_saddr,
@ -4561,7 +4568,8 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
continue;
n++;
if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
pol->xfrm_vec[i].mode != XFRM_MODE_BEET &&
pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS)
continue;
/* update endpoints */
memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,

View File

@ -43,6 +43,8 @@ static const struct snmp_mib xfrm_mib_list[] = {
SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR),
SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR),
SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR),
SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR),
SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE),
SNMP_MIB_SENTINEL
};

View File

@ -467,6 +467,11 @@ static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = {
.flags = XFRM_MODE_FLAG_TUNNEL,
.family = AF_INET,
},
[XFRM_MODE_IPTFS] = {
.encap = XFRM_MODE_IPTFS,
.flags = XFRM_MODE_FLAG_TUNNEL,
.family = AF_INET,
},
};
static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = {
@ -488,6 +493,11 @@ static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = {
.flags = XFRM_MODE_FLAG_TUNNEL,
.family = AF_INET6,
},
[XFRM_MODE_IPTFS] = {
.encap = XFRM_MODE_IPTFS,
.flags = XFRM_MODE_FLAG_TUNNEL,
.family = AF_INET6,
},
};
static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
@ -515,6 +525,60 @@ static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
return NULL;
}
static const struct xfrm_mode_cbs __rcu *xfrm_mode_cbs_map[XFRM_MODE_MAX];
static DEFINE_SPINLOCK(xfrm_mode_cbs_map_lock);
int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs)
{
if (mode >= XFRM_MODE_MAX)
return -EINVAL;
spin_lock_bh(&xfrm_mode_cbs_map_lock);
rcu_assign_pointer(xfrm_mode_cbs_map[mode], mode_cbs);
spin_unlock_bh(&xfrm_mode_cbs_map_lock);
return 0;
}
EXPORT_SYMBOL(xfrm_register_mode_cbs);
void xfrm_unregister_mode_cbs(u8 mode)
{
if (mode >= XFRM_MODE_MAX)
return;
spin_lock_bh(&xfrm_mode_cbs_map_lock);
RCU_INIT_POINTER(xfrm_mode_cbs_map[mode], NULL);
spin_unlock_bh(&xfrm_mode_cbs_map_lock);
synchronize_rcu();
}
EXPORT_SYMBOL(xfrm_unregister_mode_cbs);
static const struct xfrm_mode_cbs *xfrm_get_mode_cbs(u8 mode)
{
const struct xfrm_mode_cbs *cbs;
bool try_load = true;
if (mode >= XFRM_MODE_MAX)
return NULL;
retry:
rcu_read_lock();
cbs = rcu_dereference(xfrm_mode_cbs_map[mode]);
if (cbs && !try_module_get(cbs->owner))
cbs = NULL;
rcu_read_unlock();
if (mode == XFRM_MODE_IPTFS && !cbs && try_load) {
request_module("xfrm-iptfs");
try_load = false;
goto retry;
}
return cbs;
}
void xfrm_state_free(struct xfrm_state *x)
{
kmem_cache_free(xfrm_state_cache, x);
@ -523,6 +587,8 @@ EXPORT_SYMBOL(xfrm_state_free);
static void ___xfrm_state_destroy(struct xfrm_state *x)
{
if (x->mode_cbs && x->mode_cbs->destroy_state)
x->mode_cbs->destroy_state(x);
hrtimer_cancel(&x->mtimer);
del_timer_sync(&x->rtimer);
kfree(x->aead);
@ -682,6 +748,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
x->replay_maxdiff = 0;
x->pcpu_num = UINT_MAX;
spin_lock_init(&x->lock);
x->mode_data = NULL;
}
return x;
}
@ -1945,6 +2012,12 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
x->new_mapping_sport = 0;
x->dir = orig->dir;
x->mode_cbs = orig->mode_cbs;
if (x->mode_cbs && x->mode_cbs->clone_state) {
if (x->mode_cbs->clone_state(x, orig))
goto error;
}
return x;
error:
@ -2271,6 +2344,7 @@ static int __xfrm6_state_sort_cmp(const void *p)
#endif
case XFRM_MODE_TUNNEL:
case XFRM_MODE_BEET:
case XFRM_MODE_IPTFS:
return 4;
}
return 5;
@ -2297,6 +2371,7 @@ static int __xfrm6_tmpl_sort_cmp(const void *p)
#endif
case XFRM_MODE_TUNNEL:
case XFRM_MODE_BEET:
case XFRM_MODE_IPTFS:
return 3;
}
return 4;
@ -2986,6 +3061,9 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
case XFRM_MODE_TUNNEL:
break;
default:
if (x->mode_cbs && x->mode_cbs->get_inner_mtu)
return x->mode_cbs->get_inner_mtu(x, mtu);
WARN_ON_ONCE(1);
break;
}
@ -3086,6 +3164,12 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
}
}
x->mode_cbs = xfrm_get_mode_cbs(x->props.mode);
if (x->mode_cbs) {
if (x->mode_cbs->init_state)
err = x->mode_cbs->init_state(x);
module_put(x->mode_cbs->owner);
}
error:
return err;
}

View File

@ -301,6 +301,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode");
goto out;
}
if ((attrs[XFRMA_IPTFS_DROP_TIME] ||
attrs[XFRMA_IPTFS_REORDER_WINDOW] ||
attrs[XFRMA_IPTFS_DONT_FRAG] ||
attrs[XFRMA_IPTFS_INIT_DELAY] ||
attrs[XFRMA_IPTFS_MAX_QSIZE] ||
attrs[XFRMA_IPTFS_PKT_SIZE]) &&
p->mode != XFRM_MODE_IPTFS) {
NL_SET_ERR_MSG(extack, "IP-TFS options can only be used in IP-TFS mode");
goto out;
}
break;
case IPPROTO_COMP:
@ -373,6 +383,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
case XFRM_MODE_ROUTEOPTIMIZATION:
case XFRM_MODE_BEET:
break;
case XFRM_MODE_IPTFS:
if (p->id.proto != IPPROTO_ESP) {
NL_SET_ERR_MSG(extack, "IP-TFS mode only supported with ESP");
goto out;
}
if (sa_dir == 0) {
NL_SET_ERR_MSG(extack, "IP-TFS mode requires in or out direction attribute");
goto out;
}
break;
default:
NL_SET_ERR_MSG(extack, "Unsupported mode");
@ -421,6 +441,18 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
goto out;
}
if (attrs[XFRMA_IPTFS_DROP_TIME]) {
NL_SET_ERR_MSG(extack, "IP-TFS drop time should not be set for output SA");
err = -EINVAL;
goto out;
}
if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) {
NL_SET_ERR_MSG(extack, "IP-TFS reorder window should not be set for output SA");
err = -EINVAL;
goto out;
}
if (attrs[XFRMA_REPLAY_VAL]) {
struct xfrm_replay_state *replay;
@ -458,6 +490,30 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
}
}
if (attrs[XFRMA_IPTFS_DONT_FRAG]) {
NL_SET_ERR_MSG(extack, "IP-TFS don't fragment should not be set for input SA");
err = -EINVAL;
goto out;
}
if (attrs[XFRMA_IPTFS_INIT_DELAY]) {
NL_SET_ERR_MSG(extack, "IP-TFS initial delay should not be set for input SA");
err = -EINVAL;
goto out;
}
if (attrs[XFRMA_IPTFS_MAX_QSIZE]) {
NL_SET_ERR_MSG(extack, "IP-TFS max queue size should not be set for input SA");
err = -EINVAL;
goto out;
}
if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
NL_SET_ERR_MSG(extack, "IP-TFS packet size should not be set for input SA");
err = -EINVAL;
goto out;
}
}
if (!sa_dir && attrs[XFRMA_SA_PCPU]) {
@ -886,6 +942,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
goto error;
}
if (x->mode_cbs && x->mode_cbs->user_init) {
err = x->mode_cbs->user_init(net, x, attrs, extack);
if (err)
goto error;
}
return x;
error:
@ -1301,6 +1363,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
if (ret)
goto out;
}
if (x->mode_cbs && x->mode_cbs->copy_to_user)
ret = x->mode_cbs->copy_to_user(x, skb);
if (ret)
goto out;
if (x->mapping_maxage) {
ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage);
if (ret)
@ -1958,6 +2024,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family,
return -EINVAL;
}
break;
case XFRM_MODE_IPTFS:
break;
default:
if (ut[i].family != prev_family) {
NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change");
@ -3220,6 +3288,12 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
[XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
[XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 },
[XFRMA_SA_PCPU] = { .type = NLA_U32 },
[XFRMA_IPTFS_DROP_TIME] = { .type = NLA_U32 },
[XFRMA_IPTFS_REORDER_WINDOW] = { .type = NLA_U16 },
[XFRMA_IPTFS_DONT_FRAG] = { .type = NLA_FLAG },
[XFRMA_IPTFS_INIT_DELAY] = { .type = NLA_U32 },
[XFRMA_IPTFS_MAX_QSIZE] = { .type = NLA_U32 },
[XFRMA_IPTFS_PKT_SIZE] = { .type = NLA_U32 },
};
EXPORT_SYMBOL_GPL(xfrma_policy);
@ -3554,6 +3628,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
if (x->nat_keepalive_interval)
l += nla_total_size(sizeof(x->nat_keepalive_interval));
if (x->mode_cbs && x->mode_cbs->sa_len)
l += x->mode_cbs->sa_len(x);
return l;
}