linux-stable/include/net/sctp/structs.h

2211 lines
67 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SCTP kernel implementation
* (C) Copyright IBM Corp. 2001, 2004
* Copyright (c) 1999-2000 Cisco, Inc.
* Copyright (c) 1999-2001 Motorola, Inc.
* Copyright (c) 2001 Intel Corp.
*
* This file is part of the SCTP kernel implementation
*
* Please send any bug reports or fixes you make to the
* email addresses:
* lksctp developers <linux-sctp@vger.kernel.org>
*
* Written or modified by:
* Randall Stewart <randall@sctp.chicago.il.us>
* Ken Morneau <kmorneau@cisco.com>
* Qiaobing Xie <qxie1@email.mot.com>
* La Monte H.P. Yarroll <piggy@acm.org>
* Karl Knutson <karl@athena.chicago.il.us>
* Jon Grimm <jgrimm@us.ibm.com>
* Xingang Guo <xingang.guo@intel.com>
* Hui Huang <hui.huang@nokia.com>
* Sridhar Samudrala <sri@us.ibm.com>
* Daisy Chang <daisyc@us.ibm.com>
* Dajiang Zhang <dajiang.zhang@nokia.com>
* Ardelle Fan <ardelle.fan@intel.com>
* Ryan Layer <rmlayer@us.ibm.com>
* Anup Pemmaiah <pemmaiah@cc.usu.edu>
* Kevin Gao <kevin.gao@intel.com>
*/
#ifndef __sctp_structs_h__
#define __sctp_structs_h__
#include <linux/ktime.h>
#include <linux/generic-radix-tree.h>
#include <linux/rhashtable-types.h>
#include <linux/socket.h> /* linux/in.h needs this!! */
#include <linux/in.h> /* We get struct sockaddr_in. */
#include <linux/in6.h> /* We get struct in6_addr */
#include <linux/ipv6.h>
#include <asm/param.h> /* We get MAXHOSTNAMELEN. */
#include <linux/atomic.h> /* This gets us atomic counters. */
#include <linux/skbuff.h> /* We need sk_buff_head. */
#include <linux/workqueue.h> /* We need tq_struct. */
#include <linux/sctp.h> /* We need sctp* header structs. */
#include <net/sctp/auth.h> /* We need auth specific structs */
#include <net/ip.h> /* For inet_skb_parm */
/* A convenience structure for handling sockaddr structures.
* We should wean ourselves off this.
*/
union sctp_addr {
struct sockaddr_in v4;
struct sockaddr_in6 v6;
struct sockaddr sa;
};
/* Forward declarations for data structures. */
struct sctp_globals;
struct sctp_endpoint;
struct sctp_association;
struct sctp_transport;
struct sctp_packet;
struct sctp_chunk;
struct sctp_inq;
struct sctp_outq;
struct sctp_bind_addr;
struct sctp_ulpq;
struct sctp_ep_common;
struct crypto_shash;
struct sctp_stream;
#include <net/sctp/tsnmap.h>
#include <net/sctp/ulpevent.h>
#include <net/sctp/ulpqueue.h>
#include <net/sctp/stream_interleave.h>
/* Structures useful for managing bind/connect. */
struct sctp_bind_bucket {
unsigned short port;
signed char fastreuse;
signed char fastreuseport;
kuid_t fastuid;
struct hlist_node node;
struct hlist_head owner;
struct net *net;
};
struct sctp_bind_hashbucket {
spinlock_t lock;
struct hlist_head chain;
};
/* Used for hashing all associations. */
struct sctp_hashbucket {
rwlock_t lock;
struct hlist_head chain;
} __attribute__((__aligned__(8)));
/* The SCTP globals structure. */
extern struct sctp_globals {
/* This is a list of groups of functions for each address
* family that we support.
*/
struct list_head address_families;
/* This is the hash of all endpoints. */
struct sctp_hashbucket *ep_hashtable;
/* This is the sctp port control hash. */
struct sctp_bind_hashbucket *port_hashtable;
/* This is the hash of all transports. */
struct rhltable transport_hashtable;
/* Sizes of above hashtables. */
int ep_hashsize;
int port_hashsize;
/* Default initialization values to be applied to new associations. */
__u16 max_instreams;
__u16 max_outstreams;
/* Flag to indicate whether computing and verifying checksum
* is disabled. */
bool checksum_disable;
} sctp_globals;
#define sctp_max_instreams (sctp_globals.max_instreams)
#define sctp_max_outstreams (sctp_globals.max_outstreams)
#define sctp_address_families (sctp_globals.address_families)
#define sctp_ep_hashsize (sctp_globals.ep_hashsize)
#define sctp_ep_hashtable (sctp_globals.ep_hashtable)
#define sctp_port_hashsize (sctp_globals.port_hashsize)
#define sctp_port_hashtable (sctp_globals.port_hashtable)
#define sctp_transport_hashtable (sctp_globals.transport_hashtable)
#define sctp_checksum_disable (sctp_globals.checksum_disable)
/* SCTP Socket type: UDP or TCP style. */
enum sctp_socket_type {
SCTP_SOCKET_UDP = 0,
SCTP_SOCKET_UDP_HIGH_BANDWIDTH,
SCTP_SOCKET_TCP
};
/* Per socket SCTP information. */
struct sctp_sock {
/* inet_sock has to be the first member of sctp_sock */
struct inet_sock inet;
/* What kind of a socket is this? */
enum sctp_socket_type type;
/* PF_ family specific functions. */
struct sctp_pf *pf;
/* Access to HMAC transform. */
struct crypto_shash *hmac;
char *sctp_hmac_alg;
/* What is our base endpointer? */
struct sctp_endpoint *ep;
struct sctp_bind_bucket *bind_hash;
/* Various Socket Options. */
__u16 default_stream;
__u32 default_ppid;
__u16 default_flags;
__u32 default_context;
__u32 default_timetolive;
__u32 default_rcv_context;
int max_burst;
/* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
* the destination address every heartbeat interval. This value
* will be inherited by all new associations.
*/
__u32 hbinterval;
__u32 probe_interval;
__be16 udp_port;
__be16 encap_port;
/* This is the max_retrans value for new associations. */
__u16 pathmaxrxt;
__u32 flowlabel;
__u8 dscp;
__u16 pf_retrans;
__u16 ps_retrans;
/* The initial Path MTU to use for new associations. */
__u32 pathmtu;
/* The default SACK delay timeout for new associations. */
__u32 sackdelay;
__u32 sackfreq;
/* Flags controlling Heartbeat, SACK delay, and Path MTU Discovery. */
__u32 param_flags;
__u32 default_ss;
struct sctp_rtoinfo rtoinfo;
struct sctp_paddrparams paddrparam;
struct sctp_assocparams assocparams;
sctp: Define usercopy region in SCTP proto slab cache The SCTP socket event notification subscription information need to be copied to/from userspace. In support of usercopy hardening, this patch defines a region in the struct proto slab cache in which userspace copy operations are allowed. Additionally moves the usercopy fields to be adjacent for the region to cover both. example usage trace: net/sctp/socket.c: sctp_getsockopt_events(...): ... copy_to_user(..., &sctp_sk(sk)->subscribe, len) sctp_setsockopt_events(...): ... copy_from_user(&sctp_sk(sk)->subscribe, ..., optlen) sctp_getsockopt_initmsg(...): ... copy_to_user(..., &sctp_sk(sk)->initmsg, len) This region is known as the slab cache's usercopy region. Slab caches can now check that each dynamically sized copy operation involving cache-managed memory falls entirely within the slab's usercopy region. This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY whitelisting code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Signed-off-by: David Windsor <dave@nullcore.net> [kees: split from network patch, move struct members adjacent] [kees: add SCTPv6 struct whitelist, provide usage trace] Cc: Vlad Yasevich <vyasevich@gmail.com> Cc: Neil Horman <nhorman@tuxdriver.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: linux-sctp@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook <keescook@chromium.org>
2017-08-24 23:57:57 +00:00
/*
* These two structures must be grouped together for the usercopy
* whitelist region.
*/
__u16 subscribe;
sctp: Define usercopy region in SCTP proto slab cache The SCTP socket event notification subscription information need to be copied to/from userspace. In support of usercopy hardening, this patch defines a region in the struct proto slab cache in which userspace copy operations are allowed. Additionally moves the usercopy fields to be adjacent for the region to cover both. example usage trace: net/sctp/socket.c: sctp_getsockopt_events(...): ... copy_to_user(..., &sctp_sk(sk)->subscribe, len) sctp_setsockopt_events(...): ... copy_from_user(&sctp_sk(sk)->subscribe, ..., optlen) sctp_getsockopt_initmsg(...): ... copy_to_user(..., &sctp_sk(sk)->initmsg, len) This region is known as the slab cache's usercopy region. Slab caches can now check that each dynamically sized copy operation involving cache-managed memory falls entirely within the slab's usercopy region. This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY whitelisting code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Signed-off-by: David Windsor <dave@nullcore.net> [kees: split from network patch, move struct members adjacent] [kees: add SCTPv6 struct whitelist, provide usage trace] Cc: Vlad Yasevich <vyasevich@gmail.com> Cc: Neil Horman <nhorman@tuxdriver.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: linux-sctp@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook <keescook@chromium.org>
2017-08-24 23:57:57 +00:00
struct sctp_initmsg initmsg;
int user_frag;
__u32 autoclose;
__u32 adaptation_ind;
__u32 pd_point;
__u16 nodelay:1,
sctp: add pf_expose per netns and sock and asoc As said in rfc7829, section 3, point 12: The SCTP stack SHOULD expose the PF state of its destination addresses to the ULP as well as provide the means to notify the ULP of state transitions of its destination addresses from active to PF, and vice versa. However, it is recommended that an SCTP stack implementing SCTP-PF also allows for the ULP to be kept ignorant of the PF state of its destinations and the associated state transitions, thus allowing for retention of the simpler state transition model of [RFC4960] in the ULP. Not only does it allow to expose the PF state to ULP, but also allow to ignore sctp-pf to ULP. So this patch is to add pf_expose per netns, sock and asoc. And in sctp_assoc_control_transport(), ulp_notify will be set to false if asoc->expose is not 'enabled' in next patch. It also allows a user to change pf_expose per netns by sysctl, and pf_expose per sock and asoc will be initialized with it. Note that pf_expose also works for SCTP_GET_PEER_ADDR_INFO sockopt, to not allow a user to query the state of a sctp-pf peer address when pf_expose is 'disabled', as said in section 7.3. v1->v2: - Fix a build warning noticed by Nathan Chancellor. v2->v3: - set pf_expose to UNUSED by default to keep compatible with old applications. v3->v4: - add a new entry for pf_expose on ip-sysctl.txt, as Marcelo suggested. - change this patch to 1/5, and move sctp_assoc_control_transport change into 2/5, as Marcelo suggested. - use SCTP_PF_EXPOSE_UNSET instead of SCTP_PF_EXPOSE_UNUSED, and set SCTP_PF_EXPOSE_UNSET to 0 in enum, as Marcelo suggested. Signed-off-by: Xin Long <lucien.xin@gmail.com> Acked-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-08 05:20:32 +00:00
pf_expose:2,
sctp: add support for SCTP_REUSE_PORT sockopt This feature is actually already supported by sk->sk_reuse which can be set by socket level opt SO_REUSEADDR. But it's not working exactly as RFC6458 demands in section 8.1.27, like: - This option only supports one-to-one style SCTP sockets - This socket option must not be used after calling bind() or sctp_bindx(). Besides, SCTP_REUSE_PORT sockopt should be provided for user's programs. Otherwise, the programs with SCTP_REUSE_PORT from other systems will not work in linux. To separate it from the socket level version, this patch adds 'reuse' in sctp_sock and it works pretty much as sk->sk_reuse, but with some extra setup limitations that are needed when it is being enabled. "It should be noted that the behavior of the socket-level socket option to reuse ports and/or addresses for SCTP sockets is unspecified", so it leaves SO_REUSEADDR as is for the compatibility. Note that the name SCTP_REUSE_PORT is somewhat confusing, as its functionality is nearly identical to SO_REUSEADDR, but with some extra restrictions. Here it uses 'reuse' in sctp_sock instead of 'reuseport'. As for sk->sk_reuseport support for SCTP, it will be added in another patch. Thanks to Neil to make this clear. v1->v2: - add sctp_sk->reuse to separate it from the socket level version. v2->v3: - improve changelog according to Marcelo's suggestion. Acked-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-06-28 07:31:00 +00:00
reuse:1,
disable_fragments:1,
v4mapped:1,
frag_interleave:1,
recvrcvinfo:1,
recvnxtinfo:1,
data_ready_signalled:1;
atomic_t pd_mode;
/* Fields after this point will be skipped on copies, like on accept
* and peeloff operations
*/
/* Receive to here while partial delivery is in effect. */
struct sk_buff_head pd_lobby;
sctp: fix ASCONF list handling ->auto_asconf_splist is per namespace and mangled by functions like sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization. Also, the call to inet_sk_copy_descendant() was backuping ->auto_asconf_list through the copy but was not honoring ->do_auto_asconf, which could lead to list corruption if it was different between both sockets. This commit thus fixes the list handling by using ->addr_wq_lock spinlock to protect the list. A special handling is done upon socket creation and destruction for that. Error handlig on sctp_init_sock() will never return an error after having initialized asconf, so sctp_destroy_sock() can be called without addrq_wq_lock. The lock now will be take on sctp_close_sock(), before locking the socket, so we don't do it in inverse order compared to sctp_addr_wq_timeout_handler(). Instead of taking the lock on sctp_sock_migrate() for copying and restoring the list values, it's preferred to avoid rewritting it by implementing sctp_copy_descendant(). Issue was found with a test application that kept flipping sysctl default_auto_asconf on and off, but one could trigger it by issuing simultaneous setsockopt() calls on multiple sockets or by creating/destroying sockets fast enough. This is only triggerable locally. Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).") Reported-by: Ji Jianwen <jiji@redhat.com> Suggested-by: Neil Horman <nhorman@tuxdriver.com> Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-06-12 13:16:41 +00:00
struct list_head auto_asconf_list;
int do_auto_asconf;
};
#define sctp_sk(ptr) container_of_const(ptr, struct sctp_sock, inet.sk)
static inline struct sock *sctp_opt2sk(const struct sctp_sock *sp)
{
return (struct sock *)sp;
}
#if IS_ENABLED(CONFIG_IPV6)
struct sctp6_sock {
struct sctp_sock sctp;
struct ipv6_pinfo inet6;
};
#endif /* CONFIG_IPV6 */
/* This is our APPLICATION-SPECIFIC state cookie.
* THIS IS NOT DICTATED BY THE SPECIFICATION.
*/
/* These are the parts of an association which we send in the cookie.
* Most of these are straight out of:
* RFC2960 12.2 Parameters necessary per association (i.e. the TCB)
*
*/
struct sctp_cookie {
/* My : Tag expected in every inbound packet and sent
* Verification: in the INIT or INIT ACK chunk.
* Tag :
*/
__u32 my_vtag;
/* Peer's : Tag expected in every outbound packet except
* Verification: in the INIT chunk.
* Tag :
*/
__u32 peer_vtag;
/* The rest of these are not from the spec, but really need to
* be in the cookie.
*/
/* My Tie Tag : Assist in discovering a restarting association. */
__u32 my_ttag;
/* Peer's Tie Tag: Assist in discovering a restarting association. */
__u32 peer_ttag;
/* When does this cookie expire? */
ktime_t expiration;
/* Number of inbound/outbound streams which are set
* and negotiated during the INIT process.
*/
__u16 sinit_num_ostreams;
__u16 sinit_max_instreams;
/* This is the first sequence number I used. */
__u32 initial_tsn;
/* This holds the originating address of the INIT packet. */
union sctp_addr peer_addr;
/* IG Section 2.35.3
* Include the source port of the INIT-ACK
*/
__u16 my_port;
__u8 prsctp_capable;
/* Padding for future use */
__u8 padding;
__u32 adaptation_ind;
__u8 auth_random[sizeof(struct sctp_paramhdr) +
SCTP_AUTH_RANDOM_LENGTH];
__u8 auth_hmacs[SCTP_AUTH_NUM_HMACS * sizeof(__u16) + 2];
__u8 auth_chunks[sizeof(struct sctp_paramhdr) + SCTP_AUTH_MAX_CHUNKS];
/* This is a shim for my peer's INIT packet, followed by
* a copy of the raw address list of the association.
* The length of the raw address list is saved in the
* raw_addr_list_len field, which will be used at the time when
* the association TCB is re-constructed from the cookie.
*/
__u32 raw_addr_list_len;
/* struct sctp_init_chunk peer_init[]; */
};
/* The format of our cookie that we send to our peer. */
struct sctp_signed_cookie {
__u8 signature[SCTP_SECRET_SIZE];
__u32 __pad; /* force sctp_cookie alignment to 64 bits */
struct sctp_cookie c;
} __packed;
/* This is another convenience type to allocate memory for address
* params for the maximum size and pass such structures around
* internally.
*/
union sctp_addr_param {
struct sctp_paramhdr p;
struct sctp_ipv4addr_param v4;
struct sctp_ipv6addr_param v6;
};
/* A convenience type to allow walking through the various
* parameters and avoid casting all over the place.
*/
union sctp_params {
void *v;
struct sctp_paramhdr *p;
struct sctp_cookie_preserve_param *life;
struct sctp_hostname_param *dns;
struct sctp_cookie_param *cookie;
struct sctp_supported_addrs_param *sat;
struct sctp_ipv4addr_param *v4;
struct sctp_ipv6addr_param *v6;
union sctp_addr_param *addr;
struct sctp_adaptation_ind_param *aind;
struct sctp_supported_ext_param *ext;
struct sctp_random_param *random;
struct sctp_chunks_param *chunks;
struct sctp_hmac_algo_param *hmac_algo;
struct sctp_addip_param *addip;
};
/* RFC 2960. Section 3.3.5 Heartbeat.
* Heartbeat Information: variable length
* The Sender-specific Heartbeat Info field should normally include
* information about the sender's current time when this HEARTBEAT
* chunk is sent and the destination transport address to which this
* HEARTBEAT is sent (see Section 8.3).
*/
struct sctp_sender_hb_info {
struct sctp_paramhdr param_hdr;
union sctp_addr daddr;
unsigned long sent_at;
__u64 hb_nonce;
__u32 probe_size;
};
int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
gfp_t gfp);
int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid);
void sctp_stream_free(struct sctp_stream *stream);
void sctp_stream_clear(struct sctp_stream *stream);
void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new);
/* What is the current SSN number for this stream? */
#define sctp_ssn_peek(stream, type, sid) \
(sctp_stream_##type((stream), (sid))->ssn)
/* Return the next SSN number for this stream. */
#define sctp_ssn_next(stream, type, sid) \
(sctp_stream_##type((stream), (sid))->ssn++)
/* Skip over this ssn and all below. */
#define sctp_ssn_skip(stream, type, sid, ssn) \
(sctp_stream_##type((stream), (sid))->ssn = ssn + 1)
/* What is the current MID number for this stream? */
#define sctp_mid_peek(stream, type, sid) \
(sctp_stream_##type((stream), (sid))->mid)
/* Return the next MID number for this stream. */
#define sctp_mid_next(stream, type, sid) \
(sctp_stream_##type((stream), (sid))->mid++)
/* Skip over this mid and all below. */
#define sctp_mid_skip(stream, type, sid, mid) \
(sctp_stream_##type((stream), (sid))->mid = mid + 1)
/* What is the current MID_uo number for this stream? */
#define sctp_mid_uo_peek(stream, type, sid) \
(sctp_stream_##type((stream), (sid))->mid_uo)
/* Return the next MID_uo number for this stream. */
#define sctp_mid_uo_next(stream, type, sid) \
(sctp_stream_##type((stream), (sid))->mid_uo++)
/*
* Pointers to address related SCTP functions.
* (i.e. things that depend on the address family.)
*/
struct sctp_af {
int (*sctp_xmit) (struct sk_buff *skb,
struct sctp_transport *);
int (*setsockopt) (struct sock *sk,
int level,
int optname,
sockptr_t optval,
unsigned int optlen);
int (*getsockopt) (struct sock *sk,
int level,
int optname,
char __user *optval,
int __user *optlen);
void (*get_dst) (struct sctp_transport *t,
union sctp_addr *saddr,
struct flowi *fl,
struct sock *sk);
void (*get_saddr) (struct sctp_sock *sk,
struct sctp_transport *t,
struct flowi *fl);
void (*copy_addrlist) (struct list_head *,
struct net_device *);
int (*cmp_addr) (const union sctp_addr *addr1,
const union sctp_addr *addr2);
void (*addr_copy) (union sctp_addr *dst,
union sctp_addr *src);
void (*from_skb) (union sctp_addr *,
struct sk_buff *skb,
int saddr);
void (*from_sk) (union sctp_addr *,
struct sock *sk);
bool (*from_addr_param) (union sctp_addr *,
union sctp_addr_param *,
__be16 port, int iif);
int (*to_addr_param) (const union sctp_addr *,
union sctp_addr_param *);
int (*addr_valid) (union sctp_addr *,
struct sctp_sock *,
const struct sk_buff *);
enum sctp_scope (*scope)(union sctp_addr *);
void (*inaddr_any) (union sctp_addr *, __be16);
int (*is_any) (const union sctp_addr *);
int (*available) (union sctp_addr *,
struct sctp_sock *);
int (*skb_iif) (const struct sk_buff *sk);
int (*skb_sdif)(const struct sk_buff *sk);
int (*is_ce) (const struct sk_buff *sk);
void (*seq_dump_addr)(struct seq_file *seq,
union sctp_addr *addr);
void (*ecn_capable)(struct sock *sk);
__u16 net_header_len;
int sockaddr_len;
int (*ip_options_len)(struct sock *sk);
sa_family_t sa_family;
struct list_head list;
};
struct sctp_af *sctp_get_af_specific(sa_family_t);
int sctp_register_af(struct sctp_af *);
/* Protocol family functions. */
struct sctp_pf {
void (*event_msgname)(struct sctp_ulpevent *, char *, int *);
void (*skb_msgname) (struct sk_buff *, char *, int *);
int (*af_supported) (sa_family_t, struct sctp_sock *);
int (*cmp_addr) (const union sctp_addr *,
const union sctp_addr *,
struct sctp_sock *);
int (*bind_verify) (struct sctp_sock *, union sctp_addr *);
int (*send_verify) (struct sctp_sock *, union sctp_addr *);
int (*supported_addrs)(const struct sctp_sock *, __be16 *);
struct sock *(*create_accept_sk) (struct sock *sk,
net: Work around lockdep limitation in sockets that use sockets Lockdep issues a circular dependency warning when AFS issues an operation through AF_RXRPC from a context in which the VFS/VM holds the mmap_sem. The theory lockdep comes up with is as follows: (1) If the pagefault handler decides it needs to read pages from AFS, it calls AFS with mmap_sem held and AFS begins an AF_RXRPC call, but creating a call requires the socket lock: mmap_sem must be taken before sk_lock-AF_RXRPC (2) afs_open_socket() opens an AF_RXRPC socket and binds it. rxrpc_bind() binds the underlying UDP socket whilst holding its socket lock. inet_bind() takes its own socket lock: sk_lock-AF_RXRPC must be taken before sk_lock-AF_INET (3) Reading from a TCP socket into a userspace buffer might cause a fault and thus cause the kernel to take the mmap_sem, but the TCP socket is locked whilst doing this: sk_lock-AF_INET must be taken before mmap_sem However, lockdep's theory is wrong in this instance because it deals only with lock classes and not individual locks. The AF_INET lock in (2) isn't really equivalent to the AF_INET lock in (3) as the former deals with a socket entirely internal to the kernel that never sees userspace. This is a limitation in the design of lockdep. Fix the general case by: (1) Double up all the locking keys used in sockets so that one set are used if the socket is created by userspace and the other set is used if the socket is created by the kernel. (2) Store the kern parameter passed to sk_alloc() in a variable in the sock struct (sk_kern_sock). This informs sock_lock_init(), sock_init_data() and sk_clone_lock() as to the lock keys to be used. Note that the child created by sk_clone_lock() inherits the parent's kern setting. (3) Add a 'kern' parameter to ->accept() that is analogous to the one passed in to ->create() that distinguishes whether kernel_accept() or sys_accept4() was the caller and can be passed to sk_alloc(). Note that a lot of accept functions merely dequeue an already allocated socket. I haven't touched these as the new socket already exists before we get the parameter. Note also that there are a couple of places where I've made the accepted socket unconditionally kernel-based: irda_accept() rds_rcp_accept_one() tcp_accept_from_sock() because they follow a sock_create_kern() and accept off of that. Whilst creating this, I noticed that lustre and ocfs don't create sockets through sock_create_kern() and thus they aren't marked as for-kernel, though they appear to be internal. I wonder if these should do that so that they use the new set of lock keys. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 08:09:05 +00:00
struct sctp_association *asoc,
bool kern);
sctp: Fixup v4mapped behaviour to comply with Sock API The SCTP socket extensions API document describes the v4mapping option as follows: 8.1.15. Set/Clear IPv4 Mapped Addresses (SCTP_I_WANT_MAPPED_V4_ADDR) This socket option is a Boolean flag which turns on or off the mapping of IPv4 addresses. If this option is turned on, then IPv4 addresses will be mapped to V6 representation. If this option is turned off, then no mapping will be done of V4 addresses and a user will receive both PF_INET6 and PF_INET type addresses on the socket. See [RFC3542] for more details on mapped V6 addresses. This description isn't really in line with what the code does though. Introduce addr_to_user (renamed addr_v4map), which should be called before any sockaddr is passed back to user space. The new function places the sockaddr into the correct format depending on the SCTP_I_WANT_MAPPED_V4_ADDR option. Audit all places that touched v4mapped and either sanely construct a v4 or v6 address then call addr_to_user, or drop the unnecessary v4mapped check entirely. Audit all places that call addr_to_user and verify they are on a sycall return path. Add a custom getname that formats the address properly. Several bugs are addressed: - SCTP_I_WANT_MAPPED_V4_ADDR=0 often returned garbage for addresses to user space - The addr_len returned from recvmsg was not correct when returning AF_INET on a v6 socket - flowlabel and scope_id were not zerod when promoting a v4 to v6 - Some syscalls like bind and connect behaved differently depending on v4mapped Tested bind, getpeername, getsockname, connect, and recvmsg for proper behaviour in v4mapped = 1 and 0 cases. Signed-off-by: Neil Horman <nhorman@tuxdriver.com> Tested-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com> Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-30 18:40:53 +00:00
int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr);
void (*to_sk_saddr)(union sctp_addr *, struct sock *sk);
void (*to_sk_daddr)(union sctp_addr *, struct sock *sk);
void (*copy_ip_options)(struct sock *sk, struct sock *newsk);
struct sctp_af *af;
};
/* Structure to track chunk fragments that have been acked, but peer
* fragments of the same message have not.
*/
struct sctp_datamsg {
/* Chunks waiting to be submitted to lower layer. */
struct list_head chunks;
/* Reference counting. */
refcount_t refcnt;
/* When is this message no longer interesting to the peer? */
unsigned long expires_at;
/* Did the message fail to send? */
int send_error;
u8 send_failed:1,
can_delay:1, /* should this message be Nagle delayed */
abandoned:1; /* should this message be abandoned */
};
struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
struct sctp_sndrcvinfo *,
struct iov_iter *);
void sctp_datamsg_free(struct sctp_datamsg *);
void sctp_datamsg_put(struct sctp_datamsg *);
void sctp_chunk_fail(struct sctp_chunk *, int error);
int sctp_chunk_abandoned(struct sctp_chunk *);
/* RFC2960 1.4 Key Terms
*
* o Chunk: A unit of information within an SCTP packet, consisting of
* a chunk header and chunk-specific content.
*
* As a matter of convenience, we remember the SCTP common header for
* each chunk as well as a few other header pointers...
*/
struct sctp_chunk {
struct list_head list;
refcount_t refcnt;
/* How many times this chunk have been sent, for prsctp RTX policy */
int sent_count;
sctp: introduce stream scheduler foundations This patch introduces the hooks necessary to do stream scheduling, as per RFC Draft ndata. It also introduces the first scheduler, which is what we do today but now factored out: first come first served (FCFS). With stream scheduling now we have to track which chunk was enqueued on which stream and be able to select another other than the in front of the main outqueue. So we introduce a list on sctp_stream_out_ext structure for this purpose. We reuse sctp_chunk->transmitted_list space for the list above, as the chunk cannot belong to the two lists at the same time. By using the union in there, we can have distinct names for these moments. sctp_sched_ops are the operations expected to be implemented by each scheduler. The dequeueing is a bit particular to this implementation but it is to match how we dequeue packets today. We first dequeue and then check if it fits the packet and if not, we requeue it at head. Thus why we don't have a peek operation but have dequeue_done instead, which is called once the chunk can be safely considered as transmitted. The check removed from sctp_outq_flush is now performed by sctp_stream_outq_migrate, which is only called during assoc setup. (sctp_sendmsg() also checks for it) The only operation that is foreseen but not yet added here is a way to signalize that a new packet is starting or that the packet is done, for round robin scheduler per packet, but is intentionally left to the patch that actually implements it. Support for I-DATA chunks, also described in this RFC, with user message interleaving is straightforward as it just requires the schedulers to probe for the feature and ignore datamsg boundaries when dequeueing. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:13 +00:00
union {
/* This is our link to the per-transport transmitted list. */
struct list_head transmitted_list;
/* List in specific stream outq */
struct list_head stream_list;
};
/* This field is used by chunks that hold fragmented data.
* For the first fragment this is the list that holds the rest of
* fragments. For the remaining fragments, this is the link to the
* frag_list maintained in the first fragment.
*/
struct list_head frag_list;
/* This points to the sk_buff containing the actual data. */
struct sk_buff *skb;
union {
/* In case of GSO packets, this will store the head one */
struct sk_buff *head_skb;
/* In case of auth enabled, this will point to the shkey */
struct sctp_shared_key *shkey;
};
/* These are the SCTP headers by reverse order in a packet.
* Note that some of these may happen more than once. In that
* case, we point at the "current" one, whatever that means
* for that level of header.
*/
/* We point this at the FIRST TLV parameter to chunk_hdr. */
union sctp_params param_hdr;
union {
__u8 *v;
struct sctp_datahdr *data_hdr;
struct sctp_inithdr *init_hdr;
struct sctp_sackhdr *sack_hdr;
struct sctp_heartbeathdr *hb_hdr;
struct sctp_sender_hb_info *hbs_hdr;
struct sctp_shutdownhdr *shutdown_hdr;
struct sctp_signed_cookie *cookie_hdr;
struct sctp_ecnehdr *ecne_hdr;
struct sctp_cwrhdr *ecn_cwr_hdr;
struct sctp_errhdr *err_hdr;
struct sctp_addiphdr *addip_hdr;
struct sctp_fwdtsn_hdr *fwdtsn_hdr;
struct sctp_authhdr *auth_hdr;
struct sctp_idatahdr *idata_hdr;
struct sctp_ifwdtsn_hdr *ifwdtsn_hdr;
} subh;
__u8 *chunk_end;
struct sctp_chunkhdr *chunk_hdr;
struct sctphdr *sctp_hdr;
/* This needs to be recoverable for SCTP_SEND_FAILED events. */
struct sctp_sndrcvinfo sinfo;
/* Which association does this belong to? */
struct sctp_association *asoc;
/* What endpoint received this chunk? */
struct sctp_ep_common *rcvr;
/* We fill this in if we are calculating RTT. */
unsigned long sent_at;
/* What is the origin IP address for this chunk? */
union sctp_addr source;
/* Destination address for this chunk. */
union sctp_addr dest;
/* For outbound message, track all fragments for SEND_FAILED. */
struct sctp_datamsg *msg;
/* For an inbound chunk, this tells us where it came from.
* For an outbound chunk, it tells us where we'd like it to
* go. It is NULL if we have no preference.
*/
struct sctp_transport *transport;
/* SCTP-AUTH: For the special case inbound processing of COOKIE-ECHO
* we need save a pointer to the AUTH chunk, since the SCTP-AUTH
* spec violates the principle premis that all chunks are processed
* in order.
*/
struct sk_buff *auth_chunk;
#define SCTP_CAN_FRTX 0x0
#define SCTP_NEED_FRTX 0x1
#define SCTP_DONT_FRTX 0x2
__u16 rtt_in_progress:1, /* This chunk used for RTT calc? */
has_tsn:1, /* Does this chunk have a TSN yet? */
has_ssn:1, /* Does this chunk have a SSN yet? */
#define has_mid has_ssn
singleton:1, /* Only chunk in the packet? */
end_of_packet:1, /* Last chunk in the packet? */
ecn_ce_done:1, /* Have we processed the ECN CE bit? */
pdiscard:1, /* Discard the whole packet now? */
tsn_gap_acked:1, /* Is this chunk acked by a GAP ACK? */
data_accepted:1, /* At least 1 chunk accepted */
auth:1, /* IN: was auth'ed | OUT: needs auth */
has_asconf:1, /* IN: have seen an asconf before */
pmtu_probe:1, /* Used by PLPMTUD, can be set in s HB chunk */
tsn_missing_report:2, /* Data chunk missing counter. */
fast_retransmit:2; /* Is this chunk fast retransmitted? */
};
#define sctp_chunk_retransmitted(chunk) (chunk->sent_count > 1)
void sctp_chunk_hold(struct sctp_chunk *);
void sctp_chunk_put(struct sctp_chunk *);
int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len,
struct iov_iter *from);
void sctp_chunk_free(struct sctp_chunk *);
void *sctp_addto_chunk(struct sctp_chunk *, int len, const void *data);
struct sctp_chunk *sctp_chunkify(struct sk_buff *,
const struct sctp_association *,
struct sock *, gfp_t gfp);
void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *,
union sctp_addr *);
const union sctp_addr *sctp_source(const struct sctp_chunk *chunk);
static inline __u16 sctp_chunk_stream_no(struct sctp_chunk *ch)
{
return ntohs(ch->subh.data_hdr->stream);
}
enum {
SCTP_ADDR_NEW, /* new address added to assoc/ep */
SCTP_ADDR_SRC, /* address can be used as source */
SCTP_ADDR_DEL, /* address about to be deleted */
};
/* This is a structure for holding either an IPv6 or an IPv4 address. */
struct sctp_sockaddr_entry {
struct list_head list;
struct rcu_head rcu;
union sctp_addr a;
__u8 state;
__u8 valid;
};
#define SCTP_ADDRESS_TICK_DELAY 500
/* This structure holds lists of chunks as we are assembling for
* transmission.
*/
struct sctp_packet {
/* These are the SCTP header values (host order) for the packet. */
__u16 source_port;
__u16 destination_port;
__u32 vtag;
/* This contains the payload chunks. */
struct list_head chunk_list;
/* This is the overhead of the sctp and ip headers. */
size_t overhead;
/* This is the total size of all chunks INCLUDING padding. */
size_t size;
/* This is the maximum size this packet may have */
size_t max_size;
/* The packet is destined for this transport address.
* The function we finally use to pass down to the next lower
* layer lives in the transport structure.
*/
struct sctp_transport *transport;
/* pointer to the auth chunk for this packet */
struct sctp_chunk *auth;
u8 has_cookie_echo:1, /* This packet contains a COOKIE-ECHO chunk. */
has_sack:1, /* This packet contains a SACK chunk. */
has_auth:1, /* This packet contains an AUTH chunk */
has_data:1, /* This packet contains at least 1 DATA chunk */
ipfragok:1; /* So let ip fragment this packet */
};
void sctp_packet_init(struct sctp_packet *, struct sctp_transport *,
__u16 sport, __u16 dport);
void sctp_packet_config(struct sctp_packet *, __u32 vtag, int);
enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet,
struct sctp_chunk *chunk,
int one_packet, gfp_t gfp);
enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet,
struct sctp_chunk *chunk);
int sctp_packet_transmit(struct sctp_packet *, gfp_t);
void sctp_packet_free(struct sctp_packet *);
static inline int sctp_packet_empty(struct sctp_packet *packet)
{
return packet->size == packet->overhead;
}
/* This represents a remote transport address.
* For local transport addresses, we just use union sctp_addr.
*
* RFC2960 Section 1.4 Key Terms
*
* o Transport address: A Transport Address is traditionally defined
* by Network Layer address, Transport Layer protocol and Transport
* Layer port number. In the case of SCTP running over IP, a
* transport address is defined by the combination of an IP address
* and an SCTP port number (where SCTP is the Transport protocol).
*
* RFC2960 Section 7.1 SCTP Differences from TCP Congestion control
*
* o The sender keeps a separate congestion control parameter set for
* each of the destination addresses it can send to (not each
* source-destination pair but for each destination). The parameters
* should decay if the address is not used for a long enough time
* period.
*
*/
struct sctp_transport {
/* A list of transports. */
struct list_head transports;
struct rhlist_head node;
/* Reference counting. */
refcount_t refcnt;
/* RTO-Pending : A flag used to track if one of the DATA
* chunks sent to this address is currently being
* used to compute a RTT. If this flag is 0,
* the next DATA chunk sent to this destination
* should be used to compute a RTT and this flag
* should be set. Every time the RTT
* calculation completes (i.e. the DATA chunk
* is SACK'd) clear this flag.
*/
__u32 rto_pending:1,
/*
* hb_sent : a flag that signals that we have a pending
* heartbeat.
*/
hb_sent:1,
/* Is the Path MTU update pending on this transport */
pmtu_pending:1,
dst_pending_confirm:1, /* need to confirm neighbour */
/* Has this transport moved the ctsn since we last sacked */
sack_generation:1;
u32 dst_cookie;
sctp: be more restrictive in transport selection on bundled sacks It was noticed recently that when we send data on a transport, its possible that we might bundle a sack that arrived on a different transport. While this isn't a major problem, it does go against the SHOULD requirement in section 6.4 of RFC 2960: An endpoint SHOULD transmit reply chunks (e.g., SACK, HEARTBEAT ACK, etc.) to the same destination transport address from which it received the DATA or control chunk to which it is replying. This rule should also be followed if the endpoint is bundling DATA chunks together with the reply chunk. This patch seeks to correct that. It restricts the bundling of sack operations to only those transports which have moved the ctsn of the association forward since the last sack. By doing this we guarantee that we only bundle outbound saks on a transport that has received a chunk since the last sack. This brings us into stricter compliance with the RFC. Vlad had initially suggested that we strictly allow only sack bundling on the transport that last moved the ctsn forward. While this makes sense, I was concerned that doing so prevented us from bundling in the case where we had received chunks that moved the ctsn on multiple transports. In those cases, the RFC allows us to select any of the transports having received chunks to bundle the sack on. so I've modified the approach to allow for that, by adding a state variable to each transport that tracks weather it has moved the ctsn since the last sack. This I think keeps our behavior (and performance), close enough to our current profile that I think we can do this without a sysctl knob to enable/disable it. Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Vlad Yaseivch <vyasevich@gmail.com> CC: David S. Miller <davem@davemloft.net> CC: linux-sctp@vger.kernel.org Reported-by: Michele Baldessari <michele@redhat.com> Reported-by: sorin serban <sserban@redhat.com> Acked-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-06-30 03:04:26 +00:00
struct flowi fl;
/* This is the peer's IP address and port. */
union sctp_addr ipaddr;
/* These are the functions we call to handle LLP stuff. */
struct sctp_af *af_specific;
/* Which association do we belong to? */
struct sctp_association *asoc;
/* RFC2960
*
* 12.3 Per Transport Address Data
*
* For each destination transport address in the peer's
* address list derived from the INIT or INIT ACK chunk, a
* number of data elements needs to be maintained including:
*/
/* RTO : The current retransmission timeout value. */
unsigned long rto;
__u32 rtt; /* This is the most recent RTT. */
/* RTTVAR : The current RTT variation. */
__u32 rttvar;
/* SRTT : The current smoothed round trip time. */
__u32 srtt;
/*
* These are the congestion stats.
*/
/* cwnd : The current congestion window. */
__u32 cwnd; /* This is the actual cwnd. */
/* ssthresh : The current slow start threshold value. */
__u32 ssthresh;
/* partial : The tracking method for increase of cwnd when in
* bytes acked : congestion avoidance mode (see Section 6.2.2)
*/
__u32 partial_bytes_acked;
/* Data that has been sent, but not acknowledged. */
__u32 flight_size;
__u32 burst_limited; /* Holds old cwnd when max.burst is applied */
/* Destination */
struct dst_entry *dst;
/* Source address. */
union sctp_addr saddr;
/* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
* the destination address every heartbeat interval.
*/
unsigned long hbinterval;
unsigned long probe_interval;
/* SACK delay timeout */
unsigned long sackdelay;
__u32 sackfreq;
atomic_t mtu_info;
/* When was the last time that we heard from this transport? We use
* this to pick new active and retran paths.
*/
ktime_t last_time_heard;
sctp: avoid refreshing heartbeat timer too often Currently on high rate SCTP streams the heartbeat timer refresh can consume quite a lot of resources as timer updates are costly and it contains a random factor, which a) is also costly and b) invalidates mod_timer() optimization for not editing a timer to the same value. It may even cause the timer to be slightly advanced, for no good reason. As suggested by David Laight this patch now removes this timer update from hot path by leaving the timer on and re-evaluating upon its expiration if the heartbeat is still needed or not, similarly to what is done for TCP. If it's not needed anymore the timer is re-scheduled to the new timeout, considering the time already elapsed. For this, we now record the last tx timestamp per transport, updated in the same spots as hb timer was restarted on tx. Also split up sctp_transport_reset_timers into sctp_transport_reset_t3_rtx and sctp_transport_reset_hb_timer, so we can re-arm T3 without re-arming the heartbeat one. On loopback with MTU of 65535 and data chunks with 1636, so that we have a considerable amount of chunks without stressing system calls, netperf -t SCTP_STREAM -l 30, perf looked like this before: Samples: 103K of event 'cpu-clock', Event count (approx.): 25833000000 Overhead Command Shared Object Symbol + 6,15% netperf [kernel.vmlinux] [k] copy_user_enhanced_fast_string - 5,43% netperf [kernel.vmlinux] [k] _raw_write_unlock_irqrestore - _raw_write_unlock_irqrestore - 96,54% _raw_spin_unlock_irqrestore - 36,14% mod_timer + 97,24% sctp_transport_reset_timers + 2,76% sctp_do_sm + 33,65% __wake_up_sync_key + 28,77% sctp_ulpq_tail_event + 1,40% del_timer - 1,84% mod_timer + 99,03% sctp_transport_reset_timers + 0,97% sctp_do_sm + 1,50% sctp_ulpq_tail_event And after this patch, now with netperf -l 60: Samples: 230K of event 'cpu-clock', Event count (approx.): 57707250000 Overhead Command Shared Object Symbol + 5,65% netperf [kernel.vmlinux] [k] memcpy_erms + 5,59% netperf [kernel.vmlinux] [k] copy_user_enhanced_fast_string - 5,05% netperf [kernel.vmlinux] [k] _raw_spin_unlock_irqrestore - _raw_spin_unlock_irqrestore + 49,89% __wake_up_sync_key + 45,68% sctp_ulpq_tail_event - 2,85% mod_timer + 76,51% sctp_transport_reset_t3_rtx + 23,49% sctp_do_sm + 1,55% del_timer + 2,50% netperf [sctp] [k] sctp_datamsg_from_user + 2,26% netperf [sctp] [k] sctp_sendmsg Throughput-wise, from 6800mbps without the patch to 7050mbps with it, ~3.7%. Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-06 18:15:19 +00:00
/* When was the last time that we sent a chunk using this
* transport? We use this to check for idle transports
*/
unsigned long last_time_sent;
/* Last time(in jiffies) when cwnd is reduced due to the congestion
* indication based on ECNE chunk.
*/
unsigned long last_time_ecne_reduced;
__be16 encap_port;
/* This is the max_retrans value for the transport and will
* be initialized from the assocs value. This can be changed
* using the SCTP_SET_PEER_ADDR_PARAMS socket option.
*/
__u16 pathmaxrxt;
__u32 flowlabel;
__u8 dscp;
/* This is the partially failed retrans value for the transport
* and will be initialized from the assocs value. This can be changed
* using the SCTP_PEER_ADDR_THLDS socket option
*/
__u16 pf_retrans;
/* Used for primary path switchover. */
__u16 ps_retrans;
/* PMTU : The current known path MTU. */
__u32 pathmtu;
/* Flags controlling Heartbeat, SACK delay, and Path MTU Discovery. */
__u32 param_flags;
/* The number of times INIT has been sent on this transport. */
int init_sent_count;
/* state : The current state of this destination,
* : i.e. SCTP_ACTIVE, SCTP_INACTIVE, SCTP_UNKNOWN.
*/
int state;
/* These are the error stats for this destination. */
/* Error count : The current error count for this destination. */
unsigned short error_count;
/* Per : A timer used by each destination.
* Destination :
* Timer :
*
* [Everywhere else in the text this is called T3-rtx. -ed]
*/
struct timer_list T3_rtx_timer;
/* Heartbeat timer is per destination. */
struct timer_list hb_timer;
sctp: Fix a race between ICMP protocol unreachable and connect() ICMP protocol unreachable handling completely disregarded the fact that the user may have locked the socket. It proceeded to destroy the association, even though the user may have held the lock and had a ref on the association. This resulted in the following: Attempt to release alive inet socket f6afcc00 ========================= [ BUG: held lock freed! ] ------------------------- somenu/2672 is freeing memory f6afcc00-f6afcfff, with a lock still held there! (sk_lock-AF_INET){+.+.+.}, at: [<c122098a>] sctp_connect+0x13/0x4c 1 lock held by somenu/2672: #0: (sk_lock-AF_INET){+.+.+.}, at: [<c122098a>] sctp_connect+0x13/0x4c stack backtrace: Pid: 2672, comm: somenu Not tainted 2.6.32-telco #55 Call Trace: [<c1232266>] ? printk+0xf/0x11 [<c1038553>] debug_check_no_locks_freed+0xce/0xff [<c10620b4>] kmem_cache_free+0x21/0x66 [<c1185f25>] __sk_free+0x9d/0xab [<c1185f9c>] sk_free+0x1c/0x1e [<c1216e38>] sctp_association_put+0x32/0x89 [<c1220865>] __sctp_connect+0x36d/0x3f4 [<c122098a>] ? sctp_connect+0x13/0x4c [<c102d073>] ? autoremove_wake_function+0x0/0x33 [<c12209a8>] sctp_connect+0x31/0x4c [<c11d1e80>] inet_dgram_connect+0x4b/0x55 [<c11834fa>] sys_connect+0x54/0x71 [<c103a3a2>] ? lock_release_non_nested+0x88/0x239 [<c1054026>] ? might_fault+0x42/0x7c [<c1054026>] ? might_fault+0x42/0x7c [<c11847ab>] sys_socketcall+0x6d/0x178 [<c10da994>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c1002959>] syscall_call+0x7/0xb This was because the sctp_wait_for_connect() would aqcure the socket lock and then proceed to release the last reference count on the association, thus cause the fully destruction path to finish freeing the socket. The simplest solution is to start a very short timer in case the socket is owned by user. When the timer expires, we can do some verification and be able to do the release properly. Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-06 07:56:07 +00:00
/* Timer to handle ICMP proto unreachable envets */
struct timer_list proto_unreach_timer;
/* Timer to handler reconf chunk rtx */
struct timer_list reconf_timer;
/* Timer to send a probe HB packet for PLPMTUD */
struct timer_list probe_timer;
/* Since we're using per-destination retransmission timers
* (see above), we're also using per-destination "transmitted"
* queues. This probably ought to be a private struct
* accessible only within the outqueue, but it's not, yet.
*/
struct list_head transmitted;
/* We build bundle-able packets for this transport here. */
struct sctp_packet packet;
/* This is the list of transports that have chunks to send. */
struct list_head send_ready;
/* State information saved for SFR_CACC algorithm. The key
* idea in SFR_CACC is to maintain state at the sender on a
* per-destination basis when a changeover happens.
* char changeover_active;
* char cycling_changeover;
* __u32 next_tsn_at_change;
* char cacc_saw_newack;
*/
struct {
/* An unsigned integer, which stores the next TSN to be
* used by the sender, at the moment of changeover.
*/
__u32 next_tsn_at_change;
/* A flag which indicates the occurrence of a changeover */
char changeover_active;
/* A flag which indicates whether the change of primary is
* the first switch to this destination address during an
* active switch.
*/
char cycling_changeover;
/* A temporary flag, which is used during the processing of
* a SACK to estimate the causative TSN(s)'s group.
*/
char cacc_saw_newack;
} cacc;
struct {
__u16 pmtu;
__u16 probe_size;
__u16 probe_high;
__u8 probe_count;
__u8 state;
} pl; /* plpmtud related */
/* 64-bit random number sent with heartbeat. */
__u64 hb_nonce;
struct rcu_head rcu;
};
struct sctp_transport *sctp_transport_new(struct net *, const union sctp_addr *,
gfp_t);
void sctp_transport_set_owner(struct sctp_transport *,
struct sctp_association *);
void sctp_transport_route(struct sctp_transport *, union sctp_addr *,
struct sctp_sock *);
void sctp_transport_pmtu(struct sctp_transport *, struct sock *sk);
void sctp_transport_free(struct sctp_transport *);
sctp: avoid refreshing heartbeat timer too often Currently on high rate SCTP streams the heartbeat timer refresh can consume quite a lot of resources as timer updates are costly and it contains a random factor, which a) is also costly and b) invalidates mod_timer() optimization for not editing a timer to the same value. It may even cause the timer to be slightly advanced, for no good reason. As suggested by David Laight this patch now removes this timer update from hot path by leaving the timer on and re-evaluating upon its expiration if the heartbeat is still needed or not, similarly to what is done for TCP. If it's not needed anymore the timer is re-scheduled to the new timeout, considering the time already elapsed. For this, we now record the last tx timestamp per transport, updated in the same spots as hb timer was restarted on tx. Also split up sctp_transport_reset_timers into sctp_transport_reset_t3_rtx and sctp_transport_reset_hb_timer, so we can re-arm T3 without re-arming the heartbeat one. On loopback with MTU of 65535 and data chunks with 1636, so that we have a considerable amount of chunks without stressing system calls, netperf -t SCTP_STREAM -l 30, perf looked like this before: Samples: 103K of event 'cpu-clock', Event count (approx.): 25833000000 Overhead Command Shared Object Symbol + 6,15% netperf [kernel.vmlinux] [k] copy_user_enhanced_fast_string - 5,43% netperf [kernel.vmlinux] [k] _raw_write_unlock_irqrestore - _raw_write_unlock_irqrestore - 96,54% _raw_spin_unlock_irqrestore - 36,14% mod_timer + 97,24% sctp_transport_reset_timers + 2,76% sctp_do_sm + 33,65% __wake_up_sync_key + 28,77% sctp_ulpq_tail_event + 1,40% del_timer - 1,84% mod_timer + 99,03% sctp_transport_reset_timers + 0,97% sctp_do_sm + 1,50% sctp_ulpq_tail_event And after this patch, now with netperf -l 60: Samples: 230K of event 'cpu-clock', Event count (approx.): 57707250000 Overhead Command Shared Object Symbol + 5,65% netperf [kernel.vmlinux] [k] memcpy_erms + 5,59% netperf [kernel.vmlinux] [k] copy_user_enhanced_fast_string - 5,05% netperf [kernel.vmlinux] [k] _raw_spin_unlock_irqrestore - _raw_spin_unlock_irqrestore + 49,89% __wake_up_sync_key + 45,68% sctp_ulpq_tail_event - 2,85% mod_timer + 76,51% sctp_transport_reset_t3_rtx + 23,49% sctp_do_sm + 1,55% del_timer + 2,50% netperf [sctp] [k] sctp_datamsg_from_user + 2,26% netperf [sctp] [k] sctp_sendmsg Throughput-wise, from 6800mbps without the patch to 7050mbps with it, ~3.7%. Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-06 18:15:19 +00:00
void sctp_transport_reset_t3_rtx(struct sctp_transport *);
void sctp_transport_reset_hb_timer(struct sctp_transport *);
void sctp_transport_reset_reconf_timer(struct sctp_transport *transport);
void sctp_transport_reset_probe_timer(struct sctp_transport *transport);
void sctp_transport_reset_raise_timer(struct sctp_transport *transport);
int sctp_transport_hold(struct sctp_transport *);
void sctp_transport_put(struct sctp_transport *);
void sctp_transport_update_rto(struct sctp_transport *, __u32);
void sctp_transport_raise_cwnd(struct sctp_transport *, __u32, __u32);
void sctp_transport_lower_cwnd(struct sctp_transport *t,
enum sctp_lower_cwnd reason);
void sctp_transport_burst_limited(struct sctp_transport *);
void sctp_transport_burst_reset(struct sctp_transport *);
unsigned long sctp_transport_timeout(struct sctp_transport *);
void sctp_transport_reset(struct sctp_transport *t);
sctp: fix the handling of ICMP Frag Needed for too small MTUs syzbot reported a hang involving SCTP, on which it kept flooding dmesg with the message: [ 246.742374] sctp: sctp_transport_update_pmtu: Reported pmtu 508 too low, using default minimum of 512 That happened because whenever SCTP hits an ICMP Frag Needed, it tries to adjust to the new MTU and triggers an immediate retransmission. But it didn't consider the fact that MTUs smaller than the SCTP minimum MTU allowed (512) would not cause the PMTU to change, and issued the retransmission anyway (thus leading to another ICMP Frag Needed, and so on). As IPv4 (ip_rt_min_pmtu=556) and IPv6 (IPV6_MIN_MTU=1280) minimum MTU are higher than that, sctp_transport_update_pmtu() is changed to re-fetch the PMTU that got set after our request, and with that, detect if there was an actual change or not. The fix, thus, skips the immediate retransmission if the received ICMP resulted in no change, in the hope that SCTP will select another path. Note: The value being used for the minimum MTU (512, SCTP_DEFAULT_MINSEGMENT) is not right and instead it should be (576, SCTP_MIN_PMTU), but such change belongs to another patch. Changes from v1: - do not disable PMTU discovery, in the light of commit 06ad391919b2 ("[SCTP] Don't disable PMTU discovery when mtu is small") and as suggested by Xin Long. - changed the way to break the rtx loop by detecting if the icmp resulted in a change or not Changes from v2: none See-also: https://lkml.org/lkml/2017/12/22/811 Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-05 13:17:18 +00:00
bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu);
void sctp_transport_immediate_rtx(struct sctp_transport *);
void sctp_transport_dst_release(struct sctp_transport *t);
void sctp_transport_dst_confirm(struct sctp_transport *t);
void sctp_transport_pl_send(struct sctp_transport *t);
bool sctp_transport_pl_recv(struct sctp_transport *t);
/* This is the structure we use to queue packets as they come into
* SCTP. We write packets to it and read chunks from it.
*/
struct sctp_inq {
/* This is actually a queue of sctp_chunk each
* containing a partially decoded packet.
*/
struct list_head in_chunk_list;
/* This is the packet which is currently off the in queue and is
* being worked on through the inbound chunk processing.
*/
struct sctp_chunk *in_progress;
/* This is the delayed task to finish delivering inbound
* messages.
*/
struct work_struct immediate;
};
void sctp_inq_init(struct sctp_inq *);
void sctp_inq_free(struct sctp_inq *);
void sctp_inq_push(struct sctp_inq *, struct sctp_chunk *packet);
struct sctp_chunk *sctp_inq_pop(struct sctp_inq *);
struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *);
void sctp_inq_set_th_handler(struct sctp_inq *, work_func_t);
/* This is the structure we use to hold outbound chunks. You push
* chunks in and they automatically pop out the other end as bundled
* packets (it calls (*output_handler)()).
*
* This structure covers sections 6.3, 6.4, 6.7, 6.8, 6.10, 7., 8.1,
* and 8.2 of the v13 draft.
*
* It handles retransmissions. The connection to the timeout portion
* of the state machine is through sctp_..._timeout() and timeout_handler.
*
* If you feed it SACKs, it will eat them.
*
* If you give it big chunks, it will fragment them.
*
* It assigns TSN's to data chunks. This happens at the last possible
* instant before transmission.
*
* When free()'d, it empties itself out via output_handler().
*/
struct sctp_outq {
struct sctp_association *asoc;
/* Data pending that has never been transmitted. */
struct list_head out_chunk_list;
sctp: introduce stream scheduler foundations This patch introduces the hooks necessary to do stream scheduling, as per RFC Draft ndata. It also introduces the first scheduler, which is what we do today but now factored out: first come first served (FCFS). With stream scheduling now we have to track which chunk was enqueued on which stream and be able to select another other than the in front of the main outqueue. So we introduce a list on sctp_stream_out_ext structure for this purpose. We reuse sctp_chunk->transmitted_list space for the list above, as the chunk cannot belong to the two lists at the same time. By using the union in there, we can have distinct names for these moments. sctp_sched_ops are the operations expected to be implemented by each scheduler. The dequeueing is a bit particular to this implementation but it is to match how we dequeue packets today. We first dequeue and then check if it fits the packet and if not, we requeue it at head. Thus why we don't have a peek operation but have dequeue_done instead, which is called once the chunk can be safely considered as transmitted. The check removed from sctp_outq_flush is now performed by sctp_stream_outq_migrate, which is only called during assoc setup. (sctp_sendmsg() also checks for it) The only operation that is foreseen but not yet added here is a way to signalize that a new packet is starting or that the packet is done, for round robin scheduler per packet, but is intentionally left to the patch that actually implements it. Support for I-DATA chunks, also described in this RFC, with user message interleaving is straightforward as it just requires the schedulers to probe for the feature and ignore datamsg boundaries when dequeueing. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:13 +00:00
/* Stream scheduler being used */
struct sctp_sched_ops *sched;
unsigned int out_qlen; /* Total length of queued data chunks. */
/* Error of send failed, may used in SCTP_SEND_FAILED event. */
unsigned int error;
/* These are control chunks we want to send. */
struct list_head control_chunk_list;
/* These are chunks that have been sacked but are above the
* CTSN, or cumulative tsn ack point.
*/
struct list_head sacked;
/* Put chunks on this list to schedule them for
* retransmission.
*/
struct list_head retransmit;
/* Put chunks on this list to save them for FWD TSN processing as
* they were abandoned.
*/
struct list_head abandoned;
/* How many unackd bytes do we have in-flight? */
__u32 outstanding_bytes;
/* Are we doing fast-rtx on this queue */
char fast_rtx;
/* Corked? */
char cork;
};
void sctp_outq_init(struct sctp_association *, struct sctp_outq *);
void sctp_outq_teardown(struct sctp_outq *);
void sctp_outq_free(struct sctp_outq*);
void sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t);
int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *);
int sctp_outq_is_empty(const struct sctp_outq *);
void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
enum sctp_retransmit_reason reason);
void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8);
void sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
void sctp_prsctp_prune(struct sctp_association *asoc,
struct sctp_sndrcvinfo *sinfo, int msg_len);
void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
/* Uncork and flush an outqueue. */
static inline void sctp_outq_cork(struct sctp_outq *q)
{
q->cork = 1;
}
/* SCTP skb control block.
* sctp_input_cb is currently used on rx and sock rx queue
*/
struct sctp_input_cb {
union {
struct inet_skb_parm h4;
#if IS_ENABLED(CONFIG_IPV6)
struct inet6_skb_parm h6;
#endif
} header;
struct sctp_chunk *chunk;
struct sctp_af *af;
__be16 encap_port;
};
#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0]))
struct sctp_output_cb {
struct sk_buff *last;
};
#define SCTP_OUTPUT_CB(__skb) ((struct sctp_output_cb *)&((__skb)->cb[0]))
sctp: allow GSO frags to access the chunk too SCTP will try to access original IP headers on sctp_recvmsg in order to copy the addresses used. There are also other places that do similar access to IP or even SCTP headers. But after 90017accff61 ("sctp: Add GSO support") they aren't always there because they are only present in the header skb. SCTP handles the queueing of incoming data by cloning the incoming skb and limiting to only the relevant payload. This clone has its cb updated to something different and it's then queued on socket rx queue. Thus we need to fix this in two moments. For rx path, not related to socket queue yet, this patch uses a partially copied sctp_input_cb to such GSO frags. This restores the ability to access the headers for this part of the code. Regarding the socket rx queue, it removes iif member from sctp_event and also add a chunk pointer on it. With these changes we're always able to reach the headers again. The biggest change here is that now the sctp_chunk struct and the original skb are only freed after the application consumed the buffer. Note however that the original payload was already like this due to the skb cloning. For iif, SCTP's IPv4 code doesn't use it, so no change is necessary. IPv6 now can fetch it directly from original's IPv6 CB as the original skb is still accessible. In the future we probably can simplify sctp_v*_skb_iif() stuff, as sctp_v4_skb_iif() was called but it's return value not used, and now it's not even called, but such cleanup is out of scope for this change. Fixes: 90017accff61 ("sctp: Add GSO support") Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-13 18:08:57 +00:00
static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb)
{
const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
return chunk->head_skb ? : skb;
}
/* These bind address data fields common between endpoints and associations */
struct sctp_bind_addr {
/* RFC 2960 12.1 Parameters necessary for the SCTP instance
*
* SCTP Port: The local SCTP port number the endpoint is
* bound to.
*/
__u16 port;
/* RFC 2960 12.1 Parameters necessary for the SCTP instance
*
* Address List: The list of IP addresses that this instance
* has bound. This information is passed to one's
* peer(s) in INIT and INIT ACK chunks.
*/
struct list_head address_list;
};
void sctp_bind_addr_init(struct sctp_bind_addr *, __u16 port);
void sctp_bind_addr_free(struct sctp_bind_addr *);
int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest,
const struct sctp_bind_addr *src,
enum sctp_scope scope, gfp_t gfp,
int flags);
int sctp_bind_addr_dup(struct sctp_bind_addr *dest,
const struct sctp_bind_addr *src,
gfp_t gfp);
int sctp_add_bind_addr(struct sctp_bind_addr *, union sctp_addr *,
int new_size, __u8 addr_state, gfp_t gfp);
int sctp_del_bind_addr(struct sctp_bind_addr *, union sctp_addr *);
int sctp_bind_addr_match(struct sctp_bind_addr *, const union sctp_addr *,
struct sctp_sock *);
int sctp_bind_addr_conflict(struct sctp_bind_addr *, const union sctp_addr *,
struct sctp_sock *, struct sctp_sock *);
int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
const union sctp_addr *addr);
int sctp_bind_addrs_check(struct sctp_sock *sp,
struct sctp_sock *sp2, int cnt2);
union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp,
const union sctp_addr *addrs,
int addrcnt,
struct sctp_sock *opt);
union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
int *addrs_len,
gfp_t gfp);
int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw, int len,
__u16 port, gfp_t gfp);
enum sctp_scope sctp_scope(const union sctp_addr *addr);
int sctp_in_scope(struct net *net, const union sctp_addr *addr,
const enum sctp_scope scope);
int sctp_is_any(struct sock *sk, const union sctp_addr *addr);
int sctp_is_ep_boundall(struct sock *sk);
/* What type of endpoint? */
enum sctp_endpoint_type {
SCTP_EP_TYPE_SOCKET,
SCTP_EP_TYPE_ASSOCIATION,
};
/*
* A common base class to bridge the implementation view of a
* socket (usually listening) endpoint versus an association's
* local endpoint.
* This common structure is useful for several purposes:
* 1) Common interface for lookup routines.
* a) Subfunctions work for either endpoint or association
* b) Single interface to lookup allows hiding the lookup lock rather
* than acquiring it externally.
* 2) Common interface for the inbound chunk handling/state machine.
* 3) Common object handling routines for reference counting, etc.
* 4) Disentangle association lookup from endpoint lookup, where we
* do not have to find our endpoint to find our association.
*
*/
struct sctp_ep_common {
/* Runtime type information. What kind of endpoint is this? */
enum sctp_endpoint_type type;
/* Some fields to help us manage this object.
* refcnt - Reference count access to this object.
* dead - Do not attempt to use this object.
*/
refcount_t refcnt;
bool dead;
/* What socket does this endpoint belong to? */
struct sock *sk;
sctp: cache netns in sctp_ep_common This patch is to fix a data-race reported by syzbot: BUG: KCSAN: data-race in sctp_assoc_migrate / sctp_hash_obj write to 0xffff8880b67c0020 of 8 bytes by task 18908 on cpu 1: sctp_assoc_migrate+0x1a6/0x290 net/sctp/associola.c:1091 sctp_sock_migrate+0x8aa/0x9b0 net/sctp/socket.c:9465 sctp_accept+0x3c8/0x470 net/sctp/socket.c:4916 inet_accept+0x7f/0x360 net/ipv4/af_inet.c:734 __sys_accept4+0x224/0x430 net/socket.c:1754 __do_sys_accept net/socket.c:1795 [inline] __se_sys_accept net/socket.c:1792 [inline] __x64_sys_accept+0x4e/0x60 net/socket.c:1792 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffff8880b67c0020 of 8 bytes by task 12003 on cpu 0: sctp_hash_obj+0x4f/0x2d0 net/sctp/input.c:894 rht_key_get_hash include/linux/rhashtable.h:133 [inline] rht_key_hashfn include/linux/rhashtable.h:159 [inline] rht_head_hashfn include/linux/rhashtable.h:174 [inline] head_hashfn lib/rhashtable.c:41 [inline] rhashtable_rehash_one lib/rhashtable.c:245 [inline] rhashtable_rehash_chain lib/rhashtable.c:276 [inline] rhashtable_rehash_table lib/rhashtable.c:316 [inline] rht_deferred_worker+0x468/0xab0 lib/rhashtable.c:420 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 It was caused by rhashtable access asoc->base.sk when sctp_assoc_migrate is changing its value. However, what rhashtable wants is netns from asoc base.sk, and for an asoc, its netns won't change once set. So we can simply fix it by caching netns since created. Fixes: d6c0256a60e6 ("sctp: add the rhashtable apis for sctp global transport hashtable") Reported-by: syzbot+e3b35fe7918ff0ee474e@syzkaller.appspotmail.com Signed-off-by: Xin Long <lucien.xin@gmail.com> Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
2019-11-23 03:56:49 +00:00
/* Cache netns and it won't change once set */
struct net *net;
/* This is where we receive inbound chunks. */
struct sctp_inq inqueue;
/* This substructure includes the defining parameters of the
* endpoint:
* bind_addr.port is our shared port number.
* bind_addr.address_list is our set of local IP addresses.
*/
struct sctp_bind_addr bind_addr;
};
/* RFC Section 1.4 Key Terms
*
* o SCTP endpoint: The logical sender/receiver of SCTP packets. On a
* multi-homed host, an SCTP endpoint is represented to its peers as a
* combination of a set of eligible destination transport addresses to
* which SCTP packets can be sent and a set of eligible source
* transport addresses from which SCTP packets can be received.
* All transport addresses used by an SCTP endpoint must use the
* same port number, but can use multiple IP addresses. A transport
* address used by an SCTP endpoint must not be used by another
* SCTP endpoint. In other words, a transport address is unique
* to an SCTP endpoint.
*
* From an implementation perspective, each socket has one of these.
* A TCP-style socket will have exactly one association on one of
* these. An UDP-style socket will have multiple associations hanging
* off one of these.
*/
struct sctp_endpoint {
/* Common substructure for endpoint and association. */
struct sctp_ep_common base;
/* Fields to help us manage our entries in the hash tables. */
struct hlist_node node;
int hashent;
/* Associations: A list of current associations and mappings
* to the data consumers for each association. This
* may be in the form of a hash table or other
* implementation dependent structure. The data
* consumers may be process identification
* information such as file descriptors, named pipe
* pointer, or table pointers dependent on how SCTP
* is implemented.
*/
/* This is really a list of struct sctp_association entries. */
struct list_head asocs;
/* Secret Key: A secret key used by this endpoint to compute
* the MAC. This SHOULD be a cryptographic quality
* random number with a sufficient length.
* Discussion in [RFC1750] can be helpful in
* selection of the key.
*/
__u8 secret_key[SCTP_SECRET_SIZE];
/* digest: This is a digest of the sctp cookie. This field is
* only used on the receive path when we try to validate
* that the cookie has not been tampered with. We put
* this here so we pre-allocate this once and can re-use
* on every receive.
*/
__u8 *digest;
/* sendbuf acct. policy. */
__u32 sndbuf_policy;
/* rcvbuf acct. policy. */
__u32 rcvbuf_policy;
/* SCTP AUTH: array of the HMACs that will be allocated
* we need this per association so that we don't serialize
*/
struct crypto_shash **auth_hmacs;
/* SCTP-AUTH: hmacs for the endpoint encoded into parameter */
struct sctp_hmac_algo_param *auth_hmacs_list;
/* SCTP-AUTH: chunks to authenticate encoded into parameter */
struct sctp_chunks_param *auth_chunk_list;
/* SCTP-AUTH: endpoint shared keys */
struct list_head endpoint_shared_keys;
__u16 active_key_id;
__u8 ecn_enable:1,
auth_enable:1,
intl_enable:1,
prsctp_enable:1,
asconf_enable:1,
reconf_enable:1;
__u8 strreset_enable;
sctp: use call_rcu to free endpoint This patch is to delay the endpoint free by calling call_rcu() to fix another use-after-free issue in sctp_sock_dump(): BUG: KASAN: use-after-free in __lock_acquire+0x36d9/0x4c20 Call Trace: __lock_acquire+0x36d9/0x4c20 kernel/locking/lockdep.c:3218 lock_acquire+0x1ed/0x520 kernel/locking/lockdep.c:3844 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline] _raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168 spin_lock_bh include/linux/spinlock.h:334 [inline] __lock_sock+0x203/0x350 net/core/sock.c:2253 lock_sock_nested+0xfe/0x120 net/core/sock.c:2774 lock_sock include/net/sock.h:1492 [inline] sctp_sock_dump+0x122/0xb20 net/sctp/diag.c:324 sctp_for_each_transport+0x2b5/0x370 net/sctp/socket.c:5091 sctp_diag_dump+0x3ac/0x660 net/sctp/diag.c:527 __inet_diag_dump+0xa8/0x140 net/ipv4/inet_diag.c:1049 inet_diag_dump+0x9b/0x110 net/ipv4/inet_diag.c:1065 netlink_dump+0x606/0x1080 net/netlink/af_netlink.c:2244 __netlink_dump_start+0x59a/0x7c0 net/netlink/af_netlink.c:2352 netlink_dump_start include/linux/netlink.h:216 [inline] inet_diag_handler_cmd+0x2ce/0x3f0 net/ipv4/inet_diag.c:1170 __sock_diag_cmd net/core/sock_diag.c:232 [inline] sock_diag_rcv_msg+0x31d/0x410 net/core/sock_diag.c:263 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2477 sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:274 This issue occurs when asoc is peeled off and the old sk is freed after getting it by asoc->base.sk and before calling lock_sock(sk). To prevent the sk free, as a holder of the sk, ep should be alive when calling lock_sock(). This patch uses call_rcu() and moves sock_put and ep free into sctp_endpoint_destroy_rcu(), so that it's safe to try to hold the ep under rcu_read_lock in sctp_transport_traverse_process(). If sctp_endpoint_hold() returns true, it means this ep is still alive and we have held it and can continue to dump it; If it returns false, it means this ep is dead and can be freed after rcu_read_unlock, and we should skip it. In sctp_sock_dump(), after locking the sk, if this ep is different from tsp->asoc->ep, it means during this dumping, this asoc was peeled off before calling lock_sock(), and the sk should be skipped; If this ep is the same with tsp->asoc->ep, it means no peeloff happens on this asoc, and due to lock_sock, no peeloff will happen either until release_sock. Note that delaying endpoint free won't delay the port release, as the port release happens in sctp_endpoint_destroy() before calling call_rcu(). Also, freeing endpoint by call_rcu() makes it safe to access the sk by asoc->base.sk in sctp_assocs_seq_show() and sctp_rcv(). Thanks Jones to bring this issue up. v1->v2: - improve the changelog. - add kfree(ep) into sctp_endpoint_destroy_rcu(), as Jakub noticed. Reported-by: syzbot+9276d76e83e3bcde6c99@syzkaller.appspotmail.com Reported-by: Lee Jones <lee.jones@linaro.org> Fixes: d25adbeb0cdb ("sctp: fix an use-after-free issue in sctp_sock_dump") Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-23 18:04:30 +00:00
struct rcu_head rcu;
};
/* Recover the outer endpoint structure. */
static inline struct sctp_endpoint *sctp_ep(struct sctp_ep_common *base)
{
struct sctp_endpoint *ep;
ep = container_of(base, struct sctp_endpoint, base);
return ep;
}
/* These are function signatures for manipulating endpoints. */
struct sctp_endpoint *sctp_endpoint_new(struct sock *, gfp_t);
void sctp_endpoint_free(struct sctp_endpoint *);
void sctp_endpoint_put(struct sctp_endpoint *);
sctp: use call_rcu to free endpoint This patch is to delay the endpoint free by calling call_rcu() to fix another use-after-free issue in sctp_sock_dump(): BUG: KASAN: use-after-free in __lock_acquire+0x36d9/0x4c20 Call Trace: __lock_acquire+0x36d9/0x4c20 kernel/locking/lockdep.c:3218 lock_acquire+0x1ed/0x520 kernel/locking/lockdep.c:3844 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline] _raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168 spin_lock_bh include/linux/spinlock.h:334 [inline] __lock_sock+0x203/0x350 net/core/sock.c:2253 lock_sock_nested+0xfe/0x120 net/core/sock.c:2774 lock_sock include/net/sock.h:1492 [inline] sctp_sock_dump+0x122/0xb20 net/sctp/diag.c:324 sctp_for_each_transport+0x2b5/0x370 net/sctp/socket.c:5091 sctp_diag_dump+0x3ac/0x660 net/sctp/diag.c:527 __inet_diag_dump+0xa8/0x140 net/ipv4/inet_diag.c:1049 inet_diag_dump+0x9b/0x110 net/ipv4/inet_diag.c:1065 netlink_dump+0x606/0x1080 net/netlink/af_netlink.c:2244 __netlink_dump_start+0x59a/0x7c0 net/netlink/af_netlink.c:2352 netlink_dump_start include/linux/netlink.h:216 [inline] inet_diag_handler_cmd+0x2ce/0x3f0 net/ipv4/inet_diag.c:1170 __sock_diag_cmd net/core/sock_diag.c:232 [inline] sock_diag_rcv_msg+0x31d/0x410 net/core/sock_diag.c:263 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2477 sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:274 This issue occurs when asoc is peeled off and the old sk is freed after getting it by asoc->base.sk and before calling lock_sock(sk). To prevent the sk free, as a holder of the sk, ep should be alive when calling lock_sock(). This patch uses call_rcu() and moves sock_put and ep free into sctp_endpoint_destroy_rcu(), so that it's safe to try to hold the ep under rcu_read_lock in sctp_transport_traverse_process(). If sctp_endpoint_hold() returns true, it means this ep is still alive and we have held it and can continue to dump it; If it returns false, it means this ep is dead and can be freed after rcu_read_unlock, and we should skip it. In sctp_sock_dump(), after locking the sk, if this ep is different from tsp->asoc->ep, it means during this dumping, this asoc was peeled off before calling lock_sock(), and the sk should be skipped; If this ep is the same with tsp->asoc->ep, it means no peeloff happens on this asoc, and due to lock_sock, no peeloff will happen either until release_sock. Note that delaying endpoint free won't delay the port release, as the port release happens in sctp_endpoint_destroy() before calling call_rcu(). Also, freeing endpoint by call_rcu() makes it safe to access the sk by asoc->base.sk in sctp_assocs_seq_show() and sctp_rcv(). Thanks Jones to bring this issue up. v1->v2: - improve the changelog. - add kfree(ep) into sctp_endpoint_destroy_rcu(), as Jakub noticed. Reported-by: syzbot+9276d76e83e3bcde6c99@syzkaller.appspotmail.com Reported-by: Lee Jones <lee.jones@linaro.org> Fixes: d25adbeb0cdb ("sctp: fix an use-after-free issue in sctp_sock_dump") Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-23 18:04:30 +00:00
int sctp_endpoint_hold(struct sctp_endpoint *ep);
void sctp_endpoint_add_asoc(struct sctp_endpoint *, struct sctp_association *);
struct sctp_association *sctp_endpoint_lookup_assoc(
const struct sctp_endpoint *ep,
const union sctp_addr *paddr,
struct sctp_transport **);
bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
const union sctp_addr *paddr);
2022-11-16 20:01:20 +00:00
struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
struct net *net,
const union sctp_addr *laddr,
int dif, int sdif);
bool sctp_has_association(struct net *net, const union sctp_addr *laddr,
2022-11-16 20:01:20 +00:00
const union sctp_addr *paddr, int dif, int sdif);
net: sctp: cache auth_enable per endpoint Currently, it is possible to create an SCTP socket, then switch auth_enable via sysctl setting to 1 and crash the system on connect: Oops[#1]: CPU: 0 PID: 0 Comm: swapper Not tainted 3.14.1-mipsgit-20140415 #1 task: ffffffff8056ce80 ti: ffffffff8055c000 task.ti: ffffffff8055c000 [...] Call Trace: [<ffffffff8043c4e8>] sctp_auth_asoc_set_default_hmac+0x68/0x80 [<ffffffff8042b300>] sctp_process_init+0x5e0/0x8a4 [<ffffffff8042188c>] sctp_sf_do_5_1B_init+0x234/0x34c [<ffffffff804228c8>] sctp_do_sm+0xb4/0x1e8 [<ffffffff80425a08>] sctp_endpoint_bh_rcv+0x1c4/0x214 [<ffffffff8043af68>] sctp_rcv+0x588/0x630 [<ffffffff8043e8e8>] sctp6_rcv+0x10/0x24 [<ffffffff803acb50>] ip6_input+0x2c0/0x440 [<ffffffff8030fc00>] __netif_receive_skb_core+0x4a8/0x564 [<ffffffff80310650>] process_backlog+0xb4/0x18c [<ffffffff80313cbc>] net_rx_action+0x12c/0x210 [<ffffffff80034254>] __do_softirq+0x17c/0x2ac [<ffffffff800345e0>] irq_exit+0x54/0xb0 [<ffffffff800075a4>] ret_from_irq+0x0/0x4 [<ffffffff800090ec>] rm7k_wait_irqoff+0x24/0x48 [<ffffffff8005e388>] cpu_startup_entry+0xc0/0x148 [<ffffffff805a88b0>] start_kernel+0x37c/0x398 Code: dd0900b8 000330f8 0126302d <dcc60000> 50c0fff1 0047182a a48306a0 03e00008 00000000 ---[ end trace b530b0551467f2fd ]--- Kernel panic - not syncing: Fatal exception in interrupt What happens while auth_enable=0 in that case is, that ep->auth_hmacs is initialized to NULL in sctp_auth_init_hmacs() when endpoint is being created. After that point, if an admin switches over to auth_enable=1, the machine can crash due to NULL pointer dereference during reception of an INIT chunk. When we enter sctp_process_init() via sctp_sf_do_5_1B_init() in order to respond to an INIT chunk, the INIT verification succeeds and while we walk and process all INIT params via sctp_process_param() we find that net->sctp.auth_enable is set, therefore do not fall through, but invoke sctp_auth_asoc_set_default_hmac() instead, and thus, dereference what we have set to NULL during endpoint initialization phase. The fix is to make auth_enable immutable by caching its value during endpoint initialization, so that its original value is being carried along until destruction. The bug seems to originate from the very first days. Fix in joint work with Daniel Borkmann. Reported-by: Joshua Kinard <kumba@gentoo.org> Signed-off-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Acked-by: Neil Horman <nhorman@tuxdriver.com> Tested-by: Joshua Kinard <kumba@gentoo.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-17 15:26:50 +00:00
int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
const struct sctp_association *asoc,
enum sctp_cid cid, struct sctp_init_chunk *peer_init,
struct sctp_chunk *chunk, struct sctp_chunk **err_chunk);
int sctp_process_init(struct sctp_association *, struct sctp_chunk *chunk,
const union sctp_addr *peer,
struct sctp_init_chunk *init, gfp_t gfp);
__u32 sctp_generate_tag(const struct sctp_endpoint *);
__u32 sctp_generate_tsn(const struct sctp_endpoint *);
struct sctp_inithdr_host {
__u32 init_tag;
__u32 a_rwnd;
__u16 num_outbound_streams;
__u16 num_inbound_streams;
__u32 initial_tsn;
};
sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:16 +00:00
struct sctp_stream_priorities {
/* List of priorities scheduled */
struct list_head prio_sched;
/* List of streams scheduled */
struct list_head active;
/* The next stream in line */
sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:16 +00:00
struct sctp_stream_out_ext *next;
__u16 prio;
__u16 users;
sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:16 +00:00
};
struct sctp_stream_out_ext {
__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
sctp: introduce stream scheduler foundations This patch introduces the hooks necessary to do stream scheduling, as per RFC Draft ndata. It also introduces the first scheduler, which is what we do today but now factored out: first come first served (FCFS). With stream scheduling now we have to track which chunk was enqueued on which stream and be able to select another other than the in front of the main outqueue. So we introduce a list on sctp_stream_out_ext structure for this purpose. We reuse sctp_chunk->transmitted_list space for the list above, as the chunk cannot belong to the two lists at the same time. By using the union in there, we can have distinct names for these moments. sctp_sched_ops are the operations expected to be implemented by each scheduler. The dequeueing is a bit particular to this implementation but it is to match how we dequeue packets today. We first dequeue and then check if it fits the packet and if not, we requeue it at head. Thus why we don't have a peek operation but have dequeue_done instead, which is called once the chunk can be safely considered as transmitted. The check removed from sctp_outq_flush is now performed by sctp_stream_outq_migrate, which is only called during assoc setup. (sctp_sendmsg() also checks for it) The only operation that is foreseen but not yet added here is a way to signalize that a new packet is starting or that the packet is done, for round robin scheduler per packet, but is intentionally left to the patch that actually implements it. Support for I-DATA chunks, also described in this RFC, with user message interleaving is straightforward as it just requires the schedulers to probe for the feature and ignore datamsg boundaries when dequeueing. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:13 +00:00
struct list_head outq; /* chunks enqueued by this stream */
sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:16 +00:00
union {
struct {
/* Scheduled streams list */
struct list_head prio_list;
struct sctp_stream_priorities *prio_head;
};
/* Fields used by RR scheduler */
struct {
struct list_head rr_list;
};
struct {
struct list_head fc_list;
__u32 fc_length;
__u16 fc_weight;
};
sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:16 +00:00
};
};
struct sctp_stream_out {
union {
__u32 mid;
__u16 ssn;
};
__u32 mid_uo;
struct sctp_stream_out_ext *ext;
__u8 state;
};
struct sctp_stream_in {
union {
__u32 mid;
__u16 ssn;
};
__u32 mid_uo;
__u32 fsn;
__u32 fsn_uo;
char pd_mode;
char pd_mode_uo;
};
struct sctp_stream {
GENRADIX(struct sctp_stream_out) out;
GENRADIX(struct sctp_stream_in) in;
__u16 outcnt;
__u16 incnt;
sctp: introduce stream scheduler foundations This patch introduces the hooks necessary to do stream scheduling, as per RFC Draft ndata. It also introduces the first scheduler, which is what we do today but now factored out: first come first served (FCFS). With stream scheduling now we have to track which chunk was enqueued on which stream and be able to select another other than the in front of the main outqueue. So we introduce a list on sctp_stream_out_ext structure for this purpose. We reuse sctp_chunk->transmitted_list space for the list above, as the chunk cannot belong to the two lists at the same time. By using the union in there, we can have distinct names for these moments. sctp_sched_ops are the operations expected to be implemented by each scheduler. The dequeueing is a bit particular to this implementation but it is to match how we dequeue packets today. We first dequeue and then check if it fits the packet and if not, we requeue it at head. Thus why we don't have a peek operation but have dequeue_done instead, which is called once the chunk can be safely considered as transmitted. The check removed from sctp_outq_flush is now performed by sctp_stream_outq_migrate, which is only called during assoc setup. (sctp_sendmsg() also checks for it) The only operation that is foreseen but not yet added here is a way to signalize that a new packet is starting or that the packet is done, for round robin scheduler per packet, but is intentionally left to the patch that actually implements it. Support for I-DATA chunks, also described in this RFC, with user message interleaving is straightforward as it just requires the schedulers to probe for the feature and ignore datamsg boundaries when dequeueing. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:13 +00:00
/* Current stream being sent, if any */
struct sctp_stream_out *out_curr;
sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:16 +00:00
union {
/* Fields used by priority scheduler */
struct {
/* List of priorities scheduled */
struct list_head prio_list;
};
/* Fields used by RR scheduler */
struct {
/* List of streams scheduled */
struct list_head rr_list;
/* The next stream in line */
struct sctp_stream_out_ext *rr_next;
};
struct {
struct list_head fc_list;
};
sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-03 22:20:16 +00:00
};
struct sctp_stream_interleave *si;
};
static inline struct sctp_stream_out *sctp_stream_out(
struct sctp_stream *stream,
__u16 sid)
{
return genradix_ptr(&stream->out, sid);
}
static inline struct sctp_stream_in *sctp_stream_in(
struct sctp_stream *stream,
__u16 sid)
{
return genradix_ptr(&stream->in, sid);
}
#define SCTP_SO(s, i) sctp_stream_out((s), (i))
#define SCTP_SI(s, i) sctp_stream_in((s), (i))
#define SCTP_STREAM_CLOSED 0x00
#define SCTP_STREAM_OPEN 0x01
static inline __u16 sctp_datachk_len(const struct sctp_stream *stream)
{
return stream->si->data_chunk_len;
}
static inline __u16 sctp_datahdr_len(const struct sctp_stream *stream)
{
return stream->si->data_chunk_len - sizeof(struct sctp_chunkhdr);
}
static inline __u16 sctp_ftsnchk_len(const struct sctp_stream *stream)
{
return stream->si->ftsn_chunk_len;
}
static inline __u16 sctp_ftsnhdr_len(const struct sctp_stream *stream)
{
return stream->si->ftsn_chunk_len - sizeof(struct sctp_chunkhdr);
}
sctp: Add support to per-association statistics via a new SCTP_GET_ASSOC_STATS call The current SCTP stack is lacking a mechanism to have per association statistics. This is an implementation modeled after OpenSolaris' SCTP_GET_ASSOC_STATS. Userspace part will follow on lksctp if/when there is a general ACK on this. V4: - Move ipackets++ before q->immediate.func() for consistency reasons - Move sctp_max_rto() at the end of sctp_transport_update_rto() to avoid returning bogus RTO values - return asoc->rto_min when max_obs_rto value has not changed V3: - Increase ictrlchunks in sctp_assoc_bh_rcv() as well - Move ipackets++ to sctp_inq_push() - return 0 when no rto updates took place since the last call V2: - Implement partial retrieval of stat struct to cope for future expansion - Kill the rtxpackets counter as it cannot be precise anyway - Rename outseqtsns to outofseqtsns to make it clearer that these are out of sequence unexpected TSNs - Move asoc->ipackets++ under a lock to avoid potential miscounts - Fold asoc->opackets++ into the already existing asoc check - Kill unneeded (q->asoc) test when increasing rtxchunks - Do not count octrlchunks if sending failed (SCTP_XMIT_OK != 0) - Don't count SHUTDOWNs as SACKs - Move SCTP_GET_ASSOC_STATS to the private space API - Adjust the len check in sctp_getsockopt_assoc_stats() to allow for future struct growth - Move association statistics in their own struct - Update idupchunks when we send a SACK with dup TSNs - return min_rto in max_rto when RTO has not changed. Also return the transport when max_rto last changed. Signed-off: Michele Baldessari <michele@acksyn.org> Acked-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-12-01 04:49:42 +00:00
/* SCTP_GET_ASSOC_STATS counters */
struct sctp_priv_assoc_stats {
/* Maximum observed rto in the association during subsequent
* observations. Value is set to 0 if no RTO measurement took place
* The transport where the max_rto was observed is returned in
* obs_rto_ipaddr
*/
struct sockaddr_storage obs_rto_ipaddr;
__u64 max_obs_rto;
/* Total In and Out SACKs received and sent */
__u64 isacks;
__u64 osacks;
/* Total In and Out packets received and sent */
__u64 opackets;
__u64 ipackets;
/* Total retransmitted chunks */
__u64 rtxchunks;
/* TSN received > next expected */
__u64 outofseqtsns;
/* Duplicate Chunks received */
__u64 idupchunks;
/* Gap Ack Blocks received */
__u64 gapcnt;
/* Unordered data chunks sent and received */
__u64 ouodchunks;
__u64 iuodchunks;
/* Ordered data chunks sent and received */
__u64 oodchunks;
__u64 iodchunks;
/* Control chunks sent and received */
__u64 octrlchunks;
__u64 ictrlchunks;
};
/* RFC2960
*
* 12. Recommended Transmission Control Block (TCB) Parameters
*
* This section details a recommended set of parameters that should
* be contained within the TCB for an implementation. This section is
* for illustrative purposes and should not be deemed as requirements
* on an implementation or as an exhaustive list of all parameters
* inside an SCTP TCB. Each implementation may need its own additional
* parameters for optimization.
*/
/* Here we have information about each individual association. */
struct sctp_association {
/* A base structure common to endpoint and association.
* In this context, it represents the associations's view
* of the local endpoint of the association.
*/
struct sctp_ep_common base;
/* Associations on the same socket. */
struct list_head asocs;
/* association id. */
sctp_assoc_t assoc_id;
/* This is our parent endpoint. */
struct sctp_endpoint *ep;
/* These are those association elements needed in the cookie. */
struct sctp_cookie c;
/* This is all information about our peer. */
struct {
/* transport_addr_list
*
* Peer : A list of SCTP transport addresses that the
* Transport : peer is bound to. This information is derived
* Address : from the INIT or INIT ACK and is used to
* List : associate an inbound packet with a given
* : association. Normally this information is
* : hashed or keyed for quick lookup and access
* : of the TCB.
* : The list is also initialized with the list
* : of addresses passed with the sctp_connectx()
* : call.
*
* It is a list of SCTP_transport's.
*/
struct list_head transport_addr_list;
/* rwnd
*
* Peer Rwnd : Current calculated value of the peer's rwnd.
*/
__u32 rwnd;
/* transport_count
*
* Peer : A count of the number of peer addresses
* Transport : in the Peer Transport Address List.
* Address :
* Count :
*/
__u16 transport_count;
/* port
* The transport layer port number.
*/
__u16 port;
/* primary_path
*
* Primary : This is the current primary destination
* Path : transport address of the peer endpoint. It
* : may also specify a source transport address
* : on this endpoint.
*
* All of these paths live on transport_addr_list.
*
* At the bakeoffs, we discovered that the intent of
* primaryPath is that it only changes when the ULP
* asks to have it changed. We add the activePath to
* designate the connection we are currently using to
* transmit new data and most control chunks.
*/
struct sctp_transport *primary_path;
/* Cache the primary path address here, when we
* need a an address for msg_name.
*/
union sctp_addr primary_addr;
/* active_path
* The path that we are currently using to
* transmit new data and most control chunks.
*/
struct sctp_transport *active_path;
/* retran_path
*
* RFC2960 6.4 Multi-homed SCTP Endpoints
* ...
* Furthermore, when its peer is multi-homed, an
* endpoint SHOULD try to retransmit a chunk to an
* active destination transport address that is
* different from the last destination address to
* which the DATA chunk was sent.
*/
struct sctp_transport *retran_path;
/* Pointer to last transport I have sent on. */
struct sctp_transport *last_sent_to;
/* This is the last transport I have received DATA on. */
struct sctp_transport *last_data_from;
/*
* Mapping An array of bits or bytes indicating which out of
* Array order TSN's have been received (relative to the
* Last Rcvd TSN). If no gaps exist, i.e. no out of
* order packets have been received, this array
* will be set to all zero. This structure may be
* in the form of a circular buffer or bit array.
*
* Last Rcvd : This is the last TSN received in
* TSN : sequence. This value is set initially by
* : taking the peer's Initial TSN, received in
* : the INIT or INIT ACK chunk, and subtracting
* : one from it.
*
* Throughout most of the specification this is called the
* "Cumulative TSN ACK Point". In this case, we
* ignore the advice in 12.2 in favour of the term
* used in the bulk of the text. This value is hidden
* in tsn_map--we get it by calling sctp_tsnmap_get_ctsn().
*/
struct sctp_tsnmap tsn_map;
/* This mask is used to disable sending the ASCONF chunk
* with specified parameter to peer.
*/
__be16 addip_disabled_mask;
/* These are capabilities which our peer advertised. */
__u16 ecn_capable:1, /* Can peer do ECN? */
ipv4_address:1, /* Peer understands IPv4 addresses? */
ipv6_address:1, /* Peer understands IPv6 addresses? */
asconf_capable:1, /* Does peer support ADDIP? */
prsctp_capable:1, /* Can peer do PR-SCTP? */
reconf_capable:1, /* Can peer do RE-CONFIG? */
intl_capable:1, /* Can peer do INTERLEAVE */
auth_capable:1, /* Is peer doing SCTP-AUTH? */
/* sack_needed:
* This flag indicates if the next received
* packet is to be responded to with a
* SACK. This is initialized to 0. When a packet
* is received sack_cnt is incremented. If this value
* reaches 2 or more, a SACK is sent and the
* value is reset to 0. Note: This is used only
* when no DATA chunks are received out of
* order. When DATA chunks are out of order,
* SACK's are not delayed (see Section 6).
*/
sack_needed:1, /* Do we need to sack the peer? */
sack_generation:1,
zero_window_announced:1;
__u32 sack_cnt;
__u32 adaptation_ind; /* Adaptation Code point. */
struct sctp_inithdr_host i;
void *cookie;
int cookie_len;
/* ADDIP Section 4.2 Upon reception of an ASCONF Chunk.
* C1) ... "Peer-Serial-Number'. This value MUST be initialized to the
* Initial TSN Value minus 1
*/
__u32 addip_serial;
/* SCTP-AUTH: We need to know pears random number, hmac list
* and authenticated chunk list. All that is part of the
* cookie and these are just pointers to those locations
*/
struct sctp_random_param *peer_random;
struct sctp_chunks_param *peer_chunks;
struct sctp_hmac_algo_param *peer_hmacs;
} peer;
/* State : A state variable indicating what state the
* : association is in, i.e. COOKIE-WAIT,
* : COOKIE-ECHOED, ESTABLISHED, SHUTDOWN-PENDING,
* : SHUTDOWN-SENT, SHUTDOWN-RECEIVED, SHUTDOWN-ACK-SENT.
*
* Note: No "CLOSED" state is illustrated since if a
* association is "CLOSED" its TCB SHOULD be removed.
*
* In this implementation we DO have a CLOSED
* state which is used during initiation and shutdown.
*
* State takes values from SCTP_STATE_*.
*/
enum sctp_state state;
/* Overall : The overall association error count.
* Error Count : [Clear this any time I get something.]
*/
int overall_error_count;
/* The cookie life I award for any cookie. */
ktime_t cookie_life;
/* These are the association's initial, max, and min RTO values.
* These values will be initialized by system defaults, but can
* be modified via the SCTP_RTOINFO socket option.
*/
unsigned long rto_initial;
unsigned long rto_max;
unsigned long rto_min;
/* Maximum number of new data packets that can be sent in a burst. */
int max_burst;
/* This is the max_retrans value for the association. This value will
* be initialized from system defaults, but can be
* modified by the SCTP_ASSOCINFO socket option.
*/
int max_retrans;
/* This is the partially failed retrans value for the transport
* and will be initialized from the assocs value. This can be
* changed using the SCTP_PEER_ADDR_THLDS socket option
*/
__u16 pf_retrans;
/* Used for primary path switchover. */
__u16 ps_retrans;
/* Maximum number of times the endpoint will retransmit INIT */
__u16 max_init_attempts;
/* How many times have we resent an INIT? */
__u16 init_retries;
/* The largest timeout or RTO value to use in attempting an INIT */
unsigned long max_init_timeo;
/* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
* the destination address every heartbeat interval. This value
* will be inherited by all new transports.
*/
unsigned long hbinterval;
unsigned long probe_interval;
__be16 encap_port;
/* This is the max_retrans value for new transports in the
* association.
*/
__u16 pathmaxrxt;
__u32 flowlabel;
__u8 dscp;
/* Flag that path mtu update is pending */
__u8 pmtu_pending;
/* Association : The smallest PMTU discovered for all of the
* PMTU : peer's transport addresses.
*/
__u32 pathmtu;
/* Flags controlling Heartbeat, SACK delay, and Path MTU Discovery. */
__u32 param_flags;
__u32 sackfreq;
/* SACK delay timeout */
unsigned long sackdelay;
unsigned long timeouts[SCTP_NUM_TIMEOUT_TYPES];
struct timer_list timers[SCTP_NUM_TIMEOUT_TYPES];
/* Transport to which SHUTDOWN chunk was last sent. */
struct sctp_transport *shutdown_last_sent_to;
/* Transport to which INIT chunk was last sent. */
struct sctp_transport *init_last_sent_to;
/* How many times have we resent a SHUTDOWN */
int shutdown_retries;
/* Next TSN : The next TSN number to be assigned to a new
* : DATA chunk. This is sent in the INIT or INIT
* : ACK chunk to the peer and incremented each
* : time a DATA chunk is assigned a TSN
* : (normally just prior to transmit or during
* : fragmentation).
*/
__u32 next_tsn;
/*
* Last Rcvd : This is the last TSN received in sequence. This value
* TSN : is set initially by taking the peer's Initial TSN,
* : received in the INIT or INIT ACK chunk, and
* : subtracting one from it.
*
* Most of RFC 2960 refers to this as the Cumulative TSN Ack Point.
*/
__u32 ctsn_ack_point;
/* PR-SCTP Advanced.Peer.Ack.Point */
__u32 adv_peer_ack_point;
/* Highest TSN that is acknowledged by incoming SACKs. */
__u32 highest_sacked;
/* TSN marking the fast recovery exit point */
__u32 fast_recovery_exit;
/* Flag to track the current fast recovery state */
__u8 fast_recovery;
/* The number of unacknowledged data chunks. Reported through
* the SCTP_STATUS sockopt.
*/
__u16 unack_data;
/* The total number of data chunks that we've had to retransmit
* as the result of a T3 timer expiration
*/
__u32 rtx_data_chunks;
/* This is the association's receive buffer space. This value is used
* to set a_rwnd field in an INIT or a SACK chunk.
*/
__u32 rwnd;
/* This is the last advertised value of rwnd over a SACK chunk. */
__u32 a_rwnd;
Revert "net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer" This reverts commit ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") as it introduced a serious performance regression on SCTP over IPv4 and IPv6, though a not as dramatic on the latter. Measurements are on 10Gbit/s with ixgbe NICs. Current state: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.241.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 17:56:21 GMT Connecting to host 192.168.241.3, port 5201 Cookie: Lab200slot2.1397238981.812898.548918 [ 4] local 192.168.241.2 port 38616 connected to 192.168.241.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.09 sec 20.8 MBytes 161 Mbits/sec [ 4] 1.09-2.13 sec 10.8 MBytes 86.8 Mbits/sec [ 4] 2.13-3.15 sec 3.57 MBytes 29.5 Mbits/sec [ 4] 3.15-4.16 sec 4.33 MBytes 35.7 Mbits/sec [ 4] 4.16-6.21 sec 10.4 MBytes 42.7 Mbits/sec [ 4] 6.21-6.21 sec 0.00 Bytes 0.00 bits/sec [ 4] 6.21-7.35 sec 34.6 MBytes 253 Mbits/sec [ 4] 7.35-11.45 sec 22.0 MBytes 45.0 Mbits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-12.51 sec 16.0 MBytes 126 Mbits/sec [ 4] 12.51-13.59 sec 20.3 MBytes 158 Mbits/sec [ 4] 13.59-14.65 sec 13.4 MBytes 107 Mbits/sec [ 4] 14.65-16.79 sec 33.3 MBytes 130 Mbits/sec [ 4] 16.79-16.79 sec 0.00 Bytes 0.00 bits/sec [ 4] 16.79-17.82 sec 5.94 MBytes 48.7 Mbits/sec (etc) [root@Lab200slot2 ~]# iperf3 --sctp -6 -c 2001:db8:0:f101::1 -V -l 1400 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 19:08:41 GMT Connecting to host 2001:db8:0:f101::1, port 5201 Cookie: Lab200slot2.1397243321.714295.2b3f7c [ 4] local 2001:db8:0:f101::2 port 55804 connected to 2001:db8:0:f101::1 port 5201 Starting Test: protocol: SCTP, 1 streams, 1400 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 169 MBytes 1.42 Gbits/sec [ 4] 1.00-2.00 sec 201 MBytes 1.69 Gbits/sec [ 4] 2.00-3.00 sec 188 MBytes 1.58 Gbits/sec [ 4] 3.00-4.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 4.00-5.00 sec 165 MBytes 1.39 Gbits/sec [ 4] 5.00-6.00 sec 199 MBytes 1.67 Gbits/sec [ 4] 6.00-7.00 sec 163 MBytes 1.36 Gbits/sec [ 4] 7.00-8.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 8.00-9.00 sec 193 MBytes 1.62 Gbits/sec [ 4] 9.00-10.00 sec 196 MBytes 1.65 Gbits/sec [ 4] 10.00-11.00 sec 157 MBytes 1.31 Gbits/sec [ 4] 11.00-12.00 sec 175 MBytes 1.47 Gbits/sec [ 4] 12.00-13.00 sec 192 MBytes 1.61 Gbits/sec [ 4] 13.00-14.00 sec 199 MBytes 1.67 Gbits/sec (etc) After patch: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.240.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0+ #1 SMP Mon Apr 14 12:06:40 EDT 2014 x86_64 Time: Mon, 14 Apr 2014 16:40:48 GMT Connecting to host 192.168.240.3, port 5201 Cookie: Lab200slot2.1397493648.413274.65e131 [ 4] local 192.168.240.2 port 50548 connected to 192.168.240.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 1.00-2.00 sec 239 MBytes 2.01 Gbits/sec [ 4] 2.00-3.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 3.00-4.00 sec 239 MBytes 2.00 Gbits/sec [ 4] 4.00-5.00 sec 245 MBytes 2.05 Gbits/sec [ 4] 5.00-6.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 6.00-7.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 7.00-8.00 sec 239 MBytes 2.01 Gbits/sec With the reverted patch applied, the SCTP/IPv4 performance is back to normal on latest upstream for IPv4 and IPv6 and has same throughput as 3.4.2 test kernel, steady and interval reports are smooth again. Fixes: ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") Reported-by: Peter Butler <pbutler@sonusnet.com> Reported-by: Dongsheng Song <dongsheng.song@gmail.com> Reported-by: Fengguang Wu <fengguang.wu@intel.com> Tested-by: Peter Butler <pbutler@sonusnet.com> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com> Cc: Alexander Sverdlin <alexander.sverdlin@nsn.com> Cc: Vlad Yasevich <vyasevich@gmail.com> Acked-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-14 19:45:17 +00:00
/* Number of bytes by which the rwnd has slopped. The rwnd is allowed
* to slop over a maximum of the association's frag_point.
*/
__u32 rwnd_over;
/* Keeps treack of rwnd pressure. This happens when we have
* a window, but not receive buffer (i.e small packets). This one
Revert "net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer" This reverts commit ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") as it introduced a serious performance regression on SCTP over IPv4 and IPv6, though a not as dramatic on the latter. Measurements are on 10Gbit/s with ixgbe NICs. Current state: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.241.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 17:56:21 GMT Connecting to host 192.168.241.3, port 5201 Cookie: Lab200slot2.1397238981.812898.548918 [ 4] local 192.168.241.2 port 38616 connected to 192.168.241.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.09 sec 20.8 MBytes 161 Mbits/sec [ 4] 1.09-2.13 sec 10.8 MBytes 86.8 Mbits/sec [ 4] 2.13-3.15 sec 3.57 MBytes 29.5 Mbits/sec [ 4] 3.15-4.16 sec 4.33 MBytes 35.7 Mbits/sec [ 4] 4.16-6.21 sec 10.4 MBytes 42.7 Mbits/sec [ 4] 6.21-6.21 sec 0.00 Bytes 0.00 bits/sec [ 4] 6.21-7.35 sec 34.6 MBytes 253 Mbits/sec [ 4] 7.35-11.45 sec 22.0 MBytes 45.0 Mbits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-12.51 sec 16.0 MBytes 126 Mbits/sec [ 4] 12.51-13.59 sec 20.3 MBytes 158 Mbits/sec [ 4] 13.59-14.65 sec 13.4 MBytes 107 Mbits/sec [ 4] 14.65-16.79 sec 33.3 MBytes 130 Mbits/sec [ 4] 16.79-16.79 sec 0.00 Bytes 0.00 bits/sec [ 4] 16.79-17.82 sec 5.94 MBytes 48.7 Mbits/sec (etc) [root@Lab200slot2 ~]# iperf3 --sctp -6 -c 2001:db8:0:f101::1 -V -l 1400 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 19:08:41 GMT Connecting to host 2001:db8:0:f101::1, port 5201 Cookie: Lab200slot2.1397243321.714295.2b3f7c [ 4] local 2001:db8:0:f101::2 port 55804 connected to 2001:db8:0:f101::1 port 5201 Starting Test: protocol: SCTP, 1 streams, 1400 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 169 MBytes 1.42 Gbits/sec [ 4] 1.00-2.00 sec 201 MBytes 1.69 Gbits/sec [ 4] 2.00-3.00 sec 188 MBytes 1.58 Gbits/sec [ 4] 3.00-4.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 4.00-5.00 sec 165 MBytes 1.39 Gbits/sec [ 4] 5.00-6.00 sec 199 MBytes 1.67 Gbits/sec [ 4] 6.00-7.00 sec 163 MBytes 1.36 Gbits/sec [ 4] 7.00-8.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 8.00-9.00 sec 193 MBytes 1.62 Gbits/sec [ 4] 9.00-10.00 sec 196 MBytes 1.65 Gbits/sec [ 4] 10.00-11.00 sec 157 MBytes 1.31 Gbits/sec [ 4] 11.00-12.00 sec 175 MBytes 1.47 Gbits/sec [ 4] 12.00-13.00 sec 192 MBytes 1.61 Gbits/sec [ 4] 13.00-14.00 sec 199 MBytes 1.67 Gbits/sec (etc) After patch: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.240.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0+ #1 SMP Mon Apr 14 12:06:40 EDT 2014 x86_64 Time: Mon, 14 Apr 2014 16:40:48 GMT Connecting to host 192.168.240.3, port 5201 Cookie: Lab200slot2.1397493648.413274.65e131 [ 4] local 192.168.240.2 port 50548 connected to 192.168.240.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 1.00-2.00 sec 239 MBytes 2.01 Gbits/sec [ 4] 2.00-3.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 3.00-4.00 sec 239 MBytes 2.00 Gbits/sec [ 4] 4.00-5.00 sec 245 MBytes 2.05 Gbits/sec [ 4] 5.00-6.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 6.00-7.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 7.00-8.00 sec 239 MBytes 2.01 Gbits/sec With the reverted patch applied, the SCTP/IPv4 performance is back to normal on latest upstream for IPv4 and IPv6 and has same throughput as 3.4.2 test kernel, steady and interval reports are smooth again. Fixes: ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") Reported-by: Peter Butler <pbutler@sonusnet.com> Reported-by: Dongsheng Song <dongsheng.song@gmail.com> Reported-by: Fengguang Wu <fengguang.wu@intel.com> Tested-by: Peter Butler <pbutler@sonusnet.com> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com> Cc: Alexander Sverdlin <alexander.sverdlin@nsn.com> Cc: Vlad Yasevich <vyasevich@gmail.com> Acked-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-14 19:45:17 +00:00
* is releases slowly (1 PMTU at a time ).
*/
__u32 rwnd_press;
/* This is the sndbuf size in use for the association.
* This corresponds to the sndbuf size for the association,
* as specified in the sk->sndbuf.
*/
int sndbuf_used;
/* This is the amount of memory that this association has allocated
* in the receive path at any given time.
*/
atomic_t rmem_alloc;
/* This is the wait queue head for send requests waiting on
* the association sndbuf space.
*/
wait_queue_head_t wait;
/* The message size at which SCTP fragmentation will occur. */
__u32 frag_point;
__u32 user_frag;
/* Counter used to count INIT errors. */
int init_err_counter;
/* Count the number of INIT cycles (for doubling timeout). */
int init_cycle;
/* Default send parameters. */
__u16 default_stream;
__u16 default_flags;
__u32 default_ppid;
__u32 default_context;
__u32 default_timetolive;
/* Default receive parameters */
__u32 default_rcv_context;
/* Stream arrays */
struct sctp_stream stream;
/* All outbound chunks go through this structure. */
struct sctp_outq outqueue;
/* A smart pipe that will handle reordering and fragmentation,
* as well as handle passing events up to the ULP.
*/
struct sctp_ulpq ulpq;
/* Last TSN that caused an ECNE Chunk to be sent. */
__u32 last_ecne_tsn;
/* Last TSN that caused a CWR Chunk to be sent. */
__u32 last_cwr_tsn;
/* How many duplicated TSNs have we seen? */
int numduptsns;
/* These are to support
* "SCTP Extensions for Dynamic Reconfiguration of IP Addresses
* and Enforcement of Flow and Message Limits"
* <draft-ietf-tsvwg-addip-sctp-02.txt>
* or "ADDIP" for short.
*/
/* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks
*
* R1) One and only one ASCONF Chunk MAY be in transit and
* unacknowledged at any one time. If a sender, after sending
* an ASCONF chunk, decides it needs to transfer another
* ASCONF Chunk, it MUST wait until the ASCONF-ACK Chunk
* returns from the previous ASCONF Chunk before sending a
* subsequent ASCONF. Note this restriction binds each side,
* so at any time two ASCONF may be in-transit on any given
* association (one sent from each endpoint).
*
* [This is our one-and-only-one ASCONF in flight. If we do
* not have an ASCONF in flight, this is NULL.]
*/
struct sctp_chunk *addip_last_asconf;
/* ADDIP Section 5.2 Upon reception of an ASCONF Chunk.
*
* This is needed to implement items E1 - E4 of the updated
* spec. Here is the justification:
*
* Since the peer may bundle multiple ASCONF chunks toward us,
* we now need the ability to cache multiple ACKs. The section
* describes in detail how they are cached and cleaned up.
*/
struct list_head asconf_ack_list;
/* These ASCONF chunks are waiting to be sent.
*
* These chunks can't be pushed to outqueue until receiving
* ASCONF_ACK for the previous ASCONF indicated by
* addip_last_asconf, so as to guarantee that only one ASCONF
* is in flight at any time.
*
* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks
*
* In defining the ASCONF Chunk transfer procedures, it is
* essential that these transfers MUST NOT cause congestion
* within the network. To achieve this, we place these
* restrictions on the transfer of ASCONF Chunks:
*
* R1) One and only one ASCONF Chunk MAY be in transit and
* unacknowledged at any one time. If a sender, after sending
* an ASCONF chunk, decides it needs to transfer another
* ASCONF Chunk, it MUST wait until the ASCONF-ACK Chunk
* returns from the previous ASCONF Chunk before sending a
* subsequent ASCONF. Note this restriction binds each side,
* so at any time two ASCONF may be in-transit on any given
* association (one sent from each endpoint).
*
*
* [I really think this is EXACTLY the sort of intelligence
* which already resides in sctp_outq. Please move this
* queue and its supporting logic down there. --piggy]
*/
struct list_head addip_chunk_list;
/* ADDIP Section 4.1 ASCONF Chunk Procedures
*
* A2) A serial number should be assigned to the Chunk. The
* serial number SHOULD be a monotonically increasing
* number. The serial number SHOULD be initialized at
* the start of the association to the same value as the
* Initial TSN and every time a new ASCONF chunk is created
* it is incremented by one after assigning the serial number
* to the newly created chunk.
*
* ADDIP
* 3.1.1 Address/Stream Configuration Change Chunk (ASCONF)
*
* Serial Number : 32 bits (unsigned integer)
*
* This value represents a Serial Number for the ASCONF
* Chunk. The valid range of Serial Number is from 0 to
* 4294967295 (2^32 - 1). Serial Numbers wrap back to 0
* after reaching 4294967295.
*/
__u32 addip_serial;
int src_out_of_asoc_ok;
union sctp_addr *asconf_addr_del_pending;
struct sctp_transport *new_transport;
/* SCTP AUTH: list of the endpoint shared keys. These
* keys are provided out of band by the user application
* and can't change during the lifetime of the association
*/
struct list_head endpoint_shared_keys;
/* SCTP AUTH:
* The current generated association shared key (secret)
*/
struct sctp_auth_bytes *asoc_shared_key;
struct sctp_shared_key *shkey;
/* SCTP AUTH: hmac id of the first peer requested algorithm
* that we support.
*/
__u16 default_hmac_id;
__u16 active_key_id;
__u8 need_ecne:1, /* Need to send an ECNE Chunk? */
temp:1, /* Is it a temporary association? */
sctp: add pf_expose per netns and sock and asoc As said in rfc7829, section 3, point 12: The SCTP stack SHOULD expose the PF state of its destination addresses to the ULP as well as provide the means to notify the ULP of state transitions of its destination addresses from active to PF, and vice versa. However, it is recommended that an SCTP stack implementing SCTP-PF also allows for the ULP to be kept ignorant of the PF state of its destinations and the associated state transitions, thus allowing for retention of the simpler state transition model of [RFC4960] in the ULP. Not only does it allow to expose the PF state to ULP, but also allow to ignore sctp-pf to ULP. So this patch is to add pf_expose per netns, sock and asoc. And in sctp_assoc_control_transport(), ulp_notify will be set to false if asoc->expose is not 'enabled' in next patch. It also allows a user to change pf_expose per netns by sysctl, and pf_expose per sock and asoc will be initialized with it. Note that pf_expose also works for SCTP_GET_PEER_ADDR_INFO sockopt, to not allow a user to query the state of a sctp-pf peer address when pf_expose is 'disabled', as said in section 7.3. v1->v2: - Fix a build warning noticed by Nathan Chancellor. v2->v3: - set pf_expose to UNUSED by default to keep compatible with old applications. v3->v4: - add a new entry for pf_expose on ip-sysctl.txt, as Marcelo suggested. - change this patch to 1/5, and move sctp_assoc_control_transport change into 2/5, as Marcelo suggested. - use SCTP_PF_EXPOSE_UNSET instead of SCTP_PF_EXPOSE_UNUSED, and set SCTP_PF_EXPOSE_UNSET to 0 in enum, as Marcelo suggested. Signed-off-by: Xin Long <lucien.xin@gmail.com> Acked-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-08 05:20:32 +00:00
pf_expose:2, /* Expose pf state? */
force_delay:1;
sctp: Add support to per-association statistics via a new SCTP_GET_ASSOC_STATS call The current SCTP stack is lacking a mechanism to have per association statistics. This is an implementation modeled after OpenSolaris' SCTP_GET_ASSOC_STATS. Userspace part will follow on lksctp if/when there is a general ACK on this. V4: - Move ipackets++ before q->immediate.func() for consistency reasons - Move sctp_max_rto() at the end of sctp_transport_update_rto() to avoid returning bogus RTO values - return asoc->rto_min when max_obs_rto value has not changed V3: - Increase ictrlchunks in sctp_assoc_bh_rcv() as well - Move ipackets++ to sctp_inq_push() - return 0 when no rto updates took place since the last call V2: - Implement partial retrieval of stat struct to cope for future expansion - Kill the rtxpackets counter as it cannot be precise anyway - Rename outseqtsns to outofseqtsns to make it clearer that these are out of sequence unexpected TSNs - Move asoc->ipackets++ under a lock to avoid potential miscounts - Fold asoc->opackets++ into the already existing asoc check - Kill unneeded (q->asoc) test when increasing rtxchunks - Do not count octrlchunks if sending failed (SCTP_XMIT_OK != 0) - Don't count SHUTDOWNs as SACKs - Move SCTP_GET_ASSOC_STATS to the private space API - Adjust the len check in sctp_getsockopt_assoc_stats() to allow for future struct growth - Move association statistics in their own struct - Update idupchunks when we send a SACK with dup TSNs - return min_rto in max_rto when RTO has not changed. Also return the transport when max_rto last changed. Signed-off: Michele Baldessari <michele@acksyn.org> Acked-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-12-01 04:49:42 +00:00
__u8 strreset_enable;
__u8 strreset_outstanding; /* request param count on the fly */
__u32 strreset_outseq; /* Update after receiving response */
__u32 strreset_inseq; /* Update after receiving request */
__u32 strreset_result[2]; /* save the results of last 2 responses */
struct sctp_chunk *strreset_chunk; /* save request chunk */
sctp: Add support to per-association statistics via a new SCTP_GET_ASSOC_STATS call The current SCTP stack is lacking a mechanism to have per association statistics. This is an implementation modeled after OpenSolaris' SCTP_GET_ASSOC_STATS. Userspace part will follow on lksctp if/when there is a general ACK on this. V4: - Move ipackets++ before q->immediate.func() for consistency reasons - Move sctp_max_rto() at the end of sctp_transport_update_rto() to avoid returning bogus RTO values - return asoc->rto_min when max_obs_rto value has not changed V3: - Increase ictrlchunks in sctp_assoc_bh_rcv() as well - Move ipackets++ to sctp_inq_push() - return 0 when no rto updates took place since the last call V2: - Implement partial retrieval of stat struct to cope for future expansion - Kill the rtxpackets counter as it cannot be precise anyway - Rename outseqtsns to outofseqtsns to make it clearer that these are out of sequence unexpected TSNs - Move asoc->ipackets++ under a lock to avoid potential miscounts - Fold asoc->opackets++ into the already existing asoc check - Kill unneeded (q->asoc) test when increasing rtxchunks - Do not count octrlchunks if sending failed (SCTP_XMIT_OK != 0) - Don't count SHUTDOWNs as SACKs - Move SCTP_GET_ASSOC_STATS to the private space API - Adjust the len check in sctp_getsockopt_assoc_stats() to allow for future struct growth - Move association statistics in their own struct - Update idupchunks when we send a SACK with dup TSNs - return min_rto in max_rto when RTO has not changed. Also return the transport when max_rto last changed. Signed-off: Michele Baldessari <michele@acksyn.org> Acked-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-12-01 04:49:42 +00:00
struct sctp_priv_assoc_stats stats;
int sent_cnt_removable;
__u16 subscribe;
__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
/* Security identifiers from incoming (INIT). These are set by
* security_sctp_assoc_request(). These will only be used by
* SCTP TCP type sockets and peeled off connections as they
* cause a new socket to be generated. security_sctp_sk_clone()
* will then plug these into the new socket.
*/
u32 secid;
u32 peer_secid;
struct rcu_head rcu;
};
/* An eyecatcher for determining if we are really looking at an
* association data structure.
*/
enum {
SCTP_ASSOC_EYECATCHER = 0xa550c123,
};
/* Recover the outer association structure. */
static inline struct sctp_association *sctp_assoc(struct sctp_ep_common *base)
{
struct sctp_association *asoc;
asoc = container_of(base, struct sctp_association, base);
return asoc;
}
/* These are function signatures for manipulating associations. */
struct sctp_association *
sctp_association_new(const struct sctp_endpoint *ep, const struct sock *sk,
enum sctp_scope scope, gfp_t gfp);
void sctp_association_free(struct sctp_association *);
void sctp_association_put(struct sctp_association *);
void sctp_association_hold(struct sctp_association *);
struct sctp_transport *sctp_assoc_choose_alter_transport(
struct sctp_association *, struct sctp_transport *);
void sctp_assoc_update_retran_path(struct sctp_association *);
struct sctp_transport *sctp_assoc_lookup_paddr(const struct sctp_association *,
const union sctp_addr *);
int sctp_assoc_lookup_laddr(struct sctp_association *asoc,
const union sctp_addr *laddr);
struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *,
const union sctp_addr *address,
const gfp_t gfp,
const int peer_state);
void sctp_assoc_del_peer(struct sctp_association *asoc,
const union sctp_addr *addr);
void sctp_assoc_rm_peer(struct sctp_association *asoc,
struct sctp_transport *peer);
void sctp_assoc_control_transport(struct sctp_association *asoc,
struct sctp_transport *transport,
enum sctp_transport_cmd command,
sctp_sn_error_t error);
struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *, __u32);
void sctp_assoc_migrate(struct sctp_association *, struct sock *);
int sctp_assoc_update(struct sctp_association *old,
struct sctp_association *new);
__u32 sctp_association_get_next_tsn(struct sctp_association *);
void sctp_assoc_update_frag_point(struct sctp_association *asoc);
void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu);
void sctp_assoc_sync_pmtu(struct sctp_association *asoc);
Revert "net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer" This reverts commit ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") as it introduced a serious performance regression on SCTP over IPv4 and IPv6, though a not as dramatic on the latter. Measurements are on 10Gbit/s with ixgbe NICs. Current state: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.241.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 17:56:21 GMT Connecting to host 192.168.241.3, port 5201 Cookie: Lab200slot2.1397238981.812898.548918 [ 4] local 192.168.241.2 port 38616 connected to 192.168.241.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.09 sec 20.8 MBytes 161 Mbits/sec [ 4] 1.09-2.13 sec 10.8 MBytes 86.8 Mbits/sec [ 4] 2.13-3.15 sec 3.57 MBytes 29.5 Mbits/sec [ 4] 3.15-4.16 sec 4.33 MBytes 35.7 Mbits/sec [ 4] 4.16-6.21 sec 10.4 MBytes 42.7 Mbits/sec [ 4] 6.21-6.21 sec 0.00 Bytes 0.00 bits/sec [ 4] 6.21-7.35 sec 34.6 MBytes 253 Mbits/sec [ 4] 7.35-11.45 sec 22.0 MBytes 45.0 Mbits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-12.51 sec 16.0 MBytes 126 Mbits/sec [ 4] 12.51-13.59 sec 20.3 MBytes 158 Mbits/sec [ 4] 13.59-14.65 sec 13.4 MBytes 107 Mbits/sec [ 4] 14.65-16.79 sec 33.3 MBytes 130 Mbits/sec [ 4] 16.79-16.79 sec 0.00 Bytes 0.00 bits/sec [ 4] 16.79-17.82 sec 5.94 MBytes 48.7 Mbits/sec (etc) [root@Lab200slot2 ~]# iperf3 --sctp -6 -c 2001:db8:0:f101::1 -V -l 1400 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 19:08:41 GMT Connecting to host 2001:db8:0:f101::1, port 5201 Cookie: Lab200slot2.1397243321.714295.2b3f7c [ 4] local 2001:db8:0:f101::2 port 55804 connected to 2001:db8:0:f101::1 port 5201 Starting Test: protocol: SCTP, 1 streams, 1400 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 169 MBytes 1.42 Gbits/sec [ 4] 1.00-2.00 sec 201 MBytes 1.69 Gbits/sec [ 4] 2.00-3.00 sec 188 MBytes 1.58 Gbits/sec [ 4] 3.00-4.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 4.00-5.00 sec 165 MBytes 1.39 Gbits/sec [ 4] 5.00-6.00 sec 199 MBytes 1.67 Gbits/sec [ 4] 6.00-7.00 sec 163 MBytes 1.36 Gbits/sec [ 4] 7.00-8.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 8.00-9.00 sec 193 MBytes 1.62 Gbits/sec [ 4] 9.00-10.00 sec 196 MBytes 1.65 Gbits/sec [ 4] 10.00-11.00 sec 157 MBytes 1.31 Gbits/sec [ 4] 11.00-12.00 sec 175 MBytes 1.47 Gbits/sec [ 4] 12.00-13.00 sec 192 MBytes 1.61 Gbits/sec [ 4] 13.00-14.00 sec 199 MBytes 1.67 Gbits/sec (etc) After patch: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.240.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0+ #1 SMP Mon Apr 14 12:06:40 EDT 2014 x86_64 Time: Mon, 14 Apr 2014 16:40:48 GMT Connecting to host 192.168.240.3, port 5201 Cookie: Lab200slot2.1397493648.413274.65e131 [ 4] local 192.168.240.2 port 50548 connected to 192.168.240.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 1.00-2.00 sec 239 MBytes 2.01 Gbits/sec [ 4] 2.00-3.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 3.00-4.00 sec 239 MBytes 2.00 Gbits/sec [ 4] 4.00-5.00 sec 245 MBytes 2.05 Gbits/sec [ 4] 5.00-6.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 6.00-7.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 7.00-8.00 sec 239 MBytes 2.01 Gbits/sec With the reverted patch applied, the SCTP/IPv4 performance is back to normal on latest upstream for IPv4 and IPv6 and has same throughput as 3.4.2 test kernel, steady and interval reports are smooth again. Fixes: ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") Reported-by: Peter Butler <pbutler@sonusnet.com> Reported-by: Dongsheng Song <dongsheng.song@gmail.com> Reported-by: Fengguang Wu <fengguang.wu@intel.com> Tested-by: Peter Butler <pbutler@sonusnet.com> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com> Cc: Alexander Sverdlin <alexander.sverdlin@nsn.com> Cc: Vlad Yasevich <vyasevich@gmail.com> Acked-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-14 19:45:17 +00:00
void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int);
void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int);
void sctp_assoc_set_primary(struct sctp_association *,
struct sctp_transport *);
void sctp_assoc_del_nonprimary_peers(struct sctp_association *,
struct sctp_transport *);
int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
enum sctp_scope scope, gfp_t gfp);
int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *,
struct sctp_cookie*,
gfp_t gfp);
int sctp_assoc_set_id(struct sctp_association *, gfp_t);
void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc);
struct sctp_chunk *sctp_assoc_lookup_asconf_ack(
const struct sctp_association *asoc,
__be32 serial);
void sctp_asconf_queue_teardown(struct sctp_association *asoc);
int sctp_cmp_addr_exact(const union sctp_addr *ss1,
const union sctp_addr *ss2);
struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc);
/* A convenience structure to parse out SCTP specific CMSGs. */
struct sctp_cmsgs {
struct sctp_initmsg *init;
struct sctp_sndrcvinfo *srinfo;
struct sctp_sndinfo *sinfo;
struct sctp_prinfo *prinfo;
struct sctp_authinfo *authinfo;
struct msghdr *addrs_msg;
};
/* Structure for tracking memory objects */
struct sctp_dbg_objcnt_entry {
char *label;
atomic_t *counter;
};
#endif /* __sctp_structs_h__ */