2019-05-27 08:55:01 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* Definitions for the UDP module.
|
|
|
|
*
|
|
|
|
* Version: @(#)udp.h 1.0.2 05/07/93
|
|
|
|
*
|
2005-05-05 16:16:16 -07:00
|
|
|
* Authors: Ross Biro
|
2005-04-16 15:20:36 -07:00
|
|
|
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
|
|
|
|
*
|
|
|
|
* Fixes:
|
|
|
|
* Alan Cox : Turned on udp checksums. I don't want to
|
|
|
|
* chase 'memory corruption' bugs that aren't!
|
|
|
|
*/
|
|
|
|
#ifndef _UDP_H
|
|
|
|
#define _UDP_H
|
|
|
|
|
|
|
|
#include <linux/list.h>
|
2011-11-23 20:12:59 -05:00
|
|
|
#include <linux/bug.h>
|
2005-12-27 02:43:12 -02:00
|
|
|
#include <net/inet_sock.h>
|
2023-06-08 19:17:37 +00:00
|
|
|
#include <net/gso.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/snmp.h>
|
2006-11-27 11:10:57 -08:00
|
|
|
#include <net/ip.h>
|
|
|
|
#include <linux/ipv6.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/seq_file.h>
|
2006-10-19 17:23:57 -04:00
|
|
|
#include <linux/poll.h>
|
2020-06-23 15:31:15 -07:00
|
|
|
#include <linux/indirect_call_wrapper.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-11-27 11:10:57 -08:00
|
|
|
/**
|
|
|
|
* struct udp_skb_cb - UDP(-Lite) private variables
|
|
|
|
*
|
|
|
|
* @header: private variables used by IPv4/IPv6
|
|
|
|
* @cscov: checksum coverage length (UDP-Lite only)
|
|
|
|
* @partial_cov: if set indicates partial csum coverage
|
|
|
|
*/
|
|
|
|
struct udp_skb_cb {
|
|
|
|
union {
|
|
|
|
struct inet_skb_parm h4;
|
2011-12-10 09:48:31 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2006-11-27 11:10:57 -08:00
|
|
|
struct inet6_skb_parm h6;
|
|
|
|
#endif
|
|
|
|
} header;
|
|
|
|
__u16 cscov;
|
|
|
|
__u8 partial_cov;
|
|
|
|
};
|
|
|
|
#define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb))
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-11-08 10:17:05 +00:00
|
|
|
/**
|
2024-11-14 18:52:05 +08:00
|
|
|
* struct udp_hslot - UDP hash slot used by udp_table.hash/hash4
|
2009-11-08 10:17:05 +00:00
|
|
|
*
|
|
|
|
* @head: head of list of sockets
|
2024-11-14 18:52:05 +08:00
|
|
|
* @nulls_head: head of list of sockets, only used by hash4
|
2009-11-08 10:17:05 +00:00
|
|
|
* @count: number of sockets in 'head' list
|
|
|
|
* @lock: spinlock protecting changes to head/count
|
|
|
|
*/
|
2008-10-29 01:41:45 -07:00
|
|
|
struct udp_hslot {
|
2024-11-14 18:52:05 +08:00
|
|
|
union {
|
|
|
|
struct hlist_head head;
|
|
|
|
/* hash4 uses hlist_nulls to avoid moving wrongly onto another
|
|
|
|
* hlist, because rehash() can happen with lookup().
|
|
|
|
*/
|
|
|
|
struct hlist_nulls_head nulls_head;
|
|
|
|
};
|
2009-11-08 10:17:05 +00:00
|
|
|
int count;
|
2008-10-29 01:41:45 -07:00
|
|
|
spinlock_t lock;
|
2024-11-14 18:52:04 +08:00
|
|
|
} __aligned(2 * sizeof(long));
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct udp_hslot_main - UDP hash slot used by udp_table.hash2
|
|
|
|
*
|
|
|
|
* @hslot: basic hash slot
|
|
|
|
* @hash4_cnt: number of sockets in hslot4 of the same
|
|
|
|
* (local port, local address)
|
|
|
|
*/
|
|
|
|
struct udp_hslot_main {
|
|
|
|
struct udp_hslot hslot; /* must be the first member */
|
|
|
|
#if !IS_ENABLED(CONFIG_BASE_SMALL)
|
|
|
|
u32 hash4_cnt;
|
|
|
|
#endif
|
|
|
|
} __aligned(2 * sizeof(long));
|
|
|
|
#define UDP_HSLOT_MAIN(__hslot) ((struct udp_hslot_main *)(__hslot))
|
2009-10-07 00:37:59 +00:00
|
|
|
|
2009-11-08 10:17:58 +00:00
|
|
|
/**
|
|
|
|
* struct udp_table - UDP table
|
|
|
|
*
|
|
|
|
* @hash: hash table, sockets are hashed on (local port)
|
|
|
|
* @hash2: hash table, sockets are hashed on (local port, local address)
|
2024-11-14 18:52:05 +08:00
|
|
|
* @hash4: hash table, connected sockets are hashed on
|
|
|
|
* (local port, local address, remote port, remote address)
|
2009-11-08 10:17:58 +00:00
|
|
|
* @mask: number of slots in hash tables, minus 1
|
|
|
|
* @log: log2(number of slots in hash table)
|
|
|
|
*/
|
2008-10-29 01:41:45 -07:00
|
|
|
struct udp_table {
|
2009-10-07 00:37:59 +00:00
|
|
|
struct udp_hslot *hash;
|
2024-11-14 18:52:04 +08:00
|
|
|
struct udp_hslot_main *hash2;
|
2024-11-14 18:52:05 +08:00
|
|
|
#if !IS_ENABLED(CONFIG_BASE_SMALL)
|
|
|
|
struct udp_hslot *hash4;
|
|
|
|
#endif
|
2009-11-08 10:17:58 +00:00
|
|
|
unsigned int mask;
|
|
|
|
unsigned int log;
|
2008-10-29 01:41:45 -07:00
|
|
|
};
|
|
|
|
extern struct udp_table udp_table;
|
2013-09-23 11:33:36 -07:00
|
|
|
void udp_table_init(struct udp_table *, const char *);
|
2009-10-07 00:37:59 +00:00
|
|
|
static inline struct udp_hslot *udp_hashslot(struct udp_table *table,
|
2024-08-02 13:40:27 +00:00
|
|
|
const struct net *net,
|
|
|
|
unsigned int num)
|
2009-10-07 00:37:59 +00:00
|
|
|
{
|
|
|
|
return &table->hash[udp_hashfn(net, num, table->mask)];
|
|
|
|
}
|
2024-11-14 18:52:04 +08:00
|
|
|
|
2009-11-08 10:17:58 +00:00
|
|
|
/*
|
|
|
|
* For secondary hash, net_hash_mix() is performed before calling
|
|
|
|
* udp_hashslot2(), this explains difference with udp_hashslot()
|
|
|
|
*/
|
|
|
|
static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
|
|
|
|
unsigned int hash)
|
|
|
|
{
|
2024-11-14 18:52:04 +08:00
|
|
|
return &table->hash2[hash & table->mask].hslot;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_BASE_SMALL)
|
|
|
|
static inline void udp_table_hash4_init(struct udp_table *table)
|
|
|
|
{
|
|
|
|
}
|
2024-11-14 18:52:05 +08:00
|
|
|
|
|
|
|
static inline struct udp_hslot *udp_hashslot4(struct udp_table *table,
|
|
|
|
unsigned int hash)
|
|
|
|
{
|
|
|
|
BUILD_BUG();
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool udp_hashed4(const struct sock *sk)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int udp_hash4_slot_size(void)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool udp_has_hash4(const struct udp_hslot *hslot2)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void udp_hash4_inc(struct udp_hslot *hslot2)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void udp_hash4_dec(struct udp_hslot *hslot2)
|
|
|
|
{
|
|
|
|
}
|
2024-11-14 18:52:04 +08:00
|
|
|
#else /* !CONFIG_BASE_SMALL */
|
|
|
|
|
|
|
|
/* Must be called with table->hash2 initialized */
|
|
|
|
static inline void udp_table_hash4_init(struct udp_table *table)
|
|
|
|
{
|
2024-11-14 18:52:05 +08:00
|
|
|
table->hash4 = (void *)(table->hash2 + (table->mask + 1));
|
|
|
|
for (int i = 0; i <= table->mask; i++) {
|
2024-11-14 18:52:04 +08:00
|
|
|
table->hash2[i].hash4_cnt = 0;
|
2024-11-14 18:52:05 +08:00
|
|
|
|
|
|
|
INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i);
|
|
|
|
table->hash4[i].count = 0;
|
|
|
|
spin_lock_init(&table->hash4[i].lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct udp_hslot *udp_hashslot4(struct udp_table *table,
|
|
|
|
unsigned int hash)
|
|
|
|
{
|
|
|
|
return &table->hash4[hash & table->mask];
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool udp_hashed4(const struct sock *sk)
|
|
|
|
{
|
|
|
|
return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int udp_hash4_slot_size(void)
|
|
|
|
{
|
|
|
|
return sizeof(struct udp_hslot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool udp_has_hash4(const struct udp_hslot *hslot2)
|
|
|
|
{
|
|
|
|
return UDP_HSLOT_MAIN(hslot2)->hash4_cnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void udp_hash4_inc(struct udp_hslot *hslot2)
|
|
|
|
{
|
|
|
|
UDP_HSLOT_MAIN(hslot2)->hash4_cnt++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void udp_hash4_dec(struct udp_hslot *hslot2)
|
|
|
|
{
|
|
|
|
UDP_HSLOT_MAIN(hslot2)->hash4_cnt--;
|
2009-11-08 10:17:58 +00:00
|
|
|
}
|
2024-11-14 18:52:04 +08:00
|
|
|
#endif /* CONFIG_BASE_SMALL */
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
extern struct proto udp_prot;
|
|
|
|
|
2010-11-09 23:24:26 +00:00
|
|
|
extern atomic_long_t udp_memory_allocated;
|
2022-06-08 23:34:08 -07:00
|
|
|
DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
|
2007-12-31 00:29:24 -08:00
|
|
|
|
|
|
|
/* sysctl variables for udp */
|
2010-11-09 23:24:26 +00:00
|
|
|
extern long sysctl_udp_mem[3];
|
2007-12-31 00:29:24 -08:00
|
|
|
extern int sysctl_udp_rmem_min;
|
|
|
|
extern int sysctl_udp_wmem_min;
|
|
|
|
|
2005-12-27 02:43:12 -02:00
|
|
|
struct sk_buff;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-11-27 11:10:57 -08:00
|
|
|
/*
|
|
|
|
* Generic checksumming routines for UDP(-Lite) v4 and v6
|
|
|
|
*/
|
2006-11-14 21:40:42 -08:00
|
|
|
static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb)
|
2006-11-27 11:10:57 -08:00
|
|
|
{
|
2014-06-14 23:24:20 -07:00
|
|
|
return (UDP_SKB_CB(skb)->cscov == skb->len ?
|
|
|
|
__skb_checksum_complete(skb) :
|
|
|
|
__skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov));
|
2006-11-27 11:10:57 -08:00
|
|
|
}
|
|
|
|
|
2006-11-20 18:06:37 -08:00
|
|
|
static inline int udp_lib_checksum_complete(struct sk_buff *skb)
|
2006-11-27 11:10:57 -08:00
|
|
|
{
|
2007-04-09 11:59:39 -07:00
|
|
|
return !skb_csum_unnecessary(skb) &&
|
2006-11-27 11:10:57 -08:00
|
|
|
__udp_lib_checksum_complete(skb);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* udp_csum_outgoing - compute UDPv4/v6 checksum over fragments
|
|
|
|
* @sk: socket we are writing to
|
|
|
|
* @skb: sk_buff containing the filled-in UDP header
|
|
|
|
* (checksum field must be zeroed out)
|
|
|
|
*/
|
2006-11-14 21:35:48 -08:00
|
|
|
static inline __wsum udp_csum_outgoing(struct sock *sk, struct sk_buff *skb)
|
2006-11-27 11:10:57 -08:00
|
|
|
{
|
2007-04-25 18:04:18 -07:00
|
|
|
__wsum csum = csum_partial(skb_transport_header(skb),
|
|
|
|
sizeof(struct udphdr), 0);
|
2006-11-27 11:10:57 -08:00
|
|
|
skb_queue_walk(&sk->sk_write_queue, skb) {
|
|
|
|
csum = csum_add(csum, skb->csum);
|
|
|
|
}
|
|
|
|
return csum;
|
|
|
|
}
|
|
|
|
|
2011-03-01 02:36:48 +00:00
|
|
|
static inline __wsum udp_csum(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
__wsum csum = csum_partial(skb_transport_header(skb),
|
|
|
|
sizeof(struct udphdr), skb->csum);
|
|
|
|
|
|
|
|
for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) {
|
|
|
|
csum = csum_add(csum, skb->csum);
|
|
|
|
}
|
|
|
|
return csum;
|
|
|
|
}
|
|
|
|
|
2014-06-04 17:19:48 -07:00
|
|
|
static inline __sum16 udp_v4_check(int len, __be32 saddr,
|
|
|
|
__be32 daddr, __wsum base)
|
|
|
|
{
|
|
|
|
return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base);
|
|
|
|
}
|
|
|
|
|
|
|
|
void udp_set_csum(bool nocheck, struct sk_buff *skb,
|
|
|
|
__be32 saddr, __be32 daddr, int len);
|
|
|
|
|
2016-04-05 12:41:15 -04:00
|
|
|
static inline void udp_csum_pull_header(struct sk_buff *skb)
|
|
|
|
{
|
2016-05-31 15:22:41 -07:00
|
|
|
if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE)
|
|
|
|
skb->csum = csum_partial(skb->data, sizeof(struct udphdr),
|
2016-04-05 12:41:15 -04:00
|
|
|
skb->csum);
|
|
|
|
skb_pull_rcsum(skb, sizeof(struct udphdr));
|
|
|
|
UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr);
|
|
|
|
}
|
|
|
|
|
2020-11-09 15:13:49 -08:00
|
|
|
typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport,
|
2016-04-05 08:22:51 -07:00
|
|
|
__be16 dport);
|
|
|
|
|
tcp/udp: Make early_demux back namespacified.
Commit e21145a9871a ("ipv4: namespacify ip_early_demux sysctl knob") made
it possible to enable/disable early_demux on a per-netns basis. Then, we
introduced two knobs, tcp_early_demux and udp_early_demux, to switch it for
TCP/UDP in commit dddb64bcb346 ("net: Add sysctl to toggle early demux for
tcp and udp"). However, the .proc_handler() was wrong and actually
disabled us from changing the behaviour in each netns.
We can execute early_demux if net.ipv4.ip_early_demux is on and each proto
.early_demux() handler is not NULL. When we toggle (tcp|udp)_early_demux,
the change itself is saved in each netns variable, but the .early_demux()
handler is a global variable, so the handler is switched based on the
init_net's sysctl variable. Thus, netns (tcp|udp)_early_demux knobs have
nothing to do with the logic. Whether we CAN execute proto .early_demux()
is always decided by init_net's sysctl knob, and whether we DO it or not is
by each netns ip_early_demux knob.
This patch namespacifies (tcp|udp)_early_demux again. For now, the users
of the .early_demux() handler are TCP and UDP only, and they are called
directly to avoid retpoline. So, we can remove the .early_demux() handler
from inet6?_protos and need not dereference them in ip6?_rcv_finish_core().
If another proto needs .early_demux(), we can restore it at that time.
Fixes: dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20220713175207.7727-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-07-13 10:52:07 -07:00
|
|
|
void udp_v6_early_demux(struct sk_buff *skb);
|
2021-02-03 15:51:10 +02:00
|
|
|
INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));
|
|
|
|
|
2018-04-26 13:42:16 -04:00
|
|
|
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
|
2021-01-30 08:13:27 +09:00
|
|
|
netdev_features_t features, bool is_ipv6);
|
2018-04-26 13:42:16 -04:00
|
|
|
|
2022-10-20 19:48:52 +02:00
|
|
|
static inline void udp_lib_init_sock(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
|
|
|
|
skb_queue_head_init(&up->reader_queue);
|
|
|
|
up->forward_threshold = sk->sk_rcvbuf >> 2;
|
|
|
|
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
|
|
|
|
}
|
|
|
|
|
2006-11-27 11:10:57 -08:00
|
|
|
/* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
|
2016-02-10 11:50:35 -05:00
|
|
|
static inline int udp_lib_hash(struct sock *sk)
|
2006-11-27 11:10:57 -08:00
|
|
|
{
|
|
|
|
BUG();
|
2016-02-10 11:50:35 -05:00
|
|
|
return 0;
|
2006-11-27 11:10:57 -08:00
|
|
|
}
|
|
|
|
|
2013-09-23 11:33:36 -07:00
|
|
|
void udp_lib_unhash(struct sock *sk);
|
ipv4/udp: Add 4-tuple hash for connected socket
Currently, the udp_table has two hash table, the port hash and portaddr
hash. Usually for UDP servers, all sockets have the same local port and
addr, so they are all on the same hash slot within a reuseport group.
In some applications, UDP servers use connect() to manage clients. In
particular, when firstly receiving from an unseen 4 tuple, a new socket
is created and connect()ed to the remote addr:port, and then the fd is
used exclusively by the client.
Once there are connected sks in a reuseport group, udp has to score all
sks in the same hash2 slot to find the best match. This could be
inefficient with a large number of connections, resulting in high
softirq overhead.
To solve the problem, this patch implement 4-tuple hash for connected
udp sockets. During connect(), hash4 slot is updated, as well as a
corresponding counter, hash4_cnt, in hslot2. In __udp4_lib_lookup(),
hslot4 will be searched firstly if the counter is non-zero. Otherwise,
hslot2 is used like before. Note that only connected sockets enter this
hash4 path, while un-connected ones are not affected.
hlist_nulls is used for hash4, because we probably move to another hslot
wrongly when lookup with concurrent rehash. Then we check nulls at the
list end to see if we should restart lookup. Because udp does not use
SLAB_TYPESAFE_BY_RCU, we don't need to touch sk_refcnt when lookup.
Stress test results (with 1 cpu fully used) are shown below, in pps:
(1) _un-connected_ socket as server
[a] w/o hash4: 1,825176
[b] w/ hash4: 1,831750 (+0.36%)
(2) 500 _connected_ sockets as server
[c] w/o hash4: 290860 (only 16% of [a])
[d] w/ hash4: 1,889658 (+3.1% compared with [b])
With hash4, compute_score is skipped when lookup, so [d] is slightly
better than [b].
Co-developed-by: Cambda Zhu <cambda@linux.alibaba.com>
Signed-off-by: Cambda Zhu <cambda@linux.alibaba.com>
Co-developed-by: Fred Chen <fred.cc@alibaba-inc.com>
Signed-off-by: Fred Chen <fred.cc@alibaba-inc.com>
Co-developed-by: Yubing Qiu <yubing.qiuyubing@alibaba-inc.com>
Signed-off-by: Yubing Qiu <yubing.qiuyubing@alibaba-inc.com>
Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-11-14 18:52:06 +08:00
|
|
|
void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4);
|
2024-11-14 18:52:07 +08:00
|
|
|
u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
|
|
|
|
const __be32 faddr, const __be16 fport);
|
2006-11-27 11:10:57 -08:00
|
|
|
|
|
|
|
static inline void udp_lib_close(struct sock *sk, long timeout)
|
|
|
|
{
|
|
|
|
sk_common_release(sk);
|
|
|
|
}
|
|
|
|
|
ipv4/udp: Add 4-tuple hash for connected socket
Currently, the udp_table has two hash table, the port hash and portaddr
hash. Usually for UDP servers, all sockets have the same local port and
addr, so they are all on the same hash slot within a reuseport group.
In some applications, UDP servers use connect() to manage clients. In
particular, when firstly receiving from an unseen 4 tuple, a new socket
is created and connect()ed to the remote addr:port, and then the fd is
used exclusively by the client.
Once there are connected sks in a reuseport group, udp has to score all
sks in the same hash2 slot to find the best match. This could be
inefficient with a large number of connections, resulting in high
softirq overhead.
To solve the problem, this patch implement 4-tuple hash for connected
udp sockets. During connect(), hash4 slot is updated, as well as a
corresponding counter, hash4_cnt, in hslot2. In __udp4_lib_lookup(),
hslot4 will be searched firstly if the counter is non-zero. Otherwise,
hslot2 is used like before. Note that only connected sockets enter this
hash4 path, while un-connected ones are not affected.
hlist_nulls is used for hash4, because we probably move to another hslot
wrongly when lookup with concurrent rehash. Then we check nulls at the
list end to see if we should restart lookup. Because udp does not use
SLAB_TYPESAFE_BY_RCU, we don't need to touch sk_refcnt when lookup.
Stress test results (with 1 cpu fully used) are shown below, in pps:
(1) _un-connected_ socket as server
[a] w/o hash4: 1,825176
[b] w/ hash4: 1,831750 (+0.36%)
(2) 500 _connected_ sockets as server
[c] w/o hash4: 290860 (only 16% of [a])
[d] w/ hash4: 1,889658 (+3.1% compared with [b])
With hash4, compute_score is skipped when lookup, so [d] is slightly
better than [b].
Co-developed-by: Cambda Zhu <cambda@linux.alibaba.com>
Signed-off-by: Cambda Zhu <cambda@linux.alibaba.com>
Co-developed-by: Fred Chen <fred.cc@alibaba-inc.com>
Signed-off-by: Fred Chen <fred.cc@alibaba-inc.com>
Co-developed-by: Yubing Qiu <yubing.qiuyubing@alibaba-inc.com>
Signed-off-by: Yubing Qiu <yubing.qiuyubing@alibaba-inc.com>
Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-11-14 18:52:06 +08:00
|
|
|
/* hash4 routines shared between UDPv4/6 */
|
|
|
|
#if IS_ENABLED(CONFIG_BASE_SMALL)
|
|
|
|
static inline void udp_lib_hash4(struct sock *sk, u16 hash)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void udp4_hash4(struct sock *sk)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#else /* !CONFIG_BASE_SMALL */
|
|
|
|
void udp_lib_hash4(struct sock *sk, u16 hash);
|
|
|
|
void udp4_hash4(struct sock *sk);
|
|
|
|
#endif /* CONFIG_BASE_SMALL */
|
|
|
|
|
2013-09-23 11:33:36 -07:00
|
|
|
int udp_lib_get_port(struct sock *sk, unsigned short snum,
|
|
|
|
unsigned int hash2_nulladdr);
|
2006-11-27 11:10:57 -08:00
|
|
|
|
2015-02-24 09:17:31 -08:00
|
|
|
u32 udp_flow_hashrnd(void);
|
|
|
|
|
2014-07-01 21:32:39 -07:00
|
|
|
static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
|
|
|
|
int min, int max, bool use_eth)
|
|
|
|
{
|
|
|
|
u32 hash;
|
|
|
|
|
|
|
|
if (min >= max) {
|
|
|
|
/* Use default range */
|
|
|
|
inet_get_local_port_range(net, &min, &max);
|
|
|
|
}
|
|
|
|
|
|
|
|
hash = skb_get_hash(skb);
|
2015-02-24 09:17:31 -08:00
|
|
|
if (unlikely(!hash)) {
|
|
|
|
if (use_eth) {
|
|
|
|
/* Can't find a normal hash, caller has indicated an
|
|
|
|
* Ethernet packet so use that to compute a hash.
|
|
|
|
*/
|
|
|
|
hash = jhash(skb->data, 2 * ETH_ALEN,
|
|
|
|
(__force u32) skb->protocol);
|
|
|
|
} else {
|
|
|
|
/* Can't derive any sort of hash for the packet, set
|
|
|
|
* to some consistent random value.
|
|
|
|
*/
|
|
|
|
hash = udp_flow_hashrnd();
|
|
|
|
}
|
2014-07-01 21:32:39 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Since this is being sent on the wire obfuscate hash a bit
|
2024-08-22 13:57:33 +01:00
|
|
|
* to minimize possibility that any useful information to an
|
2014-07-01 21:32:39 -07:00
|
|
|
* attacker is leaked. Only upper 16 bits are relevant in the
|
|
|
|
* computation for 16 bit port value.
|
|
|
|
*/
|
|
|
|
hash ^= hash << 16;
|
|
|
|
|
|
|
|
return htons((((u64) hash * (max - min)) >> 32) + min);
|
|
|
|
}
|
|
|
|
|
2018-06-08 11:35:40 +02:00
|
|
|
static inline int udp_rqueue_get(struct sock *sk)
|
|
|
|
{
|
|
|
|
return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit);
|
|
|
|
}
|
|
|
|
|
2024-08-02 13:40:27 +00:00
|
|
|
static inline bool udp_sk_bound_dev_eq(const struct net *net, int bound_dev_if,
|
net: ensure unbound datagram socket to be chosen when not in a VRF
Ensure an unbound datagram skt is chosen when not in a VRF. The check
for a device match in compute_score() for UDP must be performed when
there is no device match. For this, a failure is returned when there is
no device match. This ensures that bound sockets are never selected,
even if there is no unbound socket.
Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These
packets are currently blocked, as flowi6_oif was set to that of the
master vrf device, and the ipi6_ifindex is that of the slave device.
Allow these packets to be sent by checking the device with ipi6_ifindex
has the same L3 scope as that of the bound device of the skt, which is
the master vrf device. Note that this check always succeeds if the skt
is unbound.
Even though the right datagram skt is now selected by compute_score(),
a different skt is being returned that is bound to the wrong vrf. The
difference between these and stream sockets is the handling of the skt
option for SO_REUSEPORT. While the handling when adding a skt for reuse
correctly checks that the bound device of the skt is a match, the skts
in the hashslot are already incorrect. So for the same hash, a skt for
the wrong vrf may be selected for the required port. The root cause is
that the skt is immediately placed into a slot when it is created,
but when the skt is then bound using SO_BINDTODEVICE, it remains in the
same slot. The solution is to move the skt to the correct slot by
forcing a rehash.
Signed-off-by: Mike Manning <mmanning@vyatta.att-mail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Tested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-07 15:36:04 +00:00
|
|
|
int dif, int sdif)
|
|
|
|
{
|
|
|
|
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
|
2022-07-18 10:26:43 -07:00
|
|
|
return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_udp_l3mdev_accept),
|
net: ensure unbound datagram socket to be chosen when not in a VRF
Ensure an unbound datagram skt is chosen when not in a VRF. The check
for a device match in compute_score() for UDP must be performed when
there is no device match. For this, a failure is returned when there is
no device match. This ensures that bound sockets are never selected,
even if there is no unbound socket.
Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These
packets are currently blocked, as flowi6_oif was set to that of the
master vrf device, and the ipi6_ifindex is that of the slave device.
Allow these packets to be sent by checking the device with ipi6_ifindex
has the same L3 scope as that of the bound device of the skt, which is
the master vrf device. Note that this check always succeeds if the skt
is unbound.
Even though the right datagram skt is now selected by compute_score(),
a different skt is being returned that is bound to the wrong vrf. The
difference between these and stream sockets is the handling of the skt
option for SO_REUSEPORT. While the handling when adding a skt for reuse
correctly checks that the bound device of the skt is a match, the skts
in the hashslot are already incorrect. So for the same hash, a skt for
the wrong vrf may be selected for the required port. The root cause is
that the skt is immediately placed into a slot when it is created,
but when the skt is then bound using SO_BINDTODEVICE, it remains in the
same slot. The solution is to move the skt to the correct slot by
forcing a rehash.
Signed-off-by: Mike Manning <mmanning@vyatta.att-mail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Tested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-07 15:36:04 +00:00
|
|
|
bound_dev_if, dif, sdif);
|
|
|
|
#else
|
|
|
|
return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2006-11-27 11:10:57 -08:00
|
|
|
/* net/ipv4/udp.c */
|
tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct().
Originally, inet6_sk(sk)->XXX were changed under lock_sock(), so we were
able to clean them up by calling inet6_destroy_sock() during the IPv6 ->
IPv4 conversion by IPV6_ADDRFORM. However, commit 03485f2adcde ("udpv6:
Add lockless sendmsg() support") added a lockless memory allocation path,
which could cause a memory leak:
setsockopt(IPV6_ADDRFORM) sendmsg()
+-----------------------+ +-------+
- do_ipv6_setsockopt(sk, ...) - udpv6_sendmsg(sk, ...)
- sockopt_lock_sock(sk) ^._ called via udpv6_prot
- lock_sock(sk) before WRITE_ONCE()
- WRITE_ONCE(sk->sk_prot, &tcp_prot)
- inet6_destroy_sock() - if (!corkreq)
- sockopt_release_sock(sk) - ip6_make_skb(sk, ...)
- release_sock(sk) ^._ lockless fast path for
the non-corking case
- __ip6_append_data(sk, ...)
- ipv6_local_rxpmtu(sk, ...)
- xchg(&np->rxpmtu, skb)
^._ rxpmtu is never freed.
- goto out_no_dst;
- lock_sock(sk)
For now, rxpmtu is only the case, but not to miss the future change
and a similar bug fixed in commit e27326009a3d ("net: ping6: Fix
memleak in ipv6_renew_options()."), let's set a new function to IPv6
sk->sk_destruct() and call inet6_cleanup_sock() there. Since the
conversion does not change sk->sk_destruct(), we can guarantee that
we can clean up IPv6 resources finally.
We can now remove all inet6_destroy_sock() calls from IPv6 protocol
specific ->destroy() functions, but such changes are invasive to
backport. So they can be posted as a follow-up later for net-next.
Fixes: 03485f2adcde ("udpv6: Add lockless sendmsg() support")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-10-06 11:53:47 -07:00
|
|
|
void udp_destruct_common(struct sock *sk);
|
udp: implement memory accounting helpers
Avoid using the generic helpers.
Use the receive queue spin lock to protect the memory
accounting operation, both on enqueue and on dequeue.
On dequeue perform partial memory reclaiming, trying to
leave a quantum of forward allocated memory.
On enqueue use a custom helper, to allow some optimizations:
- use a plain spin_lock() variant instead of the slightly
costly spin_lock_irqsave(),
- avoid dst_force check, since the calling code has already
dropped the skb dst
- avoid orphaning the skb, since skb_steal_sock() already did
the work for us
The above needs custom memory reclaiming on shutdown, provided
by the udp_destruct_sock().
v5 -> v6:
- don't orphan the skb on enqueue
v4 -> v5:
- replace the mem_lock with the receive queue spin lock
- ensure that the bh is always allowed to enqueue at least
a skb, even if sk_rcvbuf is exceeded
v3 -> v4:
- reworked memory accunting, simplifying the schema
- provide an helper for both memory scheduling and enqueuing
v1 -> v2:
- use a udp specific destrctor to perform memory reclaiming
- remove a couple of helpers, unneeded after the above cleanup
- do not reclaim memory on dequeue if not under memory
pressure
- reworked the fwd accounting schema to avoid potential
integer overflow
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-21 13:55:46 +02:00
|
|
|
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
|
|
|
|
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
|
2016-11-04 11:28:59 +01:00
|
|
|
void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
|
net: remove noblock parameter from recvmsg() entities
The internal recvmsg() functions have two parameters 'flags' and 'noblock'
that were merged inside skb_recv_datagram(). As a follow up patch to commit
f4b41f062c42 ("net: remove noblock parameter from skb_recv_datagram()")
this patch removes the separate 'noblock' parameter for recvmsg().
Analogue to the referenced patch for skb_recv_datagram() the 'flags' and
'noblock' parameters are unnecessarily split up with e.g.
err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
or in
err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
instead of simply using only flags all the time and check for MSG_DONTWAIT
where needed (to preserve for the formerly separated no(n)block condition).
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20220411124955.154876-1-socketcan@hartkopp.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-04-11 14:49:55 +02:00
|
|
|
struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off,
|
|
|
|
int *err);
|
2016-11-04 11:28:59 +01:00
|
|
|
static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
|
net: remove noblock parameter from recvmsg() entities
The internal recvmsg() functions have two parameters 'flags' and 'noblock'
that were merged inside skb_recv_datagram(). As a follow up patch to commit
f4b41f062c42 ("net: remove noblock parameter from skb_recv_datagram()")
this patch removes the separate 'noblock' parameter for recvmsg().
Analogue to the referenced patch for skb_recv_datagram() the 'flags' and
'noblock' parameters are unnecessarily split up with e.g.
err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
or in
err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
instead of simply using only flags all the time and check for MSG_DONTWAIT
where needed (to preserve for the formerly separated no(n)block condition).
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20220411124955.154876-1-socketcan@hartkopp.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-04-11 14:49:55 +02:00
|
|
|
int *err)
|
2016-11-04 11:28:59 +01:00
|
|
|
{
|
2019-04-08 10:15:59 +02:00
|
|
|
int off = 0;
|
2016-11-04 11:28:59 +01:00
|
|
|
|
net: remove noblock parameter from recvmsg() entities
The internal recvmsg() functions have two parameters 'flags' and 'noblock'
that were merged inside skb_recv_datagram(). As a follow up patch to commit
f4b41f062c42 ("net: remove noblock parameter from skb_recv_datagram()")
this patch removes the separate 'noblock' parameter for recvmsg().
Analogue to the referenced patch for skb_recv_datagram() the 'flags' and
'noblock' parameters are unnecessarily split up with e.g.
err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
or in
err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
instead of simply using only flags all the time and check for MSG_DONTWAIT
where needed (to preserve for the formerly separated no(n)block condition).
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20220411124955.154876-1-socketcan@hartkopp.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-04-11 14:49:55 +02:00
|
|
|
return __skb_recv_udp(sk, flags, &off, err);
|
2016-11-04 11:28:59 +01:00
|
|
|
}
|
udp: implement memory accounting helpers
Avoid using the generic helpers.
Use the receive queue spin lock to protect the memory
accounting operation, both on enqueue and on dequeue.
On dequeue perform partial memory reclaiming, trying to
leave a quantum of forward allocated memory.
On enqueue use a custom helper, to allow some optimizations:
- use a plain spin_lock() variant instead of the slightly
costly spin_lock_irqsave(),
- avoid dst_force check, since the calling code has already
dropped the skb dst
- avoid orphaning the skb, since skb_steal_sock() already did
the work for us
The above needs custom memory reclaiming on shutdown, provided
by the udp_destruct_sock().
v5 -> v6:
- don't orphan the skb on enqueue
v4 -> v5:
- replace the mem_lock with the receive queue spin lock
- ensure that the bh is always allowed to enqueue at least
a skb, even if sk_rcvbuf is exceeded
v3 -> v4:
- reworked memory accunting, simplifying the schema
- provide an helper for both memory scheduling and enqueuing
v1 -> v2:
- use a udp specific destrctor to perform memory reclaiming
- remove a couple of helpers, unneeded after the above cleanup
- do not reclaim memory on dequeue if not under memory
pressure
- reworked the fwd accounting schema to avoid potential
integer overflow
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-21 13:55:46 +02:00
|
|
|
|
2017-09-28 15:51:36 +02:00
|
|
|
int udp_v4_early_demux(struct sk_buff *skb);
|
2017-08-25 14:31:01 +02:00
|
|
|
bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
|
2018-11-08 12:19:21 +01:00
|
|
|
int udp_err(struct sk_buff *, u32);
|
2016-08-23 21:06:33 -07:00
|
|
|
int udp_abort(struct sock *sk, int err);
|
2015-03-02 15:37:48 +08:00
|
|
|
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
|
2023-06-07 19:19:13 +01:00
|
|
|
void udp_splice_eof(struct socket *sock);
|
2013-09-23 11:33:36 -07:00
|
|
|
int udp_push_pending_frames(struct sock *sk);
|
|
|
|
void udp_flush_pending_frames(struct sock *sk);
|
2018-04-26 13:42:20 -04:00
|
|
|
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
|
2013-09-23 11:33:36 -07:00
|
|
|
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
|
|
|
|
int udp_rcv(struct sk_buff *skb);
|
net: ioctl: Use kernel memory on protocol ioctl callbacks
Most of the ioctls to net protocols operates directly on userspace
argument (arg). Usually doing get_user()/put_user() directly in the
ioctl callback. This is not flexible, because it is hard to reuse these
functions without passing userspace buffers.
Change the "struct proto" ioctls to avoid touching userspace memory and
operate on kernel buffers, i.e., all protocol's ioctl callbacks is
adapted to operate on a kernel memory other than on userspace (so, no
more {put,get}_user() and friends being called in the ioctl callback).
This changes the "struct proto" ioctl format in the following way:
int (*ioctl)(struct sock *sk, int cmd,
- unsigned long arg);
+ int *karg);
(Important to say that this patch does not touch the "struct proto_ops"
protocols)
So, the "karg" argument, which is passed to the ioctl callback, is a
pointer allocated to kernel space memory (inside a function wrapper).
This buffer (karg) may contain input argument (copied from userspace in
a prep function) and it might return a value/buffer, which is copied
back to userspace if necessary. There is not one-size-fits-all format
(that is I am using 'may' above), but basically, there are three type of
ioctls:
1) Do not read from userspace, returns a result to userspace
2) Read an input parameter from userspace, and does not return anything
to userspace
3) Read an input from userspace, and return a buffer to userspace.
The default case (1) (where no input parameter is given, and an "int" is
returned to userspace) encompasses more than 90% of the cases, but there
are two other exceptions. Here is a list of exceptions:
* Protocol RAW:
* cmd = SIOCGETVIFCNT:
* input and output = struct sioc_vif_req
* cmd = SIOCGETSGCNT
* input and output = struct sioc_sg_req
* Explanation: for the SIOCGETVIFCNT case, userspace passes the input
argument, which is struct sioc_vif_req. Then the callback populates
the struct, which is copied back to userspace.
* Protocol RAW6:
* cmd = SIOCGETMIFCNT_IN6
* input and output = struct sioc_mif_req6
* cmd = SIOCGETSGCNT_IN6
* input and output = struct sioc_sg_req6
* Protocol PHONET:
* cmd == SIOCPNADDRESOURCE | SIOCPNDELRESOURCE
* input int (4 bytes)
* Nothing is copied back to userspace.
For the exception cases, functions sock_sk_ioctl_inout() will
copy the userspace input, and copy it back to kernel space.
The wrapper that prepare the buffer and put the buffer back to user is
sk_ioctl(), so, instead of calling sk->sk_prot->ioctl(), the callee now
calls sk_ioctl(), which will handle all cases.
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20230609152800.830401-1-leitao@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-06-09 08:27:42 -07:00
|
|
|
int udp_ioctl(struct sock *sk, int cmd, int *karg);
|
udp: implement memory accounting helpers
Avoid using the generic helpers.
Use the receive queue spin lock to protect the memory
accounting operation, both on enqueue and on dequeue.
On dequeue perform partial memory reclaiming, trying to
leave a quantum of forward allocated memory.
On enqueue use a custom helper, to allow some optimizations:
- use a plain spin_lock() variant instead of the slightly
costly spin_lock_irqsave(),
- avoid dst_force check, since the calling code has already
dropped the skb dst
- avoid orphaning the skb, since skb_steal_sock() already did
the work for us
The above needs custom memory reclaiming on shutdown, provided
by the udp_destruct_sock().
v5 -> v6:
- don't orphan the skb on enqueue
v4 -> v5:
- replace the mem_lock with the receive queue spin lock
- ensure that the bh is always allowed to enqueue at least
a skb, even if sk_rcvbuf is exceeded
v3 -> v4:
- reworked memory accunting, simplifying the schema
- provide an helper for both memory scheduling and enqueuing
v1 -> v2:
- use a udp specific destrctor to perform memory reclaiming
- remove a couple of helpers, unneeded after the above cleanup
- do not reclaim memory on dequeue if not under memory
pressure
- reworked the fwd accounting schema to avoid potential
integer overflow
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-21 13:55:46 +02:00
|
|
|
int udp_init_sock(struct sock *sk);
|
2018-03-30 15:08:05 -07:00
|
|
|
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
|
2016-10-20 09:39:40 -07:00
|
|
|
int __udp_disconnect(struct sock *sk, int flags);
|
2013-09-23 11:33:36 -07:00
|
|
|
int udp_disconnect(struct sock *sk, int flags);
|
2018-06-28 09:43:44 -07:00
|
|
|
__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
|
2013-09-23 11:33:36 -07:00
|
|
|
struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
|
2014-09-29 20:22:29 -07:00
|
|
|
netdev_features_t features,
|
|
|
|
bool is_ipv6);
|
2013-09-23 11:33:36 -07:00
|
|
|
int udp_lib_getsockopt(struct sock *sk, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen);
|
|
|
|
int udp_lib_setsockopt(struct sock *sk, int level, int optname,
|
2020-07-23 08:09:04 +02:00
|
|
|
sockptr_t optval, unsigned int optlen,
|
2013-09-23 11:33:36 -07:00
|
|
|
int (*push_pending_frames)(struct sock *));
|
2024-08-02 13:40:27 +00:00
|
|
|
struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport,
|
2013-09-23 11:33:36 -07:00
|
|
|
__be32 daddr, __be16 dport, int dif);
|
2024-08-02 13:40:27 +00:00
|
|
|
struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
|
|
|
|
__be16 sport,
|
2017-08-07 08:44:16 -07:00
|
|
|
__be32 daddr, __be16 dport, int dif, int sdif,
|
2016-01-04 17:41:47 -05:00
|
|
|
struct udp_table *tbl, struct sk_buff *skb);
|
2020-11-09 15:13:49 -08:00
|
|
|
struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
|
2016-04-05 08:22:50 -07:00
|
|
|
__be16 sport, __be16 dport);
|
2024-08-02 13:40:29 +00:00
|
|
|
struct sock *udp6_lib_lookup(const struct net *net,
|
2013-09-23 11:33:36 -07:00
|
|
|
const struct in6_addr *saddr, __be16 sport,
|
|
|
|
const struct in6_addr *daddr, __be16 dport,
|
|
|
|
int dif);
|
2024-08-02 13:40:29 +00:00
|
|
|
struct sock *__udp6_lib_lookup(const struct net *net,
|
2013-09-23 11:33:36 -07:00
|
|
|
const struct in6_addr *saddr, __be16 sport,
|
|
|
|
const struct in6_addr *daddr, __be16 dport,
|
2017-08-07 08:44:20 -07:00
|
|
|
int dif, int sdif, struct udp_table *tbl,
|
2016-01-04 17:41:47 -05:00
|
|
|
struct sk_buff *skb);
|
2020-11-09 15:13:49 -08:00
|
|
|
struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
|
2016-04-05 08:22:50 -07:00
|
|
|
__be16 sport, __be16 dport);
|
2022-06-15 09:20:12 -07:00
|
|
|
int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
|
2008-10-01 07:48:10 -07:00
|
|
|
|
2017-06-26 19:01:50 +02:00
|
|
|
/* UDP uses skb->dev_scratch to cache as much information as possible and avoid
|
|
|
|
* possibly multiple cache miss on dequeue()
|
|
|
|
*/
|
|
|
|
struct udp_dev_scratch {
|
2017-07-25 17:57:47 +02:00
|
|
|
/* skb->truesize and the stateless bit are embedded in a single field;
|
|
|
|
* do not use a bitfield since the compiler emits better/smaller code
|
|
|
|
* this way
|
|
|
|
*/
|
|
|
|
u32 _tsize_state;
|
|
|
|
|
|
|
|
#if BITS_PER_LONG == 64
|
|
|
|
/* len and the bit needed to compute skb_csum_unnecessary
|
|
|
|
* will be on cold cache lines at recvmsg time.
|
|
|
|
* skb->len can be stored on 16 bits since the udp header has been
|
|
|
|
* already validated and pulled.
|
|
|
|
*/
|
2017-06-26 19:01:50 +02:00
|
|
|
u16 len;
|
|
|
|
bool is_linear;
|
|
|
|
bool csum_unnecessary;
|
2017-07-25 17:57:47 +02:00
|
|
|
#endif
|
2017-06-26 19:01:50 +02:00
|
|
|
};
|
|
|
|
|
2017-07-25 17:57:47 +02:00
|
|
|
static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return (struct udp_dev_scratch *)&skb->dev_scratch;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if BITS_PER_LONG == 64
|
2017-06-26 19:01:50 +02:00
|
|
|
static inline unsigned int udp_skb_len(struct sk_buff *skb)
|
|
|
|
{
|
2017-07-25 17:57:47 +02:00
|
|
|
return udp_skb_scratch(skb)->len;
|
2017-06-26 19:01:50 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb)
|
|
|
|
{
|
2017-07-25 17:57:47 +02:00
|
|
|
return udp_skb_scratch(skb)->csum_unnecessary;
|
2017-06-26 19:01:50 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool udp_skb_is_linear(struct sk_buff *skb)
|
|
|
|
{
|
2017-07-25 17:57:47 +02:00
|
|
|
return udp_skb_scratch(skb)->is_linear;
|
2017-06-26 19:01:50 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
static inline unsigned int udp_skb_len(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return skb->len;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return skb_csum_unnecessary(skb);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool udp_skb_is_linear(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return !skb_is_nonlinear(skb);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static inline int copy_linear_skb(struct sk_buff *skb, int len, int off,
|
|
|
|
struct iov_iter *to)
|
|
|
|
{
|
2024-04-07 02:42:36 -04:00
|
|
|
return copy_to_iter_full(skb->data + off, len, to) ? 0 : -EFAULT;
|
2017-06-26 19:01:50 +02:00
|
|
|
}
|
|
|
|
|
2006-11-27 11:10:57 -08:00
|
|
|
/*
|
|
|
|
* SNMP statistics for UDP and UDP-Lite
|
|
|
|
*/
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-27 16:44:27 -07:00
|
|
|
#define UDP_INC_STATS(net, field, is_udplite) do { \
|
|
|
|
if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field); \
|
|
|
|
else SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0)
|
2016-04-27 16:44:30 -07:00
|
|
|
#define __UDP_INC_STATS(net, field, is_udplite) do { \
|
2016-04-27 16:44:43 -07:00
|
|
|
if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field); \
|
|
|
|
else __SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0)
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2016-04-27 16:44:30 -07:00
|
|
|
#define __UDP6_INC_STATS(net, field, is_udplite) do { \
|
2016-04-27 16:44:43 -07:00
|
|
|
if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\
|
|
|
|
else __SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \
|
2008-10-07 14:49:36 -07:00
|
|
|
} while(0)
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-27 16:44:27 -07:00
|
|
|
#define UDP6_INC_STATS(net, field, __lite) do { \
|
|
|
|
if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \
|
|
|
|
else SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \
|
2008-10-07 14:49:36 -07:00
|
|
|
} while(0)
|
2007-12-11 11:30:32 -08:00
|
|
|
|
2011-12-10 09:48:31 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2018-11-07 12:38:33 +01:00
|
|
|
#define __UDPX_MIB(sk, ipv4) \
|
|
|
|
({ \
|
|
|
|
ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \
|
|
|
|
sock_net(sk)->mib.udp_statistics) : \
|
|
|
|
(IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 : \
|
|
|
|
sock_net(sk)->mib.udp_stats_in6); \
|
|
|
|
})
|
2007-12-11 11:30:32 -08:00
|
|
|
#else
|
2018-11-07 12:38:33 +01:00
|
|
|
#define __UDPX_MIB(sk, ipv4) \
|
|
|
|
({ \
|
|
|
|
IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \
|
|
|
|
sock_net(sk)->mib.udp_statistics; \
|
|
|
|
})
|
2007-12-11 11:30:32 -08:00
|
|
|
#endif
|
|
|
|
|
2018-11-07 12:38:33 +01:00
|
|
|
#define __UDPX_INC_STATS(sk, field) \
|
|
|
|
__SNMP_INC_STATS(__UDPX_MIB(sk, (sk)->sk_family == AF_INET), field)
|
|
|
|
|
2018-04-10 21:31:50 +02:00
|
|
|
#ifdef CONFIG_PROC_FS
|
2005-04-16 15:20:36 -07:00
|
|
|
struct udp_seq_afinfo {
|
2011-10-30 06:46:30 +00:00
|
|
|
sa_family_t family;
|
|
|
|
struct udp_table *udp_table;
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
struct udp_iter_state {
|
2008-03-28 18:23:33 -07:00
|
|
|
struct seq_net_private p;
|
2005-04-16 15:20:36 -07:00
|
|
|
int bucket;
|
|
|
|
};
|
|
|
|
|
2018-04-10 21:31:50 +02:00
|
|
|
void *udp_seq_start(struct seq_file *seq, loff_t *pos);
|
|
|
|
void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
|
|
|
|
void udp_seq_stop(struct seq_file *seq, void *v);
|
|
|
|
|
2018-04-10 19:42:55 +02:00
|
|
|
extern const struct seq_operations udp_seq_ops;
|
|
|
|
extern const struct seq_operations udp6_seq_ops;
|
2005-08-16 02:18:02 -03:00
|
|
|
|
2013-09-23 11:33:36 -07:00
|
|
|
int udp4_proc_init(void);
|
|
|
|
void udp4_proc_exit(void);
|
2018-04-10 21:31:50 +02:00
|
|
|
#endif /* CONFIG_PROC_FS */
|
2007-12-31 00:29:24 -08:00
|
|
|
|
2013-09-23 11:33:36 -07:00
|
|
|
int udpv4_offload_init(void);
|
2013-06-08 12:56:03 +02:00
|
|
|
|
2013-09-23 11:33:36 -07:00
|
|
|
void udp_init(void);
|
2009-07-09 08:09:47 +00:00
|
|
|
|
2018-10-05 11:31:40 -04:00
|
|
|
DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
|
2013-09-23 11:33:36 -07:00
|
|
|
void udp_encap_enable(void);
|
2021-02-03 16:54:22 +08:00
|
|
|
void udp_encap_disable(void);
|
2012-04-27 08:24:08 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2018-10-05 11:31:40 -04:00
|
|
|
DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
|
2013-09-23 11:33:36 -07:00
|
|
|
void udpv6_encap_enable(void);
|
2012-04-27 08:24:08 +00:00
|
|
|
#endif
|
2017-03-23 13:34:16 -06:00
|
|
|
|
2018-11-07 12:38:33 +01:00
|
|
|
static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
|
|
|
|
struct sk_buff *skb, bool ipv4)
|
|
|
|
{
|
2019-05-28 12:22:54 -06:00
|
|
|
netdev_features_t features = NETIF_F_SG;
|
2018-11-07 12:38:33 +01:00
|
|
|
struct sk_buff *segs;
|
|
|
|
|
2019-05-28 12:22:54 -06:00
|
|
|
/* Avoid csum recalculation by skb_segment unless userspace explicitly
|
|
|
|
* asks for the final checksum values
|
|
|
|
*/
|
|
|
|
if (!inet_get_convert_csum(sk))
|
|
|
|
features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
|
|
|
|
|
2020-01-29 15:20:17 -05:00
|
|
|
/* UDP segmentation expects packets of type CHECKSUM_PARTIAL or
|
|
|
|
* CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial
|
|
|
|
* packets in udp_gro_complete_segment. As does UDP GSO, verified by
|
|
|
|
* udp_send_skb. But when those packets are looped in dev_loopback_xmit
|
net: multicast: calculate csum of looped-back and forwarded packets
During a testing of an user-space application which transmits UDP
multicast datagrams and utilizes multicast routing to send the UDP
datagrams out of defined network interfaces, I've found a multicast
router does not fill-in UDP checksum into locally produced, looped-back
and forwarded UDP datagrams, if an original output NIC the datagrams
are sent to has UDP TX checksum offload enabled.
The datagrams are sent malformed out of the NIC the datagrams have been
forwarded to.
It is because:
1. If TX checksum offload is enabled on the output NIC, UDP checksum
is not calculated by kernel and is not filled into skb data.
2. dev_loopback_xmit(), which is called solely by
ip_mc_finish_output(), sets skb->ip_summed = CHECKSUM_UNNECESSARY
unconditionally.
3. Since 35fc92a9 ("[NET]: Allow forwarding of ip_summed except
CHECKSUM_COMPLETE"), the ip_summed value is preserved during
forwarding.
4. If ip_summed != CHECKSUM_PARTIAL, checksum is not calculated during
a packet egress.
The minimum fix in dev_loopback_xmit():
1. Preserves skb->ip_summed CHECKSUM_PARTIAL. This is the
case when the original output NIC has TX checksum offload enabled.
The effects are:
a) If the forwarding destination interface supports TX checksum
offloading, the NIC driver is responsible to fill-in the
checksum.
b) If the forwarding destination interface does NOT support TX
checksum offloading, checksums are filled-in by kernel before
skb is submitted to the NIC driver.
c) For local delivery, checksum validation is skipped as in the
case of CHECKSUM_UNNECESSARY, thanks to skb_csum_unnecessary().
2. Translates ip_summed CHECKSUM_NONE to CHECKSUM_UNNECESSARY. It
means, for CHECKSUM_NONE, the behavior is unmodified and is there
to skip a looped-back packet local delivery checksum validation.
Signed-off-by: Cyril Strejc <cyril.strejc@skoda.cz>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-24 22:14:25 +02:00
|
|
|
* their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY.
|
|
|
|
* Reset in this specific case, where PARTIAL is both correct and
|
|
|
|
* required.
|
2020-01-29 15:20:17 -05:00
|
|
|
*/
|
2020-01-27 15:40:31 -05:00
|
|
|
if (skb->pkt_type == PACKET_LOOPBACK)
|
|
|
|
skb->ip_summed = CHECKSUM_PARTIAL;
|
|
|
|
|
2018-11-07 12:38:33 +01:00
|
|
|
/* the GSO CB lays after the UDP one, no need to save and restore any
|
|
|
|
* CB fragment
|
|
|
|
*/
|
2019-05-28 12:22:54 -06:00
|
|
|
segs = __skb_gso_segment(skb, features, false);
|
2019-08-29 19:50:24 +03:00
|
|
|
if (IS_ERR_OR_NULL(segs)) {
|
2018-11-07 12:38:33 +01:00
|
|
|
int segs_nr = skb_shinfo(skb)->gso_segs;
|
|
|
|
|
|
|
|
atomic_add(segs_nr, &sk->sk_drops);
|
|
|
|
SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr);
|
|
|
|
kfree_skb(skb);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
consume_skb(skb);
|
|
|
|
return segs;
|
|
|
|
}
|
|
|
|
|
2021-03-30 12:28:49 +02:00
|
|
|
static inline void udp_post_segment_fix_csum(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
/* UDP-lite can't land here - no GRO */
|
|
|
|
WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov);
|
|
|
|
|
|
|
|
/* UDP packets generated with UDP_SEGMENT and traversing:
|
|
|
|
*
|
|
|
|
* UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx)
|
|
|
|
*
|
|
|
|
* can reach an UDP socket with CHECKSUM_NONE, because
|
|
|
|
* __iptunnel_pull_header() converts CHECKSUM_PARTIAL into NONE.
|
|
|
|
* SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets with no UDP tunnel will
|
|
|
|
* have a valid checksum, as the GRO engine validates the UDP csum
|
|
|
|
* before the aggregation and nobody strips such info in between.
|
|
|
|
* Instead of adding another check in the tunnel fastpath, we can force
|
|
|
|
* a valid csum after the segmentation.
|
|
|
|
* Additionally fixup the UDP CB.
|
|
|
|
*/
|
|
|
|
UDP_SKB_CB(skb)->cscov = skb->len;
|
|
|
|
if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid)
|
|
|
|
skb->csum_valid = 1;
|
|
|
|
}
|
|
|
|
|
2021-02-23 10:49:26 -08:00
|
|
|
#ifdef CONFIG_BPF_SYSCALL
|
2020-03-09 11:12:38 +00:00
|
|
|
struct sk_psock;
|
2021-04-06 20:21:11 -07:00
|
|
|
int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
|
2021-02-23 10:49:26 -08:00
|
|
|
#endif
|
2020-03-09 11:12:38 +00:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif /* _UDP_H */
|