2019-05-27 08:55:01 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2013-06-07 05:11:46 +00:00
|
|
|
/*
|
|
|
|
* IPV4 GSO/GRO offload support
|
|
|
|
* Linux INET implementation
|
|
|
|
*
|
|
|
|
* TCPv4 GSO/GRO support
|
|
|
|
*/
|
|
|
|
|
2018-12-14 11:51:59 +01:00
|
|
|
#include <linux/indirect_call_wrapper.h>
|
2013-06-07 05:11:46 +00:00
|
|
|
#include <linux/skbuff.h>
|
2021-11-15 09:05:51 -08:00
|
|
|
#include <net/gro.h>
|
2023-06-08 19:17:37 +00:00
|
|
|
#include <net/gso.h>
|
2013-06-07 05:11:46 +00:00
|
|
|
#include <net/tcp.h>
|
|
|
|
#include <net/protocol.h>
|
|
|
|
|
2014-08-06 15:09:44 -04:00
|
|
|
static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
|
|
|
|
unsigned int seq, unsigned int mss)
|
net-timestamp: TCP timestamping
TCP timestamping extends SO_TIMESTAMPING to bytestreams.
Bytestreams do not have a 1:1 relationship between send() buffers and
network packets. The feature interprets a send call on a bytestream as
a request for a timestamp for the last byte in that send() buffer.
The choice corresponds to a request for a timestamp when all bytes in
the buffer have been sent. That assumption depends on in-order kernel
transmission. This is the common case. That said, it is possible to
construct a traffic shaping tree that would result in reordering.
The guarantee is strong, then, but not ironclad.
This implementation supports send and sendpages (splice). GSO replaces
one large packet with multiple smaller packets. This patch also copies
the option into the correct smaller packet.
This patch does not yet support timestamping on data in an initial TCP
Fast Open SYN, because that takes a very different data path.
If ID generation in ee_data is enabled, bytestream timestamps return a
byte offset, instead of the packet counter for datagrams.
The implementation supports a single timestamp per packet. It silenty
replaces requests for previous timestamps. To avoid missing tstamps,
flush the tcp queue by disabling Nagle, cork and autocork. Missing
tstamps can be detected by offset when the ee_data ID is enabled.
Implementation details:
- On GSO, the timestamping code can be included in the main loop. I
moved it into its own loop to reduce the impact on the common case
to a single branch.
- To avoid leaking the absolute seqno to userspace, the offset
returned in ee_data must always be relative. It is an offset between
an skb and sk field. The first is always set (also for GSO & ACK).
The second must also never be uninitialized. Only allow the ID
option on sockets in the ESTABLISHED state, for which the seqno
is available. Never reset it to zero (instead, move it to the
current seqno when reenabling the option).
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-08-04 22:11:49 -04:00
|
|
|
{
|
|
|
|
while (skb) {
|
2014-08-06 15:09:44 -04:00
|
|
|
if (before(ts_seq, seq + mss)) {
|
|
|
|
skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP;
|
net-timestamp: TCP timestamping
TCP timestamping extends SO_TIMESTAMPING to bytestreams.
Bytestreams do not have a 1:1 relationship between send() buffers and
network packets. The feature interprets a send call on a bytestream as
a request for a timestamp for the last byte in that send() buffer.
The choice corresponds to a request for a timestamp when all bytes in
the buffer have been sent. That assumption depends on in-order kernel
transmission. This is the common case. That said, it is possible to
construct a traffic shaping tree that would result in reordering.
The guarantee is strong, then, but not ironclad.
This implementation supports send and sendpages (splice). GSO replaces
one large packet with multiple smaller packets. This patch also copies
the option into the correct smaller packet.
This patch does not yet support timestamping on data in an initial TCP
Fast Open SYN, because that takes a very different data path.
If ID generation in ee_data is enabled, bytestream timestamps return a
byte offset, instead of the packet counter for datagrams.
The implementation supports a single timestamp per packet. It silenty
replaces requests for previous timestamps. To avoid missing tstamps,
flush the tcp queue by disabling Nagle, cork and autocork. Missing
tstamps can be detected by offset when the ee_data ID is enabled.
Implementation details:
- On GSO, the timestamping code can be included in the main loop. I
moved it into its own loop to reduce the impact on the common case
to a single branch.
- To avoid leaking the absolute seqno to userspace, the offset
returned in ee_data must always be relative. It is an offset between
an skb and sk field. The first is always set (also for GSO & ACK).
The second must also never be uninitialized. Only allow the ID
option on sockets in the ESTABLISHED state, for which the seqno
is available. Never reset it to zero (instead, move it to the
current seqno when reenabling the option).
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-08-04 22:11:49 -04:00
|
|
|
skb_shinfo(skb)->tskey = ts_seq;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
skb = skb->next;
|
|
|
|
seq += mss;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-02 10:44:43 +02:00
|
|
|
static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
|
|
|
|
__be32 *oldip, __be32 newip,
|
|
|
|
__be16 *oldport, __be16 newport)
|
|
|
|
{
|
|
|
|
struct tcphdr *th;
|
|
|
|
struct iphdr *iph;
|
|
|
|
|
|
|
|
if (*oldip == newip && *oldport == newport)
|
|
|
|
return;
|
|
|
|
|
|
|
|
th = tcp_hdr(seg);
|
|
|
|
iph = ip_hdr(seg);
|
|
|
|
|
|
|
|
inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true);
|
|
|
|
inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
|
|
|
|
*oldport = newport;
|
|
|
|
|
|
|
|
csum_replace4(&iph->check, *oldip, newip);
|
|
|
|
*oldip = newip;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
|
|
|
|
{
|
|
|
|
const struct tcphdr *th;
|
|
|
|
const struct iphdr *iph;
|
|
|
|
struct sk_buff *seg;
|
|
|
|
struct tcphdr *th2;
|
|
|
|
struct iphdr *iph2;
|
|
|
|
|
|
|
|
seg = segs;
|
|
|
|
th = tcp_hdr(seg);
|
|
|
|
iph = ip_hdr(seg);
|
|
|
|
th2 = tcp_hdr(seg->next);
|
|
|
|
iph2 = ip_hdr(seg->next);
|
|
|
|
|
|
|
|
if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
|
|
|
|
iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
|
|
|
|
return segs;
|
|
|
|
|
|
|
|
while ((seg = seg->next)) {
|
|
|
|
th2 = tcp_hdr(seg);
|
|
|
|
iph2 = ip_hdr(seg);
|
|
|
|
|
|
|
|
__tcpv4_gso_segment_csum(seg,
|
|
|
|
&iph2->saddr, iph->saddr,
|
|
|
|
&th2->source, th->source);
|
|
|
|
__tcpv4_gso_segment_csum(seg,
|
|
|
|
&iph2->daddr, iph->daddr,
|
|
|
|
&th2->dest, th->dest);
|
|
|
|
}
|
|
|
|
|
|
|
|
return segs;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
|
|
|
|
netdev_features_t features)
|
|
|
|
{
|
|
|
|
skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
|
|
|
|
if (IS_ERR(skb))
|
|
|
|
return skb;
|
|
|
|
|
|
|
|
return __tcpv4_gso_segment_list_csum(skb);
|
|
|
|
}
|
|
|
|
|
2015-02-26 19:08:59 -08:00
|
|
|
static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
|
|
|
|
netdev_features_t features)
|
2014-09-20 14:52:28 -07:00
|
|
|
{
|
2018-01-19 09:29:18 -05:00
|
|
|
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4))
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
2014-09-20 14:52:28 -07:00
|
|
|
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
2024-05-02 10:44:43 +02:00
|
|
|
if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
|
|
|
|
return __tcp4_gso_segment_list(skb, features);
|
|
|
|
|
2014-09-20 14:52:28 -07:00
|
|
|
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
|
|
|
|
const struct iphdr *iph = ip_hdr(skb);
|
|
|
|
struct tcphdr *th = tcp_hdr(skb);
|
|
|
|
|
|
|
|
/* Set up checksum pseudo header, usually expect stack to
|
|
|
|
* have done this already.
|
|
|
|
*/
|
|
|
|
|
|
|
|
th->check = 0;
|
|
|
|
skb->ip_summed = CHECKSUM_PARTIAL;
|
|
|
|
__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
|
|
|
|
}
|
|
|
|
|
|
|
|
return tcp_gso_segment(skb, features);
|
|
|
|
}
|
|
|
|
|
2013-10-18 10:36:17 -07:00
|
|
|
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
|
2013-06-07 05:11:46 +00:00
|
|
|
netdev_features_t features)
|
|
|
|
{
|
|
|
|
struct sk_buff *segs = ERR_PTR(-EINVAL);
|
2013-10-25 17:26:17 -07:00
|
|
|
unsigned int sum_truesize = 0;
|
2013-06-07 05:11:46 +00:00
|
|
|
struct tcphdr *th;
|
|
|
|
unsigned int thlen;
|
|
|
|
unsigned int seq;
|
|
|
|
unsigned int oldlen;
|
|
|
|
unsigned int mss;
|
|
|
|
struct sk_buff *gso_skb = skb;
|
|
|
|
__sum16 newcheck;
|
|
|
|
bool ooo_okay, copy_destructor;
|
2023-06-05 16:16:47 +00:00
|
|
|
__wsum delta;
|
2013-06-07 05:11:46 +00:00
|
|
|
|
|
|
|
th = tcp_hdr(skb);
|
|
|
|
thlen = th->doff * 4;
|
|
|
|
if (thlen < sizeof(*th))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, thlen))
|
|
|
|
goto out;
|
|
|
|
|
2023-06-05 16:16:47 +00:00
|
|
|
oldlen = ~skb->len;
|
2013-06-07 05:11:46 +00:00
|
|
|
__skb_pull(skb, thlen);
|
|
|
|
|
2015-06-11 09:15:15 -07:00
|
|
|
mss = skb_shinfo(skb)->gso_size;
|
2013-06-07 05:11:46 +00:00
|
|
|
if (unlikely(skb->len <= mss))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
|
|
|
|
/* Packet is from an untrusted source, reset gso_segs. */
|
|
|
|
|
|
|
|
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
|
|
|
|
|
|
|
|
segs = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
copy_destructor = gso_skb->destructor == tcp_wfree;
|
|
|
|
ooo_okay = gso_skb->ooo_okay;
|
|
|
|
/* All segments but the first should have ooo_okay cleared */
|
|
|
|
skb->ooo_okay = 0;
|
|
|
|
|
|
|
|
segs = skb_segment(skb, features);
|
|
|
|
if (IS_ERR(segs))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* Only first segment might have ooo_okay set */
|
|
|
|
segs->ooo_okay = ooo_okay;
|
|
|
|
|
2016-09-19 12:58:47 +02:00
|
|
|
/* GSO partial and frag_list segmentation only requires splitting
|
|
|
|
* the frame into an MSS multiple and possibly a remainder, both
|
|
|
|
* cases return a GSO skb. So update the mss now.
|
|
|
|
*/
|
|
|
|
if (skb_is_gso(segs))
|
|
|
|
mss *= skb_shinfo(segs)->gso_segs;
|
|
|
|
|
2023-06-05 16:16:47 +00:00
|
|
|
delta = (__force __wsum)htonl(oldlen + thlen + mss);
|
2013-06-07 05:11:46 +00:00
|
|
|
|
|
|
|
skb = segs;
|
|
|
|
th = tcp_hdr(skb);
|
|
|
|
seq = ntohl(th->seq);
|
|
|
|
|
net-timestamp: TCP timestamping
TCP timestamping extends SO_TIMESTAMPING to bytestreams.
Bytestreams do not have a 1:1 relationship between send() buffers and
network packets. The feature interprets a send call on a bytestream as
a request for a timestamp for the last byte in that send() buffer.
The choice corresponds to a request for a timestamp when all bytes in
the buffer have been sent. That assumption depends on in-order kernel
transmission. This is the common case. That said, it is possible to
construct a traffic shaping tree that would result in reordering.
The guarantee is strong, then, but not ironclad.
This implementation supports send and sendpages (splice). GSO replaces
one large packet with multiple smaller packets. This patch also copies
the option into the correct smaller packet.
This patch does not yet support timestamping on data in an initial TCP
Fast Open SYN, because that takes a very different data path.
If ID generation in ee_data is enabled, bytestream timestamps return a
byte offset, instead of the packet counter for datagrams.
The implementation supports a single timestamp per packet. It silenty
replaces requests for previous timestamps. To avoid missing tstamps,
flush the tcp queue by disabling Nagle, cork and autocork. Missing
tstamps can be detected by offset when the ee_data ID is enabled.
Implementation details:
- On GSO, the timestamping code can be included in the main loop. I
moved it into its own loop to reduce the impact on the common case
to a single branch.
- To avoid leaking the absolute seqno to userspace, the offset
returned in ee_data must always be relative. It is an offset between
an skb and sk field. The first is always set (also for GSO & ACK).
The second must also never be uninitialized. Only allow the ID
option on sockets in the ESTABLISHED state, for which the seqno
is available. Never reset it to zero (instead, move it to the
current seqno when reenabling the option).
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-08-04 22:11:49 -04:00
|
|
|
if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
|
|
|
|
tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
|
|
|
|
|
2023-06-05 16:16:47 +00:00
|
|
|
newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta));
|
2013-06-07 05:11:46 +00:00
|
|
|
|
2016-04-10 21:45:03 -04:00
|
|
|
while (skb->next) {
|
2013-06-07 05:11:46 +00:00
|
|
|
th->fin = th->psh = 0;
|
|
|
|
th->check = newcheck;
|
|
|
|
|
2016-02-05 15:27:49 -08:00
|
|
|
if (skb->ip_summed == CHECKSUM_PARTIAL)
|
|
|
|
gso_reset_checksum(skb, ~th->check);
|
|
|
|
else
|
2014-06-04 17:20:09 -07:00
|
|
|
th->check = gso_make_checksum(skb, ~th->check);
|
2013-06-07 05:11:46 +00:00
|
|
|
|
|
|
|
seq += mss;
|
|
|
|
if (copy_destructor) {
|
|
|
|
skb->destructor = gso_skb->destructor;
|
|
|
|
skb->sk = gso_skb->sk;
|
2013-10-25 17:26:17 -07:00
|
|
|
sum_truesize += skb->truesize;
|
2013-06-07 05:11:46 +00:00
|
|
|
}
|
|
|
|
skb = skb->next;
|
|
|
|
th = tcp_hdr(skb);
|
|
|
|
|
|
|
|
th->seq = htonl(seq);
|
|
|
|
th->cwr = 0;
|
2016-04-10 21:45:03 -04:00
|
|
|
}
|
2013-06-07 05:11:46 +00:00
|
|
|
|
|
|
|
/* Following permits TCP Small Queues to work well with GSO :
|
|
|
|
* The callback to TCP stack will be called at the time last frag
|
|
|
|
* is freed at TX completion, and not right now when gso_skb
|
|
|
|
* is freed by GSO engine
|
|
|
|
*/
|
|
|
|
if (copy_destructor) {
|
tcp: gso: avoid refcount_t warning from tcp_gso_segment()
When a GSO skb of truesize O is segmented into 2 new skbs of truesize N1
and N2, we want to transfer socket ownership to the new fresh skbs.
In order to avoid expensive atomic operations on a cache line subject to
cache bouncing, we replace the sequence :
refcount_add(N1, &sk->sk_wmem_alloc);
refcount_add(N2, &sk->sk_wmem_alloc); // repeated by number of segments
refcount_sub(O, &sk->sk_wmem_alloc);
by a single
refcount_add(sum_of(N) - O, &sk->sk_wmem_alloc);
Problem is :
In some pathological cases, sum(N) - O might be a negative number, and
syzkaller bot was apparently able to trigger this trace [1]
atomic_t was ok with this construct, but we need to take care of the
negative delta with refcount_t
[1]
refcount_t: saturated; leaking memory.
------------[ cut here ]------------
WARNING: CPU: 0 PID: 8404 at lib/refcount.c:77 refcount_add_not_zero+0x198/0x200 lib/refcount.c:77
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 8404 Comm: syz-executor2 Not tainted 4.14.0-rc5-mm1+ #20
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:16 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:52
panic+0x1e4/0x41c kernel/panic.c:183
__warn+0x1c4/0x1e0 kernel/panic.c:546
report_bug+0x211/0x2d0 lib/bug.c:183
fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:177
do_trap_no_signal arch/x86/kernel/traps.c:211 [inline]
do_trap+0x260/0x390 arch/x86/kernel/traps.c:260
do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:297
do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:310
invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
RIP: 0010:refcount_add_not_zero+0x198/0x200 lib/refcount.c:77
RSP: 0018:ffff8801c606e3a0 EFLAGS: 00010282
RAX: 0000000000000026 RBX: 0000000000001401 RCX: 0000000000000000
RDX: 0000000000000026 RSI: ffffc900036fc000 RDI: ffffed0038c0dc68
RBP: ffff8801c606e430 R08: 0000000000000001 R09: 0000000000000000
R10: ffff8801d97f5eba R11: 0000000000000000 R12: ffff8801d5acf73c
R13: 1ffff10038c0dc75 R14: 00000000ffffffff R15: 00000000fffff72f
refcount_add+0x1b/0x60 lib/refcount.c:101
tcp_gso_segment+0x10d0/0x16b0 net/ipv4/tcp_offload.c:155
tcp4_gso_segment+0xd4/0x310 net/ipv4/tcp_offload.c:51
inet_gso_segment+0x60c/0x11c0 net/ipv4/af_inet.c:1271
skb_mac_gso_segment+0x33f/0x660 net/core/dev.c:2749
__skb_gso_segment+0x35f/0x7f0 net/core/dev.c:2821
skb_gso_segment include/linux/netdevice.h:3971 [inline]
validate_xmit_skb+0x4ba/0xb20 net/core/dev.c:3074
__dev_queue_xmit+0xe49/0x2070 net/core/dev.c:3497
dev_queue_xmit+0x17/0x20 net/core/dev.c:3538
neigh_hh_output include/net/neighbour.h:471 [inline]
neigh_output include/net/neighbour.h:479 [inline]
ip_finish_output2+0xece/0x1460 net/ipv4/ip_output.c:229
ip_finish_output+0x85e/0xd10 net/ipv4/ip_output.c:317
NF_HOOK_COND include/linux/netfilter.h:238 [inline]
ip_output+0x1cc/0x860 net/ipv4/ip_output.c:405
dst_output include/net/dst.h:459 [inline]
ip_local_out+0x95/0x160 net/ipv4/ip_output.c:124
ip_queue_xmit+0x8c6/0x18e0 net/ipv4/ip_output.c:504
tcp_transmit_skb+0x1ab7/0x3840 net/ipv4/tcp_output.c:1137
tcp_write_xmit+0x663/0x4de0 net/ipv4/tcp_output.c:2341
__tcp_push_pending_frames+0xa0/0x250 net/ipv4/tcp_output.c:2513
tcp_push_pending_frames include/net/tcp.h:1722 [inline]
tcp_data_snd_check net/ipv4/tcp_input.c:5050 [inline]
tcp_rcv_established+0x8c7/0x18a0 net/ipv4/tcp_input.c:5497
tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1460
sk_backlog_rcv include/net/sock.h:909 [inline]
__release_sock+0x124/0x360 net/core/sock.c:2264
release_sock+0xa4/0x2a0 net/core/sock.c:2776
tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462
inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
sock_sendmsg_nosec net/socket.c:632 [inline]
sock_sendmsg+0xca/0x110 net/socket.c:642
___sys_sendmsg+0x31c/0x890 net/socket.c:2048
__sys_sendmmsg+0x1e6/0x5f0 net/socket.c:2138
Fixes: 14afee4b6092 ("net: convert sock.sk_wmem_alloc from atomic_t to refcount_t")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-07 15:15:04 -08:00
|
|
|
int delta;
|
|
|
|
|
2013-06-07 05:11:46 +00:00
|
|
|
swap(gso_skb->sk, skb->sk);
|
|
|
|
swap(gso_skb->destructor, skb->destructor);
|
2013-10-25 17:26:17 -07:00
|
|
|
sum_truesize += skb->truesize;
|
tcp: gso: avoid refcount_t warning from tcp_gso_segment()
When a GSO skb of truesize O is segmented into 2 new skbs of truesize N1
and N2, we want to transfer socket ownership to the new fresh skbs.
In order to avoid expensive atomic operations on a cache line subject to
cache bouncing, we replace the sequence :
refcount_add(N1, &sk->sk_wmem_alloc);
refcount_add(N2, &sk->sk_wmem_alloc); // repeated by number of segments
refcount_sub(O, &sk->sk_wmem_alloc);
by a single
refcount_add(sum_of(N) - O, &sk->sk_wmem_alloc);
Problem is :
In some pathological cases, sum(N) - O might be a negative number, and
syzkaller bot was apparently able to trigger this trace [1]
atomic_t was ok with this construct, but we need to take care of the
negative delta with refcount_t
[1]
refcount_t: saturated; leaking memory.
------------[ cut here ]------------
WARNING: CPU: 0 PID: 8404 at lib/refcount.c:77 refcount_add_not_zero+0x198/0x200 lib/refcount.c:77
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 8404 Comm: syz-executor2 Not tainted 4.14.0-rc5-mm1+ #20
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:16 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:52
panic+0x1e4/0x41c kernel/panic.c:183
__warn+0x1c4/0x1e0 kernel/panic.c:546
report_bug+0x211/0x2d0 lib/bug.c:183
fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:177
do_trap_no_signal arch/x86/kernel/traps.c:211 [inline]
do_trap+0x260/0x390 arch/x86/kernel/traps.c:260
do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:297
do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:310
invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
RIP: 0010:refcount_add_not_zero+0x198/0x200 lib/refcount.c:77
RSP: 0018:ffff8801c606e3a0 EFLAGS: 00010282
RAX: 0000000000000026 RBX: 0000000000001401 RCX: 0000000000000000
RDX: 0000000000000026 RSI: ffffc900036fc000 RDI: ffffed0038c0dc68
RBP: ffff8801c606e430 R08: 0000000000000001 R09: 0000000000000000
R10: ffff8801d97f5eba R11: 0000000000000000 R12: ffff8801d5acf73c
R13: 1ffff10038c0dc75 R14: 00000000ffffffff R15: 00000000fffff72f
refcount_add+0x1b/0x60 lib/refcount.c:101
tcp_gso_segment+0x10d0/0x16b0 net/ipv4/tcp_offload.c:155
tcp4_gso_segment+0xd4/0x310 net/ipv4/tcp_offload.c:51
inet_gso_segment+0x60c/0x11c0 net/ipv4/af_inet.c:1271
skb_mac_gso_segment+0x33f/0x660 net/core/dev.c:2749
__skb_gso_segment+0x35f/0x7f0 net/core/dev.c:2821
skb_gso_segment include/linux/netdevice.h:3971 [inline]
validate_xmit_skb+0x4ba/0xb20 net/core/dev.c:3074
__dev_queue_xmit+0xe49/0x2070 net/core/dev.c:3497
dev_queue_xmit+0x17/0x20 net/core/dev.c:3538
neigh_hh_output include/net/neighbour.h:471 [inline]
neigh_output include/net/neighbour.h:479 [inline]
ip_finish_output2+0xece/0x1460 net/ipv4/ip_output.c:229
ip_finish_output+0x85e/0xd10 net/ipv4/ip_output.c:317
NF_HOOK_COND include/linux/netfilter.h:238 [inline]
ip_output+0x1cc/0x860 net/ipv4/ip_output.c:405
dst_output include/net/dst.h:459 [inline]
ip_local_out+0x95/0x160 net/ipv4/ip_output.c:124
ip_queue_xmit+0x8c6/0x18e0 net/ipv4/ip_output.c:504
tcp_transmit_skb+0x1ab7/0x3840 net/ipv4/tcp_output.c:1137
tcp_write_xmit+0x663/0x4de0 net/ipv4/tcp_output.c:2341
__tcp_push_pending_frames+0xa0/0x250 net/ipv4/tcp_output.c:2513
tcp_push_pending_frames include/net/tcp.h:1722 [inline]
tcp_data_snd_check net/ipv4/tcp_input.c:5050 [inline]
tcp_rcv_established+0x8c7/0x18a0 net/ipv4/tcp_input.c:5497
tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1460
sk_backlog_rcv include/net/sock.h:909 [inline]
__release_sock+0x124/0x360 net/core/sock.c:2264
release_sock+0xa4/0x2a0 net/core/sock.c:2776
tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462
inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
sock_sendmsg_nosec net/socket.c:632 [inline]
sock_sendmsg+0xca/0x110 net/socket.c:642
___sys_sendmsg+0x31c/0x890 net/socket.c:2048
__sys_sendmmsg+0x1e6/0x5f0 net/socket.c:2138
Fixes: 14afee4b6092 ("net: convert sock.sk_wmem_alloc from atomic_t to refcount_t")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-07 15:15:04 -08:00
|
|
|
delta = sum_truesize - gso_skb->truesize;
|
|
|
|
/* In some pathological cases, delta can be negative.
|
|
|
|
* We need to either use refcount_add() or refcount_sub_and_test()
|
|
|
|
*/
|
|
|
|
if (likely(delta >= 0))
|
|
|
|
refcount_add(delta, &skb->sk->sk_wmem_alloc);
|
|
|
|
else
|
|
|
|
WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc));
|
2013-06-07 05:11:46 +00:00
|
|
|
}
|
|
|
|
|
2023-06-05 16:16:47 +00:00
|
|
|
delta = (__force __wsum)htonl(oldlen +
|
|
|
|
(skb_tail_pointer(skb) -
|
|
|
|
skb_transport_header(skb)) +
|
|
|
|
skb->data_len);
|
|
|
|
th->check = ~csum_fold(csum_add(csum_unfold(th->check), delta));
|
2016-02-05 15:27:49 -08:00
|
|
|
if (skb->ip_summed == CHECKSUM_PARTIAL)
|
|
|
|
gso_reset_checksum(skb, ~th->check);
|
|
|
|
else
|
2014-06-04 17:20:09 -07:00
|
|
|
th->check = gso_make_checksum(skb, ~th->check);
|
2013-06-07 05:11:46 +00:00
|
|
|
out:
|
|
|
|
return segs;
|
|
|
|
}
|
|
|
|
|
2024-05-02 10:44:45 +02:00
|
|
|
struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
|
|
|
|
{
|
|
|
|
struct tcphdr *th2;
|
|
|
|
struct sk_buff *p;
|
|
|
|
|
|
|
|
list_for_each_entry(p, head, list) {
|
|
|
|
if (!NAPI_GRO_CB(p)->same_flow)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
th2 = tcp_hdr(p);
|
|
|
|
if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
|
|
|
|
NAPI_GRO_CB(p)->same_flow = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2024-05-02 10:44:46 +02:00
|
|
|
struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
|
2013-06-07 05:11:46 +00:00
|
|
|
{
|
2024-05-02 10:44:46 +02:00
|
|
|
unsigned int thlen, hlen, off;
|
2013-06-07 05:11:46 +00:00
|
|
|
struct tcphdr *th;
|
|
|
|
|
|
|
|
off = skb_gro_offset(skb);
|
|
|
|
hlen = off + sizeof(*th);
|
2022-08-23 09:10:49 +02:00
|
|
|
th = skb_gro_header(skb, hlen, off);
|
|
|
|
if (unlikely(!th))
|
2024-05-02 10:44:46 +02:00
|
|
|
return NULL;
|
2013-06-07 05:11:46 +00:00
|
|
|
|
|
|
|
thlen = th->doff * 4;
|
|
|
|
if (thlen < sizeof(*th))
|
2024-05-02 10:44:46 +02:00
|
|
|
return NULL;
|
2013-06-07 05:11:46 +00:00
|
|
|
|
|
|
|
hlen = off + thlen;
|
2024-03-01 19:37:37 +00:00
|
|
|
if (!skb_gro_may_pull(skb, hlen)) {
|
2013-06-07 05:11:46 +00:00
|
|
|
th = skb_gro_header_slow(skb, hlen, off);
|
|
|
|
if (unlikely(!th))
|
2024-05-02 10:44:46 +02:00
|
|
|
return NULL;
|
2013-06-07 05:11:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
skb_gro_pull(skb, thlen);
|
|
|
|
|
2024-05-02 10:44:46 +02:00
|
|
|
return th;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
|
|
|
|
struct tcphdr *th)
|
|
|
|
{
|
|
|
|
unsigned int thlen = th->doff * 4;
|
|
|
|
struct sk_buff *pp = NULL;
|
|
|
|
struct sk_buff *p;
|
|
|
|
struct tcphdr *th2;
|
|
|
|
unsigned int len;
|
|
|
|
__be32 flags;
|
|
|
|
unsigned int mss = 1;
|
|
|
|
int flush = 1;
|
|
|
|
int i;
|
|
|
|
|
2013-06-07 05:11:46 +00:00
|
|
|
len = skb_gro_len(skb);
|
|
|
|
flags = tcp_flag_word(th);
|
|
|
|
|
2024-05-02 10:44:45 +02:00
|
|
|
p = tcp_gro_lookup(head, th);
|
|
|
|
if (!p)
|
|
|
|
goto out_check_final;
|
2013-06-07 05:11:46 +00:00
|
|
|
|
2024-05-02 10:44:45 +02:00
|
|
|
th2 = tcp_hdr(p);
|
net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
iph->id, ...) against all packets in a loop. These flush checks are used in
all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb,
since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both
outer and inner network headers - allowing these checks to be done only
once, in tcp_gro_receive and udp_gro_receive_segment. As a result,
NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are
more declarative and contained in inet_gro_flush, thus removing the need
for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP
flows.
To make sure results are not within noise range - I've made netfilter drop
all TCP packets, and measured CPU performance in GRO (in this case GRO is
responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO:
(gro_receive_network_flush is compiled inline to tcp_gro_receive)
net-next:
6.94% [kernel] [k] inet_gro_receive
3.02% [kernel] [k] tcp_gro_receive
patch applied:
4.27% [kernel] [k] tcp_gro_receive
4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same
results for any encapsulation, in this case inet_gro_receive is top
offender in net-next)
net-next:
10.09% [kernel] [k] inet_gro_receive
2.08% [kernel] [k] tcp_gro_receive
patch applied:
6.97% [kernel] [k] inet_gro_receive
3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20240509190819.2985-3-richardbgobert@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-05-09 21:08:18 +02:00
|
|
|
flush = (__force int)(flags & TCP_FLAG_CWR);
|
2013-06-07 05:11:46 +00:00
|
|
|
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
|
|
|
|
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
|
|
|
|
flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
|
|
|
|
for (i = sizeof(*th); i < thlen; i += 4)
|
|
|
|
flush |= *(u32 *)((u8 *)th + i) ^
|
|
|
|
*(u32 *)((u8 *)th2 + i);
|
|
|
|
|
net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
iph->id, ...) against all packets in a loop. These flush checks are used in
all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb,
since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both
outer and inner network headers - allowing these checks to be done only
once, in tcp_gro_receive and udp_gro_receive_segment. As a result,
NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are
more declarative and contained in inet_gro_flush, thus removing the need
for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP
flows.
To make sure results are not within noise range - I've made netfilter drop
all TCP packets, and measured CPU performance in GRO (in this case GRO is
responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO:
(gro_receive_network_flush is compiled inline to tcp_gro_receive)
net-next:
6.94% [kernel] [k] inet_gro_receive
3.02% [kernel] [k] tcp_gro_receive
patch applied:
4.27% [kernel] [k] tcp_gro_receive
4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same
results for any encapsulation, in this case inet_gro_receive is top
offender in net-next)
net-next:
10.09% [kernel] [k] inet_gro_receive
2.08% [kernel] [k] tcp_gro_receive
patch applied:
6.97% [kernel] [k] inet_gro_receive
3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20240509190819.2985-3-richardbgobert@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-05-09 21:08:18 +02:00
|
|
|
flush |= gro_receive_network_flush(th, th2, p);
|
2016-04-10 21:44:57 -04:00
|
|
|
|
2015-06-11 09:15:15 -07:00
|
|
|
mss = skb_shinfo(p)->gso_size;
|
2013-06-07 05:11:46 +00:00
|
|
|
|
2022-09-30 15:09:05 -07:00
|
|
|
/* If skb is a GRO packet, make sure its gso_size matches prior packet mss.
|
|
|
|
* If it is a single frame, do not aggregate it if its length
|
|
|
|
* is bigger than our mss.
|
|
|
|
*/
|
|
|
|
if (unlikely(skb_is_gso(skb)))
|
|
|
|
flush |= (mss != skb_shinfo(skb)->gso_size);
|
|
|
|
else
|
|
|
|
flush |= (len - 1) >= mss;
|
|
|
|
|
2013-06-07 05:11:46 +00:00
|
|
|
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
|
2024-04-03 13:21:39 -07:00
|
|
|
flush |= skb_cmp_decrypted(p, skb);
|
2013-06-07 05:11:46 +00:00
|
|
|
|
2024-05-02 10:44:44 +02:00
|
|
|
if (unlikely(NAPI_GRO_CB(p)->is_flist)) {
|
|
|
|
flush |= (__force int)(flags ^ tcp_flag_word(th2));
|
|
|
|
flush |= skb->ip_summed != p->ip_summed;
|
|
|
|
flush |= skb->csum_level != p->csum_level;
|
|
|
|
flush |= NAPI_GRO_CB(p)->count >= 64;
|
|
|
|
|
|
|
|
if (flush || skb_gro_receive_list(p, skb))
|
|
|
|
mss = 1;
|
|
|
|
|
|
|
|
goto out_check_final;
|
|
|
|
}
|
|
|
|
|
2018-06-24 14:13:49 +09:00
|
|
|
if (flush || skb_gro_receive(p, skb)) {
|
2013-06-07 05:11:46 +00:00
|
|
|
mss = 1;
|
|
|
|
goto out_check_final;
|
|
|
|
}
|
|
|
|
|
|
|
|
tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
|
|
|
|
|
|
|
|
out_check_final:
|
2022-09-30 15:09:05 -07:00
|
|
|
/* Force a flush if last segment is smaller than mss. */
|
|
|
|
if (unlikely(skb_is_gso(skb)))
|
|
|
|
flush = len != NAPI_GRO_CB(skb)->count * skb_shinfo(skb)->gso_size;
|
|
|
|
else
|
|
|
|
flush = len < mss;
|
|
|
|
|
2013-06-07 05:11:46 +00:00
|
|
|
flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
|
|
|
|
TCP_FLAG_RST | TCP_FLAG_SYN |
|
|
|
|
TCP_FLAG_FIN));
|
|
|
|
|
|
|
|
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
|
2018-06-24 14:13:49 +09:00
|
|
|
pp = p;
|
2013-06-07 05:11:46 +00:00
|
|
|
|
net-gre-gro: Add GRE support to the GRO stack
This patch built on top of Commit 299603e8370a93dd5d8e8d800f0dff1ce2c53d36
("net-gro: Prepare GRO stack for the upcoming tunneling support") to add
the support of the standard GRE (RFC1701/RFC2784/RFC2890) to the GRO
stack. It also serves as an example for supporting other encapsulation
protocols in the GRO stack in the future.
The patch supports version 0 and all the flags (key, csum, seq#) but
will flush any pkt with the S (seq#) flag. This is because the S flag
is not support by GSO, and a GRO pkt may end up in the forwarding path,
thus requiring GSO support to break it up correctly.
Currently the "packet_offload" structure only contains L3 (ETH_P_IP/
ETH_P_IPV6) GRO offload support so the encapped pkts are limited to
IP pkts (i.e., w/o L2 hdr). But support for other protocol type can
be easily added, so is the support for GRE variations like NVGRE.
The patch also support csum offload. Specifically if the csum flag is on
and the h/w is capable of checksumming the payload (CHECKSUM_COMPLETE),
the code will take advantage of the csum computed by the h/w when
validating the GRE csum.
Note that commit 60769a5dcd8755715c7143b4571d5c44f01796f1 "ipv4: gre:
add GRO capability" already introduces GRO capability to IPv4 GRE
tunnels, using the gro_cells infrastructure. But GRO is done after
GRE hdr has been removed (i.e., decapped). The following patch applies
GRO when pkts first come in (before hitting the GRE tunnel code). There
is some performance advantage for applying GRO as early as possible.
Also this approach is transparent to other subsystem like Open vSwitch
where GRE decap is handled outside of the IP stack hence making it
harder for the gro_cells stuff to apply. On the other hand, some NICs
are still not capable of hashing on the inner hdr of a GRE pkt (RSS).
In that case the GRO processing of pkts from the same remote host will
all happen on the same CPU and the performance may be suboptimal.
I'm including some rough preliminary performance numbers below. Note
that the performance will be highly dependent on traffic load, mix as
usual. Moreover it also depends on NIC offload features hence the
following is by no means a comprehesive study. Local testing and tuning
will be needed to decide the best setting.
All tests spawned 50 copies of netperf TCP_STREAM and ran for 30 secs.
(super_netperf 50 -H 192.168.1.18 -l 30)
An IP GRE tunnel with only the key flag on (e.g., ip tunnel add gre1
mode gre local 10.246.17.18 remote 10.246.17.17 ttl 255 key 123)
is configured.
The GRO support for pkts AFTER decap are controlled through the device
feature of the GRE device (e.g., ethtool -K gre1 gro on/off).
1.1 ethtool -K gre1 gro off; ethtool -K eth0 gro off
thruput: 9.16Gbps
CPU utilization: 19%
1.2 ethtool -K gre1 gro on; ethtool -K eth0 gro off
thruput: 5.9Gbps
CPU utilization: 15%
1.3 ethtool -K gre1 gro off; ethtool -K eth0 gro on
thruput: 9.26Gbps
CPU utilization: 12-13%
1.4 ethtool -K gre1 gro on; ethtool -K eth0 gro on
thruput: 9.26Gbps
CPU utilization: 10%
The following tests were performed on a different NIC that is capable of
csum offload. I.e., the h/w is capable of computing IP payload csum
(CHECKSUM_COMPLETE).
2.1 ethtool -K gre1 gro on (hence will use gro_cells)
2.1.1 ethtool -K eth0 gro off; csum offload disabled
thruput: 8.53Gbps
CPU utilization: 9%
2.1.2 ethtool -K eth0 gro off; csum offload enabled
thruput: 8.97Gbps
CPU utilization: 7-8%
2.1.3 ethtool -K eth0 gro on; csum offload disabled
thruput: 8.83Gbps
CPU utilization: 5-6%
2.1.4 ethtool -K eth0 gro on; csum offload enabled
thruput: 8.98Gbps
CPU utilization: 5%
2.2 ethtool -K gre1 gro off
2.2.1 ethtool -K eth0 gro off; csum offload disabled
thruput: 5.93Gbps
CPU utilization: 9%
2.2.2 ethtool -K eth0 gro off; csum offload enabled
thruput: 5.62Gbps
CPU utilization: 8%
2.2.3 ethtool -K eth0 gro on; csum offload disabled
thruput: 7.69Gbps
CPU utilization: 8%
2.2.4 ethtool -K eth0 gro on; csum offload enabled
thruput: 8.96Gbps
CPU utilization: 5-6%
Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-07 10:23:19 -08:00
|
|
|
NAPI_GRO_CB(skb)->flush |= (flush != 0);
|
2013-06-07 05:11:46 +00:00
|
|
|
|
|
|
|
return pp;
|
|
|
|
}
|
|
|
|
|
2023-05-29 16:44:30 +03:00
|
|
|
void tcp_gro_complete(struct sk_buff *skb)
|
2013-06-07 05:11:46 +00:00
|
|
|
{
|
|
|
|
struct tcphdr *th = tcp_hdr(skb);
|
2024-03-01 19:37:40 +00:00
|
|
|
struct skb_shared_info *shinfo;
|
|
|
|
|
|
|
|
if (skb->encapsulation)
|
|
|
|
skb->inner_transport_header = skb->transport_header;
|
2013-06-07 05:11:46 +00:00
|
|
|
|
net-gro: Prepare GRO stack for the upcoming tunneling support
This patch modifies the GRO stack to avoid the use of "network_header"
and associated macros like ip_hdr() and ipv6_hdr() in order to allow
an arbitary number of IP hdrs (v4 or v6) to be used in the
encapsulation chain. This lays the foundation for various IP
tunneling support (IP-in-IP, GRE, VXLAN, SIT,...) to be added later.
With this patch, the GRO stack traversing now is mostly based on
skb_gro_offset rather than special hdr offsets saved in skb (e.g.,
skb->network_header). As a result all but the top layer (i.e., the
the transport layer) must have hdrs of the same length in order for
a pkt to be considered for aggregation. Therefore when adding a new
encap layer (e.g., for tunneling), one must check and skip flows
(e.g., by setting NAPI_GRO_CB(p)->same_flow to 0) that have a
different hdr length.
Note that unlike the network header, the transport header can and
will continue to be set by the GRO code since there will be at
most one "transport layer" in the encap chain.
Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-11 20:53:45 -08:00
|
|
|
skb->csum_start = (unsigned char *)th - skb->head;
|
2013-06-07 05:11:46 +00:00
|
|
|
skb->csum_offset = offsetof(struct tcphdr, check);
|
|
|
|
skb->ip_summed = CHECKSUM_PARTIAL;
|
|
|
|
|
2024-03-01 19:37:40 +00:00
|
|
|
shinfo = skb_shinfo(skb);
|
|
|
|
shinfo->gso_segs = NAPI_GRO_CB(skb)->count;
|
2013-06-07 05:11:46 +00:00
|
|
|
|
|
|
|
if (th->cwr)
|
2024-03-01 19:37:40 +00:00
|
|
|
shinfo->gso_type |= SKB_GSO_TCP_ECN;
|
2013-06-07 05:11:46 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(tcp_gro_complete);
|
|
|
|
|
2024-05-02 10:44:47 +02:00
|
|
|
static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
|
|
|
|
struct tcphdr *th)
|
|
|
|
{
|
|
|
|
const struct iphdr *iph;
|
|
|
|
struct sk_buff *p;
|
|
|
|
struct sock *sk;
|
|
|
|
struct net *net;
|
|
|
|
int iif, sdif;
|
|
|
|
|
|
|
|
if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST)))
|
|
|
|
return;
|
|
|
|
|
|
|
|
p = tcp_gro_lookup(head, th);
|
|
|
|
if (p) {
|
|
|
|
NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
inet_get_iif_sdif(skb, &iif, &sdif);
|
|
|
|
iph = skb_gro_network_header(skb);
|
|
|
|
net = dev_net(skb->dev);
|
|
|
|
sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
|
|
|
|
iph->saddr, th->source,
|
|
|
|
iph->daddr, ntohs(th->dest),
|
|
|
|
iif, sdif);
|
|
|
|
NAPI_GRO_CB(skb)->is_flist = !sk;
|
|
|
|
if (sk)
|
|
|
|
sock_put(sk);
|
|
|
|
}
|
|
|
|
|
2018-12-14 11:51:59 +01:00
|
|
|
INDIRECT_CALLABLE_SCOPE
|
|
|
|
struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
|
2013-06-07 05:11:46 +00:00
|
|
|
{
|
2024-05-02 10:44:46 +02:00
|
|
|
struct tcphdr *th;
|
|
|
|
|
2013-11-22 10:31:29 +08:00
|
|
|
/* Don't bother verifying checksum if we're going to flush anyway. */
|
2014-08-22 13:34:30 -07:00
|
|
|
if (!NAPI_GRO_CB(skb)->flush &&
|
|
|
|
skb_gro_checksum_validate(skb, IPPROTO_TCP,
|
2024-05-02 10:44:46 +02:00
|
|
|
inet_gro_compute_pseudo))
|
|
|
|
goto flush;
|
|
|
|
|
|
|
|
th = tcp_gro_pull_header(skb);
|
|
|
|
if (!th)
|
|
|
|
goto flush;
|
2013-06-07 05:11:46 +00:00
|
|
|
|
2024-05-02 10:44:47 +02:00
|
|
|
tcp4_check_fraglist_gro(head, skb, th);
|
|
|
|
|
2024-05-02 10:44:46 +02:00
|
|
|
return tcp_gro_receive(head, skb, th);
|
|
|
|
|
|
|
|
flush:
|
|
|
|
NAPI_GRO_CB(skb)->flush = 1;
|
|
|
|
return NULL;
|
2013-06-07 05:11:46 +00:00
|
|
|
}
|
|
|
|
|
2018-12-14 11:51:59 +01:00
|
|
|
INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
|
2013-06-07 05:11:46 +00:00
|
|
|
{
|
2024-05-09 21:08:17 +02:00
|
|
|
const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
|
|
|
|
const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
|
2013-06-07 05:11:46 +00:00
|
|
|
struct tcphdr *th = tcp_hdr(skb);
|
|
|
|
|
2024-05-02 10:44:44 +02:00
|
|
|
if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
|
|
|
|
skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
|
|
|
|
skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
|
|
|
|
|
|
|
|
__skb_incr_checksum_unnecessary(skb);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
net-gro: Prepare GRO stack for the upcoming tunneling support
This patch modifies the GRO stack to avoid the use of "network_header"
and associated macros like ip_hdr() and ipv6_hdr() in order to allow
an arbitary number of IP hdrs (v4 or v6) to be used in the
encapsulation chain. This lays the foundation for various IP
tunneling support (IP-in-IP, GRE, VXLAN, SIT,...) to be added later.
With this patch, the GRO stack traversing now is mostly based on
skb_gro_offset rather than special hdr offsets saved in skb (e.g.,
skb->network_header). As a result all but the top layer (i.e., the
the transport layer) must have hdrs of the same length in order for
a pkt to be considered for aggregation. Therefore when adding a new
encap layer (e.g., for tunneling), one must check and skip flows
(e.g., by setting NAPI_GRO_CB(p)->same_flow to 0) that have a
different hdr length.
Note that unlike the network header, the transport header can and
will continue to be set by the GRO code since there will be at
most one "transport layer" in the encap chain.
Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-11 20:53:45 -08:00
|
|
|
th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
|
|
|
|
iph->daddr, 0);
|
2013-06-07 05:11:46 +00:00
|
|
|
|
2024-03-01 19:37:40 +00:00
|
|
|
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 |
|
net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
iph->id, ...) against all packets in a loop. These flush checks are used in
all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb,
since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both
outer and inner network headers - allowing these checks to be done only
once, in tcp_gro_receive and udp_gro_receive_segment. As a result,
NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are
more declarative and contained in inet_gro_flush, thus removing the need
for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP
flows.
To make sure results are not within noise range - I've made netfilter drop
all TCP packets, and measured CPU performance in GRO (in this case GRO is
responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO:
(gro_receive_network_flush is compiled inline to tcp_gro_receive)
net-next:
6.94% [kernel] [k] inet_gro_receive
3.02% [kernel] [k] tcp_gro_receive
patch applied:
4.27% [kernel] [k] tcp_gro_receive
4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same
results for any encapsulation, in this case inet_gro_receive is top
offender in net-next)
net-next:
10.09% [kernel] [k] inet_gro_receive
2.08% [kernel] [k] tcp_gro_receive
patch applied:
6.97% [kernel] [k] inet_gro_receive
3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20240509190819.2985-3-richardbgobert@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-05-09 21:08:18 +02:00
|
|
|
(NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID);
|
2016-04-10 21:44:57 -04:00
|
|
|
|
2023-05-29 16:44:30 +03:00
|
|
|
tcp_gro_complete(skb);
|
|
|
|
return 0;
|
2013-06-07 05:11:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int __init tcpv4_offload_init(void)
|
|
|
|
{
|
2024-03-06 16:00:20 +00:00
|
|
|
net_hotdata.tcpv4_offload = (struct net_offload) {
|
|
|
|
.callbacks = {
|
|
|
|
.gso_segment = tcp4_gso_segment,
|
|
|
|
.gro_receive = tcp4_gro_receive,
|
|
|
|
.gro_complete = tcp4_gro_complete,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
return inet_add_offload(&net_hotdata.tcpv4_offload, IPPROTO_TCP);
|
2013-06-07 05:11:46 +00:00
|
|
|
}
|