bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
|
|
|
|
|
|
|
|
#include <linux/skmsg.h>
|
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <linux/scatterlist.h>
|
|
|
|
|
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/tcp.h>
|
2020-05-29 23:06:59 +00:00
|
|
|
#include <net/tls.h>
|
2023-01-20 00:45:16 +00:00
|
|
|
#include <trace/events/sock.h>
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
|
|
|
|
static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
|
|
|
|
{
|
|
|
|
if (msg->sg.end > msg->sg.start &&
|
|
|
|
elem_first_coalesce < msg->sg.end)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (msg->sg.end < msg->sg.start &&
|
|
|
|
(elem_first_coalesce > msg->sg.start ||
|
|
|
|
elem_first_coalesce < msg->sg.end))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
|
|
|
|
int elem_first_coalesce)
|
|
|
|
{
|
|
|
|
struct page_frag *pfrag = sk_page_frag(sk);
|
bpf, sockmap: Fix memleak in tcp_bpf_sendmsg while sk msg is full
If tcp_bpf_sendmsg() is running while sk msg is full. When sk_msg_alloc()
returns -ENOMEM error, tcp_bpf_sendmsg() goes to wait_for_memory. If partial
memory has been alloced by sk_msg_alloc(), that is, msg_tx->sg.size is
greater than osize after sk_msg_alloc(), memleak occurs. To fix we use
sk_msg_trim() to release the allocated memory, then goto wait for memory.
Other call paths of sk_msg_alloc() have the similar issue, such as
tls_sw_sendmsg(), so handle sk_msg_trim logic inside sk_msg_alloc(),
as Cong Wang suggested.
This issue can cause the following info:
WARNING: CPU: 3 PID: 7950 at net/core/stream.c:208 sk_stream_kill_queues+0xd4/0x1a0
Call Trace:
<TASK>
inet_csk_destroy_sock+0x55/0x110
__tcp_close+0x279/0x470
tcp_close+0x1f/0x60
inet_release+0x3f/0x80
__sock_release+0x3d/0xb0
sock_close+0x11/0x20
__fput+0x92/0x250
task_work_run+0x6a/0xa0
do_exit+0x33b/0xb60
do_group_exit+0x2f/0xa0
get_signal+0xb6/0x950
arch_do_signal_or_restart+0xac/0x2a0
exit_to_user_mode_prepare+0xa9/0x200
syscall_exit_to_user_mode+0x12/0x30
do_syscall_64+0x46/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae
</TASK>
WARNING: CPU: 3 PID: 2094 at net/ipv4/af_inet.c:155 inet_sock_destruct+0x13c/0x260
Call Trace:
<TASK>
__sk_destruct+0x24/0x1f0
sk_psock_destroy+0x19b/0x1c0
process_one_work+0x1b3/0x3c0
kthread+0xe6/0x110
ret_from_fork+0x22/0x30
</TASK>
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220304081145.2037182-3-wangyufen@huawei.com
2022-03-04 08:11:43 +00:00
|
|
|
u32 osize = msg->sg.size;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
len -= msg->sg.size;
|
|
|
|
while (len > 0) {
|
|
|
|
struct scatterlist *sge;
|
|
|
|
u32 orig_offset;
|
|
|
|
int use, i;
|
|
|
|
|
bpf, sockmap: Fix memleak in tcp_bpf_sendmsg while sk msg is full
If tcp_bpf_sendmsg() is running while sk msg is full. When sk_msg_alloc()
returns -ENOMEM error, tcp_bpf_sendmsg() goes to wait_for_memory. If partial
memory has been alloced by sk_msg_alloc(), that is, msg_tx->sg.size is
greater than osize after sk_msg_alloc(), memleak occurs. To fix we use
sk_msg_trim() to release the allocated memory, then goto wait for memory.
Other call paths of sk_msg_alloc() have the similar issue, such as
tls_sw_sendmsg(), so handle sk_msg_trim logic inside sk_msg_alloc(),
as Cong Wang suggested.
This issue can cause the following info:
WARNING: CPU: 3 PID: 7950 at net/core/stream.c:208 sk_stream_kill_queues+0xd4/0x1a0
Call Trace:
<TASK>
inet_csk_destroy_sock+0x55/0x110
__tcp_close+0x279/0x470
tcp_close+0x1f/0x60
inet_release+0x3f/0x80
__sock_release+0x3d/0xb0
sock_close+0x11/0x20
__fput+0x92/0x250
task_work_run+0x6a/0xa0
do_exit+0x33b/0xb60
do_group_exit+0x2f/0xa0
get_signal+0xb6/0x950
arch_do_signal_or_restart+0xac/0x2a0
exit_to_user_mode_prepare+0xa9/0x200
syscall_exit_to_user_mode+0x12/0x30
do_syscall_64+0x46/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae
</TASK>
WARNING: CPU: 3 PID: 2094 at net/ipv4/af_inet.c:155 inet_sock_destruct+0x13c/0x260
Call Trace:
<TASK>
__sk_destruct+0x24/0x1f0
sk_psock_destroy+0x19b/0x1c0
process_one_work+0x1b3/0x3c0
kthread+0xe6/0x110
ret_from_fork+0x22/0x30
</TASK>
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220304081145.2037182-3-wangyufen@huawei.com
2022-03-04 08:11:43 +00:00
|
|
|
if (!sk_page_frag_refill(sk, pfrag)) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto msg_trim;
|
|
|
|
}
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
|
|
|
|
orig_offset = pfrag->offset;
|
|
|
|
use = min_t(int, len, pfrag->size - orig_offset);
|
bpf, sockmap: Fix memleak in tcp_bpf_sendmsg while sk msg is full
If tcp_bpf_sendmsg() is running while sk msg is full. When sk_msg_alloc()
returns -ENOMEM error, tcp_bpf_sendmsg() goes to wait_for_memory. If partial
memory has been alloced by sk_msg_alloc(), that is, msg_tx->sg.size is
greater than osize after sk_msg_alloc(), memleak occurs. To fix we use
sk_msg_trim() to release the allocated memory, then goto wait for memory.
Other call paths of sk_msg_alloc() have the similar issue, such as
tls_sw_sendmsg(), so handle sk_msg_trim logic inside sk_msg_alloc(),
as Cong Wang suggested.
This issue can cause the following info:
WARNING: CPU: 3 PID: 7950 at net/core/stream.c:208 sk_stream_kill_queues+0xd4/0x1a0
Call Trace:
<TASK>
inet_csk_destroy_sock+0x55/0x110
__tcp_close+0x279/0x470
tcp_close+0x1f/0x60
inet_release+0x3f/0x80
__sock_release+0x3d/0xb0
sock_close+0x11/0x20
__fput+0x92/0x250
task_work_run+0x6a/0xa0
do_exit+0x33b/0xb60
do_group_exit+0x2f/0xa0
get_signal+0xb6/0x950
arch_do_signal_or_restart+0xac/0x2a0
exit_to_user_mode_prepare+0xa9/0x200
syscall_exit_to_user_mode+0x12/0x30
do_syscall_64+0x46/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae
</TASK>
WARNING: CPU: 3 PID: 2094 at net/ipv4/af_inet.c:155 inet_sock_destruct+0x13c/0x260
Call Trace:
<TASK>
__sk_destruct+0x24/0x1f0
sk_psock_destroy+0x19b/0x1c0
process_one_work+0x1b3/0x3c0
kthread+0xe6/0x110
ret_from_fork+0x22/0x30
</TASK>
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220304081145.2037182-3-wangyufen@huawei.com
2022-03-04 08:11:43 +00:00
|
|
|
if (!sk_wmem_schedule(sk, use)) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto msg_trim;
|
|
|
|
}
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
|
|
|
|
i = msg->sg.end;
|
|
|
|
sk_msg_iter_var_prev(i);
|
|
|
|
sge = &msg->sg.data[i];
|
|
|
|
|
|
|
|
if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
|
|
|
|
sg_page(sge) == pfrag->page &&
|
|
|
|
sge->offset + sge->length == orig_offset) {
|
|
|
|
sge->length += use;
|
|
|
|
} else {
|
|
|
|
if (sk_msg_full(msg)) {
|
|
|
|
ret = -ENOSPC;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
sge = &msg->sg.data[msg->sg.end];
|
|
|
|
sg_unmark_end(sge);
|
|
|
|
sg_set_page(sge, pfrag->page, use, orig_offset);
|
|
|
|
get_page(pfrag->page);
|
|
|
|
sk_msg_iter_next(msg, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
sk_mem_charge(sk, use);
|
|
|
|
msg->sg.size += use;
|
|
|
|
pfrag->offset += use;
|
|
|
|
len -= use;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
bpf, sockmap: Fix memleak in tcp_bpf_sendmsg while sk msg is full
If tcp_bpf_sendmsg() is running while sk msg is full. When sk_msg_alloc()
returns -ENOMEM error, tcp_bpf_sendmsg() goes to wait_for_memory. If partial
memory has been alloced by sk_msg_alloc(), that is, msg_tx->sg.size is
greater than osize after sk_msg_alloc(), memleak occurs. To fix we use
sk_msg_trim() to release the allocated memory, then goto wait for memory.
Other call paths of sk_msg_alloc() have the similar issue, such as
tls_sw_sendmsg(), so handle sk_msg_trim logic inside sk_msg_alloc(),
as Cong Wang suggested.
This issue can cause the following info:
WARNING: CPU: 3 PID: 7950 at net/core/stream.c:208 sk_stream_kill_queues+0xd4/0x1a0
Call Trace:
<TASK>
inet_csk_destroy_sock+0x55/0x110
__tcp_close+0x279/0x470
tcp_close+0x1f/0x60
inet_release+0x3f/0x80
__sock_release+0x3d/0xb0
sock_close+0x11/0x20
__fput+0x92/0x250
task_work_run+0x6a/0xa0
do_exit+0x33b/0xb60
do_group_exit+0x2f/0xa0
get_signal+0xb6/0x950
arch_do_signal_or_restart+0xac/0x2a0
exit_to_user_mode_prepare+0xa9/0x200
syscall_exit_to_user_mode+0x12/0x30
do_syscall_64+0x46/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae
</TASK>
WARNING: CPU: 3 PID: 2094 at net/ipv4/af_inet.c:155 inet_sock_destruct+0x13c/0x260
Call Trace:
<TASK>
__sk_destruct+0x24/0x1f0
sk_psock_destroy+0x19b/0x1c0
process_one_work+0x1b3/0x3c0
kthread+0xe6/0x110
ret_from_fork+0x22/0x30
</TASK>
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220304081145.2037182-3-wangyufen@huawei.com
2022-03-04 08:11:43 +00:00
|
|
|
|
|
|
|
msg_trim:
|
|
|
|
sk_msg_trim(sk, msg, osize);
|
|
|
|
return ret;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_alloc);
|
|
|
|
|
2018-10-13 00:45:59 +00:00
|
|
|
int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
|
|
|
|
u32 off, u32 len)
|
|
|
|
{
|
|
|
|
int i = src->sg.start;
|
|
|
|
struct scatterlist *sge = sk_msg_elem(src, i);
|
2019-01-16 01:42:44 +00:00
|
|
|
struct scatterlist *sgd = NULL;
|
2018-10-13 00:45:59 +00:00
|
|
|
u32 sge_len, sge_off;
|
|
|
|
|
|
|
|
while (off) {
|
|
|
|
if (sge->length > off)
|
|
|
|
break;
|
|
|
|
off -= sge->length;
|
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
if (i == src->sg.end && off)
|
|
|
|
return -ENOSPC;
|
|
|
|
sge = sk_msg_elem(src, i);
|
|
|
|
}
|
|
|
|
|
|
|
|
while (len) {
|
|
|
|
sge_len = sge->length - off;
|
|
|
|
if (sge_len > len)
|
|
|
|
sge_len = len;
|
2019-01-16 01:42:44 +00:00
|
|
|
|
|
|
|
if (dst->sg.end)
|
|
|
|
sgd = sk_msg_elem(dst, dst->sg.end - 1);
|
|
|
|
|
|
|
|
if (sgd &&
|
|
|
|
(sg_page(sge) == sg_page(sgd)) &&
|
|
|
|
(sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) {
|
|
|
|
sgd->length += sge_len;
|
|
|
|
dst->sg.size += sge_len;
|
|
|
|
} else if (!sk_msg_full(dst)) {
|
|
|
|
sge_off = sge->offset + off;
|
|
|
|
sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off);
|
|
|
|
} else {
|
|
|
|
return -ENOSPC;
|
|
|
|
}
|
|
|
|
|
2018-10-13 00:45:59 +00:00
|
|
|
off = 0;
|
|
|
|
len -= sge_len;
|
|
|
|
sk_mem_charge(sk, sge_len);
|
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
if (i == src->sg.end && len)
|
|
|
|
return -ENOSPC;
|
|
|
|
sge = sk_msg_elem(src, i);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_clone);
|
|
|
|
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
|
|
|
|
{
|
|
|
|
int i = msg->sg.start;
|
|
|
|
|
|
|
|
do {
|
|
|
|
struct scatterlist *sge = sk_msg_elem(msg, i);
|
|
|
|
|
|
|
|
if (bytes < sge->length) {
|
|
|
|
sge->length -= bytes;
|
|
|
|
sge->offset += bytes;
|
|
|
|
sk_mem_uncharge(sk, bytes);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
sk_mem_uncharge(sk, sge->length);
|
|
|
|
bytes -= sge->length;
|
|
|
|
sge->length = 0;
|
|
|
|
sge->offset = 0;
|
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
} while (bytes && i != msg->sg.end);
|
|
|
|
msg->sg.start = i;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_return_zero);
|
|
|
|
|
|
|
|
void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
|
|
|
|
{
|
|
|
|
int i = msg->sg.start;
|
|
|
|
|
|
|
|
do {
|
|
|
|
struct scatterlist *sge = &msg->sg.data[i];
|
|
|
|
int uncharge = (bytes < sge->length) ? bytes : sge->length;
|
|
|
|
|
|
|
|
sk_mem_uncharge(sk, uncharge);
|
|
|
|
bytes -= uncharge;
|
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
} while (i != msg->sg.end);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_return);
|
|
|
|
|
|
|
|
static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
|
|
|
|
bool charge)
|
|
|
|
{
|
|
|
|
struct scatterlist *sge = sk_msg_elem(msg, i);
|
|
|
|
u32 len = sge->length;
|
|
|
|
|
2020-11-16 22:28:06 +00:00
|
|
|
/* When the skb owns the memory we free it from consume_skb path. */
|
|
|
|
if (!msg->skb) {
|
|
|
|
if (charge)
|
|
|
|
sk_mem_uncharge(sk, len);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
put_page(sg_page(sge));
|
2020-11-16 22:28:06 +00:00
|
|
|
}
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
memset(sge, 0, sizeof(*sge));
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
|
|
|
|
bool charge)
|
|
|
|
{
|
|
|
|
struct scatterlist *sge = sk_msg_elem(msg, i);
|
|
|
|
int freed = 0;
|
|
|
|
|
|
|
|
while (msg->sg.size) {
|
|
|
|
msg->sg.size -= sge->length;
|
|
|
|
freed += sk_msg_free_elem(sk, msg, i, charge);
|
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
sk_msg_check_to_free(msg, i, msg->sg.size);
|
|
|
|
sge = sk_msg_elem(msg, i);
|
|
|
|
}
|
2019-08-22 16:00:40 +00:00
|
|
|
consume_skb(msg->skb);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
sk_msg_init(msg);
|
|
|
|
return freed;
|
|
|
|
}
|
|
|
|
|
|
|
|
int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
|
|
|
|
{
|
|
|
|
return __sk_msg_free(sk, msg, msg->sg.start, false);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
|
|
|
|
|
|
|
|
int sk_msg_free(struct sock *sk, struct sk_msg *msg)
|
|
|
|
{
|
|
|
|
return __sk_msg_free(sk, msg, msg->sg.start, true);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_free);
|
|
|
|
|
|
|
|
static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
|
|
|
|
u32 bytes, bool charge)
|
|
|
|
{
|
|
|
|
struct scatterlist *sge;
|
|
|
|
u32 i = msg->sg.start;
|
|
|
|
|
|
|
|
while (bytes) {
|
|
|
|
sge = sk_msg_elem(msg, i);
|
|
|
|
if (!sge->length)
|
|
|
|
break;
|
|
|
|
if (bytes < sge->length) {
|
|
|
|
if (charge)
|
|
|
|
sk_mem_uncharge(sk, bytes);
|
|
|
|
sge->length -= bytes;
|
|
|
|
sge->offset += bytes;
|
|
|
|
msg->sg.size -= bytes;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
msg->sg.size -= sge->length;
|
|
|
|
bytes -= sge->length;
|
|
|
|
sk_msg_free_elem(sk, msg, i, charge);
|
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
sk_msg_check_to_free(msg, i, bytes);
|
|
|
|
}
|
|
|
|
msg->sg.start = i;
|
|
|
|
}
|
|
|
|
|
|
|
|
void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
|
|
|
|
{
|
|
|
|
__sk_msg_free_partial(sk, msg, bytes, true);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_free_partial);
|
|
|
|
|
|
|
|
void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
|
|
|
|
u32 bytes)
|
|
|
|
{
|
|
|
|
__sk_msg_free_partial(sk, msg, bytes, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
|
|
|
|
{
|
|
|
|
int trim = msg->sg.size - len;
|
|
|
|
u32 i = msg->sg.end;
|
|
|
|
|
|
|
|
if (trim <= 0) {
|
|
|
|
WARN_ON(trim < 0);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
sk_msg_iter_var_prev(i);
|
|
|
|
msg->sg.size = len;
|
|
|
|
while (msg->sg.data[i].length &&
|
|
|
|
trim >= msg->sg.data[i].length) {
|
|
|
|
trim -= msg->sg.data[i].length;
|
|
|
|
sk_msg_free_elem(sk, msg, i, true);
|
|
|
|
sk_msg_iter_var_prev(i);
|
|
|
|
if (!trim)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
msg->sg.data[i].length -= trim;
|
|
|
|
sk_mem_uncharge(sk, trim);
|
2019-11-04 23:36:57 +00:00
|
|
|
/* Adjust copybreak if it falls into the trimmed part of last buf */
|
|
|
|
if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length)
|
|
|
|
msg->sg.copybreak = msg->sg.data[i].length;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
out:
|
2019-11-04 23:36:57 +00:00
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
msg->sg.end = i;
|
|
|
|
|
|
|
|
/* If we trim data a full sg elem before curr pointer update
|
|
|
|
* copybreak and current so that any future copy operations
|
|
|
|
* start at new copy location.
|
2024-08-29 15:45:51 +00:00
|
|
|
* However trimmed data that has not yet been used in a copy op
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
* does not require an update.
|
|
|
|
*/
|
2019-11-04 23:36:57 +00:00
|
|
|
if (!msg->sg.size) {
|
|
|
|
msg->sg.curr = msg->sg.start;
|
|
|
|
msg->sg.copybreak = 0;
|
|
|
|
} else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >=
|
|
|
|
sk_msg_iter_dist(msg->sg.start, msg->sg.end)) {
|
|
|
|
sk_msg_iter_var_prev(i);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
msg->sg.curr = i;
|
|
|
|
msg->sg.copybreak = msg->sg.data[i].length;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_trim);
|
|
|
|
|
|
|
|
int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
|
|
|
|
struct sk_msg *msg, u32 bytes)
|
|
|
|
{
|
|
|
|
int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
|
|
|
|
const int to_max_pages = MAX_MSG_FRAGS;
|
|
|
|
struct page *pages[MAX_MSG_FRAGS];
|
|
|
|
ssize_t orig, copied, use, offset;
|
|
|
|
|
|
|
|
orig = msg->sg.size;
|
|
|
|
while (bytes > 0) {
|
|
|
|
i = 0;
|
|
|
|
maxpages = to_max_pages - num_elems;
|
|
|
|
if (maxpages == 0) {
|
|
|
|
ret = -EFAULT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2022-06-09 14:28:36 +00:00
|
|
|
copied = iov_iter_get_pages2(from, pages, bytes, maxpages,
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
&offset);
|
|
|
|
if (copied <= 0) {
|
|
|
|
ret = -EFAULT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
bytes -= copied;
|
|
|
|
msg->sg.size += copied;
|
|
|
|
|
|
|
|
while (copied) {
|
|
|
|
use = min_t(int, copied, PAGE_SIZE - offset);
|
|
|
|
sg_set_page(&msg->sg.data[msg->sg.end],
|
|
|
|
pages[i], use, offset);
|
|
|
|
sg_unmark_end(&msg->sg.data[msg->sg.end]);
|
|
|
|
sk_mem_charge(sk, use);
|
|
|
|
|
|
|
|
offset = 0;
|
|
|
|
copied -= use;
|
|
|
|
sk_msg_iter_next(msg, end);
|
|
|
|
num_elems++;
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
/* When zerocopy is mixed with sk_msg_*copy* operations we
|
|
|
|
* may have a copybreak set in this case clear and prefer
|
|
|
|
* zerocopy remainder when possible.
|
|
|
|
*/
|
|
|
|
msg->sg.copybreak = 0;
|
|
|
|
msg->sg.curr = msg->sg.end;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
/* Revert iov_iter updates, msg will need to use 'trim' later if it
|
|
|
|
* also needs to be cleared.
|
|
|
|
*/
|
|
|
|
if (ret)
|
|
|
|
iov_iter_revert(from, msg->sg.size - orig);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
|
|
|
|
|
|
|
|
int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
|
|
|
|
struct sk_msg *msg, u32 bytes)
|
|
|
|
{
|
|
|
|
int ret = -ENOSPC, i = msg->sg.curr;
|
2024-11-30 13:38:22 +00:00
|
|
|
u32 copy, buf_size, copied = 0;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
struct scatterlist *sge;
|
|
|
|
void *to;
|
|
|
|
|
|
|
|
do {
|
|
|
|
sge = sk_msg_elem(msg, i);
|
|
|
|
/* This is possible if a trim operation shrunk the buffer */
|
|
|
|
if (msg->sg.copybreak >= sge->length) {
|
|
|
|
msg->sg.copybreak = 0;
|
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
if (i == msg->sg.end)
|
|
|
|
break;
|
|
|
|
sge = sk_msg_elem(msg, i);
|
|
|
|
}
|
|
|
|
|
|
|
|
buf_size = sge->length - msg->sg.copybreak;
|
|
|
|
copy = (buf_size > bytes) ? bytes : buf_size;
|
|
|
|
to = sg_virt(sge) + msg->sg.copybreak;
|
|
|
|
msg->sg.copybreak += copy;
|
|
|
|
if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
|
|
|
|
ret = copy_from_iter_nocache(to, copy, from);
|
|
|
|
else
|
|
|
|
ret = copy_from_iter(to, copy, from);
|
|
|
|
if (ret != copy) {
|
|
|
|
ret = -EFAULT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
bytes -= copy;
|
2024-11-30 13:38:22 +00:00
|
|
|
copied += copy;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
if (!bytes)
|
|
|
|
break;
|
|
|
|
msg->sg.copybreak = 0;
|
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
} while (i != msg->sg.end);
|
|
|
|
out:
|
|
|
|
msg->sg.curr = i;
|
2024-11-30 13:38:22 +00:00
|
|
|
return (ret < 0) ? ret : copied;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
|
|
|
|
|
2021-03-31 02:32:33 +00:00
|
|
|
/* Receive sk_msg from psock->ingress_msg to @msg. */
|
|
|
|
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
|
|
|
int len, int flags)
|
|
|
|
{
|
|
|
|
struct iov_iter *iter = &msg->msg_iter;
|
|
|
|
int peek = flags & MSG_PEEK;
|
|
|
|
struct sk_msg *msg_rx;
|
|
|
|
int i, copied = 0;
|
|
|
|
|
|
|
|
msg_rx = sk_psock_peek_msg(psock);
|
|
|
|
while (copied != len) {
|
|
|
|
struct scatterlist *sge;
|
|
|
|
|
|
|
|
if (unlikely(!msg_rx))
|
|
|
|
break;
|
|
|
|
|
|
|
|
i = msg_rx->sg.start;
|
|
|
|
do {
|
|
|
|
struct page *page;
|
|
|
|
int copy;
|
|
|
|
|
|
|
|
sge = sk_msg_elem(msg_rx, i);
|
|
|
|
copy = sge->length;
|
|
|
|
page = sg_page(sge);
|
|
|
|
if (copied + copy > len)
|
|
|
|
copy = len - copied;
|
skmsg: Skip zero length skb in sk_msg_recvmsg
When running BPF selftests (./test_progs -t sockmap_basic) on a Loongarch
platform, the following kernel panic occurs:
[...]
Oops[#1]:
CPU: 22 PID: 2824 Comm: test_progs Tainted: G OE 6.10.0-rc2+ #18
Hardware name: LOONGSON Dabieshan/Loongson-TC542F0, BIOS Loongson-UDK2018
... ...
ra: 90000000048bf6c0 sk_msg_recvmsg+0x120/0x560
ERA: 9000000004162774 copy_page_to_iter+0x74/0x1c0
CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE)
PRMD: 0000000c (PPLV0 +PIE +PWE)
EUEN: 00000007 (+FPE +SXE +ASXE -BTE)
ECFG: 00071c1d (LIE=0,2-4,10-12 VS=7)
ESTAT: 00010000 [PIL] (IS= ECode=1 EsubCode=0)
BADV: 0000000000000040
PRID: 0014c011 (Loongson-64bit, Loongson-3C5000)
Modules linked in: bpf_testmod(OE) xt_CHECKSUM xt_MASQUERADE xt_conntrack
Process test_progs (pid: 2824, threadinfo=0000000000863a31, task=...)
Stack : ...
Call Trace:
[<9000000004162774>] copy_page_to_iter+0x74/0x1c0
[<90000000048bf6c0>] sk_msg_recvmsg+0x120/0x560
[<90000000049f2b90>] tcp_bpf_recvmsg_parser+0x170/0x4e0
[<90000000049aae34>] inet_recvmsg+0x54/0x100
[<900000000481ad5c>] sock_recvmsg+0x7c/0xe0
[<900000000481e1a8>] __sys_recvfrom+0x108/0x1c0
[<900000000481e27c>] sys_recvfrom+0x1c/0x40
[<9000000004c076ec>] do_syscall+0x8c/0xc0
[<9000000003731da4>] handle_syscall+0xc4/0x160
Code: ...
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Fatal exception
Kernel relocated by 0x3510000
.text @ 0x9000000003710000
.data @ 0x9000000004d70000
.bss @ 0x9000000006469400
---[ end Kernel panic - not syncing: Fatal exception ]---
[...]
This crash happens every time when running sockmap_skb_verdict_shutdown
subtest in sockmap_basic.
This crash is because a NULL pointer is passed to page_address() in the
sk_msg_recvmsg(). Due to the different implementations depending on the
architecture, page_address(NULL) will trigger a panic on Loongarch
platform but not on x86 platform. So this bug was hidden on x86 platform
for a while, but now it is exposed on Loongarch platform. The root cause
is that a zero length skb (skb->len == 0) was put on the queue.
This zero length skb is a TCP FIN packet, which was sent by shutdown(),
invoked in test_sockmap_skb_verdict_shutdown():
shutdown(p1, SHUT_WR);
In this case, in sk_psock_skb_ingress_enqueue(), num_sge is zero, and no
page is put to this sge (see sg_set_page in sg_set_page), but this empty
sge is queued into ingress_msg list.
And in sk_msg_recvmsg(), this empty sge is used, and a NULL page is got by
sg_page(sge). Pass this NULL page to copy_page_to_iter(), which passes it
to kmap_local_page() and to page_address(), then kernel panics.
To solve this, we should skip this zero length skb. So in sk_msg_recvmsg(),
if copy is zero, that means it's a zero length skb, skip invoking
copy_page_to_iter(). We are using the EFAULT return triggered by
copy_page_to_iter to check for is_fin in tcp_bpf.c.
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Suggested-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/e3a16eacdc6740658ee02a33489b1b9d4912f378.1719992715.git.tanggeliang@kylinos.cn
2024-07-03 08:39:31 +00:00
|
|
|
if (copy)
|
|
|
|
copy = copy_page_to_iter(page, sge->offset, copy, iter);
|
2022-09-07 07:13:11 +00:00
|
|
|
if (!copy) {
|
|
|
|
copied = copied ? copied : -EFAULT;
|
|
|
|
goto out;
|
|
|
|
}
|
2021-03-31 02:32:33 +00:00
|
|
|
|
|
|
|
copied += copy;
|
|
|
|
if (likely(!peek)) {
|
|
|
|
sge->offset += copy;
|
|
|
|
sge->length -= copy;
|
tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection
When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will
be created out of the skb, and the rmem accounting of the sk_msg will be
handled by the skb.
For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting
to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to
the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result,
except for the global memory limit, the rmem of sk_redir is nearly
unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer.
Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are
used in these two paths. We use "msg->skb" to test whether the sk_msg is
skb backed up. If it's not, we shall do the memory accounting explicitly.
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com
2024-12-10 01:20:39 +00:00
|
|
|
if (!msg_rx->skb) {
|
2021-03-31 02:32:33 +00:00
|
|
|
sk_mem_uncharge(sk, copy);
|
tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection
When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will
be created out of the skb, and the rmem accounting of the sk_msg will be
handled by the skb.
For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting
to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to
the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result,
except for the global memory limit, the rmem of sk_redir is nearly
unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer.
Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are
used in these two paths. We use "msg->skb" to test whether the sk_msg is
skb backed up. If it's not, we shall do the memory accounting explicitly.
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com
2024-12-10 01:20:39 +00:00
|
|
|
atomic_sub(copy, &sk->sk_rmem_alloc);
|
|
|
|
}
|
2021-03-31 02:32:33 +00:00
|
|
|
msg_rx->sg.size -= copy;
|
|
|
|
|
|
|
|
if (!sge->length) {
|
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
if (!msg_rx->skb)
|
|
|
|
put_page(page);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* Lets not optimize peek case if copy_page_to_iter
|
|
|
|
* didn't copy the entire length lets just break.
|
|
|
|
*/
|
|
|
|
if (copy != sge->length)
|
2022-09-07 07:13:11 +00:00
|
|
|
goto out;
|
2021-03-31 02:32:33 +00:00
|
|
|
sk_msg_iter_var_next(i);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (copied == len)
|
|
|
|
break;
|
2022-08-09 09:49:15 +00:00
|
|
|
} while ((i != msg_rx->sg.end) && !sg_is_last(sge));
|
2021-03-31 02:32:33 +00:00
|
|
|
|
|
|
|
if (unlikely(peek)) {
|
|
|
|
msg_rx = sk_psock_next_msg(psock, msg_rx);
|
|
|
|
if (!msg_rx)
|
|
|
|
break;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
msg_rx->sg.start = i;
|
2022-08-09 09:49:15 +00:00
|
|
|
if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) {
|
2021-03-31 02:32:33 +00:00
|
|
|
msg_rx = sk_psock_dequeue_msg(psock);
|
|
|
|
kfree_sk_msg(msg_rx);
|
|
|
|
}
|
|
|
|
msg_rx = sk_psock_peek_msg(psock);
|
|
|
|
}
|
2022-09-07 07:13:11 +00:00
|
|
|
out:
|
2021-03-31 02:32:33 +00:00
|
|
|
return copied;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
|
|
|
|
|
2021-10-08 20:33:04 +00:00
|
|
|
bool sk_msg_is_readable(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct sk_psock *psock;
|
|
|
|
bool empty = true;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
psock = sk_psock(sk);
|
|
|
|
if (likely(psock))
|
|
|
|
empty = list_empty(&psock->ingress_msg);
|
|
|
|
rcu_read_unlock();
|
|
|
|
return !empty;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_msg_is_readable);
|
|
|
|
|
2022-10-15 21:24:41 +00:00
|
|
|
static struct sk_msg *alloc_sk_msg(gfp_t gfp)
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
{
|
|
|
|
struct sk_msg *msg;
|
|
|
|
|
2022-10-15 21:24:41 +00:00
|
|
|
msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN);
|
2022-06-15 16:20:14 +00:00
|
|
|
if (unlikely(!msg))
|
2020-11-16 22:28:46 +00:00
|
|
|
return NULL;
|
2022-06-15 16:20:14 +00:00
|
|
|
sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS);
|
|
|
|
return msg;
|
|
|
|
}
|
2020-11-16 22:28:46 +00:00
|
|
|
|
2022-06-15 16:20:14 +00:00
|
|
|
static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
|
2020-11-16 22:28:46 +00:00
|
|
|
return NULL;
|
2020-11-16 22:28:06 +00:00
|
|
|
|
2022-06-15 16:20:14 +00:00
|
|
|
if (!sk_rmem_schedule(sk, skb, skb->truesize))
|
2020-11-16 22:28:46 +00:00
|
|
|
return NULL;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
|
2022-10-15 21:24:41 +00:00
|
|
|
return alloc_sk_msg(GFP_KERNEL);
|
2020-11-16 22:28:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
|
2021-10-29 14:12:14 +00:00
|
|
|
u32 off, u32 len,
|
2020-11-16 22:28:46 +00:00
|
|
|
struct sk_psock *psock,
|
|
|
|
struct sock *sk,
|
|
|
|
struct sk_msg *msg)
|
|
|
|
{
|
2020-11-16 22:29:28 +00:00
|
|
|
int num_sge, copied;
|
2020-11-16 22:28:46 +00:00
|
|
|
|
2021-10-29 14:12:14 +00:00
|
|
|
num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
|
2022-04-27 11:51:50 +00:00
|
|
|
if (num_sge < 0) {
|
|
|
|
/* skb linearize may fail with ENOMEM, but lets simply try again
|
|
|
|
* later if this happens. Under memory pressure we don't want to
|
|
|
|
* drop the skb. We need to linearize the skb so that the mapping
|
|
|
|
* in skb_to_sgvec can not error.
|
|
|
|
*/
|
|
|
|
if (skb_linearize(skb))
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
|
|
|
|
if (unlikely(num_sge < 0))
|
|
|
|
return num_sge;
|
|
|
|
}
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
|
2021-10-29 14:12:14 +00:00
|
|
|
copied = len;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
msg->sg.start = 0;
|
2019-05-13 14:19:55 +00:00
|
|
|
msg->sg.size = copied;
|
2019-11-27 20:16:41 +00:00
|
|
|
msg->sg.end = num_sge;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
msg->skb = skb;
|
|
|
|
|
|
|
|
sk_psock_queue_msg(psock, msg);
|
2018-12-20 19:35:33 +00:00
|
|
|
sk_psock_data_ready(sk, psock);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
return copied;
|
|
|
|
}
|
|
|
|
|
2021-10-29 14:12:14 +00:00
|
|
|
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
|
|
|
|
u32 off, u32 len);
|
2020-11-16 22:29:08 +00:00
|
|
|
|
2021-10-29 14:12:14 +00:00
|
|
|
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
|
|
|
|
u32 off, u32 len)
|
2020-11-16 22:28:46 +00:00
|
|
|
{
|
|
|
|
struct sock *sk = psock->sk;
|
|
|
|
struct sk_msg *msg;
|
2021-07-12 19:55:45 +00:00
|
|
|
int err;
|
2020-11-16 22:28:46 +00:00
|
|
|
|
2020-11-16 22:29:08 +00:00
|
|
|
/* If we are receiving on the same sock skb->sk is already assigned,
|
|
|
|
* skip memory accounting and owner transition seeing it already set
|
|
|
|
* correctly.
|
|
|
|
*/
|
|
|
|
if (unlikely(skb->sk == sk))
|
2021-10-29 14:12:14 +00:00
|
|
|
return sk_psock_skb_ingress_self(psock, skb, off, len);
|
2020-11-16 22:28:46 +00:00
|
|
|
msg = sk_psock_create_ingress_msg(sk, skb);
|
|
|
|
if (!msg)
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
/* This will transition ownership of the data from the socket where
|
|
|
|
* the BPF program was run initiating the redirect to the socket
|
|
|
|
* we will eventually receive this data on. The data will be released
|
|
|
|
* from skb_consume found in __tcp_bpf_recvmsg() after its been copied
|
|
|
|
* into user buffers.
|
|
|
|
*/
|
|
|
|
skb_set_owner_r(skb, sk);
|
2021-10-29 14:12:14 +00:00
|
|
|
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
|
2021-07-12 19:55:45 +00:00
|
|
|
if (err < 0)
|
|
|
|
kfree(msg);
|
|
|
|
return err;
|
2020-11-16 22:28:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Puts an skb on the ingress queue of the socket already assigned to the
|
|
|
|
* skb. In this case we do not need to check memory limits or skb_set_owner_r
|
|
|
|
* because the skb is already accounted for here.
|
|
|
|
*/
|
2021-10-29 14:12:14 +00:00
|
|
|
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
|
|
|
|
u32 off, u32 len)
|
2020-11-16 22:28:46 +00:00
|
|
|
{
|
2022-10-15 21:24:41 +00:00
|
|
|
struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
|
2020-11-16 22:28:46 +00:00
|
|
|
struct sock *sk = psock->sk;
|
2021-07-12 19:55:45 +00:00
|
|
|
int err;
|
2020-11-16 22:28:46 +00:00
|
|
|
|
|
|
|
if (unlikely(!msg))
|
|
|
|
return -EAGAIN;
|
2021-04-01 22:00:40 +00:00
|
|
|
skb_set_owner_r(skb, sk);
|
2021-10-29 14:12:14 +00:00
|
|
|
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
|
2021-07-12 19:55:45 +00:00
|
|
|
if (err < 0)
|
|
|
|
kfree(msg);
|
|
|
|
return err;
|
2020-11-16 22:28:46 +00:00
|
|
|
}
|
|
|
|
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
|
|
|
|
u32 off, u32 len, bool ingress)
|
|
|
|
{
|
2023-09-01 20:21:37 +00:00
|
|
|
int err = 0;
|
|
|
|
|
2020-10-09 18:37:17 +00:00
|
|
|
if (!ingress) {
|
|
|
|
if (!sock_writeable(psock->sk))
|
|
|
|
return -EAGAIN;
|
2021-03-31 02:32:25 +00:00
|
|
|
return skb_send_sock(psock->sk, skb, off, len);
|
2020-10-09 18:37:17 +00:00
|
|
|
}
|
2023-09-01 20:21:37 +00:00
|
|
|
skb_get(skb);
|
|
|
|
err = sk_psock_skb_ingress(psock, skb, off, len);
|
|
|
|
if (err < 0)
|
|
|
|
kfree_skb(skb);
|
|
|
|
return err;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
|
|
|
|
2021-07-27 16:04:59 +00:00
|
|
|
static void sk_psock_skb_state(struct sk_psock *psock,
|
|
|
|
struct sk_psock_work_state *state,
|
|
|
|
int len, int off)
|
|
|
|
{
|
|
|
|
spin_lock_bh(&psock->ingress_lock);
|
|
|
|
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
|
|
|
|
state->len = len;
|
|
|
|
state->off = off;
|
|
|
|
}
|
|
|
|
spin_unlock_bh(&psock->ingress_lock);
|
|
|
|
}
|
|
|
|
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
static void sk_psock_backlog(struct work_struct *work)
|
|
|
|
{
|
bpf, sockmap: Convert schedule_work into delayed_work
Sk_buffs are fed into sockmap verdict programs either from a strparser
(when the user might want to decide how framing of skb is done by attaching
another parser program) or directly through tcp_read_sock. The
tcp_read_sock is the preferred method for performance when the BPF logic is
a stream parser.
The flow for Cilium's common use case with a stream parser is,
tcp_read_sock()
sk_psock_verdict_recv
ret = bpf_prog_run_pin_on_cpu()
sk_psock_verdict_apply(sock, skb, ret)
// if system is under memory pressure or app is slow we may
// need to queue skb. Do this queuing through ingress_skb and
// then kick timer to wake up handler
skb_queue_tail(ingress_skb, skb)
schedule_work(work);
The work queue is wired up to sk_psock_backlog(). This will then walk the
ingress_skb skb list that holds our sk_buffs that could not be handled,
but should be OK to run at some later point. However, its possible that
the workqueue doing this work still hits an error when sending the skb.
When this happens the skbuff is requeued on a temporary 'state' struct
kept with the workqueue. This is necessary because its possible to
partially send an skbuff before hitting an error and we need to know how
and where to restart when the workqueue runs next.
Now for the trouble, we don't rekick the workqueue. This can cause a
stall where the skbuff we just cached on the state variable might never
be sent. This happens when its the last packet in a flow and no further
packets come along that would cause the system to kick the workqueue from
that side.
To fix we could do simple schedule_work(), but while under memory pressure
it makes sense to back off some instead of continue to retry repeatedly. So
instead to fix convert schedule_work to schedule_delayed_work and add
backoff logic to reschedule from backlog queue on errors. Its not obvious
though what a good backoff is so use '1'.
To test we observed some flakes whil running NGINX compliance test with
sockmap we attributed these failed test to this bug and subsequent issue.
>From on list discussion. This commit
bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock")
was intended to address similar race, but had a couple cases it missed.
Most obvious it only accounted for receiving traffic on the local socket
so if redirecting into another socket we could still get an sk_buff stuck
here. Next it missed the case where copied=0 in the recv() handler and
then we wouldn't kick the scheduler. Also its sub-optimal to require
userspace to kick the internal mechanisms of sockmap to wake it up and
copy data to user. It results in an extra syscall and requires the app
to actual handle the EAGAIN correctly.
Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: William Findlay <will@isovalent.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com
2023-05-23 02:56:06 +00:00
|
|
|
struct delayed_work *dwork = to_delayed_work(work);
|
|
|
|
struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
struct sk_psock_work_state *state = &psock->work_state;
|
2021-07-27 16:04:59 +00:00
|
|
|
struct sk_buff *skb = NULL;
|
2023-05-23 02:56:08 +00:00
|
|
|
u32 len = 0, off = 0;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
bool ingress;
|
|
|
|
int ret;
|
|
|
|
|
2021-03-31 02:32:25 +00:00
|
|
|
mutex_lock(&psock->work_mutex);
|
2023-05-23 02:56:08 +00:00
|
|
|
if (unlikely(state->len)) {
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
len = state->len;
|
|
|
|
off = state->off;
|
|
|
|
}
|
|
|
|
|
2023-05-23 02:56:08 +00:00
|
|
|
while ((skb = skb_peek(&psock->ingress_skb))) {
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
len = skb->len;
|
|
|
|
off = 0;
|
2021-10-29 14:12:14 +00:00
|
|
|
if (skb_bpf_strparser(skb)) {
|
|
|
|
struct strp_msg *stm = strp_msg(skb);
|
|
|
|
|
|
|
|
off = stm->offset;
|
|
|
|
len = stm->full_len;
|
|
|
|
}
|
2021-02-23 18:49:29 +00:00
|
|
|
ingress = skb_bpf_ingress(skb);
|
|
|
|
skb_bpf_redirect_clear(skb);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
do {
|
|
|
|
ret = -EIO;
|
2021-03-31 02:32:25 +00:00
|
|
|
if (!sock_flag(psock->sk, SOCK_DEAD))
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
ret = sk_psock_handle_skb(psock, skb, off,
|
|
|
|
len, ingress);
|
|
|
|
if (ret <= 0) {
|
|
|
|
if (ret == -EAGAIN) {
|
2023-05-23 02:56:08 +00:00
|
|
|
sk_psock_skb_state(psock, state, len, off);
|
bpf, sockmap: Convert schedule_work into delayed_work
Sk_buffs are fed into sockmap verdict programs either from a strparser
(when the user might want to decide how framing of skb is done by attaching
another parser program) or directly through tcp_read_sock. The
tcp_read_sock is the preferred method for performance when the BPF logic is
a stream parser.
The flow for Cilium's common use case with a stream parser is,
tcp_read_sock()
sk_psock_verdict_recv
ret = bpf_prog_run_pin_on_cpu()
sk_psock_verdict_apply(sock, skb, ret)
// if system is under memory pressure or app is slow we may
// need to queue skb. Do this queuing through ingress_skb and
// then kick timer to wake up handler
skb_queue_tail(ingress_skb, skb)
schedule_work(work);
The work queue is wired up to sk_psock_backlog(). This will then walk the
ingress_skb skb list that holds our sk_buffs that could not be handled,
but should be OK to run at some later point. However, its possible that
the workqueue doing this work still hits an error when sending the skb.
When this happens the skbuff is requeued on a temporary 'state' struct
kept with the workqueue. This is necessary because its possible to
partially send an skbuff before hitting an error and we need to know how
and where to restart when the workqueue runs next.
Now for the trouble, we don't rekick the workqueue. This can cause a
stall where the skbuff we just cached on the state variable might never
be sent. This happens when its the last packet in a flow and no further
packets come along that would cause the system to kick the workqueue from
that side.
To fix we could do simple schedule_work(), but while under memory pressure
it makes sense to back off some instead of continue to retry repeatedly. So
instead to fix convert schedule_work to schedule_delayed_work and add
backoff logic to reschedule from backlog queue on errors. Its not obvious
though what a good backoff is so use '1'.
To test we observed some flakes whil running NGINX compliance test with
sockmap we attributed these failed test to this bug and subsequent issue.
>From on list discussion. This commit
bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock")
was intended to address similar race, but had a couple cases it missed.
Most obvious it only accounted for receiving traffic on the local socket
so if redirecting into another socket we could still get an sk_buff stuck
here. Next it missed the case where copied=0 in the recv() handler and
then we wouldn't kick the scheduler. Also its sub-optimal to require
userspace to kick the internal mechanisms of sockmap to wake it up and
copy data to user. It results in an extra syscall and requires the app
to actual handle the EAGAIN correctly.
Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: William Findlay <will@isovalent.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com
2023-05-23 02:56:06 +00:00
|
|
|
|
|
|
|
/* Delay slightly to prioritize any
|
|
|
|
* other work that might be here.
|
|
|
|
*/
|
|
|
|
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
|
|
|
|
schedule_delayed_work(&psock->work, 1);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
goto end;
|
|
|
|
}
|
|
|
|
/* Hard errors break pipe and stop xmit. */
|
|
|
|
sk_psock_report_error(psock, ret ? -ret : EPIPE);
|
|
|
|
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
|
|
|
|
goto end;
|
|
|
|
}
|
|
|
|
off += ret;
|
|
|
|
len -= ret;
|
|
|
|
} while (len);
|
|
|
|
|
2023-05-23 02:56:08 +00:00
|
|
|
skb = skb_dequeue(&psock->ingress_skb);
|
2023-09-01 20:21:37 +00:00
|
|
|
kfree_skb(skb);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
|
|
|
end:
|
2021-03-31 02:32:25 +00:00
|
|
|
mutex_unlock(&psock->work_mutex);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
struct sk_psock *sk_psock_init(struct sock *sk, int node)
|
|
|
|
{
|
2020-08-21 10:29:43 +00:00
|
|
|
struct sk_psock *psock;
|
|
|
|
struct proto *prot;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
|
2020-08-21 10:29:43 +00:00
|
|
|
write_lock_bh(&sk->sk_callback_lock);
|
|
|
|
|
2022-06-20 19:13:53 +00:00
|
|
|
if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) {
|
|
|
|
psock = ERR_PTR(-EINVAL);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-08-21 10:29:43 +00:00
|
|
|
if (sk->sk_user_data) {
|
|
|
|
psock = ERR_PTR(-EBUSY);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node);
|
|
|
|
if (!psock) {
|
|
|
|
psock = ERR_PTR(-ENOMEM);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
prot = READ_ONCE(sk->sk_prot);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
psock->sk = sk;
|
2020-08-21 10:29:43 +00:00
|
|
|
psock->eval = __SK_NONE;
|
|
|
|
psock->sk_proto = prot;
|
|
|
|
psock->saved_unhash = prot->unhash;
|
2022-05-24 07:53:11 +00:00
|
|
|
psock->saved_destroy = prot->destroy;
|
2020-08-21 10:29:43 +00:00
|
|
|
psock->saved_close = prot->close;
|
|
|
|
psock->saved_write_space = sk->sk_write_space;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
|
|
|
|
INIT_LIST_HEAD(&psock->link);
|
|
|
|
spin_lock_init(&psock->link_lock);
|
|
|
|
|
bpf, sockmap: Convert schedule_work into delayed_work
Sk_buffs are fed into sockmap verdict programs either from a strparser
(when the user might want to decide how framing of skb is done by attaching
another parser program) or directly through tcp_read_sock. The
tcp_read_sock is the preferred method for performance when the BPF logic is
a stream parser.
The flow for Cilium's common use case with a stream parser is,
tcp_read_sock()
sk_psock_verdict_recv
ret = bpf_prog_run_pin_on_cpu()
sk_psock_verdict_apply(sock, skb, ret)
// if system is under memory pressure or app is slow we may
// need to queue skb. Do this queuing through ingress_skb and
// then kick timer to wake up handler
skb_queue_tail(ingress_skb, skb)
schedule_work(work);
The work queue is wired up to sk_psock_backlog(). This will then walk the
ingress_skb skb list that holds our sk_buffs that could not be handled,
but should be OK to run at some later point. However, its possible that
the workqueue doing this work still hits an error when sending the skb.
When this happens the skbuff is requeued on a temporary 'state' struct
kept with the workqueue. This is necessary because its possible to
partially send an skbuff before hitting an error and we need to know how
and where to restart when the workqueue runs next.
Now for the trouble, we don't rekick the workqueue. This can cause a
stall where the skbuff we just cached on the state variable might never
be sent. This happens when its the last packet in a flow and no further
packets come along that would cause the system to kick the workqueue from
that side.
To fix we could do simple schedule_work(), but while under memory pressure
it makes sense to back off some instead of continue to retry repeatedly. So
instead to fix convert schedule_work to schedule_delayed_work and add
backoff logic to reschedule from backlog queue on errors. Its not obvious
though what a good backoff is so use '1'.
To test we observed some flakes whil running NGINX compliance test with
sockmap we attributed these failed test to this bug and subsequent issue.
>From on list discussion. This commit
bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock")
was intended to address similar race, but had a couple cases it missed.
Most obvious it only accounted for receiving traffic on the local socket
so if redirecting into another socket we could still get an sk_buff stuck
here. Next it missed the case where copied=0 in the recv() handler and
then we wouldn't kick the scheduler. Also its sub-optimal to require
userspace to kick the internal mechanisms of sockmap to wake it up and
copy data to user. It results in an extra syscall and requires the app
to actual handle the EAGAIN correctly.
Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: William Findlay <will@isovalent.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com
2023-05-23 02:56:06 +00:00
|
|
|
INIT_DELAYED_WORK(&psock->work, sk_psock_backlog);
|
2021-03-31 02:32:25 +00:00
|
|
|
mutex_init(&psock->work_mutex);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
INIT_LIST_HEAD(&psock->ingress_msg);
|
2021-03-31 02:32:23 +00:00
|
|
|
spin_lock_init(&psock->ingress_lock);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
skb_queue_head_init(&psock->ingress_skb);
|
|
|
|
|
|
|
|
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
|
|
|
|
refcount_set(&psock->refcnt, 1);
|
|
|
|
|
net: fix refcount bug in sk_psock_get (2)
Syzkaller reports refcount bug as follows:
------------[ cut here ]------------
refcount_t: saturated; leaking memory.
WARNING: CPU: 1 PID: 3605 at lib/refcount.c:19 refcount_warn_saturate+0xf4/0x1e0 lib/refcount.c:19
Modules linked in:
CPU: 1 PID: 3605 Comm: syz-executor208 Not tainted 5.18.0-syzkaller-03023-g7e062cda7d90 #0
<TASK>
__refcount_add_not_zero include/linux/refcount.h:163 [inline]
__refcount_inc_not_zero include/linux/refcount.h:227 [inline]
refcount_inc_not_zero include/linux/refcount.h:245 [inline]
sk_psock_get+0x3bc/0x410 include/linux/skmsg.h:439
tls_data_ready+0x6d/0x1b0 net/tls/tls_sw.c:2091
tcp_data_ready+0x106/0x520 net/ipv4/tcp_input.c:4983
tcp_data_queue+0x25f2/0x4c90 net/ipv4/tcp_input.c:5057
tcp_rcv_state_process+0x1774/0x4e80 net/ipv4/tcp_input.c:6659
tcp_v4_do_rcv+0x339/0x980 net/ipv4/tcp_ipv4.c:1682
sk_backlog_rcv include/net/sock.h:1061 [inline]
__release_sock+0x134/0x3b0 net/core/sock.c:2849
release_sock+0x54/0x1b0 net/core/sock.c:3404
inet_shutdown+0x1e0/0x430 net/ipv4/af_inet.c:909
__sys_shutdown_sock net/socket.c:2331 [inline]
__sys_shutdown_sock net/socket.c:2325 [inline]
__sys_shutdown+0xf1/0x1b0 net/socket.c:2343
__do_sys_shutdown net/socket.c:2351 [inline]
__se_sys_shutdown net/socket.c:2349 [inline]
__x64_sys_shutdown+0x50/0x70 net/socket.c:2349
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x46/0xb0
</TASK>
During SMC fallback process in connect syscall, kernel will
replaces TCP with SMC. In order to forward wakeup
smc socket waitqueue after fallback, kernel will sets
clcsk->sk_user_data to origin smc socket in
smc_fback_replace_callbacks().
Later, in shutdown syscall, kernel will calls
sk_psock_get(), which treats the clcsk->sk_user_data
as psock type, triggering the refcnt warning.
So, the root cause is that smc and psock, both will use
sk_user_data field. So they will mismatch this field
easily.
This patch solves it by using another bit(defined as
SK_USER_DATA_PSOCK) in PTRMASK, to mark whether
sk_user_data points to a psock object or not.
This patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e
("net, sk_msg: Clear sk_user_data pointer on clone if tagged").
For there will possibly be more flags in the sk_user_data field,
this patch also refactor sk_user_data flags code to be more generic
to improve its maintainability.
Reported-and-tested-by: syzbot+5f26f85569bd179c18ce@syzkaller.appspotmail.com
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-08-05 07:48:34 +00:00
|
|
|
__rcu_assign_sk_user_data_with_flags(sk, psock,
|
|
|
|
SK_USER_DATA_NOCOPY |
|
|
|
|
SK_USER_DATA_PSOCK);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
sock_hold(sk);
|
|
|
|
|
2020-08-21 10:29:43 +00:00
|
|
|
out:
|
|
|
|
write_unlock_bh(&sk->sk_callback_lock);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
return psock;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_psock_init);
|
|
|
|
|
|
|
|
struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
|
|
|
|
{
|
|
|
|
struct sk_psock_link *link;
|
|
|
|
|
|
|
|
spin_lock_bh(&psock->link_lock);
|
|
|
|
link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
|
|
|
|
list);
|
|
|
|
if (link)
|
|
|
|
list_del(&link->list);
|
|
|
|
spin_unlock_bh(&psock->link_lock);
|
|
|
|
return link;
|
|
|
|
}
|
|
|
|
|
2021-02-23 18:49:32 +00:00
|
|
|
static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
{
|
|
|
|
struct sk_msg *msg, *tmp;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
|
|
|
|
list_del(&msg->list);
|
tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection
When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will
be created out of the skb, and the rmem accounting of the sk_msg will be
handled by the skb.
For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting
to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to
the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result,
except for the global memory limit, the rmem of sk_redir is nearly
unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer.
Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are
used in these two paths. We use "msg->skb" to test whether the sk_msg is
skb backed up. If it's not, we shall do the memory accounting explicitly.
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com
2024-12-10 01:20:39 +00:00
|
|
|
if (!msg->skb)
|
|
|
|
atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
sk_msg_free(psock->sk, msg);
|
|
|
|
kfree(msg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-31 02:32:25 +00:00
|
|
|
static void __sk_psock_zap_ingress(struct sk_psock *psock)
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
{
|
2021-02-23 18:49:29 +00:00
|
|
|
struct sk_buff *skb;
|
|
|
|
|
2021-03-31 02:32:22 +00:00
|
|
|
while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
|
2021-02-23 18:49:29 +00:00
|
|
|
skb_bpf_redirect_clear(skb);
|
2021-06-15 02:13:42 +00:00
|
|
|
sock_drop(psock->sk, skb);
|
2021-02-23 18:49:29 +00:00
|
|
|
}
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
__sk_psock_purge_ingress_msg(psock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sk_psock_link_destroy(struct sk_psock *psock)
|
|
|
|
{
|
|
|
|
struct sk_psock_link *link, *tmp;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(link, tmp, &psock->link, list) {
|
|
|
|
list_del(&link->list);
|
|
|
|
sk_psock_free_link(link);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-02 04:34:17 +00:00
|
|
|
void sk_psock_stop(struct sk_psock *psock)
|
2021-03-31 02:32:25 +00:00
|
|
|
{
|
|
|
|
spin_lock_bh(&psock->ingress_lock);
|
|
|
|
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
|
|
|
|
sk_psock_cork_free(psock);
|
|
|
|
spin_unlock_bh(&psock->ingress_lock);
|
|
|
|
}
|
|
|
|
|
2021-02-23 18:49:26 +00:00
|
|
|
static void sk_psock_done_strp(struct sk_psock *psock);
|
|
|
|
|
2021-03-31 02:32:26 +00:00
|
|
|
static void sk_psock_destroy(struct work_struct *work)
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
{
|
2021-03-31 02:32:26 +00:00
|
|
|
struct sk_psock *psock = container_of(to_rcu_work(work),
|
|
|
|
struct sk_psock, rwork);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
/* No sk_callback_lock since already detached. */
|
2019-05-13 14:19:19 +00:00
|
|
|
|
2021-02-23 18:49:26 +00:00
|
|
|
sk_psock_done_strp(psock);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
|
bpf, sockmap: Convert schedule_work into delayed_work
Sk_buffs are fed into sockmap verdict programs either from a strparser
(when the user might want to decide how framing of skb is done by attaching
another parser program) or directly through tcp_read_sock. The
tcp_read_sock is the preferred method for performance when the BPF logic is
a stream parser.
The flow for Cilium's common use case with a stream parser is,
tcp_read_sock()
sk_psock_verdict_recv
ret = bpf_prog_run_pin_on_cpu()
sk_psock_verdict_apply(sock, skb, ret)
// if system is under memory pressure or app is slow we may
// need to queue skb. Do this queuing through ingress_skb and
// then kick timer to wake up handler
skb_queue_tail(ingress_skb, skb)
schedule_work(work);
The work queue is wired up to sk_psock_backlog(). This will then walk the
ingress_skb skb list that holds our sk_buffs that could not be handled,
but should be OK to run at some later point. However, its possible that
the workqueue doing this work still hits an error when sending the skb.
When this happens the skbuff is requeued on a temporary 'state' struct
kept with the workqueue. This is necessary because its possible to
partially send an skbuff before hitting an error and we need to know how
and where to restart when the workqueue runs next.
Now for the trouble, we don't rekick the workqueue. This can cause a
stall where the skbuff we just cached on the state variable might never
be sent. This happens when its the last packet in a flow and no further
packets come along that would cause the system to kick the workqueue from
that side.
To fix we could do simple schedule_work(), but while under memory pressure
it makes sense to back off some instead of continue to retry repeatedly. So
instead to fix convert schedule_work to schedule_delayed_work and add
backoff logic to reschedule from backlog queue on errors. Its not obvious
though what a good backoff is so use '1'.
To test we observed some flakes whil running NGINX compliance test with
sockmap we attributed these failed test to this bug and subsequent issue.
>From on list discussion. This commit
bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock")
was intended to address similar race, but had a couple cases it missed.
Most obvious it only accounted for receiving traffic on the local socket
so if redirecting into another socket we could still get an sk_buff stuck
here. Next it missed the case where copied=0 in the recv() handler and
then we wouldn't kick the scheduler. Also its sub-optimal to require
userspace to kick the internal mechanisms of sockmap to wake it up and
copy data to user. It results in an extra syscall and requires the app
to actual handle the EAGAIN correctly.
Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: William Findlay <will@isovalent.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com
2023-05-23 02:56:06 +00:00
|
|
|
cancel_delayed_work_sync(&psock->work);
|
2023-05-23 02:56:08 +00:00
|
|
|
__sk_psock_zap_ingress(psock);
|
2021-03-31 02:32:25 +00:00
|
|
|
mutex_destroy(&psock->work_mutex);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
|
|
|
|
psock_progs_drop(&psock->progs);
|
|
|
|
|
|
|
|
sk_psock_link_destroy(psock);
|
|
|
|
sk_psock_cork_free(psock);
|
|
|
|
|
|
|
|
if (psock->sk_redir)
|
|
|
|
sock_put(psock->sk_redir);
|
2023-11-29 01:25:56 +00:00
|
|
|
if (psock->sk_pair)
|
|
|
|
sock_put(psock->sk_pair);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
sock_put(psock->sk);
|
|
|
|
kfree(psock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
|
|
|
|
{
|
|
|
|
write_lock_bh(&sk->sk_callback_lock);
|
2019-07-19 17:29:22 +00:00
|
|
|
sk_psock_restore_proto(sk, psock);
|
|
|
|
rcu_assign_sk_user_data(sk, NULL);
|
2021-02-23 18:49:30 +00:00
|
|
|
if (psock->progs.stream_parser)
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
sk_psock_stop_strp(sk, psock);
|
2021-03-31 02:32:30 +00:00
|
|
|
else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
|
2020-10-11 05:09:38 +00:00
|
|
|
sk_psock_stop_verdict(sk, psock);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
write_unlock_bh(&sk->sk_callback_lock);
|
|
|
|
|
2022-11-02 04:34:17 +00:00
|
|
|
sk_psock_stop(psock);
|
2021-07-27 16:04:58 +00:00
|
|
|
|
2021-03-31 02:32:26 +00:00
|
|
|
INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
|
|
|
|
queue_rcu_work(system_wq, &psock->rwork);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_psock_drop);
|
|
|
|
|
|
|
|
static int sk_psock_map_verd(int verdict, bool redir)
|
|
|
|
{
|
|
|
|
switch (verdict) {
|
|
|
|
case SK_PASS:
|
|
|
|
return redir ? __SK_REDIRECT : __SK_PASS;
|
|
|
|
case SK_DROP:
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return __SK_DROP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
|
|
|
|
struct sk_msg *msg)
|
|
|
|
{
|
|
|
|
struct bpf_prog *prog;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
prog = READ_ONCE(psock->progs.msg_parser);
|
|
|
|
if (unlikely(!prog)) {
|
|
|
|
ret = __SK_PASS;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
sk_msg_compute_data_pointers(msg);
|
|
|
|
msg->sk = sk;
|
2020-02-24 14:01:43 +00:00
|
|
|
ret = bpf_prog_run_pin_on_cpu(prog, msg);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
ret = sk_psock_map_verd(ret, msg->sk_redir);
|
|
|
|
psock->apply_bytes = msg->apply_bytes;
|
|
|
|
if (ret == __SK_REDIRECT) {
|
2022-11-29 10:40:39 +00:00
|
|
|
if (psock->sk_redir) {
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
sock_put(psock->sk_redir);
|
2022-11-29 10:40:39 +00:00
|
|
|
psock->sk_redir = NULL;
|
|
|
|
}
|
|
|
|
if (!msg->sk_redir) {
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
ret = __SK_DROP;
|
|
|
|
goto out;
|
|
|
|
}
|
2022-11-29 10:40:39 +00:00
|
|
|
psock->redir_ingress = sk_msg_to_ingress(msg);
|
|
|
|
psock->sk_redir = msg->sk_redir;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
sock_hold(psock->sk_redir);
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
|
|
|
|
|
2021-06-15 02:13:41 +00:00
|
|
|
static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
{
|
|
|
|
struct sk_psock *psock_other;
|
|
|
|
struct sock *sk_other;
|
|
|
|
|
2021-02-23 18:49:29 +00:00
|
|
|
sk_other = skb_bpf_redirect_fetch(skb);
|
2020-10-09 18:37:17 +00:00
|
|
|
/* This error is a buggy BPF program, it returned a redirect
|
|
|
|
* return code, but then didn't set a redirect interface.
|
|
|
|
*/
|
2020-05-29 23:06:41 +00:00
|
|
|
if (unlikely(!sk_other)) {
|
2021-10-29 14:12:14 +00:00
|
|
|
skb_bpf_redirect_clear(skb);
|
2021-06-15 02:13:42 +00:00
|
|
|
sock_drop(from->sk, skb);
|
2021-06-15 02:13:40 +00:00
|
|
|
return -EIO;
|
2020-05-29 23:06:41 +00:00
|
|
|
}
|
|
|
|
psock_other = sk_psock(sk_other);
|
2020-10-09 18:37:17 +00:00
|
|
|
/* This error indicates the socket is being torn down or had another
|
|
|
|
* error that caused the pipe to break. We can't send a packet on
|
|
|
|
* a socket that is in this state so we drop the skb.
|
|
|
|
*/
|
2021-03-31 02:32:25 +00:00
|
|
|
if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
|
2021-06-15 02:13:38 +00:00
|
|
|
skb_bpf_redirect_clear(skb);
|
2021-06-15 02:13:42 +00:00
|
|
|
sock_drop(from->sk, skb);
|
2021-06-15 02:13:40 +00:00
|
|
|
return -EIO;
|
2021-03-31 02:32:25 +00:00
|
|
|
}
|
|
|
|
spin_lock_bh(&psock_other->ingress_lock);
|
|
|
|
if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
|
|
|
|
spin_unlock_bh(&psock_other->ingress_lock);
|
2021-06-15 02:13:38 +00:00
|
|
|
skb_bpf_redirect_clear(skb);
|
2021-06-15 02:13:42 +00:00
|
|
|
sock_drop(from->sk, skb);
|
2021-06-15 02:13:40 +00:00
|
|
|
return -EIO;
|
2020-05-29 23:06:41 +00:00
|
|
|
}
|
|
|
|
|
2020-10-09 18:37:17 +00:00
|
|
|
skb_queue_tail(&psock_other->ingress_skb, skb);
|
bpf, sockmap: Convert schedule_work into delayed_work
Sk_buffs are fed into sockmap verdict programs either from a strparser
(when the user might want to decide how framing of skb is done by attaching
another parser program) or directly through tcp_read_sock. The
tcp_read_sock is the preferred method for performance when the BPF logic is
a stream parser.
The flow for Cilium's common use case with a stream parser is,
tcp_read_sock()
sk_psock_verdict_recv
ret = bpf_prog_run_pin_on_cpu()
sk_psock_verdict_apply(sock, skb, ret)
// if system is under memory pressure or app is slow we may
// need to queue skb. Do this queuing through ingress_skb and
// then kick timer to wake up handler
skb_queue_tail(ingress_skb, skb)
schedule_work(work);
The work queue is wired up to sk_psock_backlog(). This will then walk the
ingress_skb skb list that holds our sk_buffs that could not be handled,
but should be OK to run at some later point. However, its possible that
the workqueue doing this work still hits an error when sending the skb.
When this happens the skbuff is requeued on a temporary 'state' struct
kept with the workqueue. This is necessary because its possible to
partially send an skbuff before hitting an error and we need to know how
and where to restart when the workqueue runs next.
Now for the trouble, we don't rekick the workqueue. This can cause a
stall where the skbuff we just cached on the state variable might never
be sent. This happens when its the last packet in a flow and no further
packets come along that would cause the system to kick the workqueue from
that side.
To fix we could do simple schedule_work(), but while under memory pressure
it makes sense to back off some instead of continue to retry repeatedly. So
instead to fix convert schedule_work to schedule_delayed_work and add
backoff logic to reschedule from backlog queue on errors. Its not obvious
though what a good backoff is so use '1'.
To test we observed some flakes whil running NGINX compliance test with
sockmap we attributed these failed test to this bug and subsequent issue.
>From on list discussion. This commit
bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock")
was intended to address similar race, but had a couple cases it missed.
Most obvious it only accounted for receiving traffic on the local socket
so if redirecting into another socket we could still get an sk_buff stuck
here. Next it missed the case where copied=0 in the recv() handler and
then we wouldn't kick the scheduler. Also its sub-optimal to require
userspace to kick the internal mechanisms of sockmap to wake it up and
copy data to user. It results in an extra syscall and requires the app
to actual handle the EAGAIN correctly.
Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: William Findlay <will@isovalent.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com
2023-05-23 02:56:06 +00:00
|
|
|
schedule_delayed_work(&psock_other->work, 0);
|
2021-03-31 02:32:25 +00:00
|
|
|
spin_unlock_bh(&psock_other->ingress_lock);
|
2021-06-15 02:13:40 +00:00
|
|
|
return 0;
|
2020-05-29 23:06:41 +00:00
|
|
|
}
|
|
|
|
|
2021-06-15 02:13:41 +00:00
|
|
|
static void sk_psock_tls_verdict_apply(struct sk_buff *skb,
|
|
|
|
struct sk_psock *from, int verdict)
|
2020-05-29 23:06:59 +00:00
|
|
|
{
|
|
|
|
switch (verdict) {
|
|
|
|
case __SK_REDIRECT:
|
2021-06-15 02:13:41 +00:00
|
|
|
sk_psock_skb_redirect(from, skb);
|
2020-05-29 23:06:59 +00:00
|
|
|
break;
|
|
|
|
case __SK_PASS:
|
|
|
|
case __SK_DROP:
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct bpf_prog *prog;
|
|
|
|
int ret = __SK_PASS;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2021-02-23 18:49:30 +00:00
|
|
|
prog = READ_ONCE(psock->progs.stream_verdict);
|
2020-05-29 23:06:59 +00:00
|
|
|
if (likely(prog)) {
|
2020-10-09 18:37:55 +00:00
|
|
|
skb->sk = psock->sk;
|
2021-02-23 18:49:29 +00:00
|
|
|
skb_dst_drop(skb);
|
|
|
|
skb_bpf_redirect_clear(skb);
|
2021-02-23 18:49:33 +00:00
|
|
|
ret = bpf_prog_run_pin_on_cpu(prog, skb);
|
2021-02-23 18:49:29 +00:00
|
|
|
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
|
2020-10-09 18:37:55 +00:00
|
|
|
skb->sk = NULL;
|
2020-05-29 23:06:59 +00:00
|
|
|
}
|
2021-06-15 02:13:41 +00:00
|
|
|
sk_psock_tls_verdict_apply(skb, psock, ret);
|
2020-05-29 23:06:59 +00:00
|
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
|
|
|
|
|
2021-06-15 02:13:40 +00:00
|
|
|
static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
|
|
|
|
int verdict)
|
2020-05-29 23:06:41 +00:00
|
|
|
{
|
|
|
|
struct sock *sk_other;
|
2021-06-15 02:13:40 +00:00
|
|
|
int err = 0;
|
2021-10-29 14:12:14 +00:00
|
|
|
u32 len, off;
|
2020-05-29 23:06:41 +00:00
|
|
|
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
switch (verdict) {
|
2018-12-20 19:35:32 +00:00
|
|
|
case __SK_PASS:
|
2021-06-15 02:13:40 +00:00
|
|
|
err = -EIO;
|
2018-12-20 19:35:32 +00:00
|
|
|
sk_other = psock->sk;
|
|
|
|
if (sock_flag(sk_other, SOCK_DEAD) ||
|
2023-05-23 02:56:12 +00:00
|
|
|
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
|
2018-12-20 19:35:32 +00:00
|
|
|
goto out_free;
|
|
|
|
|
2021-02-23 18:49:29 +00:00
|
|
|
skb_bpf_set_ingress(skb);
|
2020-10-09 18:36:37 +00:00
|
|
|
|
|
|
|
/* If the queue is empty then we can submit directly
|
|
|
|
* into the msg queue. If its not empty we have to
|
|
|
|
* queue work otherwise we may get OOO data. Otherwise,
|
|
|
|
* if sk_psock_skb_ingress errors will be handled by
|
|
|
|
* retrying later from workqueue.
|
|
|
|
*/
|
|
|
|
if (skb_queue_empty(&psock->ingress_skb)) {
|
2021-10-29 14:12:14 +00:00
|
|
|
len = skb->len;
|
|
|
|
off = 0;
|
|
|
|
if (skb_bpf_strparser(skb)) {
|
|
|
|
struct strp_msg *stm = strp_msg(skb);
|
|
|
|
|
|
|
|
off = stm->offset;
|
|
|
|
len = stm->full_len;
|
|
|
|
}
|
|
|
|
err = sk_psock_skb_ingress_self(psock, skb, off, len);
|
2020-10-09 18:36:37 +00:00
|
|
|
}
|
|
|
|
if (err < 0) {
|
2021-03-31 02:32:25 +00:00
|
|
|
spin_lock_bh(&psock->ingress_lock);
|
|
|
|
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
|
|
|
|
skb_queue_tail(&psock->ingress_skb, skb);
|
bpf, sockmap: Convert schedule_work into delayed_work
Sk_buffs are fed into sockmap verdict programs either from a strparser
(when the user might want to decide how framing of skb is done by attaching
another parser program) or directly through tcp_read_sock. The
tcp_read_sock is the preferred method for performance when the BPF logic is
a stream parser.
The flow for Cilium's common use case with a stream parser is,
tcp_read_sock()
sk_psock_verdict_recv
ret = bpf_prog_run_pin_on_cpu()
sk_psock_verdict_apply(sock, skb, ret)
// if system is under memory pressure or app is slow we may
// need to queue skb. Do this queuing through ingress_skb and
// then kick timer to wake up handler
skb_queue_tail(ingress_skb, skb)
schedule_work(work);
The work queue is wired up to sk_psock_backlog(). This will then walk the
ingress_skb skb list that holds our sk_buffs that could not be handled,
but should be OK to run at some later point. However, its possible that
the workqueue doing this work still hits an error when sending the skb.
When this happens the skbuff is requeued on a temporary 'state' struct
kept with the workqueue. This is necessary because its possible to
partially send an skbuff before hitting an error and we need to know how
and where to restart when the workqueue runs next.
Now for the trouble, we don't rekick the workqueue. This can cause a
stall where the skbuff we just cached on the state variable might never
be sent. This happens when its the last packet in a flow and no further
packets come along that would cause the system to kick the workqueue from
that side.
To fix we could do simple schedule_work(), but while under memory pressure
it makes sense to back off some instead of continue to retry repeatedly. So
instead to fix convert schedule_work to schedule_delayed_work and add
backoff logic to reschedule from backlog queue on errors. Its not obvious
though what a good backoff is so use '1'.
To test we observed some flakes whil running NGINX compliance test with
sockmap we attributed these failed test to this bug and subsequent issue.
>From on list discussion. This commit
bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock")
was intended to address similar race, but had a couple cases it missed.
Most obvious it only accounted for receiving traffic on the local socket
so if redirecting into another socket we could still get an sk_buff stuck
here. Next it missed the case where copied=0 in the recv() handler and
then we wouldn't kick the scheduler. Also its sub-optimal to require
userspace to kick the internal mechanisms of sockmap to wake it up and
copy data to user. It results in an extra syscall and requires the app
to actual handle the EAGAIN correctly.
Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: William Findlay <will@isovalent.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com
2023-05-23 02:56:06 +00:00
|
|
|
schedule_delayed_work(&psock->work, 0);
|
2021-06-15 02:13:39 +00:00
|
|
|
err = 0;
|
2021-03-31 02:32:25 +00:00
|
|
|
}
|
|
|
|
spin_unlock_bh(&psock->ingress_lock);
|
2023-05-23 02:56:12 +00:00
|
|
|
if (err < 0)
|
2021-06-15 02:13:39 +00:00
|
|
|
goto out_free;
|
2020-10-09 18:36:37 +00:00
|
|
|
}
|
2020-10-09 18:36:16 +00:00
|
|
|
break;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
case __SK_REDIRECT:
|
2023-05-23 02:56:12 +00:00
|
|
|
tcp_eat_skb(psock->sk, skb);
|
2021-06-15 02:13:41 +00:00
|
|
|
err = sk_psock_skb_redirect(psock, skb);
|
2020-05-29 23:06:41 +00:00
|
|
|
break;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
case __SK_DROP:
|
|
|
|
default:
|
|
|
|
out_free:
|
2023-05-23 02:56:12 +00:00
|
|
|
skb_bpf_redirect_clear(skb);
|
|
|
|
tcp_eat_skb(psock->sk, skb);
|
2021-06-15 02:13:42 +00:00
|
|
|
sock_drop(psock->sk, skb);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
2021-06-15 02:13:40 +00:00
|
|
|
|
|
|
|
return err;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
|
|
|
|
2021-02-23 18:49:26 +00:00
|
|
|
static void sk_psock_write_space(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct sk_psock *psock;
|
|
|
|
void (*write_space)(struct sock *sk) = NULL;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
psock = sk_psock(sk);
|
|
|
|
if (likely(psock)) {
|
|
|
|
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
|
bpf, sockmap: Convert schedule_work into delayed_work
Sk_buffs are fed into sockmap verdict programs either from a strparser
(when the user might want to decide how framing of skb is done by attaching
another parser program) or directly through tcp_read_sock. The
tcp_read_sock is the preferred method for performance when the BPF logic is
a stream parser.
The flow for Cilium's common use case with a stream parser is,
tcp_read_sock()
sk_psock_verdict_recv
ret = bpf_prog_run_pin_on_cpu()
sk_psock_verdict_apply(sock, skb, ret)
// if system is under memory pressure or app is slow we may
// need to queue skb. Do this queuing through ingress_skb and
// then kick timer to wake up handler
skb_queue_tail(ingress_skb, skb)
schedule_work(work);
The work queue is wired up to sk_psock_backlog(). This will then walk the
ingress_skb skb list that holds our sk_buffs that could not be handled,
but should be OK to run at some later point. However, its possible that
the workqueue doing this work still hits an error when sending the skb.
When this happens the skbuff is requeued on a temporary 'state' struct
kept with the workqueue. This is necessary because its possible to
partially send an skbuff before hitting an error and we need to know how
and where to restart when the workqueue runs next.
Now for the trouble, we don't rekick the workqueue. This can cause a
stall where the skbuff we just cached on the state variable might never
be sent. This happens when its the last packet in a flow and no further
packets come along that would cause the system to kick the workqueue from
that side.
To fix we could do simple schedule_work(), but while under memory pressure
it makes sense to back off some instead of continue to retry repeatedly. So
instead to fix convert schedule_work to schedule_delayed_work and add
backoff logic to reschedule from backlog queue on errors. Its not obvious
though what a good backoff is so use '1'.
To test we observed some flakes whil running NGINX compliance test with
sockmap we attributed these failed test to this bug and subsequent issue.
>From on list discussion. This commit
bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock")
was intended to address similar race, but had a couple cases it missed.
Most obvious it only accounted for receiving traffic on the local socket
so if redirecting into another socket we could still get an sk_buff stuck
here. Next it missed the case where copied=0 in the recv() handler and
then we wouldn't kick the scheduler. Also its sub-optimal to require
userspace to kick the internal mechanisms of sockmap to wake it up and
copy data to user. It results in an extra syscall and requires the app
to actual handle the EAGAIN correctly.
Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: William Findlay <will@isovalent.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com
2023-05-23 02:56:06 +00:00
|
|
|
schedule_delayed_work(&psock->work, 0);
|
2021-02-23 18:49:26 +00:00
|
|
|
write_space = psock->saved_write_space;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
if (write_space)
|
|
|
|
write_space(sk);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
|
|
|
|
{
|
2020-06-25 23:13:18 +00:00
|
|
|
struct sk_psock *psock;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
struct bpf_prog *prog;
|
|
|
|
int ret = __SK_DROP;
|
2020-06-25 23:13:18 +00:00
|
|
|
struct sock *sk;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
|
|
|
|
rcu_read_lock();
|
2020-06-25 23:13:18 +00:00
|
|
|
sk = strp->sk;
|
|
|
|
psock = sk_psock(sk);
|
|
|
|
if (unlikely(!psock)) {
|
2021-06-15 02:13:42 +00:00
|
|
|
sock_drop(sk, skb);
|
2020-06-25 23:13:18 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2021-02-23 18:49:30 +00:00
|
|
|
prog = READ_ONCE(psock->progs.stream_verdict);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
if (likely(prog)) {
|
2021-04-01 22:00:40 +00:00
|
|
|
skb->sk = sk;
|
2021-02-23 18:49:29 +00:00
|
|
|
skb_dst_drop(skb);
|
|
|
|
skb_bpf_redirect_clear(skb);
|
2021-02-23 18:49:33 +00:00
|
|
|
ret = bpf_prog_run_pin_on_cpu(prog, skb);
|
2023-05-23 02:56:12 +00:00
|
|
|
skb_bpf_set_strparser(skb);
|
2021-02-23 18:49:29 +00:00
|
|
|
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
|
2021-04-01 22:00:40 +00:00
|
|
|
skb->sk = NULL;
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
|
|
|
sk_psock_verdict_apply(psock, skb, ret);
|
2020-06-25 23:13:18 +00:00
|
|
|
out:
|
bpf, sockmap: RCU splat with redirect and strparser error or TLS
There are two paths to generate the below RCU splat the first and
most obvious is the result of the BPF verdict program issuing a
redirect on a TLS socket (This is the splat shown below). Unlike
the non-TLS case the caller of the *strp_read() hooks does not
wrap the call in a rcu_read_lock/unlock. Then if the BPF program
issues a redirect action we hit the RCU splat.
However, in the non-TLS socket case the splat appears to be
relatively rare, because the skmsg caller into the strp_data_ready()
is wrapped in a rcu_read_lock/unlock. Shown here,
static void sk_psock_strp_data_ready(struct sock *sk)
{
struct sk_psock *psock;
rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock)) {
if (tls_sw_has_ctx_rx(sk)) {
psock->parser.saved_data_ready(sk);
} else {
write_lock_bh(&sk->sk_callback_lock);
strp_data_ready(&psock->parser.strp);
write_unlock_bh(&sk->sk_callback_lock);
}
}
rcu_read_unlock();
}
If the above was the only way to run the verdict program we
would be safe. But, there is a case where the strparser may throw an
ENOMEM error while parsing the skb. This is a result of a failed
skb_clone, or alloc_skb_for_msg while building a new merged skb when
the msg length needed spans multiple skbs. This will in turn put the
skb on the strp_wrk workqueue in the strparser code. The skb will
later be dequeued and verdict programs run, but now from a
different context without the rcu_read_lock()/unlock() critical
section in sk_psock_strp_data_ready() shown above. In practice
I have not seen this yet, because as far as I know most users of the
verdict programs are also only working on single skbs. In this case no
merge happens which could trigger the above ENOMEM errors. In addition
the system would need to be under memory pressure. For example, we
can't hit the above case in selftests because we missed having tests
to merge skbs. (Added in later patch)
To fix the below splat extend the rcu_read_lock/unnlock block to
include the call to sk_psock_tls_verdict_apply(). This will fix both
TLS redirect case and non-TLS redirect+error case. Also remove
psock from the sk_psock_tls_verdict_apply() function signature its
not used there.
[ 1095.937597] WARNING: suspicious RCU usage
[ 1095.940964] 5.7.0-rc7-02911-g463bac5f1ca79 #1 Tainted: G W
[ 1095.944363] -----------------------------
[ 1095.947384] include/linux/skmsg.h:284 suspicious rcu_dereference_check() usage!
[ 1095.950866]
[ 1095.950866] other info that might help us debug this:
[ 1095.950866]
[ 1095.957146]
[ 1095.957146] rcu_scheduler_active = 2, debug_locks = 1
[ 1095.961482] 1 lock held by test_sockmap/15970:
[ 1095.964501] #0: ffff9ea6b25de660 (sk_lock-AF_INET){+.+.}-{0:0}, at: tls_sw_recvmsg+0x13a/0x840 [tls]
[ 1095.968568]
[ 1095.968568] stack backtrace:
[ 1095.975001] CPU: 1 PID: 15970 Comm: test_sockmap Tainted: G W 5.7.0-rc7-02911-g463bac5f1ca79 #1
[ 1095.977883] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
[ 1095.980519] Call Trace:
[ 1095.982191] dump_stack+0x8f/0xd0
[ 1095.984040] sk_psock_skb_redirect+0xa6/0xf0
[ 1095.986073] sk_psock_tls_strp_read+0x1d8/0x250
[ 1095.988095] tls_sw_recvmsg+0x714/0x840 [tls]
v2: Improve commit message to identify non-TLS redirect plus error case
condition as well as more common TLS case. In the process I decided
doing the rcu_read_unlock followed by the lock/unlock inside branches
was unnecessarily complex. We can just extend the current rcu block
and get the same effeective without the shuffling and branching.
Thanks Martin!
Fixes: e91de6afa81c1 ("bpf: Fix running sk_skb program types with ktls")
Reported-by: Jakub Sitnicki <jakub@cloudflare.com>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/159312677907.18340.11064813152758406626.stgit@john-XPS-13-9370
2020-06-25 23:12:59 +00:00
|
|
|
rcu_read_unlock();
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int sk_psock_strp_read_done(struct strparser *strp, int err)
|
|
|
|
{
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
|
|
|
|
{
|
2021-02-23 18:49:27 +00:00
|
|
|
struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
struct bpf_prog *prog;
|
|
|
|
int ret = skb->len;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2021-02-23 18:49:30 +00:00
|
|
|
prog = READ_ONCE(psock->progs.stream_parser);
|
2020-10-09 18:37:55 +00:00
|
|
|
if (likely(prog)) {
|
|
|
|
skb->sk = psock->sk;
|
2021-02-23 18:49:33 +00:00
|
|
|
ret = bpf_prog_run_pin_on_cpu(prog, skb);
|
2020-10-09 18:37:55 +00:00
|
|
|
skb->sk = NULL;
|
|
|
|
}
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Called with socket lock held. */
|
2018-12-20 19:35:33 +00:00
|
|
|
static void sk_psock_strp_data_ready(struct sock *sk)
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
{
|
|
|
|
struct sk_psock *psock;
|
|
|
|
|
2023-01-20 00:45:16 +00:00
|
|
|
trace_sk_data_ready(sk);
|
|
|
|
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
rcu_read_lock();
|
|
|
|
psock = sk_psock(sk);
|
|
|
|
if (likely(psock)) {
|
2020-05-29 23:06:59 +00:00
|
|
|
if (tls_sw_has_ctx_rx(sk)) {
|
2021-02-23 18:49:27 +00:00
|
|
|
psock->saved_data_ready(sk);
|
2020-05-29 23:06:59 +00:00
|
|
|
} else {
|
2024-11-18 03:09:09 +00:00
|
|
|
read_lock_bh(&sk->sk_callback_lock);
|
2021-02-23 18:49:27 +00:00
|
|
|
strp_data_ready(&psock->strp);
|
2024-11-18 03:09:09 +00:00
|
|
|
read_unlock_bh(&sk->sk_callback_lock);
|
2020-05-29 23:06:59 +00:00
|
|
|
}
|
bpf, sockmap: convert to generic sk_msg interface
Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.
This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.
Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.
The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.
Joint work with John.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-10-13 00:45:58 +00:00
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2021-02-23 18:49:26 +00:00
|
|
|
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
|
|
|
|
{
|
2023-08-04 07:37:38 +00:00
|
|
|
int ret;
|
|
|
|
|
2021-02-23 18:49:26 +00:00
|
|
|
static const struct strp_callbacks cb = {
|
|
|
|
.rcv_msg = sk_psock_strp_read,
|
|
|
|
.read_sock_done = sk_psock_strp_read_done,
|
|
|
|
.parse_msg = sk_psock_strp_parse,
|
|
|
|
};
|
|
|
|
|
2023-08-04 07:37:38 +00:00
|
|
|
ret = strp_init(&psock->strp, sk, &cb);
|
|
|
|
if (!ret)
|
|
|
|
sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);
|
|
|
|
|
|
|
|
return ret;
|
2021-02-23 18:49:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
|
|
|
|
{
|
2021-02-23 18:49:27 +00:00
|
|
|
if (psock->saved_data_ready)
|
2021-02-23 18:49:26 +00:00
|
|
|
return;
|
|
|
|
|
2021-02-23 18:49:27 +00:00
|
|
|
psock->saved_data_ready = sk->sk_data_ready;
|
2021-02-23 18:49:26 +00:00
|
|
|
sk->sk_data_ready = sk_psock_strp_data_ready;
|
|
|
|
sk->sk_write_space = sk_psock_write_space;
|
|
|
|
}
|
|
|
|
|
|
|
|
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
|
|
|
|
{
|
2021-11-19 18:14:18 +00:00
|
|
|
psock_set_prog(&psock->progs.stream_parser, NULL);
|
|
|
|
|
2021-02-23 18:49:27 +00:00
|
|
|
if (!psock->saved_data_ready)
|
2021-02-23 18:49:26 +00:00
|
|
|
return;
|
|
|
|
|
2021-02-23 18:49:27 +00:00
|
|
|
sk->sk_data_ready = psock->saved_data_ready;
|
|
|
|
psock->saved_data_ready = NULL;
|
|
|
|
strp_stop(&psock->strp);
|
2021-02-23 18:49:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void sk_psock_done_strp(struct sk_psock *psock)
|
|
|
|
{
|
|
|
|
/* Parser has been stopped */
|
2023-08-04 07:37:38 +00:00
|
|
|
if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED))
|
2021-02-23 18:49:27 +00:00
|
|
|
strp_done(&psock->strp);
|
2021-02-23 18:49:26 +00:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void sk_psock_done_strp(struct sk_psock *psock)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_BPF_STREAM_PARSER */
|
|
|
|
|
2022-06-15 16:20:12 +00:00
|
|
|
static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
|
2020-10-11 05:09:38 +00:00
|
|
|
{
|
|
|
|
struct sk_psock *psock;
|
|
|
|
struct bpf_prog *prog;
|
|
|
|
int ret = __SK_DROP;
|
2022-06-15 16:20:12 +00:00
|
|
|
int len = skb->len;
|
2020-10-11 05:09:38 +00:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
psock = sk_psock(sk);
|
|
|
|
if (unlikely(!psock)) {
|
|
|
|
len = 0;
|
2023-05-23 02:56:12 +00:00
|
|
|
tcp_eat_skb(sk, skb);
|
2021-06-15 02:13:42 +00:00
|
|
|
sock_drop(sk, skb);
|
2020-10-11 05:09:38 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2021-02-23 18:49:30 +00:00
|
|
|
prog = READ_ONCE(psock->progs.stream_verdict);
|
2021-03-31 02:32:30 +00:00
|
|
|
if (!prog)
|
|
|
|
prog = READ_ONCE(psock->progs.skb_verdict);
|
2020-10-11 05:09:38 +00:00
|
|
|
if (likely(prog)) {
|
2021-02-23 18:49:29 +00:00
|
|
|
skb_dst_drop(skb);
|
|
|
|
skb_bpf_redirect_clear(skb);
|
2021-02-23 18:49:33 +00:00
|
|
|
ret = bpf_prog_run_pin_on_cpu(prog, skb);
|
2021-02-23 18:49:29 +00:00
|
|
|
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
|
2020-10-11 05:09:38 +00:00
|
|
|
}
|
2022-08-17 19:54:45 +00:00
|
|
|
ret = sk_psock_verdict_apply(psock, skb, ret);
|
|
|
|
if (ret < 0)
|
|
|
|
len = ret;
|
2020-10-11 05:09:38 +00:00
|
|
|
out:
|
|
|
|
rcu_read_unlock();
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sk_psock_verdict_data_ready(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct socket *sock = sk->sk_socket;
|
2023-08-08 13:58:09 +00:00
|
|
|
const struct proto_ops *ops;
|
2023-05-23 02:56:11 +00:00
|
|
|
int copied;
|
2020-10-11 05:09:38 +00:00
|
|
|
|
2023-01-20 00:45:16 +00:00
|
|
|
trace_sk_data_ready(sk);
|
|
|
|
|
2023-08-08 13:58:09 +00:00
|
|
|
if (unlikely(!sock))
|
2020-10-11 05:09:38 +00:00
|
|
|
return;
|
2023-08-08 13:58:09 +00:00
|
|
|
ops = READ_ONCE(sock->ops);
|
|
|
|
if (!ops || !ops->read_skb)
|
|
|
|
return;
|
|
|
|
copied = ops->read_skb(sk, sk_psock_verdict_recv);
|
2023-05-23 02:56:11 +00:00
|
|
|
if (copied >= 0) {
|
|
|
|
struct sk_psock *psock;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
psock = sk_psock(sk);
|
2024-04-04 02:10:01 +00:00
|
|
|
if (psock)
|
2024-02-18 15:09:33 +00:00
|
|
|
sk_psock_data_ready(sk, psock);
|
2023-05-23 02:56:11 +00:00
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
2020-10-11 05:09:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
|
|
|
|
{
|
2021-02-23 18:49:27 +00:00
|
|
|
if (psock->saved_data_ready)
|
2020-10-11 05:09:38 +00:00
|
|
|
return;
|
|
|
|
|
2021-02-23 18:49:27 +00:00
|
|
|
psock->saved_data_ready = sk->sk_data_ready;
|
2020-10-11 05:09:38 +00:00
|
|
|
sk->sk_data_ready = sk_psock_verdict_data_ready;
|
|
|
|
sk->sk_write_space = sk_psock_write_space;
|
|
|
|
}
|
|
|
|
|
|
|
|
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
|
|
|
|
{
|
2021-11-19 18:14:18 +00:00
|
|
|
psock_set_prog(&psock->progs.stream_verdict, NULL);
|
|
|
|
psock_set_prog(&psock->progs.skb_verdict, NULL);
|
|
|
|
|
2021-02-23 18:49:27 +00:00
|
|
|
if (!psock->saved_data_ready)
|
2020-10-11 05:09:38 +00:00
|
|
|
return;
|
|
|
|
|
2021-02-23 18:49:27 +00:00
|
|
|
sk->sk_data_ready = psock->saved_data_ready;
|
|
|
|
psock->saved_data_ready = NULL;
|
2020-10-11 05:09:38 +00:00
|
|
|
}
|