bpf-for-netdev

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQTFp0I1jqZrAX+hPRXbK58LschIgwUCZRqk1wAKCRDbK58LschI
 g8GRAQC4E0bw6BTFRl0b3MxvpZES6lU0BUtX2gKVK4tLZdXw/wEAmTlBXQqNzF3b
 BkCQknVbFTSw/8l8pzUW123Fb46wUAQ=
 =E3hd
 -----END PGP SIGNATURE-----

Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2023-10-02

We've added 11 non-merge commits during the last 12 day(s) which contain
a total of 12 files changed, 176 insertions(+), 41 deletions(-).

The main changes are:

1) Fix BPF verifier to reset backtrack_state masks on global function
   exit as otherwise subsequent precision tracking would reuse them,
   from Andrii Nakryiko.

2) Several sockmap fixes for available bytes accounting,
   from John Fastabend.

3) Reject sk_msg egress redirects to non-TCP sockets given this
   is only supported for TCP sockets today, from Jakub Sitnicki.

4) Fix a syzkaller splat in bpf_mprog when hitting maximum program
   limits with BPF_F_BEFORE directive, from Daniel Borkmann
   and Nikolay Aleksandrov.

5) Fix BPF memory allocator to use kmalloc_size_roundup() to adjust
   size_index for selecting a bpf_mem_cache, from Hou Tao.

6) Fix arch_prepare_bpf_trampoline return code for s390 JIT,
   from Song Liu.

7) Fix bpf_trampoline_get when CONFIG_BPF_JIT is turned off,
   from Leon Hwang.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  bpf: Use kmalloc_size_roundup() to adjust size_index
  selftest/bpf: Add various selftests for program limits
  bpf, mprog: Fix maximum program check on mprog attachment
  bpf, sockmap: Reject sk_msg egress redirects to non-TCP sockets
  bpf, sockmap: Add tests for MSG_F_PEEK
  bpf, sockmap: Do not inc copied_seq when PEEK flag set
  bpf: tcp_read_skb needs to pop skb regardless of seq
  bpf: unconditionally reset backtrack_state masks on global func exit
  bpf: Fix tr dereferencing
  selftests/bpf: Check bpf_cubic_acked() is called via struct_ops
  s390/bpf: Let arch_prepare_bpf_trampoline return program size
====================

Link: https://lore.kernel.org/r/20231002113417.2309-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2023-10-04 08:28:07 -07:00
commit 1eb3dee16a
12 changed files with 176 additions and 41 deletions

View File

@ -2513,7 +2513,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
return -E2BIG;
}
return ret;
return tjit.common.prg;
}
bool bpf_jit_supports_subprog_tailcalls(void)

View File

@ -1307,7 +1307,7 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
static inline struct bpf_trampoline *bpf_trampoline_get(u64 key,
struct bpf_attach_target_info *tgt_info)
{
return ERR_PTR(-EOPNOTSUPP);
return NULL;
}
static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
#define DEFINE_BPF_DISPATCHER(name)

View File

@ -965,37 +965,31 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
/* Most of the logic is taken from setup_kmalloc_cache_index_table() */
static __init int bpf_mem_cache_adjust_size(void)
{
unsigned int size, index;
unsigned int size;
/* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be
* up-to 256-bytes.
/* Adjusting the indexes in size_index() according to the object_size
* of underlying slab cache, so bpf_mem_alloc() will select a
* bpf_mem_cache with unit_size equal to the object_size of
* the underlying slab cache.
*
* The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is
* 256-bytes, so only do adjustment for [8-bytes, 192-bytes].
*/
size = KMALLOC_MIN_SIZE;
if (size <= 192)
index = size_index[(size - 1) / 8];
else
index = fls(size - 1) - 1;
for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8)
size_index[(size - 1) / 8] = index;
for (size = 192; size >= 8; size -= 8) {
unsigned int kmalloc_size, index;
/* The minimal alignment is 64-bytes, so disable 96-bytes cache and
* use 128-bytes cache instead.
*/
if (KMALLOC_MIN_SIZE >= 64) {
index = size_index[(128 - 1) / 8];
for (size = 64 + 8; size <= 96; size += 8)
size_index[(size - 1) / 8] = index;
}
kmalloc_size = kmalloc_size_roundup(size);
if (kmalloc_size == size)
continue;
/* The minimal alignment is 128-bytes, so disable 192-bytes cache and
* use 256-bytes cache instead.
*/
if (KMALLOC_MIN_SIZE >= 128) {
index = fls(256 - 1) - 1;
for (size = 128 + 8; size <= 192; size += 8)
if (kmalloc_size <= 192)
index = size_index[(kmalloc_size - 1) / 8];
else
index = fls(kmalloc_size - 1) - 1;
/* Only overwrite if necessary */
if (size_index[(size - 1) / 8] != index)
size_index[(size - 1) / 8] = index;
}

View File

@ -253,6 +253,9 @@ int bpf_mprog_attach(struct bpf_mprog_entry *entry,
goto out;
}
idx = tidx;
} else if (bpf_mprog_total(entry) == bpf_mprog_max()) {
ret = -ERANGE;
goto out;
}
if (flags & BPF_F_BEFORE) {
tidx = bpf_mprog_pos_before(entry, &rtuple);

View File

@ -4047,11 +4047,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
bitmap_from_u64(mask, bt_reg_mask(bt));
for_each_set_bit(i, mask, 32) {
reg = &st->frame[0]->regs[i];
if (reg->type != SCALAR_VALUE) {
bt_clear_reg(bt, i);
continue;
}
reg->precise = true;
bt_clear_reg(bt, i);
if (reg->type == SCALAR_VALUE)
reg->precise = true;
}
return 0;
}

View File

@ -668,6 +668,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
sk = __sock_map_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;
@ -1267,6 +1269,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
sk = __sock_hash_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;

View File

@ -1621,16 +1621,13 @@ EXPORT_SYMBOL(tcp_read_sock);
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 seq = tp->copied_seq;
struct sk_buff *skb;
int copied = 0;
u32 offset;
if (sk->sk_state == TCP_LISTEN)
return -ENOTCONN;
while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
u8 tcp_flags;
int used;
@ -1643,13 +1640,10 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
copied = used;
break;
}
seq += used;
copied += used;
if (tcp_flags & TCPHDR_FIN) {
++seq;
if (tcp_flags & TCPHDR_FIN)
break;
}
}
return copied;
}

View File

@ -222,6 +222,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
int *addr_len)
{
struct tcp_sock *tcp = tcp_sk(sk);
int peek = flags & MSG_PEEK;
u32 seq = tcp->copied_seq;
struct sk_psock *psock;
int copied = 0;
@ -311,7 +312,8 @@ msg_bytes_ready:
copied = -EAGAIN;
}
out:
WRITE_ONCE(tcp->copied_seq, seq);
if (!peek)
WRITE_ONCE(tcp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
if (copied > 0)
__tcp_cleanup_rbuf(sk, copied);

View File

@ -185,6 +185,8 @@ static void test_cubic(void)
do_test("bpf_cubic", NULL);
ASSERT_EQ(cubic_skel->bss->bpf_cubic_acked_called, 1, "pkts_acked called");
bpf_link__destroy(link);
bpf_cubic__destroy(cubic_skel);
}

View File

@ -475,6 +475,55 @@ out:
test_sockmap_drop_prog__destroy(drop);
}
static void test_sockmap_skb_verdict_peek(void)
{
int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail;
struct test_sockmap_pass_prog *pass;
char snd[256] = "0123456789";
char rcv[256] = "0";
pass = test_sockmap_pass_prog__open_and_load();
if (!ASSERT_OK_PTR(pass, "open_and_load"))
return;
verdict = bpf_program__fd(pass->progs.prog_skb_verdict);
map = bpf_map__fd(pass->maps.sock_map_rx);
err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0);
if (!ASSERT_OK(err, "bpf_prog_attach"))
goto out;
s = socket_loopback(AF_INET, SOCK_STREAM);
if (!ASSERT_GT(s, -1, "socket_loopback(s)"))
goto out;
err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1);
if (!ASSERT_OK(err, "create_pairs(s)"))
goto out;
err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST);
if (!ASSERT_OK(err, "bpf_map_update_elem(c1)"))
goto out_close;
sent = xsend(p1, snd, sizeof(snd), 0);
ASSERT_EQ(sent, sizeof(snd), "xsend(p1)");
recvd = recv(c1, rcv, sizeof(rcv), MSG_PEEK);
ASSERT_EQ(recvd, sizeof(rcv), "recv(c1)");
err = ioctl(c1, FIONREAD, &avail);
ASSERT_OK(err, "ioctl(FIONREAD) error");
ASSERT_EQ(avail, sizeof(snd), "after peek ioctl(FIONREAD)");
recvd = recv(c1, rcv, sizeof(rcv), 0);
ASSERT_EQ(recvd, sizeof(rcv), "recv(p0)");
err = ioctl(c1, FIONREAD, &avail);
ASSERT_OK(err, "ioctl(FIONREAD) error");
ASSERT_EQ(avail, 0, "after read ioctl(FIONREAD)");
out_close:
close(c1);
close(p1);
out:
test_sockmap_pass_prog__destroy(pass);
}
void test_sockmap_basic(void)
{
if (test__start_subtest("sockmap create_update_free"))
@ -515,4 +564,6 @@ void test_sockmap_basic(void)
test_sockmap_skb_verdict_fionread(true);
if (test__start_subtest("sockmap skb_verdict fionread on drop"))
test_sockmap_skb_verdict_fionread(false);
if (test__start_subtest("sockmap skb_verdict msg_f_peek"))
test_sockmap_skb_verdict_peek();
}

View File

@ -2378,3 +2378,87 @@ void serial_test_tc_opts_chain_mixed(void)
test_tc_chain_mixed(BPF_TCX_INGRESS);
test_tc_chain_mixed(BPF_TCX_EGRESS);
}
static int generate_dummy_prog(void)
{
const struct bpf_insn prog_insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
const size_t prog_insn_cnt = sizeof(prog_insns) / sizeof(struct bpf_insn);
LIBBPF_OPTS(bpf_prog_load_opts, opts);
const size_t log_buf_sz = 256;
char *log_buf;
int fd = -1;
log_buf = malloc(log_buf_sz);
if (!ASSERT_OK_PTR(log_buf, "log_buf_alloc"))
return fd;
opts.log_buf = log_buf;
opts.log_size = log_buf_sz;
log_buf[0] = '\0';
opts.log_level = 0;
fd = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, "tcx_prog", "GPL",
prog_insns, prog_insn_cnt, &opts);
ASSERT_STREQ(log_buf, "", "log_0");
ASSERT_GE(fd, 0, "prog_fd");
free(log_buf);
return fd;
}
static void test_tc_opts_max_target(int target, int flags, bool relative)
{
int err, ifindex, i, prog_fd, last_fd = -1;
LIBBPF_OPTS(bpf_prog_attach_opts, opta);
const int max_progs = 63;
ASSERT_OK(system("ip link add dev tcx_opts1 type veth peer name tcx_opts2"), "add veth");
ifindex = if_nametoindex("tcx_opts1");
ASSERT_NEQ(ifindex, 0, "non_zero_ifindex");
assert_mprog_count_ifindex(ifindex, target, 0);
for (i = 0; i < max_progs; i++) {
prog_fd = generate_dummy_prog();
if (!ASSERT_GE(prog_fd, 0, "dummy_prog"))
goto cleanup;
err = bpf_prog_attach_opts(prog_fd, ifindex, target, &opta);
if (!ASSERT_EQ(err, 0, "prog_attach"))
goto cleanup;
assert_mprog_count_ifindex(ifindex, target, i + 1);
if (i == max_progs - 1 && relative)
last_fd = prog_fd;
else
close(prog_fd);
}
prog_fd = generate_dummy_prog();
if (!ASSERT_GE(prog_fd, 0, "dummy_prog"))
goto cleanup;
opta.flags = flags;
if (last_fd > 0)
opta.relative_fd = last_fd;
err = bpf_prog_attach_opts(prog_fd, ifindex, target, &opta);
ASSERT_EQ(err, -ERANGE, "prog_64_attach");
assert_mprog_count_ifindex(ifindex, target, max_progs);
close(prog_fd);
cleanup:
if (last_fd > 0)
close(last_fd);
ASSERT_OK(system("ip link del dev tcx_opts1"), "del veth");
ASSERT_EQ(if_nametoindex("tcx_opts1"), 0, "dev1_removed");
ASSERT_EQ(if_nametoindex("tcx_opts2"), 0, "dev2_removed");
}
void serial_test_tc_opts_max(void)
{
test_tc_opts_max_target(BPF_TCX_INGRESS, 0, false);
test_tc_opts_max_target(BPF_TCX_EGRESS, 0, false);
test_tc_opts_max_target(BPF_TCX_INGRESS, BPF_F_BEFORE, false);
test_tc_opts_max_target(BPF_TCX_EGRESS, BPF_F_BEFORE, true);
test_tc_opts_max_target(BPF_TCX_INGRESS, BPF_F_AFTER, true);
test_tc_opts_max_target(BPF_TCX_EGRESS, BPF_F_AFTER, false);
}

View File

@ -490,6 +490,8 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay)
}
}
int bpf_cubic_acked_called = 0;
void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
const struct ack_sample *sample)
{
@ -497,6 +499,7 @@ void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
struct bictcp *ca = inet_csk_ca(sk);
__u32 delay;
bpf_cubic_acked_called = 1;
/* Some calls are for duplicates without timetamps */
if (sample->rtt_us < 0)
return;