mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-11 15:40:50 +00:00
40304b2a15
Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding struct that allows BPF programs of this type to access some of the socket's fields (such as IP addresses, ports, etc.). It uses the existing bpf cgroups infrastructure so the programs can be attached per cgroup with full inheritance support. The program will be called at appropriate times to set relevant connections parameters such as buffer sizes, SYN and SYN-ACK RTOs, etc., based on connection information such as IP addresses, port numbers, etc. Alghough there are already 3 mechanisms to set parameters (sysctls, route metrics and setsockopts), this new mechanism provides some distinct advantages. Unlike sysctls, it can set parameters per connection. In contrast to route metrics, it can also use port numbers and information provided by a user level program. In addition, it could set parameters probabilistically for evaluation purposes (i.e. do something different on 10% of the flows and compare results with the other 90% of the flows). Also, in cases where IPv6 addresses contain geographic information, the rules to make changes based on the distance (or RTT) between the hosts are much easier than route metric rules and can be global. Finally, unlike setsockopt, it oes not require application changes and it can be updated easily at any time. Although the bpf cgroup framework already contains a sock related program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type (BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called only once during the connections's lifetime. In contrast, the new program type will be called multiple times from different places in the network stack code. For example, before sending SYN and SYN-ACKs to set an appropriate timeout, when the connection is established to set congestion control, etc. As a result it has "op" field to specify the type of operation requested. The purpose of this new program type is to simplify setting connection parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is easy to use facebook's internal IPv6 addresses to determine if both hosts of a connection are in the same datacenter. Therefore, it is easy to write a BPF program to choose a small SYN RTO value when both hosts are in the same datacenter. This patch only contains the framework to support the new BPF program type, following patches add the functionality to set various connection parameters. This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS and a new bpf syscall command to load a new program of this type: BPF_PROG_LOAD_SOCKET_OPS. Two new corresponding structs (one for the kernel one for the user/BPF program): /* kernel version */ struct bpf_sock_ops_kern { struct sock *sk; __u32 op; union { __u32 reply; __u32 replylong[4]; }; }; /* user version * Some fields are in network byte order reflecting the sock struct * Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to * convert them to host byte order. */ struct bpf_sock_ops { __u32 op; union { __u32 reply; __u32 replylong[4]; }; __u32 family; __u32 remote_ip4; /* In network byte order */ __u32 local_ip4; /* In network byte order */ __u32 remote_ip6[4]; /* In network byte order */ __u32 local_ip6[4]; /* In network byte order */ __u32 remote_port; /* In network byte order */ __u32 local_port; /* In host byte horder */ }; Currently there are two types of ops. The first type expects the BPF program to return a value which is then used by the caller (or a negative value to indicate the operation is not supported). The second type expects state changes to be done by the BPF program, for example through a setsockopt BPF helper function, and they ignore the return value. The reply fields of the bpf_sockt_ops struct are there in case a bpf program needs to return a value larger than an integer. Signed-off-by: Lawrence Brakmo <brakmo@fb.com> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
110 lines
3.3 KiB
C
110 lines
3.3 KiB
C
#ifndef _BPF_CGROUP_H
|
|
#define _BPF_CGROUP_H
|
|
|
|
#include <linux/jump_label.h>
|
|
#include <uapi/linux/bpf.h>
|
|
|
|
struct sock;
|
|
struct cgroup;
|
|
struct sk_buff;
|
|
struct bpf_sock_ops_kern;
|
|
|
|
#ifdef CONFIG_CGROUP_BPF
|
|
|
|
extern struct static_key_false cgroup_bpf_enabled_key;
|
|
#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
|
|
|
|
struct cgroup_bpf {
|
|
/*
|
|
* Store two sets of bpf_prog pointers, one for programs that are
|
|
* pinned directly to this cgroup, and one for those that are effective
|
|
* when this cgroup is accessed.
|
|
*/
|
|
struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
|
|
struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
|
|
bool disallow_override[MAX_BPF_ATTACH_TYPE];
|
|
};
|
|
|
|
void cgroup_bpf_put(struct cgroup *cgrp);
|
|
void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
|
|
|
|
int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
|
|
struct bpf_prog *prog, enum bpf_attach_type type,
|
|
bool overridable);
|
|
|
|
/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
|
|
int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
|
|
enum bpf_attach_type type, bool overridable);
|
|
|
|
int __cgroup_bpf_run_filter_skb(struct sock *sk,
|
|
struct sk_buff *skb,
|
|
enum bpf_attach_type type);
|
|
|
|
int __cgroup_bpf_run_filter_sk(struct sock *sk,
|
|
enum bpf_attach_type type);
|
|
|
|
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
|
|
struct bpf_sock_ops_kern *sock_ops,
|
|
enum bpf_attach_type type);
|
|
|
|
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
|
|
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
|
|
({ \
|
|
int __ret = 0; \
|
|
if (cgroup_bpf_enabled) \
|
|
__ret = __cgroup_bpf_run_filter_skb(sk, skb, \
|
|
BPF_CGROUP_INET_INGRESS); \
|
|
\
|
|
__ret; \
|
|
})
|
|
|
|
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \
|
|
({ \
|
|
int __ret = 0; \
|
|
if (cgroup_bpf_enabled && sk && sk == skb->sk) { \
|
|
typeof(sk) __sk = sk_to_full_sk(sk); \
|
|
if (sk_fullsock(__sk)) \
|
|
__ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
|
|
BPF_CGROUP_INET_EGRESS); \
|
|
} \
|
|
__ret; \
|
|
})
|
|
|
|
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
|
|
({ \
|
|
int __ret = 0; \
|
|
if (cgroup_bpf_enabled && sk) { \
|
|
__ret = __cgroup_bpf_run_filter_sk(sk, \
|
|
BPF_CGROUP_INET_SOCK_CREATE); \
|
|
} \
|
|
__ret; \
|
|
})
|
|
|
|
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \
|
|
({ \
|
|
int __ret = 0; \
|
|
if (cgroup_bpf_enabled && (sock_ops)->sk) { \
|
|
typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \
|
|
if (sk_fullsock(__sk)) \
|
|
__ret = __cgroup_bpf_run_filter_sock_ops(__sk, \
|
|
sock_ops, \
|
|
BPF_CGROUP_SOCK_OPS); \
|
|
} \
|
|
__ret; \
|
|
})
|
|
#else
|
|
|
|
struct cgroup_bpf {};
|
|
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
|
|
static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
|
|
struct cgroup *parent) {}
|
|
|
|
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
|
|
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
|
|
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
|
|
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
|
|
|
|
#endif /* CONFIG_CGROUP_BPF */
|
|
|
|
#endif /* _BPF_CGROUP_H */
|