Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2020-02-28

The following pull-request contains BPF updates for your *net-next* tree.

We've added 41 non-merge commits during the last 7 day(s) which contain
a total of 49 files changed, 1383 insertions(+), 499 deletions(-).

The main changes are:

1) BPF and Real-Time nicely co-exist.

2) bpftool feature improvements.

3) retrieve bpf_sk_storage via INET_DIAG.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2020-02-29 15:53:35 -08:00
commit 9f0ca0c1a5
49 changed files with 1385 additions and 501 deletions

View File

@ -371,7 +371,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
struct receive_queue *rq, struct receive_queue *rq,
struct page *page, unsigned int offset, struct page *page, unsigned int offset,
unsigned int len, unsigned int truesize, unsigned int len, unsigned int truesize,
bool hdr_valid) bool hdr_valid, unsigned int metasize)
{ {
struct sk_buff *skb; struct sk_buff *skb;
struct virtio_net_hdr_mrg_rxbuf *hdr; struct virtio_net_hdr_mrg_rxbuf *hdr;
@ -393,6 +393,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
else else
hdr_padded_len = sizeof(struct padded_vnet_hdr); hdr_padded_len = sizeof(struct padded_vnet_hdr);
/* hdr_valid means no XDP, so we can copy the vnet header */
if (hdr_valid) if (hdr_valid)
memcpy(hdr, p, hdr_len); memcpy(hdr, p, hdr_len);
@ -405,6 +406,11 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
copy = skb_tailroom(skb); copy = skb_tailroom(skb);
skb_put_data(skb, p, copy); skb_put_data(skb, p, copy);
if (metasize) {
__skb_pull(skb, metasize);
skb_metadata_set(skb, metasize);
}
len -= copy; len -= copy;
offset += copy; offset += copy;
@ -450,10 +456,6 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
struct virtio_net_hdr_mrg_rxbuf *hdr; struct virtio_net_hdr_mrg_rxbuf *hdr;
int err; int err;
/* virtqueue want to use data area in-front of packet */
if (unlikely(xdpf->metasize > 0))
return -EOPNOTSUPP;
if (unlikely(xdpf->headroom < vi->hdr_len)) if (unlikely(xdpf->headroom < vi->hdr_len))
return -EOVERFLOW; return -EOVERFLOW;
@ -644,6 +646,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
unsigned int delta = 0; unsigned int delta = 0;
struct page *xdp_page; struct page *xdp_page;
int err; int err;
unsigned int metasize = 0;
len -= vi->hdr_len; len -= vi->hdr_len;
stats->bytes += len; stats->bytes += len;
@ -683,8 +686,8 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
xdp.data = xdp.data_hard_start + xdp_headroom; xdp.data = xdp.data_hard_start + xdp_headroom;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len; xdp.data_end = xdp.data + len;
xdp.data_meta = xdp.data;
xdp.rxq = &rq->xdp_rxq; xdp.rxq = &rq->xdp_rxq;
orig_data = xdp.data; orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp); act = bpf_prog_run_xdp(xdp_prog, &xdp);
@ -695,6 +698,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
/* Recalculate length in case bpf program changed it */ /* Recalculate length in case bpf program changed it */
delta = orig_data - xdp.data; delta = orig_data - xdp.data;
len = xdp.data_end - xdp.data; len = xdp.data_end - xdp.data;
metasize = xdp.data - xdp.data_meta;
break; break;
case XDP_TX: case XDP_TX:
stats->xdp_tx++; stats->xdp_tx++;
@ -735,10 +739,13 @@ static struct sk_buff *receive_small(struct net_device *dev,
} }
skb_reserve(skb, headroom - delta); skb_reserve(skb, headroom - delta);
skb_put(skb, len); skb_put(skb, len);
if (!delta) { if (!xdp_prog) {
buf += header_offset; buf += header_offset;
memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len); memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
} /* keep zeroed vnet hdr since packet was changed by bpf */ } /* keep zeroed vnet hdr since XDP is loaded */
if (metasize)
skb_metadata_set(skb, metasize);
err: err:
return skb; return skb;
@ -760,8 +767,8 @@ static struct sk_buff *receive_big(struct net_device *dev,
struct virtnet_rq_stats *stats) struct virtnet_rq_stats *stats)
{ {
struct page *page = buf; struct page *page = buf;
struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, struct sk_buff *skb =
PAGE_SIZE, true); page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0);
stats->bytes += len - vi->hdr_len; stats->bytes += len - vi->hdr_len;
if (unlikely(!skb)) if (unlikely(!skb))
@ -793,6 +800,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
unsigned int truesize; unsigned int truesize;
unsigned int headroom = mergeable_ctx_to_headroom(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx);
int err; int err;
unsigned int metasize = 0;
head_skb = NULL; head_skb = NULL;
stats->bytes += len - vi->hdr_len; stats->bytes += len - vi->hdr_len;
@ -839,8 +847,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
data = page_address(xdp_page) + offset; data = page_address(xdp_page) + offset;
xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
xdp.data = data + vi->hdr_len; xdp.data = data + vi->hdr_len;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + (len - vi->hdr_len); xdp.data_end = xdp.data + (len - vi->hdr_len);
xdp.data_meta = xdp.data;
xdp.rxq = &rq->xdp_rxq; xdp.rxq = &rq->xdp_rxq;
act = bpf_prog_run_xdp(xdp_prog, &xdp); act = bpf_prog_run_xdp(xdp_prog, &xdp);
@ -848,24 +856,27 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
switch (act) { switch (act) {
case XDP_PASS: case XDP_PASS:
/* recalculate offset to account for any header metasize = xdp.data - xdp.data_meta;
* adjustments. Note other cases do not build an
* skb and avoid using offset
*/
offset = xdp.data -
page_address(xdp_page) - vi->hdr_len;
/* recalculate len if xdp.data or xdp.data_end were /* recalculate offset to account for any header
* adjusted * adjustments and minus the metasize to copy the
* metadata in page_to_skb(). Note other cases do not
* build an skb and avoid using offset
*/ */
len = xdp.data_end - xdp.data + vi->hdr_len; offset = xdp.data - page_address(xdp_page) -
vi->hdr_len - metasize;
/* recalculate len if xdp.data, xdp.data_end or
* xdp.data_meta were adjusted
*/
len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
/* We can only create skb based on xdp_page. */ /* We can only create skb based on xdp_page. */
if (unlikely(xdp_page != page)) { if (unlikely(xdp_page != page)) {
rcu_read_unlock(); rcu_read_unlock();
put_page(page); put_page(page);
head_skb = page_to_skb(vi, rq, xdp_page, head_skb = page_to_skb(vi, rq, xdp_page, offset,
offset, len, len, PAGE_SIZE, false,
PAGE_SIZE, false); metasize);
return head_skb; return head_skb;
} }
break; break;
@ -921,7 +932,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
goto err_skb; goto err_skb;
} }
head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog); head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
metasize);
curr_skb = head_skb; curr_skb = head_skb;
if (unlikely(!curr_skb)) if (unlikely(!curr_skb))

View File

@ -36,7 +36,7 @@ struct bpf_cgroup_storage_map;
struct bpf_storage_buffer { struct bpf_storage_buffer {
struct rcu_head rcu; struct rcu_head rcu;
char data[0]; char data[];
}; };
struct bpf_cgroup_storage { struct bpf_cgroup_storage {

View File

@ -859,7 +859,7 @@ struct bpf_prog_array_item {
struct bpf_prog_array { struct bpf_prog_array {
struct rcu_head rcu; struct rcu_head rcu;
struct bpf_prog_array_item items[0]; struct bpf_prog_array_item items[];
}; };
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
@ -885,7 +885,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *_prog; \ struct bpf_prog *_prog; \
struct bpf_prog_array *_array; \ struct bpf_prog_array *_array; \
u32 _ret = 1; \ u32 _ret = 1; \
preempt_disable(); \ migrate_disable(); \
rcu_read_lock(); \ rcu_read_lock(); \
_array = rcu_dereference(array); \ _array = rcu_dereference(array); \
if (unlikely(check_non_null && !_array))\ if (unlikely(check_non_null && !_array))\
@ -898,7 +898,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
} \ } \
_out: \ _out: \
rcu_read_unlock(); \ rcu_read_unlock(); \
preempt_enable(); \ migrate_enable(); \
_ret; \ _ret; \
}) })
@ -932,7 +932,7 @@ _out: \
u32 ret; \ u32 ret; \
u32 _ret = 1; \ u32 _ret = 1; \
u32 _cn = 0; \ u32 _cn = 0; \
preempt_disable(); \ migrate_disable(); \
rcu_read_lock(); \ rcu_read_lock(); \
_array = rcu_dereference(array); \ _array = rcu_dereference(array); \
_item = &_array->items[0]; \ _item = &_array->items[0]; \
@ -944,7 +944,7 @@ _out: \
_item++; \ _item++; \
} \ } \
rcu_read_unlock(); \ rcu_read_unlock(); \
preempt_enable(); \ migrate_enable(); \
if (_ret) \ if (_ret) \
_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \
else \ else \
@ -961,6 +961,36 @@ _out: \
#ifdef CONFIG_BPF_SYSCALL #ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active); DECLARE_PER_CPU(int, bpf_prog_active);
/*
* Block execution of BPF programs attached to instrumentation (perf,
* kprobes, tracepoints) to prevent deadlocks on map operations as any of
* these events can happen inside a region which holds a map bucket lock
* and can deadlock on it.
*
* Use the preemption safe inc/dec variants on RT because migrate disable
* is preemptible on RT and preemption in the middle of the RMW operation
* might lead to inconsistent state. Use the raw variants for non RT
* kernels as migrate_disable() maps to preempt_disable() so the slightly
* more expensive save operation can be avoided.
*/
static inline void bpf_disable_instrumentation(void)
{
migrate_disable();
if (IS_ENABLED(CONFIG_PREEMPT_RT))
this_cpu_inc(bpf_prog_active);
else
__this_cpu_inc(bpf_prog_active);
}
static inline void bpf_enable_instrumentation(void)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
this_cpu_dec(bpf_prog_active);
else
__this_cpu_dec(bpf_prog_active);
migrate_enable();
}
extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_map_fops;
extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_prog_fops;
@ -993,6 +1023,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
struct bpf_map *bpf_map_get(u32 ufd);
struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f); struct bpf_map *__bpf_map_get(struct fd f);
void bpf_map_inc(struct bpf_map *map); void bpf_map_inc(struct bpf_map *map);

View File

@ -561,7 +561,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
#define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ #define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \
u32 ret; \ u32 ret; \
cant_sleep(); \ cant_migrate(); \
if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ if (static_branch_unlikely(&bpf_stats_enabled_key)) { \
struct bpf_prog_stats *stats; \ struct bpf_prog_stats *stats; \
u64 start = sched_clock(); \ u64 start = sched_clock(); \
@ -576,8 +576,30 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
} \ } \
ret; }) ret; })
#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx, \ #define BPF_PROG_RUN(prog, ctx) \
bpf_dispatcher_nopfunc) __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc)
/*
* Use in preemptible and therefore migratable context to make sure that
* the execution of the BPF program runs on one CPU.
*
* This uses migrate_disable/enable() explicitly to document that the
* invocation of a BPF program does not require reentrancy protection
* against a BPF program which is invoked from a preempting task.
*
* For non RT enabled kernels migrate_disable/enable() maps to
* preempt_disable/enable(), i.e. it disables also preemption.
*/
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
const void *ctx)
{
u32 ret;
migrate_disable();
ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc);
migrate_enable();
return ret;
}
#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
@ -655,6 +677,7 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
return qdisc_skb_cb(skb)->data; return qdisc_skb_cb(skb)->data;
} }
/* Must be invoked with migration disabled */
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
struct sk_buff *skb) struct sk_buff *skb)
{ {
@ -680,9 +703,9 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
{ {
u32 res; u32 res;
preempt_disable(); migrate_disable();
res = __bpf_prog_run_save_cb(prog, skb); res = __bpf_prog_run_save_cb(prog, skb);
preempt_enable(); migrate_enable();
return res; return res;
} }
@ -695,9 +718,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
if (unlikely(prog->cb_access)) if (unlikely(prog->cb_access))
memset(cb_data, 0, BPF_SKB_CB_LEN); memset(cb_data, 0, BPF_SKB_CB_LEN);
preempt_disable(); res = bpf_prog_run_pin_on_cpu(prog, skb);
res = BPF_PROG_RUN(prog, skb);
preempt_enable();
return res; return res;
} }

View File

@ -15,11 +15,9 @@ struct netlink_callback;
struct inet_diag_handler { struct inet_diag_handler {
void (*dump)(struct sk_buff *skb, void (*dump)(struct sk_buff *skb,
struct netlink_callback *cb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, const struct inet_diag_req_v2 *r);
struct nlattr *bc);
int (*dump_one)(struct sk_buff *in_skb, int (*dump_one)(struct netlink_callback *cb,
const struct nlmsghdr *nlh,
const struct inet_diag_req_v2 *req); const struct inet_diag_req_v2 *req);
void (*idiag_get_info)(struct sock *sk, void (*idiag_get_info)(struct sock *sk,
@ -40,18 +38,25 @@ struct inet_diag_handler {
__u16 idiag_info_size; __u16 idiag_info_size;
}; };
struct bpf_sk_storage_diag;
struct inet_diag_dump_data {
struct nlattr *req_nlas[__INET_DIAG_REQ_MAX];
#define inet_diag_nla_bc req_nlas[INET_DIAG_REQ_BYTECODE]
#define inet_diag_nla_bpf_stgs req_nlas[INET_DIAG_REQ_SK_BPF_STORAGES]
struct bpf_sk_storage_diag *bpf_stg_diag;
};
struct inet_connection_sock; struct inet_connection_sock;
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct sk_buff *skb, struct netlink_callback *cb,
struct user_namespace *user_ns, const struct inet_diag_req_v2 *req,
u32 pid, u32 seq, u16 nlmsg_flags, u16 nlmsg_flags, bool net_admin);
const struct nlmsghdr *unlh, bool net_admin);
void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb,
struct netlink_callback *cb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, const struct inet_diag_req_v2 *r);
struct nlattr *bc);
int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
struct sk_buff *in_skb, const struct nlmsghdr *nlh, struct netlink_callback *cb,
const struct inet_diag_req_v2 *req); const struct inet_diag_req_v2 *req);
struct sock *inet_diag_find_one_icsk(struct net *net, struct sock *inet_diag_find_one_icsk(struct net *net,

View File

@ -257,6 +257,13 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
#ifndef CONFIG_PREEMPT_RT
# define cant_migrate() cant_sleep()
#else
/* Placeholder for now */
# define cant_migrate() do { } while (0)
#endif
/** /**
* abs - return absolute value of an argument * abs - return absolute value of an argument
* @x: the value. If it is unsigned type, it is converted to signed type first. * @x: the value. If it is unsigned type, it is converted to signed type first.

View File

@ -188,10 +188,10 @@ struct netlink_callback {
struct module *module; struct module *module;
struct netlink_ext_ack *extack; struct netlink_ext_ack *extack;
u16 family; u16 family;
u16 min_dump_alloc;
bool strict_check;
u16 answer_flags; u16 answer_flags;
u32 min_dump_alloc;
unsigned int prev_seq, seq; unsigned int prev_seq, seq;
bool strict_check;
union { union {
u8 ctx[48]; u8 ctx[48];

View File

@ -322,4 +322,34 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
#endif #endif
/**
* migrate_disable - Prevent migration of the current task
*
* Maps to preempt_disable() which also disables preemption. Use
* migrate_disable() to annotate that the intent is to prevent migration,
* but not necessarily preemption.
*
* Can be invoked nested like preempt_disable() and needs the corresponding
* number of migrate_enable() invocations.
*/
static __always_inline void migrate_disable(void)
{
preempt_disable();
}
/**
* migrate_enable - Allow migration of the current task
*
* Counterpart to migrate_disable().
*
* As migrate_disable() can be invoked nested, only the outermost invocation
* reenables migration.
*
* Currently mapped to preempt_enable().
*/
static __always_inline void migrate_enable(void)
{
preempt_enable();
}
#endif /* __LINUX_PREEMPT_H */ #endif /* __LINUX_PREEMPT_H */

View File

@ -10,14 +10,41 @@ void bpf_sk_storage_free(struct sock *sk);
extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_get_proto;
extern const struct bpf_func_proto bpf_sk_storage_delete_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
struct bpf_sk_storage_diag;
struct sk_buff;
struct nlattr;
struct sock;
#ifdef CONFIG_BPF_SYSCALL #ifdef CONFIG_BPF_SYSCALL
int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
struct bpf_sk_storage_diag *
bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs);
void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag);
int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
struct sock *sk, struct sk_buff *skb,
int stg_array_type,
unsigned int *res_diag_size);
#else #else
static inline int bpf_sk_storage_clone(const struct sock *sk, static inline int bpf_sk_storage_clone(const struct sock *sk,
struct sock *newsk) struct sock *newsk)
{ {
return 0; return 0;
} }
static inline struct bpf_sk_storage_diag *
bpf_sk_storage_diag_alloc(const struct nlattr *nla)
{
return NULL;
}
static inline void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
{
}
static inline int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
struct sock *sk, struct sk_buff *skb,
int stg_array_type,
unsigned int *res_diag_size)
{
return 0;
}
#endif #endif
#endif /* _BPF_SK_STORAGE_H */ #endif /* _BPF_SK_STORAGE_H */

View File

@ -73,7 +73,7 @@ struct bpf_insn {
/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
struct bpf_lpm_trie_key { struct bpf_lpm_trie_key {
__u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */
__u8 data[0]; /* Arbitrary size */ __u8 data[]; /* Arbitrary size */
}; };
struct bpf_cgroup_storage_key { struct bpf_cgroup_storage_key {

View File

@ -64,9 +64,11 @@ struct inet_diag_req_raw {
enum { enum {
INET_DIAG_REQ_NONE, INET_DIAG_REQ_NONE,
INET_DIAG_REQ_BYTECODE, INET_DIAG_REQ_BYTECODE,
INET_DIAG_REQ_SK_BPF_STORAGES,
__INET_DIAG_REQ_MAX,
}; };
#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE #define INET_DIAG_REQ_MAX (__INET_DIAG_REQ_MAX - 1)
/* Bytecode is sequence of 4 byte commands followed by variable arguments. /* Bytecode is sequence of 4 byte commands followed by variable arguments.
* All the commands identified by "code" are conditional jumps forward: * All the commands identified by "code" are conditional jumps forward:
@ -154,6 +156,7 @@ enum {
INET_DIAG_CLASS_ID, /* request as INET_DIAG_TCLASS */ INET_DIAG_CLASS_ID, /* request as INET_DIAG_TCLASS */
INET_DIAG_MD5SIG, INET_DIAG_MD5SIG,
INET_DIAG_ULP_INFO, INET_DIAG_ULP_INFO,
INET_DIAG_SK_BPF_STORAGES,
__INET_DIAG_MAX, __INET_DIAG_MAX,
}; };

View File

@ -36,4 +36,30 @@ enum sknetlink_groups {
}; };
#define SKNLGRP_MAX (__SKNLGRP_MAX - 1) #define SKNLGRP_MAX (__SKNLGRP_MAX - 1)
enum {
SK_DIAG_BPF_STORAGE_REQ_NONE,
SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
__SK_DIAG_BPF_STORAGE_REQ_MAX,
};
#define SK_DIAG_BPF_STORAGE_REQ_MAX (__SK_DIAG_BPF_STORAGE_REQ_MAX - 1)
enum {
SK_DIAG_BPF_STORAGE_REP_NONE,
SK_DIAG_BPF_STORAGE,
__SK_DIAG_BPF_STORAGE_REP_MAX,
};
#define SK_DIAB_BPF_STORAGE_REP_MAX (__SK_DIAG_BPF_STORAGE_REP_MAX - 1)
enum {
SK_DIAG_BPF_STORAGE_NONE,
SK_DIAG_BPF_STORAGE_PAD,
SK_DIAG_BPF_STORAGE_MAP_ID,
SK_DIAG_BPF_STORAGE_MAP_VALUE,
__SK_DIAG_BPF_STORAGE_MAX,
};
#define SK_DIAG_BPF_STORAGE_MAX (__SK_DIAG_BPF_STORAGE_MAX - 1)
#endif /* _UAPI__SOCK_DIAG_H__ */ #endif /* _UAPI__SOCK_DIAG_H__ */

View File

@ -23,7 +23,7 @@ enum bpf_struct_ops_state {
struct bpf_struct_ops_value { struct bpf_struct_ops_value {
BPF_STRUCT_OPS_COMMON_VALUE; BPF_STRUCT_OPS_COMMON_VALUE;
char data[0] ____cacheline_aligned_in_smp; char data[] ____cacheline_aligned_in_smp;
}; };
struct bpf_struct_ops_map { struct bpf_struct_ops_map {

View File

@ -27,9 +27,62 @@
.map_delete_batch = \ .map_delete_batch = \
generic_map_delete_batch generic_map_delete_batch
/*
* The bucket lock has two protection scopes:
*
* 1) Serializing concurrent operations from BPF programs on differrent
* CPUs
*
* 2) Serializing concurrent operations from BPF programs and sys_bpf()
*
* BPF programs can execute in any context including perf, kprobes and
* tracing. As there are almost no limits where perf, kprobes and tracing
* can be invoked from the lock operations need to be protected against
* deadlocks. Deadlocks can be caused by recursion and by an invocation in
* the lock held section when functions which acquire this lock are invoked
* from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
* variable bpf_prog_active, which prevents BPF programs attached to perf
* events, kprobes and tracing to be invoked before the prior invocation
* from one of these contexts completed. sys_bpf() uses the same mechanism
* by pinning the task to the current CPU and incrementing the recursion
* protection accross the map operation.
*
* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
* operations like memory allocations (even with GFP_ATOMIC) from atomic
* contexts. This is required because even with GFP_ATOMIC the memory
* allocator calls into code pathes which acquire locks with long held lock
* sections. To ensure the deterministic behaviour these locks are regular
* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
* true atomic contexts on an RT kernel are the low level hardware
* handling, scheduling, low level interrupt handling, NMIs etc. None of
* these contexts should ever do memory allocations.
*
* As regular device interrupt handlers and soft interrupts are forced into
* thread context, the existing code which does
* spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
* just works.
*
* In theory the BPF locks could be converted to regular spinlocks as well,
* but the bucket locks and percpu_freelist locks can be taken from
* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
* atomic contexts even on RT. These mechanisms require preallocated maps,
* so there is no need to invoke memory allocations within the lock held
* sections.
*
* BPF maps which need dynamic allocation are only used from (forced)
* thread context on RT and can therefore use regular spinlocks which in
* turn allows to invoke memory allocations from the lock held section.
*
* On a non RT kernel this distinction is neither possible nor required.
* spinlock maps to raw_spinlock and the extra code is optimized out by the
* compiler.
*/
struct bucket { struct bucket {
struct hlist_nulls_head head; struct hlist_nulls_head head;
raw_spinlock_t lock; union {
raw_spinlock_t raw_lock;
spinlock_t lock;
};
}; };
struct bpf_htab { struct bpf_htab {
@ -65,9 +118,54 @@ struct htab_elem {
struct bpf_lru_node lru_node; struct bpf_lru_node lru_node;
}; };
u32 hash; u32 hash;
char key[0] __aligned(8); char key[] __aligned(8);
}; };
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
{
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
{
return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
}
static void htab_init_buckets(struct bpf_htab *htab)
{
unsigned i;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
if (htab_use_raw_lock(htab))
raw_spin_lock_init(&htab->buckets[i].raw_lock);
else
spin_lock_init(&htab->buckets[i].lock);
}
}
static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab,
struct bucket *b)
{
unsigned long flags;
if (htab_use_raw_lock(htab))
raw_spin_lock_irqsave(&b->raw_lock, flags);
else
spin_lock_irqsave(&b->lock, flags);
return flags;
}
static inline void htab_unlock_bucket(const struct bpf_htab *htab,
struct bucket *b,
unsigned long flags)
{
if (htab_use_raw_lock(htab))
raw_spin_unlock_irqrestore(&b->raw_lock, flags);
else
spin_unlock_irqrestore(&b->lock, flags);
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
static bool htab_is_lru(const struct bpf_htab *htab) static bool htab_is_lru(const struct bpf_htab *htab)
@ -82,11 +180,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
} }
static bool htab_is_prealloc(const struct bpf_htab *htab)
{
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr) void __percpu *pptr)
{ {
@ -328,8 +421,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab; struct bpf_htab *htab;
int err, i;
u64 cost; u64 cost;
int err;
htab = kzalloc(sizeof(*htab), GFP_USER); htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab) if (!htab)
@ -391,10 +484,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
else else
htab->hashrnd = get_random_int(); htab->hashrnd = get_random_int();
for (i = 0; i < htab->n_buckets; i++) { htab_init_buckets(htab);
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
raw_spin_lock_init(&htab->buckets[i].lock);
}
if (prealloc) { if (prealloc) {
err = prealloc_init(htab); err = prealloc_init(htab);
@ -602,7 +692,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
b = __select_bucket(htab, tgt_l->hash); b = __select_bucket(htab, tgt_l->hash);
head = &b->head; head = &b->head;
raw_spin_lock_irqsave(&b->lock, flags); flags = htab_lock_bucket(htab, b);
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) { if (l == tgt_l) {
@ -610,7 +700,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
break; break;
} }
raw_spin_unlock_irqrestore(&b->lock, flags); htab_unlock_bucket(htab, b, flags);
return l == tgt_l; return l == tgt_l;
} }
@ -686,15 +776,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
struct htab_elem *l = container_of(head, struct htab_elem, rcu); struct htab_elem *l = container_of(head, struct htab_elem, rcu);
struct bpf_htab *htab = l->htab; struct bpf_htab *htab = l->htab;
/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
* we're calling kfree, otherwise deadlock is possible if kprobes
* are placed somewhere inside of slub
*/
preempt_disable();
__this_cpu_inc(bpf_prog_active);
htab_elem_free(htab, l); htab_elem_free(htab, l);
__this_cpu_dec(bpf_prog_active);
preempt_enable();
} }
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
@ -884,8 +966,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
*/ */
} }
/* bpf_map_update_elem() can be called in_irq() */ flags = htab_lock_bucket(htab, b);
raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, hash, key, key_size); l_old = lookup_elem_raw(head, hash, key, key_size);
@ -926,7 +1007,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
} }
ret = 0; ret = 0;
err: err:
raw_spin_unlock_irqrestore(&b->lock, flags); htab_unlock_bucket(htab, b, flags);
return ret; return ret;
} }
@ -964,8 +1045,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
return -ENOMEM; return -ENOMEM;
memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
/* bpf_map_update_elem() can be called in_irq() */ flags = htab_lock_bucket(htab, b);
raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, hash, key, key_size); l_old = lookup_elem_raw(head, hash, key, key_size);
@ -984,7 +1064,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
ret = 0; ret = 0;
err: err:
raw_spin_unlock_irqrestore(&b->lock, flags); htab_unlock_bucket(htab, b, flags);
if (ret) if (ret)
bpf_lru_push_free(&htab->lru, &l_new->lru_node); bpf_lru_push_free(&htab->lru, &l_new->lru_node);
@ -1019,8 +1099,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
b = __select_bucket(htab, hash); b = __select_bucket(htab, hash);
head = &b->head; head = &b->head;
/* bpf_map_update_elem() can be called in_irq() */ flags = htab_lock_bucket(htab, b);
raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, hash, key, key_size); l_old = lookup_elem_raw(head, hash, key, key_size);
@ -1043,7 +1122,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
} }
ret = 0; ret = 0;
err: err:
raw_spin_unlock_irqrestore(&b->lock, flags); htab_unlock_bucket(htab, b, flags);
return ret; return ret;
} }
@ -1083,8 +1162,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
return -ENOMEM; return -ENOMEM;
} }
/* bpf_map_update_elem() can be called in_irq() */ flags = htab_lock_bucket(htab, b);
raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, hash, key, key_size); l_old = lookup_elem_raw(head, hash, key, key_size);
@ -1106,7 +1184,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
} }
ret = 0; ret = 0;
err: err:
raw_spin_unlock_irqrestore(&b->lock, flags); htab_unlock_bucket(htab, b, flags);
if (l_new) if (l_new)
bpf_lru_push_free(&htab->lru, &l_new->lru_node); bpf_lru_push_free(&htab->lru, &l_new->lru_node);
return ret; return ret;
@ -1144,7 +1222,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash); b = __select_bucket(htab, hash);
head = &b->head; head = &b->head;
raw_spin_lock_irqsave(&b->lock, flags); flags = htab_lock_bucket(htab, b);
l = lookup_elem_raw(head, hash, key, key_size); l = lookup_elem_raw(head, hash, key, key_size);
@ -1154,7 +1232,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
ret = 0; ret = 0;
} }
raw_spin_unlock_irqrestore(&b->lock, flags); htab_unlock_bucket(htab, b, flags);
return ret; return ret;
} }
@ -1176,7 +1254,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash); b = __select_bucket(htab, hash);
head = &b->head; head = &b->head;
raw_spin_lock_irqsave(&b->lock, flags); flags = htab_lock_bucket(htab, b);
l = lookup_elem_raw(head, hash, key, key_size); l = lookup_elem_raw(head, hash, key, key_size);
@ -1185,7 +1263,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
ret = 0; ret = 0;
} }
raw_spin_unlock_irqrestore(&b->lock, flags); htab_unlock_bucket(htab, b, flags);
if (l) if (l)
bpf_lru_push_free(&htab->lru, &l->lru_node); bpf_lru_push_free(&htab->lru, &l->lru_node);
return ret; return ret;
@ -1325,8 +1403,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
} }
again: again:
preempt_disable(); bpf_disable_instrumentation();
this_cpu_inc(bpf_prog_active);
rcu_read_lock(); rcu_read_lock();
again_nocopy: again_nocopy:
dst_key = keys; dst_key = keys;
@ -1335,7 +1412,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
head = &b->head; head = &b->head;
/* do not grab the lock unless need it (bucket_cnt > 0). */ /* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked) if (locked)
raw_spin_lock_irqsave(&b->lock, flags); flags = htab_lock_bucket(htab, b);
bucket_cnt = 0; bucket_cnt = 0;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
@ -1352,10 +1429,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
/* Note that since bucket_cnt > 0 here, it is implicit /* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it. * that the locked was grabbed, so release it.
*/ */
raw_spin_unlock_irqrestore(&b->lock, flags); htab_unlock_bucket(htab, b, flags);
rcu_read_unlock(); rcu_read_unlock();
this_cpu_dec(bpf_prog_active); bpf_enable_instrumentation();
preempt_enable();
goto after_loop; goto after_loop;
} }
@ -1364,10 +1440,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
/* Note that since bucket_cnt > 0 here, it is implicit /* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it. * that the locked was grabbed, so release it.
*/ */
raw_spin_unlock_irqrestore(&b->lock, flags); htab_unlock_bucket(htab, b, flags);
rcu_read_unlock(); rcu_read_unlock();
this_cpu_dec(bpf_prog_active); bpf_enable_instrumentation();
preempt_enable();
kvfree(keys); kvfree(keys);
kvfree(values); kvfree(values);
goto alloc; goto alloc;
@ -1418,7 +1493,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
dst_val += value_size; dst_val += value_size;
} }
raw_spin_unlock_irqrestore(&b->lock, flags); htab_unlock_bucket(htab, b, flags);
locked = false; locked = false;
while (node_to_free) { while (node_to_free) {
@ -1437,8 +1512,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
} }
rcu_read_unlock(); rcu_read_unlock();
this_cpu_dec(bpf_prog_active); bpf_enable_instrumentation();
preempt_enable();
if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
key_size * bucket_cnt) || key_size * bucket_cnt) ||
copy_to_user(uvalues + total * value_size, values, copy_to_user(uvalues + total * value_size, values,

View File

@ -25,7 +25,7 @@ struct lpm_trie_node {
struct lpm_trie_node __rcu *child[2]; struct lpm_trie_node __rcu *child[2];
u32 prefixlen; u32 prefixlen;
u32 flags; u32 flags;
u8 data[0]; u8 data[];
}; };
struct lpm_trie { struct lpm_trie {
@ -34,7 +34,7 @@ struct lpm_trie {
size_t n_entries; size_t n_entries;
size_t max_prefixlen; size_t max_prefixlen;
size_t data_size; size_t data_size;
raw_spinlock_t lock; spinlock_t lock;
}; };
/* This trie implements a longest prefix match algorithm that can be used to /* This trie implements a longest prefix match algorithm that can be used to
@ -315,7 +315,7 @@ static int trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen) if (key->prefixlen > trie->max_prefixlen)
return -EINVAL; return -EINVAL;
raw_spin_lock_irqsave(&trie->lock, irq_flags); spin_lock_irqsave(&trie->lock, irq_flags);
/* Allocate and fill a new node */ /* Allocate and fill a new node */
@ -422,7 +422,7 @@ static int trie_update_elem(struct bpf_map *map,
kfree(im_node); kfree(im_node);
} }
raw_spin_unlock_irqrestore(&trie->lock, irq_flags); spin_unlock_irqrestore(&trie->lock, irq_flags);
return ret; return ret;
} }
@ -442,7 +442,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
if (key->prefixlen > trie->max_prefixlen) if (key->prefixlen > trie->max_prefixlen)
return -EINVAL; return -EINVAL;
raw_spin_lock_irqsave(&trie->lock, irq_flags); spin_lock_irqsave(&trie->lock, irq_flags);
/* Walk the tree looking for an exact key/length match and keeping /* Walk the tree looking for an exact key/length match and keeping
* track of the path we traverse. We will need to know the node * track of the path we traverse. We will need to know the node
@ -518,7 +518,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
kfree_rcu(node, rcu); kfree_rcu(node, rcu);
out: out:
raw_spin_unlock_irqrestore(&trie->lock, irq_flags); spin_unlock_irqrestore(&trie->lock, irq_flags);
return ret; return ret;
} }
@ -575,7 +575,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
if (ret) if (ret)
goto out_err; goto out_err;
raw_spin_lock_init(&trie->lock); spin_lock_init(&trie->lock);
return &trie->map; return &trie->map;
out_err: out_err:

View File

@ -25,12 +25,18 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
free_percpu(s->freelist); free_percpu(s->freelist);
} }
static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
node->next = head->first;
head->first = node;
}
static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node) struct pcpu_freelist_node *node)
{ {
raw_spin_lock(&head->lock); raw_spin_lock(&head->lock);
node->next = head->first; pcpu_freelist_push_node(head, node);
head->first = node;
raw_spin_unlock(&head->lock); raw_spin_unlock(&head->lock);
} }
@ -56,21 +62,16 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems) u32 nr_elems)
{ {
struct pcpu_freelist_head *head; struct pcpu_freelist_head *head;
unsigned long flags;
int i, cpu, pcpu_entries; int i, cpu, pcpu_entries;
pcpu_entries = nr_elems / num_possible_cpus() + 1; pcpu_entries = nr_elems / num_possible_cpus() + 1;
i = 0; i = 0;
/* disable irq to workaround lockdep false positive
* in bpf usage pcpu_freelist_populate() will never race
* with pcpu_freelist_push()
*/
local_irq_save(flags);
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
again: again:
head = per_cpu_ptr(s->freelist, cpu); head = per_cpu_ptr(s->freelist, cpu);
___pcpu_freelist_push(head, buf); /* No locking required as this is not visible yet. */
pcpu_freelist_push_node(head, buf);
i++; i++;
buf += elem_size; buf += elem_size;
if (i == nr_elems) if (i == nr_elems)
@ -78,7 +79,6 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
if (i % pcpu_entries) if (i % pcpu_entries)
goto again; goto again;
} }
local_irq_restore(flags);
} }
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)

View File

@ -40,6 +40,9 @@ static void do_up_read(struct irq_work *entry)
{ {
struct stack_map_irq_work *work; struct stack_map_irq_work *work;
if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
return;
work = container_of(entry, struct stack_map_irq_work, irq_work); work = container_of(entry, struct stack_map_irq_work, irq_work);
up_read_non_owner(work->sem); up_read_non_owner(work->sem);
work->sem = NULL; work->sem = NULL;
@ -288,10 +291,19 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
struct stack_map_irq_work *work = NULL; struct stack_map_irq_work *work = NULL;
if (irqs_disabled()) { if (irqs_disabled()) {
work = this_cpu_ptr(&up_read_work); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) work = this_cpu_ptr(&up_read_work);
/* cannot queue more up_read, fallback */ if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
/* cannot queue more up_read, fallback */
irq_work_busy = true;
}
} else {
/*
* PREEMPT_RT does not allow to trylock mmap sem in
* interrupt disabled context. Force the fallback code.
*/
irq_work_busy = true; irq_work_busy = true;
}
} }
/* /*

View File

@ -171,11 +171,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
flags); flags);
} }
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from bpf_disable_instrumentation();
* inside bpf map update or delete otherwise deadlocks are possible
*/
preempt_disable();
__this_cpu_inc(bpf_prog_active);
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, flags); err = bpf_percpu_hash_update(map, key, value, flags);
@ -206,8 +202,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
err = map->ops->map_update_elem(map, key, value, flags); err = map->ops->map_update_elem(map, key, value, flags);
rcu_read_unlock(); rcu_read_unlock();
} }
__this_cpu_dec(bpf_prog_active); bpf_enable_instrumentation();
preempt_enable();
maybe_wait_bpf_programs(map); maybe_wait_bpf_programs(map);
return err; return err;
@ -222,8 +217,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
if (bpf_map_is_dev_bound(map)) if (bpf_map_is_dev_bound(map))
return bpf_map_offload_lookup_elem(map, key, value); return bpf_map_offload_lookup_elem(map, key, value);
preempt_disable(); bpf_disable_instrumentation();
this_cpu_inc(bpf_prog_active);
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value); err = bpf_percpu_hash_copy(map, key, value);
@ -268,8 +262,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
rcu_read_unlock(); rcu_read_unlock();
} }
this_cpu_dec(bpf_prog_active); bpf_enable_instrumentation();
preempt_enable();
maybe_wait_bpf_programs(map); maybe_wait_bpf_programs(map);
return err; return err;
@ -909,6 +902,21 @@ void bpf_map_inc_with_uref(struct bpf_map *map)
} }
EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
struct bpf_map *bpf_map_get(u32 ufd)
{
struct fd f = fdget(ufd);
struct bpf_map *map;
map = __bpf_map_get(f);
if (IS_ERR(map))
return map;
bpf_map_inc(map);
fdput(f);
return map;
}
struct bpf_map *bpf_map_get_with_uref(u32 ufd) struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{ {
struct fd f = fdget(ufd); struct fd f = fdget(ufd);
@ -1136,13 +1144,11 @@ static int map_delete_elem(union bpf_attr *attr)
goto out; goto out;
} }
preempt_disable(); bpf_disable_instrumentation();
__this_cpu_inc(bpf_prog_active);
rcu_read_lock(); rcu_read_lock();
err = map->ops->map_delete_elem(map, key); err = map->ops->map_delete_elem(map, key);
rcu_read_unlock(); rcu_read_unlock();
__this_cpu_dec(bpf_prog_active); bpf_enable_instrumentation();
preempt_enable();
maybe_wait_bpf_programs(map); maybe_wait_bpf_programs(map);
out: out:
kfree(key); kfree(key);
@ -1254,13 +1260,11 @@ int generic_map_delete_batch(struct bpf_map *map,
break; break;
} }
preempt_disable(); bpf_disable_instrumentation();
__this_cpu_inc(bpf_prog_active);
rcu_read_lock(); rcu_read_lock();
err = map->ops->map_delete_elem(map, key); err = map->ops->map_delete_elem(map, key);
rcu_read_unlock(); rcu_read_unlock();
__this_cpu_dec(bpf_prog_active); bpf_enable_instrumentation();
preempt_enable();
maybe_wait_bpf_programs(map); maybe_wait_bpf_programs(map);
if (err) if (err)
break; break;

View File

@ -367,8 +367,9 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
mutex_unlock(&trampoline_mutex); mutex_unlock(&trampoline_mutex);
} }
/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that /* The logic is similar to BPF_PROG_RUN, but with an explicit
* are needed for trampoline. The macro is split into * rcu_read_lock() and migrate_disable() which are required
* for the trampoline. The macro is split into
* call _bpf_prog_enter * call _bpf_prog_enter
* call prog->bpf_func * call prog->bpf_func
* call __bpf_prog_exit * call __bpf_prog_exit
@ -378,7 +379,7 @@ u64 notrace __bpf_prog_enter(void)
u64 start = 0; u64 start = 0;
rcu_read_lock(); rcu_read_lock();
preempt_disable(); migrate_disable();
if (static_branch_unlikely(&bpf_stats_enabled_key)) if (static_branch_unlikely(&bpf_stats_enabled_key))
start = sched_clock(); start = sched_clock();
return start; return start;
@ -401,7 +402,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
stats->nsecs += sched_clock() - start; stats->nsecs += sched_clock() - start;
u64_stats_update_end(&stats->syncp); u64_stats_update_end(&stats->syncp);
} }
preempt_enable(); migrate_enable();
rcu_read_unlock(); rcu_read_unlock();
} }

View File

@ -8143,26 +8143,48 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
} }
} }
static bool is_preallocated_map(struct bpf_map *map)
{
if (!check_map_prealloc(map))
return false;
if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
return false;
return true;
}
static int check_map_prog_compatibility(struct bpf_verifier_env *env, static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map, struct bpf_map *map,
struct bpf_prog *prog) struct bpf_prog *prog)
{ {
/* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use /*
* preallocated hash maps, since doing memory allocation * Validate that trace type programs use preallocated hash maps.
* in overflow_handler can crash depending on where nmi got *
* triggered. * For programs attached to PERF events this is mandatory as the
* perf NMI can hit any arbitrary code sequence.
*
* All other trace types using preallocated hash maps are unsafe as
* well because tracepoint or kprobes can be inside locked regions
* of the memory allocator or at a place where a recursion into the
* memory allocator would see inconsistent state.
*
* On RT enabled kernels run-time allocation of all trace type
* programs is strictly prohibited due to lock type constraints. On
* !RT kernels it is allowed for backwards compatibility reasons for
* now, but warnings are emitted so developers are made aware of
* the unsafety and can fix their programs before this is enforced.
*/ */
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) {
if (!check_map_prealloc(map)) { if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
verbose(env, "perf_event programs can only use preallocated hash map\n"); verbose(env, "perf_event programs can only use preallocated hash map\n");
return -EINVAL; return -EINVAL;
} }
if (map->inner_map_meta && if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
!check_map_prealloc(map->inner_map_meta)) { verbose(env, "trace type programs can only use preallocated hash map\n");
verbose(env, "perf_event programs can only use preallocated inner hash map\n");
return -EINVAL; return -EINVAL;
} }
WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
} }
if ((is_tracing_prog_type(prog->type) || if ((is_tracing_prog_type(prog->type) ||

View File

@ -9206,7 +9206,6 @@ static void bpf_overflow_handler(struct perf_event *event,
int ret = 0; int ret = 0;
ctx.regs = perf_arch_bpf_user_pt_regs(regs); ctx.regs = perf_arch_bpf_user_pt_regs(regs);
preempt_disable();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out; goto out;
rcu_read_lock(); rcu_read_lock();
@ -9214,7 +9213,6 @@ static void bpf_overflow_handler(struct perf_event *event,
rcu_read_unlock(); rcu_read_unlock();
out: out:
__this_cpu_dec(bpf_prog_active); __this_cpu_dec(bpf_prog_active);
preempt_enable();
if (!ret) if (!ret)
return; return;

View File

@ -268,16 +268,14 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
* All filters in the list are evaluated and the lowest BPF return * All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA). * value always takes priority (ignoring the DATA).
*/ */
preempt_disable();
for (; f; f = f->prev) { for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd); u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
ret = cur_ret; ret = cur_ret;
*match = f; *match = f;
} }
} }
preempt_enable();
return ret; return ret;
} }
#endif /* CONFIG_SECCOMP_FILTER */ #endif /* CONFIG_SECCOMP_FILTER */

View File

@ -83,7 +83,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
if (in_nmi()) /* not supported yet */ if (in_nmi()) /* not supported yet */
return 1; return 1;
preempt_disable(); cant_sleep();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
/* /*
@ -115,11 +115,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
out: out:
__this_cpu_dec(bpf_prog_active); __this_cpu_dec(bpf_prog_active);
preempt_enable();
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(trace_call_bpf);
#ifdef CONFIG_BPF_KPROBE_OVERRIDE #ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
@ -1516,10 +1514,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
static __always_inline static __always_inline
void __bpf_trace_run(struct bpf_prog *prog, u64 *args) void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
{ {
cant_sleep();
rcu_read_lock(); rcu_read_lock();
preempt_disable();
(void) BPF_PROG_RUN(prog, args); (void) BPF_PROG_RUN(prog, args);
preempt_enable();
rcu_read_unlock(); rcu_read_unlock();
} }

View File

@ -1333,8 +1333,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
int size, esize; int size, esize;
int rctx; int rctx;
if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) if (bpf_prog_array_valid(call)) {
return; u32 ret;
preempt_disable();
ret = trace_call_bpf(call, regs);
preempt_enable();
if (!ret)
return;
}
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));

View File

@ -6660,14 +6660,14 @@ static int __run_one(const struct bpf_prog *fp, const void *data,
u64 start, finish; u64 start, finish;
int ret = 0, i; int ret = 0, i;
preempt_disable(); migrate_disable();
start = ktime_get_ns(); start = ktime_get_ns();
for (i = 0; i < runs; i++) for (i = 0; i < runs; i++)
ret = BPF_PROG_RUN(fp, data); ret = BPF_PROG_RUN(fp, data);
finish = ktime_get_ns(); finish = ktime_get_ns();
preempt_enable(); migrate_enable();
*duration = finish - start; *duration = finish - start;
do_div(*duration, runs); do_div(*duration, runs);

View File

@ -37,7 +37,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
repeat = 1; repeat = 1;
rcu_read_lock(); rcu_read_lock();
preempt_disable(); migrate_disable();
time_start = ktime_get_ns(); time_start = ktime_get_ns();
for (i = 0; i < repeat; i++) { for (i = 0; i < repeat; i++) {
bpf_cgroup_storage_set(storage); bpf_cgroup_storage_set(storage);
@ -54,18 +54,18 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
if (need_resched()) { if (need_resched()) {
time_spent += ktime_get_ns() - time_start; time_spent += ktime_get_ns() - time_start;
preempt_enable(); migrate_enable();
rcu_read_unlock(); rcu_read_unlock();
cond_resched(); cond_resched();
rcu_read_lock(); rcu_read_lock();
preempt_disable(); migrate_disable();
time_start = ktime_get_ns(); time_start = ktime_get_ns();
} }
} }
time_spent += ktime_get_ns() - time_start; time_spent += ktime_get_ns() - time_start;
preempt_enable(); migrate_enable();
rcu_read_unlock(); rcu_read_unlock();
do_div(time_spent, repeat); do_div(time_spent, repeat);

View File

@ -8,6 +8,7 @@
#include <linux/bpf.h> #include <linux/bpf.h>
#include <net/bpf_sk_storage.h> #include <net/bpf_sk_storage.h>
#include <net/sock.h> #include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h> #include <uapi/linux/btf.h>
static atomic_t cache_idx; static atomic_t cache_idx;
@ -606,6 +607,14 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
kfree(map); kfree(map);
} }
/* U16_MAX is much more than enough for sk local storage
* considering a tcp_sock is ~2k.
*/
#define MAX_VALUE_SIZE \
min_t(u32, \
(KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \
(U16_MAX - sizeof(struct bpf_sk_storage_elem)))
static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
{ {
if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
@ -619,12 +628,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_ADMIN))
return -EPERM; return -EPERM;
if (attr->value_size >= KMALLOC_MAX_SIZE - if (attr->value_size > MAX_VALUE_SIZE)
MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) ||
/* U16_MAX is much more than enough for sk local storage
* considering a tcp_sock is ~2k.
*/
attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem))
return -E2BIG; return -E2BIG;
return 0; return 0;
@ -910,3 +914,270 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
.arg1_type = ARG_CONST_MAP_PTR, .arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_SOCKET, .arg2_type = ARG_PTR_TO_SOCKET,
}; };
struct bpf_sk_storage_diag {
u32 nr_maps;
struct bpf_map *maps[];
};
/* The reply will be like:
* INET_DIAG_BPF_SK_STORAGES (nla_nest)
* SK_DIAG_BPF_STORAGE (nla_nest)
* SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
* SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
* SK_DIAG_BPF_STORAGE (nla_nest)
* SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
* SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
* ....
*/
static int nla_value_size(u32 value_size)
{
/* SK_DIAG_BPF_STORAGE (nla_nest)
* SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
* SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
*/
return nla_total_size(0) + nla_total_size(sizeof(u32)) +
nla_total_size_64bit(value_size);
}
void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
{
u32 i;
if (!diag)
return;
for (i = 0; i < diag->nr_maps; i++)
bpf_map_put(diag->maps[i]);
kfree(diag);
}
EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);
static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
const struct bpf_map *map)
{
u32 i;
for (i = 0; i < diag->nr_maps; i++) {
if (diag->maps[i] == map)
return true;
}
return false;
}
struct bpf_sk_storage_diag *
bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
{
struct bpf_sk_storage_diag *diag;
struct nlattr *nla;
u32 nr_maps = 0;
int rem, err;
/* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
* the map_alloc_check() side also does.
*/
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
nla_for_each_nested(nla, nla_stgs, rem) {
if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
nr_maps++;
}
diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps,
GFP_KERNEL);
if (!diag)
return ERR_PTR(-ENOMEM);
nla_for_each_nested(nla, nla_stgs, rem) {
struct bpf_map *map;
int map_fd;
if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
continue;
map_fd = nla_get_u32(nla);
map = bpf_map_get(map_fd);
if (IS_ERR(map)) {
err = PTR_ERR(map);
goto err_free;
}
if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
bpf_map_put(map);
err = -EINVAL;
goto err_free;
}
if (diag_check_dup(diag, map)) {
bpf_map_put(map);
err = -EEXIST;
goto err_free;
}
diag->maps[diag->nr_maps++] = map;
}
return diag;
err_free:
bpf_sk_storage_diag_free(diag);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb)
{
struct nlattr *nla_stg, *nla_value;
struct bpf_sk_storage_map *smap;
/* It cannot exceed max nlattr's payload */
BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE);
nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
if (!nla_stg)
return -EMSGSIZE;
smap = rcu_dereference(sdata->smap);
if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
goto errout;
nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
smap->map.value_size,
SK_DIAG_BPF_STORAGE_PAD);
if (!nla_value)
goto errout;
if (map_value_has_spin_lock(&smap->map))
copy_map_value_locked(&smap->map, nla_data(nla_value),
sdata->data, true);
else
copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
nla_nest_end(skb, nla_stg);
return 0;
errout:
nla_nest_cancel(skb, nla_stg);
return -EMSGSIZE;
}
static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
int stg_array_type,
unsigned int *res_diag_size)
{
/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
unsigned int diag_size = nla_total_size(0);
struct bpf_sk_storage *sk_storage;
struct bpf_sk_storage_elem *selem;
struct bpf_sk_storage_map *smap;
struct nlattr *nla_stgs;
unsigned int saved_len;
int err = 0;
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage || hlist_empty(&sk_storage->list)) {
rcu_read_unlock();
return 0;
}
nla_stgs = nla_nest_start(skb, stg_array_type);
if (!nla_stgs)
/* Continue to learn diag_size */
err = -EMSGSIZE;
saved_len = skb->len;
hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
smap = rcu_dereference(SDATA(selem)->smap);
diag_size += nla_value_size(smap->map.value_size);
if (nla_stgs && diag_get(SDATA(selem), skb))
/* Continue to learn diag_size */
err = -EMSGSIZE;
}
rcu_read_unlock();
if (nla_stgs) {
if (saved_len == skb->len)
nla_nest_cancel(skb, nla_stgs);
else
nla_nest_end(skb, nla_stgs);
}
if (diag_size == nla_total_size(0)) {
*res_diag_size = 0;
return 0;
}
*res_diag_size = diag_size;
return err;
}
int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
struct sock *sk, struct sk_buff *skb,
int stg_array_type,
unsigned int *res_diag_size)
{
/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
unsigned int diag_size = nla_total_size(0);
struct bpf_sk_storage *sk_storage;
struct bpf_sk_storage_data *sdata;
struct nlattr *nla_stgs;
unsigned int saved_len;
int err = 0;
u32 i;
*res_diag_size = 0;
/* No map has been specified. Dump all. */
if (!diag->nr_maps)
return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
res_diag_size);
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage || hlist_empty(&sk_storage->list)) {
rcu_read_unlock();
return 0;
}
nla_stgs = nla_nest_start(skb, stg_array_type);
if (!nla_stgs)
/* Continue to learn diag_size */
err = -EMSGSIZE;
saved_len = skb->len;
for (i = 0; i < diag->nr_maps; i++) {
sdata = __sk_storage_lookup(sk_storage,
(struct bpf_sk_storage_map *)diag->maps[i],
false);
if (!sdata)
continue;
diag_size += nla_value_size(diag->maps[i]->value_size);
if (nla_stgs && diag_get(sdata, skb))
/* Continue to learn diag_size */
err = -EMSGSIZE;
}
rcu_read_unlock();
if (nla_stgs) {
if (saved_len == skb->len)
nla_nest_cancel(skb, nla_stgs);
else
nla_nest_end(skb, nla_stgs);
}
if (diag_size == nla_total_size(0)) {
*res_diag_size = 0;
return 0;
}
*res_diag_size = diag_size;
return err;
}
EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);

View File

@ -920,9 +920,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
(int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
flow_keys->flags = flags; flow_keys->flags = flags;
preempt_disable(); result = bpf_prog_run_pin_on_cpu(prog, ctx);
result = BPF_PROG_RUN(prog, ctx);
preempt_enable();
flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->thoff = clamp_t(u16, flow_keys->thoff,

View File

@ -628,7 +628,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
struct bpf_prog *prog; struct bpf_prog *prog;
int ret; int ret;
preempt_disable();
rcu_read_lock(); rcu_read_lock();
prog = READ_ONCE(psock->progs.msg_parser); prog = READ_ONCE(psock->progs.msg_parser);
if (unlikely(!prog)) { if (unlikely(!prog)) {
@ -638,7 +637,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
sk_msg_compute_data_pointers(msg); sk_msg_compute_data_pointers(msg);
msg->sk = sk; msg->sk = sk;
ret = BPF_PROG_RUN(prog, msg); ret = bpf_prog_run_pin_on_cpu(prog, msg);
ret = sk_psock_map_verd(ret, msg->sk_redir); ret = sk_psock_map_verd(ret, msg->sk_redir);
psock->apply_bytes = msg->apply_bytes; psock->apply_bytes = msg->apply_bytes;
if (ret == __SK_REDIRECT) { if (ret == __SK_REDIRECT) {
@ -653,7 +652,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
} }
out: out:
rcu_read_unlock(); rcu_read_unlock();
preempt_enable();
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
@ -665,9 +663,7 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
skb->sk = psock->sk; skb->sk = psock->sk;
bpf_compute_data_end_sk_skb(skb); bpf_compute_data_end_sk_skb(skb);
preempt_disable(); ret = bpf_prog_run_pin_on_cpu(prog, skb);
ret = BPF_PROG_RUN(prog, skb);
preempt_enable();
/* strparser clones the skb before handing it to a upper layer, /* strparser clones the skb before handing it to a upper layer,
* meaning skb_orphan has been called. We NULL sk on the way out * meaning skb_orphan has been called. We NULL sk on the way out
* to ensure we don't trigger a BUG_ON() in skb/sk operations * to ensure we don't trigger a BUG_ON() in skb/sk operations

View File

@ -46,16 +46,15 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
} }
static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc) const struct inet_diag_req_v2 *r)
{ {
inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc); inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r);
} }
static int dccp_diag_dump_one(struct sk_buff *in_skb, static int dccp_diag_dump_one(struct netlink_callback *cb,
const struct nlmsghdr *nlh,
const struct inet_diag_req_v2 *req) const struct inet_diag_req_v2 *req)
{ {
return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req); return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req);
} }
static const struct inet_diag_handler dccp_diag_handler = { static const struct inet_diag_handler dccp_diag_handler = {

View File

@ -23,6 +23,7 @@
#include <net/inet_hashtables.h> #include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h> #include <net/inet_timewait_sock.h>
#include <net/inet6_hashtables.h> #include <net/inet6_hashtables.h>
#include <net/bpf_sk_storage.h>
#include <net/netlink.h> #include <net/netlink.h>
#include <linux/inet.h> #include <linux/inet.h>
@ -156,26 +157,28 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
} }
EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
#define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct sk_buff *skb, struct netlink_callback *cb,
struct user_namespace *user_ns, const struct inet_diag_req_v2 *req,
u32 portid, u32 seq, u16 nlmsg_flags, u16 nlmsg_flags, bool net_admin)
const struct nlmsghdr *unlh,
bool net_admin)
{ {
const struct tcp_congestion_ops *ca_ops; const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler; const struct inet_diag_handler *handler;
struct inet_diag_dump_data *cb_data;
int ext = req->idiag_ext; int ext = req->idiag_ext;
struct inet_diag_msg *r; struct inet_diag_msg *r;
struct nlmsghdr *nlh; struct nlmsghdr *nlh;
struct nlattr *attr; struct nlattr *attr;
void *info = NULL; void *info = NULL;
cb_data = cb->data;
handler = inet_diag_table[req->sdiag_protocol]; handler = inet_diag_table[req->sdiag_protocol];
BUG_ON(!handler); BUG_ON(!handler);
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
nlmsg_flags); cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
if (!nlh) if (!nlh)
return -EMSGSIZE; return -EMSGSIZE;
@ -187,7 +190,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 0; r->idiag_timer = 0;
r->idiag_retrans = 0; r->idiag_retrans = 0;
if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) if (inet_diag_msg_attrs_fill(sk, skb, r, ext,
sk_user_ns(NETLINK_CB(cb->skb).sk),
net_admin))
goto errout; goto errout;
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@ -302,6 +307,48 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
goto errout; goto errout;
} }
/* Keep it at the end for potential retry with a larger skb,
* or else do best-effort fitting, which is only done for the
* first_nlmsg.
*/
if (cb_data->bpf_stg_diag) {
bool first_nlmsg = ((unsigned char *)nlh == skb->data);
unsigned int prev_min_dump_alloc;
unsigned int total_nla_size = 0;
unsigned int msg_len;
int err;
msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb,
INET_DIAG_SK_BPF_STORAGES,
&total_nla_size);
if (!err)
goto out;
total_nla_size += msg_len;
prev_min_dump_alloc = cb->min_dump_alloc;
if (total_nla_size > prev_min_dump_alloc)
cb->min_dump_alloc = min_t(u32, total_nla_size,
MAX_DUMP_ALLOC_SIZE);
if (!first_nlmsg)
goto errout;
if (cb->min_dump_alloc > prev_min_dump_alloc)
/* Retry with pskb_expand_head() with
* __GFP_DIRECT_RECLAIM
*/
goto errout;
WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc);
/* Send what we have for this sk
* and move on to the next sk in the following
* dump()
*/
}
out: out:
nlmsg_end(skb, nlh); nlmsg_end(skb, nlh);
return 0; return 0;
@ -312,30 +359,19 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
} }
EXPORT_SYMBOL_GPL(inet_sk_diag_fill); EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
static int inet_csk_diag_fill(struct sock *sk,
struct sk_buff *skb,
const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh,
bool net_admin)
{
return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
portid, seq, nlmsg_flags, unlh, net_admin);
}
static int inet_twsk_diag_fill(struct sock *sk, static int inet_twsk_diag_fill(struct sock *sk,
struct sk_buff *skb, struct sk_buff *skb,
u32 portid, u32 seq, u16 nlmsg_flags, struct netlink_callback *cb,
const struct nlmsghdr *unlh) u16 nlmsg_flags)
{ {
struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_timewait_sock *tw = inet_twsk(sk);
struct inet_diag_msg *r; struct inet_diag_msg *r;
struct nlmsghdr *nlh; struct nlmsghdr *nlh;
long tmo; long tmo;
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
nlmsg_flags); cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
sizeof(*r), nlmsg_flags);
if (!nlh) if (!nlh)
return -EMSGSIZE; return -EMSGSIZE;
@ -359,16 +395,16 @@ static int inet_twsk_diag_fill(struct sock *sk,
} }
static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
u32 portid, u32 seq, u16 nlmsg_flags, struct netlink_callback *cb,
const struct nlmsghdr *unlh, bool net_admin) u16 nlmsg_flags, bool net_admin)
{ {
struct request_sock *reqsk = inet_reqsk(sk); struct request_sock *reqsk = inet_reqsk(sk);
struct inet_diag_msg *r; struct inet_diag_msg *r;
struct nlmsghdr *nlh; struct nlmsghdr *nlh;
long tmo; long tmo;
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
nlmsg_flags); cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
if (!nlh) if (!nlh)
return -EMSGSIZE; return -EMSGSIZE;
@ -397,21 +433,18 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
} }
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, const struct inet_diag_req_v2 *r,
struct user_namespace *user_ns, u16 nlmsg_flags, bool net_admin)
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh, bool net_admin)
{ {
if (sk->sk_state == TCP_TIME_WAIT) if (sk->sk_state == TCP_TIME_WAIT)
return inet_twsk_diag_fill(sk, skb, portid, seq, return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags);
nlmsg_flags, unlh);
if (sk->sk_state == TCP_NEW_SYN_RECV) if (sk->sk_state == TCP_NEW_SYN_RECV)
return inet_req_diag_fill(sk, skb, portid, seq, return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
nlmsg_flags, unlh, net_admin);
return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
nlmsg_flags, unlh, net_admin); net_admin);
} }
struct sock *inet_diag_find_one_icsk(struct net *net, struct sock *inet_diag_find_one_icsk(struct net *net,
@ -459,10 +492,10 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
struct sk_buff *in_skb, struct netlink_callback *cb,
const struct nlmsghdr *nlh,
const struct inet_diag_req_v2 *req) const struct inet_diag_req_v2 *req)
{ {
struct sk_buff *in_skb = cb->skb;
bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
struct net *net = sock_net(in_skb->sk); struct net *net = sock_net(in_skb->sk);
struct sk_buff *rep; struct sk_buff *rep;
@ -479,10 +512,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
goto out; goto out;
} }
err = sk_diag_fill(sk, rep, req, err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0, nlh, net_admin);
if (err < 0) { if (err < 0) {
WARN_ON(err == -EMSGSIZE); WARN_ON(err == -EMSGSIZE);
nlmsg_free(rep); nlmsg_free(rep);
@ -509,14 +539,21 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
int err; int err;
handler = inet_diag_lock_handler(req->sdiag_protocol); handler = inet_diag_lock_handler(req->sdiag_protocol);
if (IS_ERR(handler)) if (IS_ERR(handler)) {
err = PTR_ERR(handler); err = PTR_ERR(handler);
else if (cmd == SOCK_DIAG_BY_FAMILY) } else if (cmd == SOCK_DIAG_BY_FAMILY) {
err = handler->dump_one(in_skb, nlh, req); struct inet_diag_dump_data empty_dump_data = {};
else if (cmd == SOCK_DESTROY && handler->destroy) struct netlink_callback cb = {
.nlh = nlh,
.skb = in_skb,
.data = &empty_dump_data,
};
err = handler->dump_one(&cb, req);
} else if (cmd == SOCK_DESTROY && handler->destroy) {
err = handler->destroy(in_skb, req); err = handler->destroy(in_skb, req);
else } else {
err = -EOPNOTSUPP; err = -EOPNOTSUPP;
}
inet_diag_unlock_handler(handler); inet_diag_unlock_handler(handler);
return err; return err;
@ -847,23 +884,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return len == 0 ? 0 : -EINVAL; return len == 0 ? 0 : -EINVAL;
} }
static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
const struct nlattr *bc,
bool net_admin)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
return inet_csk_diag_fill(sk, skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
net_admin);
}
static void twsk_build_assert(void) static void twsk_build_assert(void)
{ {
BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
@ -892,14 +912,17 @@ static void twsk_build_assert(void)
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc) const struct inet_diag_req_v2 *r)
{ {
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct inet_diag_dump_data *cb_data = cb->data;
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
u32 idiag_states = r->idiag_states; u32 idiag_states = r->idiag_states;
int i, num, s_i, s_num; int i, num, s_i, s_num;
struct nlattr *bc;
struct sock *sk; struct sock *sk;
bc = cb_data->inet_diag_nla_bc;
if (idiag_states & TCPF_SYN_RECV) if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV; idiag_states |= TCPF_NEW_SYN_RECV;
s_i = cb->args[1]; s_i = cb->args[1];
@ -935,8 +958,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport) r->id.idiag_sport)
goto next_listen; goto next_listen;
if (inet_csk_diag_dump(sk, skb, cb, r, if (!inet_diag_bc_sk(bc, sk))
bc, net_admin) < 0) { goto next_listen;
if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
cb, r, NLM_F_MULTI,
net_admin) < 0) {
spin_unlock(&ilb->lock); spin_unlock(&ilb->lock);
goto done; goto done;
} }
@ -1014,11 +1041,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
res = 0; res = 0;
for (idx = 0; idx < accum; idx++) { for (idx = 0; idx < accum; idx++) {
if (res >= 0) { if (res >= 0) {
res = sk_diag_fill(sk_arr[idx], skb, r, res = sk_diag_fill(sk_arr[idx], skb, cb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk), NLM_F_MULTI, net_admin);
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->nlh, net_admin);
if (res < 0) if (res < 0)
num = num_arr[idx]; num = num_arr[idx];
} }
@ -1042,31 +1066,101 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, const struct inet_diag_req_v2 *r)
struct nlattr *bc)
{ {
const struct inet_diag_handler *handler; const struct inet_diag_handler *handler;
u32 prev_min_dump_alloc;
int err = 0; int err = 0;
again:
prev_min_dump_alloc = cb->min_dump_alloc;
handler = inet_diag_lock_handler(r->sdiag_protocol); handler = inet_diag_lock_handler(r->sdiag_protocol);
if (!IS_ERR(handler)) if (!IS_ERR(handler))
handler->dump(skb, cb, r, bc); handler->dump(skb, cb, r);
else else
err = PTR_ERR(handler); err = PTR_ERR(handler);
inet_diag_unlock_handler(handler); inet_diag_unlock_handler(handler);
/* The skb is not large enough to fit one sk info and
* inet_sk_diag_fill() has requested for a larger skb.
*/
if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) {
err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL);
if (!err)
goto again;
}
return err ? : skb->len; return err ? : skb->len;
} }
static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{ {
int hdrlen = sizeof(struct inet_diag_req_v2); return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh));
struct nlattr *bc = NULL; }
if (nlmsg_attrlen(cb->nlh, hdrlen)) static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); {
const struct nlmsghdr *nlh = cb->nlh;
struct inet_diag_dump_data *cb_data;
struct sk_buff *skb = cb->skb;
struct nlattr *nla;
int rem, err;
return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL);
if (!cb_data)
return -ENOMEM;
nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen),
nlmsg_attrlen(nlh, hdrlen), rem) {
int type = nla_type(nla);
if (type < __INET_DIAG_REQ_MAX)
cb_data->req_nlas[type] = nla;
}
nla = cb_data->inet_diag_nla_bc;
if (nla) {
err = inet_diag_bc_audit(nla, skb);
if (err) {
kfree(cb_data);
return err;
}
}
nla = cb_data->inet_diag_nla_bpf_stgs;
if (nla) {
struct bpf_sk_storage_diag *bpf_stg_diag;
bpf_stg_diag = bpf_sk_storage_diag_alloc(nla);
if (IS_ERR(bpf_stg_diag)) {
kfree(cb_data);
return PTR_ERR(bpf_stg_diag);
}
cb_data->bpf_stg_diag = bpf_stg_diag;
}
cb->data = cb_data;
return 0;
}
static int inet_diag_dump_start(struct netlink_callback *cb)
{
return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2));
}
static int inet_diag_dump_start_compat(struct netlink_callback *cb)
{
return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req));
}
static int inet_diag_dump_done(struct netlink_callback *cb)
{
struct inet_diag_dump_data *cb_data = cb->data;
bpf_sk_storage_diag_free(cb_data->bpf_stg_diag);
kfree(cb->data);
return 0;
} }
static int inet_diag_type2proto(int type) static int inet_diag_type2proto(int type)
@ -1085,9 +1179,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
struct netlink_callback *cb) struct netlink_callback *cb)
{ {
struct inet_diag_req *rc = nlmsg_data(cb->nlh); struct inet_diag_req *rc = nlmsg_data(cb->nlh);
int hdrlen = sizeof(struct inet_diag_req);
struct inet_diag_req_v2 req; struct inet_diag_req_v2 req;
struct nlattr *bc = NULL;
req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_family = AF_UNSPEC; /* compatibility */
req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
@ -1095,10 +1187,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
req.idiag_states = rc->idiag_states; req.idiag_states = rc->idiag_states;
req.id = rc->id; req.id = rc->id;
if (nlmsg_attrlen(cb->nlh, hdrlen)) return __inet_diag_dump(skb, cb, &req);
bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
return __inet_diag_dump(skb, cb, &req, bc);
} }
static int inet_diag_get_exact_compat(struct sk_buff *in_skb, static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
@ -1126,22 +1215,12 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL; return -EINVAL;
if (nlh->nlmsg_flags & NLM_F_DUMP) { if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(nlh, hdrlen)) { struct netlink_dump_control c = {
struct nlattr *attr; .start = inet_diag_dump_start_compat,
int err; .done = inet_diag_dump_done,
.dump = inet_diag_dump_compat,
attr = nlmsg_find_attr(nlh, hdrlen, };
INET_DIAG_REQ_BYTECODE); return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
err = inet_diag_bc_audit(attr, skb);
if (err)
return err;
}
{
struct netlink_dump_control c = {
.dump = inet_diag_dump_compat,
};
return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
}
} }
return inet_diag_get_exact_compat(skb, nlh); return inet_diag_get_exact_compat(skb, nlh);
@ -1157,22 +1236,12 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
h->nlmsg_flags & NLM_F_DUMP) { h->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(h, hdrlen)) { struct netlink_dump_control c = {
struct nlattr *attr; .start = inet_diag_dump_start,
int err; .done = inet_diag_dump_done,
.dump = inet_diag_dump,
attr = nlmsg_find_attr(h, hdrlen, };
INET_DIAG_REQ_BYTECODE); return netlink_dump_start(net->diag_nlsk, skb, h, &c);
err = inet_diag_bc_audit(attr, skb);
if (err)
return err;
}
{
struct netlink_dump_control c = {
.dump = inet_diag_dump,
};
return netlink_dump_start(net->diag_nlsk, skb, h, &c);
}
} }
return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h));

View File

@ -87,15 +87,16 @@ static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2
return sk ? sk : ERR_PTR(-ENOENT); return sk ? sk : ERR_PTR(-ENOENT);
} }
static int raw_diag_dump_one(struct sk_buff *in_skb, static int raw_diag_dump_one(struct netlink_callback *cb,
const struct nlmsghdr *nlh,
const struct inet_diag_req_v2 *r) const struct inet_diag_req_v2 *r)
{ {
struct net *net = sock_net(in_skb->sk); struct sk_buff *in_skb = cb->skb;
struct sk_buff *rep; struct sk_buff *rep;
struct sock *sk; struct sock *sk;
struct net *net;
int err; int err;
net = sock_net(in_skb->sk);
sk = raw_sock_get(net, r); sk = raw_sock_get(net, r);
if (IS_ERR(sk)) if (IS_ERR(sk))
return PTR_ERR(sk); return PTR_ERR(sk);
@ -108,10 +109,7 @@ static int raw_diag_dump_one(struct sk_buff *in_skb,
return -ENOMEM; return -ENOMEM;
} }
err = inet_sk_diag_fill(sk, NULL, rep, r, err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0, nlh,
netlink_net_capable(in_skb, CAP_NET_ADMIN)); netlink_net_capable(in_skb, CAP_NET_ADMIN));
sock_put(sk); sock_put(sk);
@ -136,25 +134,25 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
if (!inet_diag_bc_sk(bc, sk)) if (!inet_diag_bc_sk(bc, sk))
return 0; return 0;
return inet_sk_diag_fill(sk, NULL, skb, r, return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin);
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->nlh, net_admin);
} }
static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc) const struct inet_diag_req_v2 *r)
{ {
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot; int num, s_num, slot, s_slot;
struct sock *sk = NULL; struct sock *sk = NULL;
struct nlattr *bc;
if (IS_ERR(hashinfo)) if (IS_ERR(hashinfo))
return; return;
cb_data = cb->data;
bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0]; s_slot = cb->args[0];
num = s_num = cb->args[1]; num = s_num = cb->args[1];

View File

@ -179,15 +179,15 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
} }
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc) const struct inet_diag_req_v2 *r)
{ {
inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r);
} }
static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, static int tcp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req) const struct inet_diag_req_v2 *req)
{ {
return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); return inet_diag_dump_one_icsk(&tcp_hashinfo, cb, req);
} }
#ifdef CONFIG_INET_DIAG_DESTROY #ifdef CONFIG_INET_DIAG_DESTROY

View File

@ -21,16 +21,15 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
if (!inet_diag_bc_sk(bc, sk)) if (!inet_diag_bc_sk(bc, sk))
return 0; return 0;
return inet_sk_diag_fill(sk, NULL, skb, req, return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI,
sk_user_ns(NETLINK_CB(cb->skb).sk), net_admin);
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
} }
static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, static int udp_dump_one(struct udp_table *tbl,
const struct nlmsghdr *nlh, struct netlink_callback *cb,
const struct inet_diag_req_v2 *req) const struct inet_diag_req_v2 *req)
{ {
struct sk_buff *in_skb = cb->skb;
int err = -EINVAL; int err = -EINVAL;
struct sock *sk = NULL; struct sock *sk = NULL;
struct sk_buff *rep; struct sk_buff *rep;
@ -70,11 +69,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
if (!rep) if (!rep)
goto out; goto out;
err = inet_sk_diag_fill(sk, NULL, rep, req, err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0,
sk_user_ns(NETLINK_CB(in_skb).sk), netlink_net_capable(in_skb, CAP_NET_ADMIN));
NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0, nlh,
netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) { if (err < 0) {
WARN_ON(err == -EMSGSIZE); WARN_ON(err == -EMSGSIZE);
kfree_skb(rep); kfree_skb(rep);
@ -93,12 +89,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
static void udp_dump(struct udp_table *table, struct sk_buff *skb, static void udp_dump(struct udp_table *table, struct sk_buff *skb,
struct netlink_callback *cb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc) const struct inet_diag_req_v2 *r)
{ {
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot; int num, s_num, slot, s_slot;
struct nlattr *bc;
cb_data = cb->data;
bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0]; s_slot = cb->args[0];
num = s_num = cb->args[1]; num = s_num = cb->args[1];
@ -146,15 +146,15 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
} }
static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc) const struct inet_diag_req_v2 *r)
{ {
udp_dump(&udp_table, skb, cb, r, bc); udp_dump(&udp_table, skb, cb, r);
} }
static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, static int udp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req) const struct inet_diag_req_v2 *req)
{ {
return udp_dump_one(&udp_table, in_skb, nlh, req); return udp_dump_one(&udp_table, cb, req);
} }
static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@ -249,16 +249,15 @@ static const struct inet_diag_handler udp_diag_handler = {
}; };
static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, const struct inet_diag_req_v2 *r)
struct nlattr *bc)
{ {
udp_dump(&udplite_table, skb, cb, r, bc); udp_dump(&udplite_table, skb, cb, r);
} }
static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, static int udplite_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req) const struct inet_diag_req_v2 *req)
{ {
return udp_dump_one(&udplite_table, in_skb, nlh, req); return udp_dump_one(&udplite_table, cb, req);
} }
static const struct inet_diag_handler udplite_diag_handler = { static const struct inet_diag_handler udplite_diag_handler = {

View File

@ -380,9 +380,7 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
struct bpf_prog *prog = psock->bpf_prog; struct bpf_prog *prog = psock->bpf_prog;
int res; int res;
preempt_disable(); res = bpf_prog_run_pin_on_cpu(prog, skb);
res = BPF_PROG_RUN(prog, skb);
preempt_enable();
return res; return res;
} }

View File

@ -432,11 +432,12 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo);
} }
static int sctp_diag_dump_one(struct sk_buff *in_skb, static int sctp_diag_dump_one(struct netlink_callback *cb,
const struct nlmsghdr *nlh,
const struct inet_diag_req_v2 *req) const struct inet_diag_req_v2 *req)
{ {
struct sk_buff *in_skb = cb->skb;
struct net *net = sock_net(in_skb->sk); struct net *net = sock_net(in_skb->sk);
const struct nlmsghdr *nlh = cb->nlh;
union sctp_addr laddr, paddr; union sctp_addr laddr, paddr;
struct sctp_comm_param commp = { struct sctp_comm_param commp = {
.skb = in_skb, .skb = in_skb,
@ -470,7 +471,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb,
} }
static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc) const struct inet_diag_req_v2 *r)
{ {
u32 idiag_states = r->idiag_states; u32 idiag_states = r->idiag_states;
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);

View File

@ -1,4 +1,4 @@
#!/usr/bin/python3 #!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
# #
# Copyright (C) 2018-2019 Netronome Systems, Inc. # Copyright (C) 2018-2019 Netronome Systems, Inc.

View File

@ -19,19 +19,24 @@ SYNOPSIS
FEATURE COMMANDS FEATURE COMMANDS
================ ================
| **bpftool** **feature probe** [*COMPONENT*] [**macros** [**prefix** *PREFIX*]] | **bpftool** **feature probe** [*COMPONENT*] [**full**] [**macros** [**prefix** *PREFIX*]]
| **bpftool** **feature help** | **bpftool** **feature help**
| |
| *COMPONENT* := { **kernel** | **dev** *NAME* } | *COMPONENT* := { **kernel** | **dev** *NAME* }
DESCRIPTION DESCRIPTION
=========== ===========
**bpftool feature probe** [**kernel**] [**macros** [**prefix** *PREFIX*]] **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]]
Probe the running kernel and dump a number of eBPF-related Probe the running kernel and dump a number of eBPF-related
parameters, such as availability of the **bpf()** system call, parameters, such as availability of the **bpf()** system call,
JIT status, eBPF program types availability, eBPF helper JIT status, eBPF program types availability, eBPF helper
functions availability, and more. functions availability, and more.
By default, bpftool **does not run probes** for
**bpf_probe_write_user**\ () and **bpf_trace_printk**\()
helpers which print warnings to kernel logs. To enable them
and run all probes, the **full** keyword should be used.
If the **macros** keyword (but not the **-j** option) is If the **macros** keyword (but not the **-j** option) is
passed, a subset of the output is dumped as a list of passed, a subset of the output is dumped as a list of
**#define** macros that are ready to be included in a C **#define** macros that are ready to be included in a C
@ -44,16 +49,12 @@ DESCRIPTION
Keyword **kernel** can be omitted. If no probe target is Keyword **kernel** can be omitted. If no probe target is
specified, probing the kernel is the default behaviour. specified, probing the kernel is the default behaviour.
Note that when probed, some eBPF helpers (e.g. **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]]
**bpf_trace_printk**\ () or **bpf_probe_write_user**\ ()) may
print warnings to kernel logs.
**bpftool feature probe dev** *NAME* [**macros** [**prefix** *PREFIX*]]
Probe network device for supported eBPF features and dump Probe network device for supported eBPF features and dump
results to the console. results to the console.
The two keywords **macros** and **prefix** have the same The keywords **full**, **macros** and **prefix** have the
role as when probing the kernel. same role as when probing the kernel.
**bpftool feature help** **bpftool feature help**
Print short help message. Print short help message.

View File

@ -42,7 +42,8 @@ PROG COMMANDS
| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | | **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
| **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | | **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** | | **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** |
| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/getsockopt** | **cgroup/setsockopt** |
| **struct_ops** | **fentry** | **fexit** | **freplace**
| } | }
| *ATTACH_TYPE* := { | *ATTACH_TYPE* := {
| **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** | **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector**

View File

@ -469,7 +469,8 @@ _bpftool()
cgroup/recvmsg4 cgroup/recvmsg6 \ cgroup/recvmsg4 cgroup/recvmsg6 \
cgroup/post_bind4 cgroup/post_bind6 \ cgroup/post_bind4 cgroup/post_bind6 \
cgroup/sysctl cgroup/getsockopt \ cgroup/sysctl cgroup/getsockopt \
cgroup/setsockopt" -- \ cgroup/setsockopt struct_ops \
fentry fexit freplace" -- \
"$cur" ) ) "$cur" ) )
return 0 return 0
;; ;;
@ -983,11 +984,12 @@ _bpftool()
probe) probe)
[[ $prev == "prefix" ]] && return 0 [[ $prev == "prefix" ]] && return 0
if _bpftool_search_list 'macros'; then if _bpftool_search_list 'macros'; then
COMPREPLY+=( $( compgen -W 'prefix' -- "$cur" ) ) _bpftool_once_attr 'prefix'
else else
COMPREPLY+=( $( compgen -W 'macros' -- "$cur" ) ) COMPREPLY+=( $( compgen -W 'macros' -- "$cur" ) )
fi fi
_bpftool_one_of_list 'kernel dev' _bpftool_one_of_list 'kernel dev'
_bpftool_once_attr 'full'
return 0 return 0
;; ;;
*) *)

View File

@ -112,18 +112,12 @@ print_start_section(const char *json_title, const char *plain_title,
} }
} }
static void static void print_end_section(void)
print_end_then_start_section(const char *json_title, const char *plain_title,
const char *define_comment,
const char *define_prefix)
{ {
if (json_output) if (json_output)
jsonw_end_object(json_wtr); jsonw_end_object(json_wtr);
else else
printf("\n"); printf("\n");
print_start_section(json_title, plain_title, define_comment,
define_prefix);
} }
/* Probing functions */ /* Probing functions */
@ -519,14 +513,39 @@ probe_map_type(enum bpf_map_type map_type, const char *define_prefix,
define_prefix); define_prefix);
} }
static void
probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
const char *define_prefix, unsigned int id,
const char *ptype_name, __u32 ifindex)
{
bool res;
if (!supported_type)
res = false;
else
res = bpf_probe_helper(id, prog_type, ifindex);
if (json_output) {
if (res)
jsonw_string(json_wtr, helper_name[id]);
} else if (define_prefix) {
printf("#define %sBPF__PROG_TYPE_%s__HELPER_%s %s\n",
define_prefix, ptype_name, helper_name[id],
res ? "1" : "0");
} else {
if (res)
printf("\n\t- %s", helper_name[id]);
}
}
static void static void
probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
const char *define_prefix, __u32 ifindex) const char *define_prefix, bool full_mode,
__u32 ifindex)
{ {
const char *ptype_name = prog_type_name[prog_type]; const char *ptype_name = prog_type_name[prog_type];
char feat_name[128]; char feat_name[128];
unsigned int id; unsigned int id;
bool res;
if (ifindex) if (ifindex)
/* Only test helpers for offload-able program types */ /* Only test helpers for offload-able program types */
@ -548,21 +567,19 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
} }
for (id = 1; id < ARRAY_SIZE(helper_name); id++) { for (id = 1; id < ARRAY_SIZE(helper_name); id++) {
if (!supported_type) /* Skip helper functions which emit dmesg messages when not in
res = false; * the full mode.
else */
res = bpf_probe_helper(id, prog_type, ifindex); switch (id) {
case BPF_FUNC_trace_printk:
if (json_output) { case BPF_FUNC_probe_write_user:
if (res) if (!full_mode)
jsonw_string(json_wtr, helper_name[id]); continue;
} else if (define_prefix) { /* fallthrough */
printf("#define %sBPF__PROG_TYPE_%s__HELPER_%s %s\n", default:
define_prefix, ptype_name, helper_name[id], probe_helper_for_progtype(prog_type, supported_type,
res ? "1" : "0"); define_prefix, id, ptype_name,
} else { ifindex);
if (res)
printf("\n\t- %s", helper_name[id]);
} }
} }
@ -584,13 +601,132 @@ probe_large_insn_limit(const char *define_prefix, __u32 ifindex)
res, define_prefix); res, define_prefix);
} }
static void
section_system_config(enum probe_component target, const char *define_prefix)
{
switch (target) {
case COMPONENT_KERNEL:
case COMPONENT_UNSPEC:
if (define_prefix)
break;
print_start_section("system_config",
"Scanning system configuration...",
NULL, /* define_comment never used here */
NULL); /* define_prefix always NULL here */
if (check_procfs()) {
probe_unprivileged_disabled();
probe_jit_enable();
probe_jit_harden();
probe_jit_kallsyms();
probe_jit_limit();
} else {
p_info("/* procfs not mounted, skipping related probes */");
}
probe_kernel_image_config();
print_end_section();
break;
default:
break;
}
}
static bool section_syscall_config(const char *define_prefix)
{
bool res;
print_start_section("syscall_config",
"Scanning system call availability...",
"/*** System call availability ***/",
define_prefix);
res = probe_bpf_syscall(define_prefix);
print_end_section();
return res;
}
static void
section_program_types(bool *supported_types, const char *define_prefix,
__u32 ifindex)
{
unsigned int i;
print_start_section("program_types",
"Scanning eBPF program types...",
"/*** eBPF program types ***/",
define_prefix);
for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++)
probe_prog_type(i, supported_types, define_prefix, ifindex);
print_end_section();
}
static void section_map_types(const char *define_prefix, __u32 ifindex)
{
unsigned int i;
print_start_section("map_types",
"Scanning eBPF map types...",
"/*** eBPF map types ***/",
define_prefix);
for (i = BPF_MAP_TYPE_UNSPEC + 1; i < map_type_name_size; i++)
probe_map_type(i, define_prefix, ifindex);
print_end_section();
}
static void
section_helpers(bool *supported_types, const char *define_prefix,
bool full_mode, __u32 ifindex)
{
unsigned int i;
print_start_section("helpers",
"Scanning eBPF helper functions...",
"/*** eBPF helper functions ***/",
define_prefix);
if (define_prefix)
printf("/*\n"
" * Use %sHAVE_PROG_TYPE_HELPER(prog_type_name, helper_name)\n"
" * to determine if <helper_name> is available for <prog_type_name>,\n"
" * e.g.\n"
" * #if %sHAVE_PROG_TYPE_HELPER(xdp, bpf_redirect)\n"
" * // do stuff with this helper\n"
" * #elif\n"
" * // use a workaround\n"
" * #endif\n"
" */\n"
"#define %sHAVE_PROG_TYPE_HELPER(prog_type, helper) \\\n"
" %sBPF__PROG_TYPE_ ## prog_type ## __HELPER_ ## helper\n",
define_prefix, define_prefix, define_prefix,
define_prefix);
for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++)
probe_helpers_for_progtype(i, supported_types[i],
define_prefix, full_mode, ifindex);
print_end_section();
}
static void section_misc(const char *define_prefix, __u32 ifindex)
{
print_start_section("misc",
"Scanning miscellaneous eBPF features...",
"/*** eBPF misc features ***/",
define_prefix);
probe_large_insn_limit(define_prefix, ifindex);
print_end_section();
}
static int do_probe(int argc, char **argv) static int do_probe(int argc, char **argv)
{ {
enum probe_component target = COMPONENT_UNSPEC; enum probe_component target = COMPONENT_UNSPEC;
const char *define_prefix = NULL; const char *define_prefix = NULL;
bool supported_types[128] = {}; bool supported_types[128] = {};
bool full_mode = false;
__u32 ifindex = 0; __u32 ifindex = 0;
unsigned int i;
char *ifname; char *ifname;
/* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN). /* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN).
@ -629,6 +765,9 @@ static int do_probe(int argc, char **argv)
strerror(errno)); strerror(errno));
return -1; return -1;
} }
} else if (is_prefix(*argv, "full")) {
full_mode = true;
NEXT_ARG();
} else if (is_prefix(*argv, "macros") && !define_prefix) { } else if (is_prefix(*argv, "macros") && !define_prefix) {
define_prefix = ""; define_prefix = "";
NEXT_ARG(); NEXT_ARG();
@ -658,97 +797,19 @@ static int do_probe(int argc, char **argv)
jsonw_start_object(json_wtr); jsonw_start_object(json_wtr);
} }
switch (target) { section_system_config(target, define_prefix);
case COMPONENT_KERNEL: if (!section_syscall_config(define_prefix))
case COMPONENT_UNSPEC:
if (define_prefix)
break;
print_start_section("system_config",
"Scanning system configuration...",
NULL, /* define_comment never used here */
NULL); /* define_prefix always NULL here */
if (check_procfs()) {
probe_unprivileged_disabled();
probe_jit_enable();
probe_jit_harden();
probe_jit_kallsyms();
probe_jit_limit();
} else {
p_info("/* procfs not mounted, skipping related probes */");
}
probe_kernel_image_config();
if (json_output)
jsonw_end_object(json_wtr);
else
printf("\n");
break;
default:
break;
}
print_start_section("syscall_config",
"Scanning system call availability...",
"/*** System call availability ***/",
define_prefix);
if (!probe_bpf_syscall(define_prefix))
/* bpf() syscall unavailable, don't probe other BPF features */ /* bpf() syscall unavailable, don't probe other BPF features */
goto exit_close_json; goto exit_close_json;
section_program_types(supported_types, define_prefix, ifindex);
print_end_then_start_section("program_types", section_map_types(define_prefix, ifindex);
"Scanning eBPF program types...", section_helpers(supported_types, define_prefix, full_mode, ifindex);
"/*** eBPF program types ***/", section_misc(define_prefix, ifindex);
define_prefix);
for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++)
probe_prog_type(i, supported_types, define_prefix, ifindex);
print_end_then_start_section("map_types",
"Scanning eBPF map types...",
"/*** eBPF map types ***/",
define_prefix);
for (i = BPF_MAP_TYPE_UNSPEC + 1; i < map_type_name_size; i++)
probe_map_type(i, define_prefix, ifindex);
print_end_then_start_section("helpers",
"Scanning eBPF helper functions...",
"/*** eBPF helper functions ***/",
define_prefix);
if (define_prefix)
printf("/*\n"
" * Use %sHAVE_PROG_TYPE_HELPER(prog_type_name, helper_name)\n"
" * to determine if <helper_name> is available for <prog_type_name>,\n"
" * e.g.\n"
" * #if %sHAVE_PROG_TYPE_HELPER(xdp, bpf_redirect)\n"
" * // do stuff with this helper\n"
" * #elif\n"
" * // use a workaround\n"
" * #endif\n"
" */\n"
"#define %sHAVE_PROG_TYPE_HELPER(prog_type, helper) \\\n"
" %sBPF__PROG_TYPE_ ## prog_type ## __HELPER_ ## helper\n",
define_prefix, define_prefix, define_prefix,
define_prefix);
for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++)
probe_helpers_for_progtype(i, supported_types[i],
define_prefix, ifindex);
print_end_then_start_section("misc",
"Scanning miscellaneous eBPF features...",
"/*** eBPF misc features ***/",
define_prefix);
probe_large_insn_limit(define_prefix, ifindex);
exit_close_json: exit_close_json:
if (json_output) { if (json_output)
/* End current "section" of probes */
jsonw_end_object(json_wtr);
/* End root object */ /* End root object */
jsonw_end_object(json_wtr); jsonw_end_object(json_wtr);
}
return 0; return 0;
} }
@ -761,7 +822,7 @@ static int do_help(int argc, char **argv)
} }
fprintf(stderr, fprintf(stderr,
"Usage: %s %s probe [COMPONENT] [macros [prefix PREFIX]]\n" "Usage: %s %s probe [COMPONENT] [full] [macros [prefix PREFIX]]\n"
" %s %s help\n" " %s %s help\n"
"\n" "\n"
" COMPONENT := { kernel | dev NAME }\n" " COMPONENT := { kernel | dev NAME }\n"

View File

@ -76,6 +76,9 @@ static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl",
[BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable",
[BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt", [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt",
[BPF_PROG_TYPE_TRACING] = "tracing",
[BPF_PROG_TYPE_STRUCT_OPS] = "struct_ops",
[BPF_PROG_TYPE_EXT] = "ext",
}; };
extern const char * const map_type_name[]; extern const char * const map_type_name[];

View File

@ -1573,8 +1573,8 @@ static int do_help(int argc, char **argv)
" cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
" cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
" cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n" " cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n"
" cgroup/recvmsg6 | cgroup/getsockopt |\n" " cgroup/recvmsg6 | cgroup/getsockopt | cgroup/setsockopt |\n"
" cgroup/setsockopt }\n" " struct_ops | fentry | fexit | freplace }\n"
" ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
" flow_dissector }\n" " flow_dissector }\n"
" " HELP_SPEC_OPTIONS "\n" " " HELP_SPEC_OPTIONS "\n"

View File

@ -3,4 +3,7 @@ gpiogpio-hammer
gpioinclude/ gpioinclude/
gpiolsgpio gpiolsgpio
tpm2/SpaceTest.log tpm2/SpaceTest.log
tpm2/*.pyc
# Python bytecode and cache
__pycache__/
*.py[cod]

View File

@ -20,7 +20,7 @@ CLANG ?= clang
LLC ?= llc LLC ?= llc
LLVM_OBJCOPY ?= llvm-objcopy LLVM_OBJCOPY ?= llvm-objcopy
BPF_GCC ?= $(shell command -v bpf-gcc;) BPF_GCC ?= $(shell command -v bpf-gcc;)
CFLAGS += -g -Wall -O2 $(GENFLAGS) -I$(CURDIR) -I$(APIDIR) \ CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) -I$(CURDIR) -I$(APIDIR) \
-I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \
-Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_prog_load=bpf_prog_test_load \
-Dbpf_load_program=bpf_test_load_program -Dbpf_load_program=bpf_test_load_program
@ -62,7 +62,8 @@ TEST_PROGS := test_kmod.sh \
test_tc_tunnel.sh \ test_tc_tunnel.sh \
test_tc_edt.sh \ test_tc_edt.sh \
test_xdping.sh \ test_xdping.sh \
test_bpftool_build.sh test_bpftool_build.sh \
test_bpftool.sh
TEST_PROGS_EXTENDED := with_addr.sh \ TEST_PROGS_EXTENDED := with_addr.sh \
with_tunnels.sh \ with_tunnels.sh \

View File

@ -509,11 +509,6 @@ static void test_syncookie(int type, sa_family_t family)
.pass_on_failure = 0, .pass_on_failure = 0,
}; };
if (type != SOCK_STREAM) {
test__skip();
return;
}
/* /*
* +1 for TCP-SYN and * +1 for TCP-SYN and
* +1 for the TCP-ACK (ack the syncookie) * +1 for the TCP-ACK (ack the syncookie)
@ -787,7 +782,7 @@ static const char *sotype_str(int sotype)
} }
} }
#define TEST_INIT(fn, ...) { fn, #fn, __VA_ARGS__ } #define TEST_INIT(fn_, ...) { .fn = fn_, .name = #fn_, __VA_ARGS__ }
static void test_config(int sotype, sa_family_t family, bool inany) static void test_config(int sotype, sa_family_t family, bool inany)
{ {
@ -795,19 +790,31 @@ static void test_config(int sotype, sa_family_t family, bool inany)
void (*fn)(int sotype, sa_family_t family); void (*fn)(int sotype, sa_family_t family);
const char *name; const char *name;
bool no_inner_map; bool no_inner_map;
int need_sotype;
} tests[] = { } tests[] = {
TEST_INIT(test_err_inner_map, true /* no_inner_map */), TEST_INIT(test_err_inner_map,
.no_inner_map = true),
TEST_INIT(test_err_skb_data), TEST_INIT(test_err_skb_data),
TEST_INIT(test_err_sk_select_port), TEST_INIT(test_err_sk_select_port),
TEST_INIT(test_pass), TEST_INIT(test_pass),
TEST_INIT(test_syncookie), TEST_INIT(test_syncookie,
.need_sotype = SOCK_STREAM),
TEST_INIT(test_pass_on_err), TEST_INIT(test_pass_on_err),
TEST_INIT(test_detach_bpf), TEST_INIT(test_detach_bpf),
}; };
char s[MAX_TEST_NAME]; char s[MAX_TEST_NAME];
const struct test *t; const struct test *t;
/* SOCKMAP/SOCKHASH don't support UDP yet */
if (sotype == SOCK_DGRAM &&
(inner_map_type == BPF_MAP_TYPE_SOCKMAP ||
inner_map_type == BPF_MAP_TYPE_SOCKHASH))
return;
for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
if (t->need_sotype && t->need_sotype != sotype)
continue; /* test not compatible with socket type */
snprintf(s, sizeof(s), "%s %s/%s %s %s", snprintf(s, sizeof(s), "%s %s/%s %s %s",
maptype_str(inner_map_type), maptype_str(inner_map_type),
family_str(family), sotype_str(sotype), family_str(family), sotype_str(sotype),
@ -816,13 +823,6 @@ static void test_config(int sotype, sa_family_t family, bool inany)
if (!test__start_subtest(s)) if (!test__start_subtest(s))
continue; continue;
if (sotype == SOCK_DGRAM &&
inner_map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
/* SOCKMAP/SOCKHASH don't support UDP yet */
test__skip();
continue;
}
setup_per_test(sotype, family, inany, t->no_inner_map); setup_per_test(sotype, family, inany, t->no_inner_map);
t->fn(sotype, family); t->fn(sotype, family);
cleanup_per_test(t->no_inner_map); cleanup_per_test(t->no_inner_map);

View File

@ -0,0 +1,178 @@
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2020 SUSE LLC.
import collections
import functools
import json
import os
import socket
import subprocess
import unittest
# Add the source tree of bpftool and /usr/local/sbin to PATH
cur_dir = os.path.dirname(os.path.realpath(__file__))
bpftool_dir = os.path.abspath(os.path.join(cur_dir, "..", "..", "..", "..",
"tools", "bpf", "bpftool"))
os.environ["PATH"] = bpftool_dir + ":/usr/local/sbin:" + os.environ["PATH"]
class IfaceNotFoundError(Exception):
pass
class UnprivilegedUserError(Exception):
pass
def _bpftool(args, json=True):
_args = ["bpftool"]
if json:
_args.append("-j")
_args.extend(args)
return subprocess.check_output(_args)
def bpftool(args):
return _bpftool(args, json=False).decode("utf-8")
def bpftool_json(args):
res = _bpftool(args)
return json.loads(res)
def get_default_iface():
for iface in socket.if_nameindex():
if iface[1] != "lo":
return iface[1]
raise IfaceNotFoundError("Could not find any network interface to probe")
def default_iface(f):
@functools.wraps(f)
def wrapper(*args, **kwargs):
iface = get_default_iface()
return f(*args, iface, **kwargs)
return wrapper
class TestBpftool(unittest.TestCase):
@classmethod
def setUpClass(cls):
if os.getuid() != 0:
raise UnprivilegedUserError(
"This test suite needs root privileges")
@default_iface
def test_feature_dev_json(self, iface):
unexpected_helpers = [
"bpf_probe_write_user",
"bpf_trace_printk",
]
expected_keys = [
"syscall_config",
"program_types",
"map_types",
"helpers",
"misc",
]
res = bpftool_json(["feature", "probe", "dev", iface])
# Check if the result has all expected keys.
self.assertCountEqual(res.keys(), expected_keys)
# Check if unexpected helpers are not included in helpers probes
# result.
for helpers in res["helpers"].values():
for unexpected_helper in unexpected_helpers:
self.assertNotIn(unexpected_helper, helpers)
def test_feature_kernel(self):
test_cases = [
bpftool_json(["feature", "probe", "kernel"]),
bpftool_json(["feature", "probe"]),
bpftool_json(["feature"]),
]
unexpected_helpers = [
"bpf_probe_write_user",
"bpf_trace_printk",
]
expected_keys = [
"syscall_config",
"system_config",
"program_types",
"map_types",
"helpers",
"misc",
]
for tc in test_cases:
# Check if the result has all expected keys.
self.assertCountEqual(tc.keys(), expected_keys)
# Check if unexpected helpers are not included in helpers probes
# result.
for helpers in tc["helpers"].values():
for unexpected_helper in unexpected_helpers:
self.assertNotIn(unexpected_helper, helpers)
def test_feature_kernel_full(self):
test_cases = [
bpftool_json(["feature", "probe", "kernel", "full"]),
bpftool_json(["feature", "probe", "full"]),
]
expected_helpers = [
"bpf_probe_write_user",
"bpf_trace_printk",
]
for tc in test_cases:
# Check if expected helpers are included at least once in any
# helpers list for any program type. Unfortunately we cannot assume
# that they will be included in all program types or a specific
# subset of programs. It depends on the kernel version and
# configuration.
found_helpers = False
for helpers in tc["helpers"].values():
if all(expected_helper in helpers
for expected_helper in expected_helpers):
found_helpers = True
break
self.assertTrue(found_helpers)
def test_feature_kernel_full_vs_not_full(self):
full_res = bpftool_json(["feature", "probe", "full"])
not_full_res = bpftool_json(["feature", "probe"])
not_full_set = set()
full_set = set()
for helpers in full_res["helpers"].values():
for helper in helpers:
full_set.add(helper)
for helpers in not_full_res["helpers"].values():
for helper in helpers:
not_full_set.add(helper)
self.assertCountEqual(full_set - not_full_set,
{"bpf_probe_write_user", "bpf_trace_printk"})
self.assertCountEqual(not_full_set - full_set, set())
def test_feature_macros(self):
expected_patterns = [
r"/\*\*\* System call availability \*\*\*/",
r"#define HAVE_BPF_SYSCALL",
r"/\*\*\* eBPF program types \*\*\*/",
r"#define HAVE.*PROG_TYPE",
r"/\*\*\* eBPF map types \*\*\*/",
r"#define HAVE.*MAP_TYPE",
r"/\*\*\* eBPF helper functions \*\*\*/",
r"#define HAVE.*HELPER",
r"/\*\*\* eBPF misc features \*\*\*/",
]
res = bpftool(["feature", "probe", "macros"])
for pattern in expected_patterns:
self.assertRegex(res, pattern)

View File

@ -0,0 +1,5 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2020 SUSE LLC.
python3 -m unittest -v test_bpftool.TestBpftool

View File

@ -6,6 +6,8 @@
#include "bpf_rlimit.h" #include "bpf_rlimit.h"
#include <argp.h> #include <argp.h>
#include <string.h> #include <string.h>
#include <signal.h>
#include <execinfo.h> /* backtrace */
/* defined in test_progs.h */ /* defined in test_progs.h */
struct test_env env = {}; struct test_env env = {};
@ -617,6 +619,23 @@ int cd_flavor_subdir(const char *exec_name)
return chdir(flavor); return chdir(flavor);
} }
#define MAX_BACKTRACE_SZ 128
void crash_handler(int signum)
{
void *bt[MAX_BACKTRACE_SZ];
size_t sz;
sz = backtrace(bt, ARRAY_SIZE(bt));
if (env.test)
dump_test_log(env.test, true);
if (env.stdout)
stdio_restore();
fprintf(stderr, "Caught signal #%d!\nStack trace:\n", signum);
backtrace_symbols_fd(bt, sz, STDERR_FILENO);
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
static const struct argp argp = { static const struct argp argp = {
@ -624,8 +643,14 @@ int main(int argc, char **argv)
.parser = parse_arg, .parser = parse_arg,
.doc = argp_program_doc, .doc = argp_program_doc,
}; };
struct sigaction sigact = {
.sa_handler = crash_handler,
.sa_flags = SA_RESETHAND,
};
int err, i; int err, i;
sigaction(SIGSEGV, &sigact, NULL);
err = argp_parse(&argp, argc, argv, 0, NULL, &env); err = argp_parse(&argp, argc, argv, 0, NULL, &env);
if (err) if (err)
return err; return err;