2022-09-02 14:10:43 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/llist.h>
|
|
|
|
#include <linux/bpf.h>
|
|
|
|
#include <linux/irq_work.h>
|
|
|
|
#include <linux/bpf_mem_alloc.h>
|
|
|
|
#include <linux/memcontrol.h>
|
|
|
|
#include <asm/local.h>
|
|
|
|
|
|
|
|
/* Any context (including NMI) BPF specific memory allocator.
|
|
|
|
*
|
|
|
|
* Tracing BPF programs can attach to kprobe and fentry. Hence they
|
|
|
|
* run in unknown context where calling plain kmalloc() might not be safe.
|
|
|
|
*
|
|
|
|
* Front-end kmalloc() with per-cpu per-bucket cache of free elements.
|
|
|
|
* Refill this cache asynchronously from irq_work.
|
|
|
|
*
|
|
|
|
* CPU_0 buckets
|
|
|
|
* 16 32 64 96 128 196 256 512 1024 2048 4096
|
|
|
|
* ...
|
|
|
|
* CPU_N buckets
|
|
|
|
* 16 32 64 96 128 196 256 512 1024 2048 4096
|
|
|
|
*
|
|
|
|
* The buckets are prefilled at the start.
|
|
|
|
* BPF programs always run with migration disabled.
|
|
|
|
* It's safe to allocate from cache of the current cpu with irqs disabled.
|
|
|
|
* Free-ing is always done into bucket of the current cpu as well.
|
|
|
|
* irq_work trims extra free elements from buckets with kfree
|
|
|
|
* and refills them with kmalloc, so global kmalloc logic takes care
|
|
|
|
* of freeing objects allocated by one cpu and freed on another.
|
|
|
|
*
|
|
|
|
* Every allocated objected is padded with extra 8 bytes that contains
|
|
|
|
* struct llist_node.
|
|
|
|
*/
|
|
|
|
#define LLIST_NODE_SZ sizeof(struct llist_node)
|
|
|
|
|
|
|
|
/* similar to kmalloc, but sizeof == 8 bucket is gone */
|
|
|
|
static u8 size_index[24] __ro_after_init = {
|
|
|
|
3, /* 8 */
|
|
|
|
3, /* 16 */
|
|
|
|
4, /* 24 */
|
|
|
|
4, /* 32 */
|
|
|
|
5, /* 40 */
|
|
|
|
5, /* 48 */
|
|
|
|
5, /* 56 */
|
|
|
|
5, /* 64 */
|
|
|
|
1, /* 72 */
|
|
|
|
1, /* 80 */
|
|
|
|
1, /* 88 */
|
|
|
|
1, /* 96 */
|
|
|
|
6, /* 104 */
|
|
|
|
6, /* 112 */
|
|
|
|
6, /* 120 */
|
|
|
|
6, /* 128 */
|
|
|
|
2, /* 136 */
|
|
|
|
2, /* 144 */
|
|
|
|
2, /* 152 */
|
|
|
|
2, /* 160 */
|
|
|
|
2, /* 168 */
|
|
|
|
2, /* 176 */
|
|
|
|
2, /* 184 */
|
|
|
|
2 /* 192 */
|
|
|
|
};
|
|
|
|
|
|
|
|
static int bpf_mem_cache_idx(size_t size)
|
|
|
|
{
|
|
|
|
if (!size || size > 4096)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (size <= 192)
|
|
|
|
return size_index[(size - 1) / 8] - 1;
|
|
|
|
|
2023-01-18 16:46:30 +08:00
|
|
|
return fls(size - 1) - 2;
|
2022-09-02 14:10:43 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
#define NUM_CACHES 11
|
|
|
|
|
|
|
|
struct bpf_mem_cache {
|
|
|
|
/* per-cpu list of free objects of size 'unit_size'.
|
|
|
|
* All accesses are done with interrupts disabled and 'active' counter
|
|
|
|
* protection with __llist_add() and __llist_del_first().
|
|
|
|
*/
|
|
|
|
struct llist_head free_llist;
|
|
|
|
local_t active;
|
|
|
|
|
|
|
|
/* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
|
|
|
|
* are sequenced by per-cpu 'active' counter. But unit_free() cannot
|
|
|
|
* fail. When 'active' is busy the unit_free() will add an object to
|
|
|
|
* free_llist_extra.
|
|
|
|
*/
|
|
|
|
struct llist_head free_llist_extra;
|
|
|
|
|
|
|
|
struct irq_work refill_work;
|
|
|
|
struct obj_cgroup *objcg;
|
|
|
|
int unit_size;
|
|
|
|
/* count of objects in free_llist */
|
|
|
|
int free_cnt;
|
2022-09-02 14:10:50 -07:00
|
|
|
int low_watermark, high_watermark, batch;
|
2022-09-02 14:10:57 -07:00
|
|
|
int percpu_size;
|
2023-07-05 20:34:40 -07:00
|
|
|
bool draining;
|
2022-09-02 14:10:51 -07:00
|
|
|
|
2023-07-05 20:34:34 -07:00
|
|
|
/* list of objects to be freed after RCU tasks trace GP */
|
|
|
|
struct llist_head free_by_rcu_ttrace;
|
|
|
|
struct llist_head waiting_for_gp_ttrace;
|
|
|
|
struct rcu_head rcu_ttrace;
|
|
|
|
atomic_t call_rcu_ttrace_in_progress;
|
2022-09-02 14:10:43 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
struct bpf_mem_caches {
|
|
|
|
struct bpf_mem_cache cache[NUM_CACHES];
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct llist_node notrace *__llist_del_first(struct llist_head *head)
|
|
|
|
{
|
|
|
|
struct llist_node *entry, *next;
|
|
|
|
|
|
|
|
entry = head->first;
|
|
|
|
if (!entry)
|
|
|
|
return NULL;
|
|
|
|
next = entry->next;
|
|
|
|
head->first = next;
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
|
2023-03-22 14:52:42 -07:00
|
|
|
static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
|
2022-09-02 14:10:43 -07:00
|
|
|
{
|
2022-09-02 14:10:57 -07:00
|
|
|
if (c->percpu_size) {
|
|
|
|
void **obj = kmalloc_node(c->percpu_size, flags, node);
|
2022-09-02 14:10:52 -07:00
|
|
|
void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
|
|
|
|
|
|
|
|
if (!obj || !pptr) {
|
|
|
|
free_percpu(pptr);
|
|
|
|
kfree(obj);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
obj[1] = pptr;
|
|
|
|
return obj;
|
|
|
|
}
|
|
|
|
|
bpf: Zeroing allocated object from slab in bpf memory allocator
Currently the freed element in bpf memory allocator may be immediately
reused, for htab map the reuse will reinitialize special fields in map
value (e.g., bpf_spin_lock), but lookup procedure may still access
these special fields, and it may lead to hard-lockup as shown below:
NMI backtrace for cpu 16
CPU: 16 PID: 2574 Comm: htab.bin Tainted: G L 6.1.0+ #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
RIP: 0010:queued_spin_lock_slowpath+0x283/0x2c0
......
Call Trace:
<TASK>
copy_map_value_locked+0xb7/0x170
bpf_map_copy_value+0x113/0x3c0
__sys_bpf+0x1c67/0x2780
__x64_sys_bpf+0x1c/0x20
do_syscall_64+0x30/0x60
entry_SYSCALL_64_after_hwframe+0x46/0xb0
......
</TASK>
For htab map, just like the preallocated case, these is no need to
initialize these special fields in map value again once these fields
have been initialized. For preallocated htab map, these fields are
initialized through __GFP_ZERO in bpf_map_area_alloc(), so do the
similar thing for non-preallocated htab in bpf memory allocator. And
there is no need to use __GFP_ZERO for per-cpu bpf memory allocator,
because __alloc_percpu_gfp() does it implicitly.
Fixes: 0fd7c5d43339 ("bpf: Optimize call_rcu in non-preallocated hash map.")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230215082132.3856544-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-02-15 16:21:31 +08:00
|
|
|
return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);
|
2022-09-02 14:10:43 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
|
|
if (c->objcg)
|
|
|
|
return get_mem_cgroup_from_objcg(c->objcg);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_MEMCG
|
|
|
|
return root_mem_cgroup;
|
|
|
|
#else
|
|
|
|
return NULL;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2023-07-05 20:34:38 -07:00
|
|
|
static void inc_active(struct bpf_mem_cache *c, unsigned long *flags)
|
2023-07-05 20:34:37 -07:00
|
|
|
{
|
|
|
|
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
|
|
|
/* In RT irq_work runs in per-cpu kthread, so disable
|
|
|
|
* interrupts to avoid preemption and interrupts and
|
|
|
|
* reduce the chance of bpf prog executing on this cpu
|
|
|
|
* when active counter is busy.
|
|
|
|
*/
|
2023-07-05 20:34:38 -07:00
|
|
|
local_irq_save(*flags);
|
2023-07-05 20:34:37 -07:00
|
|
|
/* alloc_bulk runs from irq_work which will not preempt a bpf
|
|
|
|
* program that does unit_alloc/unit_free since IRQs are
|
|
|
|
* disabled there. There is no race to increment 'active'
|
|
|
|
* counter. It protects free_llist from corruption in case NMI
|
|
|
|
* bpf prog preempted this loop.
|
|
|
|
*/
|
|
|
|
WARN_ON_ONCE(local_inc_return(&c->active) != 1);
|
2023-07-05 20:34:38 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void dec_active(struct bpf_mem_cache *c, unsigned long flags)
|
|
|
|
{
|
2023-07-05 20:34:37 -07:00
|
|
|
local_dec(&c->active);
|
|
|
|
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
2023-07-05 20:34:38 -07:00
|
|
|
static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
inc_active(c, &flags);
|
|
|
|
__llist_add(obj, &c->free_llist);
|
|
|
|
c->free_cnt++;
|
|
|
|
dec_active(c, flags);
|
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:43 -07:00
|
|
|
/* Mostly runs from irq_work except __init phase. */
|
|
|
|
static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = NULL, *old_memcg;
|
|
|
|
void *obj;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < cnt; i++) {
|
2022-12-09 09:09:46 +08:00
|
|
|
/*
|
2023-07-05 20:34:34 -07:00
|
|
|
* free_by_rcu_ttrace is only manipulated by irq work refill_work().
|
2022-12-09 09:09:46 +08:00
|
|
|
* IRQ works on the same CPU are called sequentially, so it is
|
|
|
|
* safe to use __llist_del_first() here. If alloc_bulk() is
|
|
|
|
* invoked by the initial prefill, there will be no running
|
|
|
|
* refill_work(), so __llist_del_first() is fine as well.
|
|
|
|
*
|
2023-07-05 20:34:34 -07:00
|
|
|
* In most cases, objects on free_by_rcu_ttrace are from the same CPU.
|
2022-12-09 09:09:46 +08:00
|
|
|
* If some objects come from other CPUs, it doesn't incur any
|
|
|
|
* harm because NUMA_NO_NODE means the preference for current
|
|
|
|
* numa node and it is not a guarantee.
|
|
|
|
*/
|
2023-07-05 20:34:34 -07:00
|
|
|
obj = __llist_del_first(&c->free_by_rcu_ttrace);
|
2023-07-05 20:34:39 -07:00
|
|
|
if (!obj)
|
|
|
|
break;
|
|
|
|
add_obj_to_free_list(c, obj);
|
|
|
|
}
|
|
|
|
if (i >= cnt)
|
|
|
|
return;
|
|
|
|
|
|
|
|
memcg = get_memcg(c);
|
|
|
|
old_memcg = set_active_memcg(memcg);
|
|
|
|
for (; i < cnt; i++) {
|
|
|
|
/* Allocate, but don't deplete atomic reserves that typical
|
|
|
|
* GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
|
|
|
|
* will allocate from the current numa node which is what we
|
|
|
|
* want here.
|
|
|
|
*/
|
|
|
|
obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
|
|
|
|
if (!obj)
|
|
|
|
break;
|
2023-07-05 20:34:37 -07:00
|
|
|
add_obj_to_free_list(c, obj);
|
2022-09-02 14:10:43 -07:00
|
|
|
}
|
|
|
|
set_active_memcg(old_memcg);
|
|
|
|
mem_cgroup_put(memcg);
|
|
|
|
}
|
|
|
|
|
2023-06-06 11:53:08 +08:00
|
|
|
static void free_one(void *obj, bool percpu)
|
2022-09-02 14:10:43 -07:00
|
|
|
{
|
2023-06-06 11:53:08 +08:00
|
|
|
if (percpu) {
|
2022-09-02 14:10:52 -07:00
|
|
|
free_percpu(((void **)obj)[1]);
|
2022-09-02 14:10:57 -07:00
|
|
|
kfree(obj);
|
2022-09-02 14:10:52 -07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:57 -07:00
|
|
|
kfree(obj);
|
2022-09-02 14:10:43 -07:00
|
|
|
}
|
|
|
|
|
2023-07-05 20:34:36 -07:00
|
|
|
static int free_all(struct llist_node *llnode, bool percpu)
|
2022-09-02 14:10:51 -07:00
|
|
|
{
|
|
|
|
struct llist_node *pos, *t;
|
2023-07-05 20:34:36 -07:00
|
|
|
int cnt = 0;
|
2022-09-02 14:10:51 -07:00
|
|
|
|
2023-07-05 20:34:36 -07:00
|
|
|
llist_for_each_safe(pos, t, llnode) {
|
2023-06-06 11:53:08 +08:00
|
|
|
free_one(pos, percpu);
|
2023-07-05 20:34:36 -07:00
|
|
|
cnt++;
|
|
|
|
}
|
|
|
|
return cnt;
|
2023-06-06 11:53:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __free_rcu(struct rcu_head *head)
|
|
|
|
{
|
2023-07-05 20:34:34 -07:00
|
|
|
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
|
2023-06-06 11:53:08 +08:00
|
|
|
|
2023-07-05 20:34:34 -07:00
|
|
|
free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
|
|
|
|
atomic_set(&c->call_rcu_ttrace_in_progress, 0);
|
2022-09-02 14:10:51 -07:00
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:55 -07:00
|
|
|
static void __free_rcu_tasks_trace(struct rcu_head *head)
|
|
|
|
{
|
2022-10-14 19:39:44 +08:00
|
|
|
/* If RCU Tasks Trace grace period implies RCU grace period,
|
|
|
|
* there is no need to invoke call_rcu().
|
|
|
|
*/
|
|
|
|
if (rcu_trace_implies_rcu_gp())
|
|
|
|
__free_rcu(head);
|
|
|
|
else
|
|
|
|
call_rcu(head, __free_rcu);
|
2022-09-02 14:10:55 -07:00
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:51 -07:00
|
|
|
static void enque_to_free(struct bpf_mem_cache *c, void *obj)
|
|
|
|
{
|
|
|
|
struct llist_node *llnode = obj;
|
|
|
|
|
|
|
|
/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
|
2023-07-05 20:34:34 -07:00
|
|
|
* Nothing races to add to free_by_rcu_ttrace list.
|
2022-09-02 14:10:51 -07:00
|
|
|
*/
|
2023-07-05 20:34:34 -07:00
|
|
|
__llist_add(llnode, &c->free_by_rcu_ttrace);
|
2022-09-02 14:10:51 -07:00
|
|
|
}
|
|
|
|
|
2023-07-05 20:34:34 -07:00
|
|
|
static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
|
2022-09-02 14:10:51 -07:00
|
|
|
{
|
|
|
|
struct llist_node *llnode, *t;
|
|
|
|
|
2023-07-05 20:34:34 -07:00
|
|
|
if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1))
|
2022-09-02 14:10:51 -07:00
|
|
|
return;
|
|
|
|
|
2023-07-05 20:34:34 -07:00
|
|
|
WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
|
|
|
|
llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu_ttrace))
|
|
|
|
/* There is no concurrent __llist_add(waiting_for_gp_ttrace) access.
|
2022-09-02 14:10:51 -07:00
|
|
|
* It doesn't race with llist_del_all either.
|
2023-07-05 20:34:34 -07:00
|
|
|
* But there could be two concurrent llist_del_all(waiting_for_gp_ttrace):
|
2022-09-02 14:10:51 -07:00
|
|
|
* from __free_rcu() and from drain_mem_cache().
|
|
|
|
*/
|
2023-07-05 20:34:34 -07:00
|
|
|
__llist_add(llnode, &c->waiting_for_gp_ttrace);
|
2023-07-05 20:34:40 -07:00
|
|
|
|
|
|
|
if (unlikely(READ_ONCE(c->draining))) {
|
|
|
|
__free_rcu(&c->rcu_ttrace);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:55 -07:00
|
|
|
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
|
2022-10-14 19:39:44 +08:00
|
|
|
* If RCU Tasks Trace grace period implies RCU grace period, free
|
|
|
|
* these elements directly, else use call_rcu() to wait for normal
|
|
|
|
* progs to finish and finally do free_one() on each element.
|
2022-09-02 14:10:55 -07:00
|
|
|
*/
|
2023-07-05 20:34:34 -07:00
|
|
|
call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
|
2022-09-02 14:10:51 -07:00
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:43 -07:00
|
|
|
static void free_bulk(struct bpf_mem_cache *c)
|
|
|
|
{
|
|
|
|
struct llist_node *llnode, *t;
|
|
|
|
unsigned long flags;
|
|
|
|
int cnt;
|
|
|
|
|
|
|
|
do {
|
2023-07-05 20:34:38 -07:00
|
|
|
inc_active(c, &flags);
|
2022-09-02 14:10:43 -07:00
|
|
|
llnode = __llist_del_first(&c->free_llist);
|
|
|
|
if (llnode)
|
|
|
|
cnt = --c->free_cnt;
|
|
|
|
else
|
|
|
|
cnt = 0;
|
2023-07-05 20:34:38 -07:00
|
|
|
dec_active(c, flags);
|
2022-09-19 22:48:11 +08:00
|
|
|
if (llnode)
|
|
|
|
enque_to_free(c, llnode);
|
2022-09-02 14:10:50 -07:00
|
|
|
} while (cnt > (c->high_watermark + c->low_watermark) / 2);
|
2022-09-02 14:10:43 -07:00
|
|
|
|
|
|
|
/* and drain free_llist_extra */
|
|
|
|
llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
|
2022-09-02 14:10:51 -07:00
|
|
|
enque_to_free(c, llnode);
|
2023-07-05 20:34:34 -07:00
|
|
|
do_call_rcu_ttrace(c);
|
2022-09-02 14:10:43 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void bpf_mem_refill(struct irq_work *work)
|
|
|
|
{
|
|
|
|
struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
|
|
|
|
int cnt;
|
|
|
|
|
|
|
|
/* Racy access to free_cnt. It doesn't need to be 100% accurate */
|
|
|
|
cnt = c->free_cnt;
|
2022-09-02 14:10:50 -07:00
|
|
|
if (cnt < c->low_watermark)
|
2022-09-02 14:10:43 -07:00
|
|
|
/* irq_work runs on this cpu and kmalloc will allocate
|
|
|
|
* from the current numa node which is what we want here.
|
|
|
|
*/
|
2022-09-02 14:10:50 -07:00
|
|
|
alloc_bulk(c, c->batch, NUMA_NO_NODE);
|
|
|
|
else if (cnt > c->high_watermark)
|
2022-09-02 14:10:43 -07:00
|
|
|
free_bulk(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void notrace irq_work_raise(struct bpf_mem_cache *c)
|
|
|
|
{
|
|
|
|
irq_work_queue(&c->refill_work);
|
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:50 -07:00
|
|
|
/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
|
|
|
|
* the freelist cache will be elem_size * 64 (or less) on each cpu.
|
|
|
|
*
|
|
|
|
* For bpf programs that don't have statically known allocation sizes and
|
|
|
|
* assuming (low_mark + high_mark) / 2 as an average number of elements per
|
|
|
|
* bucket and all buckets are used the total amount of memory in freelists
|
|
|
|
* on each cpu will be:
|
|
|
|
* 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
|
|
|
|
* == ~ 116 Kbyte using below heuristic.
|
|
|
|
* Initialized, but unused bpf allocator (not bpf map specific one) will
|
|
|
|
* consume ~ 11 Kbyte per cpu.
|
|
|
|
* Typical case will be between 11K and 116K closer to 11K.
|
|
|
|
* bpf progs can and should share bpf_mem_cache when possible.
|
|
|
|
*/
|
|
|
|
|
2022-09-02 14:10:43 -07:00
|
|
|
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
|
|
|
|
{
|
|
|
|
init_irq_work(&c->refill_work, bpf_mem_refill);
|
2022-09-02 14:10:50 -07:00
|
|
|
if (c->unit_size <= 256) {
|
|
|
|
c->low_watermark = 32;
|
|
|
|
c->high_watermark = 96;
|
|
|
|
} else {
|
|
|
|
/* When page_size == 4k, order-0 cache will have low_mark == 2
|
|
|
|
* and high_mark == 6 with batch alloc of 3 individual pages at
|
|
|
|
* a time.
|
|
|
|
* 8k allocs and above low == 1, high == 3, batch == 1.
|
|
|
|
*/
|
|
|
|
c->low_watermark = max(32 * 256 / c->unit_size, 1);
|
|
|
|
c->high_watermark = max(96 * 256 / c->unit_size, 3);
|
|
|
|
}
|
|
|
|
c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
|
|
|
|
|
2022-09-02 14:10:43 -07:00
|
|
|
/* To avoid consuming memory assume that 1st run of bpf
|
|
|
|
* prog won't be doing more than 4 map_update_elem from
|
|
|
|
* irq disabled region
|
|
|
|
*/
|
|
|
|
alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
|
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:57 -07:00
|
|
|
/* When size != 0 bpf_mem_cache for each cpu.
|
2022-09-02 14:10:43 -07:00
|
|
|
* This is typical bpf hash map use case when all elements have equal size.
|
|
|
|
*
|
|
|
|
* When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
|
|
|
|
* kmalloc/kfree. Max allocation size is 4096 in this case.
|
|
|
|
* This is bpf_dynptr and bpf_kptr use case.
|
|
|
|
*/
|
2022-09-02 14:10:52 -07:00
|
|
|
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
|
2022-09-02 14:10:43 -07:00
|
|
|
{
|
|
|
|
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
|
|
|
|
struct bpf_mem_caches *cc, __percpu *pcc;
|
|
|
|
struct bpf_mem_cache *c, __percpu *pc;
|
|
|
|
struct obj_cgroup *objcg = NULL;
|
2022-09-02 14:10:57 -07:00
|
|
|
int cpu, i, unit_size, percpu_size = 0;
|
2022-09-02 14:10:43 -07:00
|
|
|
|
|
|
|
if (size) {
|
|
|
|
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
|
|
|
|
if (!pc)
|
|
|
|
return -ENOMEM;
|
2022-09-02 14:10:52 -07:00
|
|
|
|
2022-09-02 14:10:57 -07:00
|
|
|
if (percpu)
|
2022-09-02 14:10:52 -07:00
|
|
|
/* room for llist_node and per-cpu pointer */
|
2022-09-02 14:10:57 -07:00
|
|
|
percpu_size = LLIST_NODE_SZ + sizeof(void *);
|
|
|
|
else
|
2022-09-02 14:10:52 -07:00
|
|
|
size += LLIST_NODE_SZ; /* room for llist_node */
|
2022-09-02 14:10:57 -07:00
|
|
|
unit_size = size;
|
2022-09-02 14:10:52 -07:00
|
|
|
|
2022-09-02 14:10:43 -07:00
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
2023-02-10 15:47:33 +00:00
|
|
|
if (memcg_bpf_enabled())
|
|
|
|
objcg = get_obj_cgroup_from_current();
|
2022-09-02 14:10:43 -07:00
|
|
|
#endif
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
c = per_cpu_ptr(pc, cpu);
|
2022-09-02 14:10:52 -07:00
|
|
|
c->unit_size = unit_size;
|
2022-09-02 14:10:43 -07:00
|
|
|
c->objcg = objcg;
|
2022-09-02 14:10:57 -07:00
|
|
|
c->percpu_size = percpu_size;
|
2022-09-02 14:10:43 -07:00
|
|
|
prefill_mem_cache(c, cpu);
|
|
|
|
}
|
|
|
|
ma->cache = pc;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:52 -07:00
|
|
|
/* size == 0 && percpu is an invalid combination */
|
|
|
|
if (WARN_ON_ONCE(percpu))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2022-09-02 14:10:43 -07:00
|
|
|
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
|
|
|
|
if (!pcc)
|
|
|
|
return -ENOMEM;
|
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
|
|
objcg = get_obj_cgroup_from_current();
|
|
|
|
#endif
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
cc = per_cpu_ptr(pcc, cpu);
|
|
|
|
for (i = 0; i < NUM_CACHES; i++) {
|
|
|
|
c = &cc->cache[i];
|
|
|
|
c->unit_size = sizes[i];
|
|
|
|
c->objcg = objcg;
|
|
|
|
prefill_mem_cache(c, cpu);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ma->caches = pcc;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void drain_mem_cache(struct bpf_mem_cache *c)
|
|
|
|
{
|
2023-06-06 11:53:08 +08:00
|
|
|
bool percpu = !!c->percpu_size;
|
2022-09-02 14:10:43 -07:00
|
|
|
|
2022-09-02 14:10:58 -07:00
|
|
|
/* No progs are using this bpf_mem_cache, but htab_map_free() called
|
|
|
|
* bpf_mem_cache_free() for all remaining elements and they can be in
|
2023-07-05 20:34:34 -07:00
|
|
|
* free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now.
|
2022-10-21 19:49:13 +08:00
|
|
|
*
|
2023-07-05 20:34:34 -07:00
|
|
|
* Except for waiting_for_gp_ttrace list, there are no concurrent operations
|
2022-10-21 19:49:13 +08:00
|
|
|
* on these lists, so it is safe to use __llist_del_all().
|
2022-09-02 14:10:51 -07:00
|
|
|
*/
|
2023-07-05 20:34:34 -07:00
|
|
|
free_all(__llist_del_all(&c->free_by_rcu_ttrace), percpu);
|
|
|
|
free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
|
2023-06-06 11:53:08 +08:00
|
|
|
free_all(__llist_del_all(&c->free_llist), percpu);
|
|
|
|
free_all(__llist_del_all(&c->free_llist_extra), percpu);
|
2022-09-02 14:10:43 -07:00
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:58 -07:00
|
|
|
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
|
|
|
|
{
|
|
|
|
free_percpu(ma->cache);
|
|
|
|
free_percpu(ma->caches);
|
|
|
|
ma->cache = NULL;
|
|
|
|
ma->caches = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_mem_alloc(struct bpf_mem_alloc *ma)
|
|
|
|
{
|
2023-07-05 20:34:34 -07:00
|
|
|
/* waiting_for_gp_ttrace lists was drained, but __free_rcu might
|
2022-09-02 14:10:58 -07:00
|
|
|
* still execute. Wait for it now before we freeing percpu caches.
|
2022-12-09 09:09:47 +08:00
|
|
|
*
|
|
|
|
* rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
|
|
|
|
* but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
|
|
|
|
* to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
|
|
|
|
* so if call_rcu(head, __free_rcu) is skipped due to
|
|
|
|
* rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
|
|
|
|
* using rcu_trace_implies_rcu_gp() as well.
|
2022-09-02 14:10:58 -07:00
|
|
|
*/
|
|
|
|
rcu_barrier_tasks_trace();
|
2022-12-09 09:09:47 +08:00
|
|
|
if (!rcu_trace_implies_rcu_gp())
|
|
|
|
rcu_barrier();
|
2022-09-02 14:10:58 -07:00
|
|
|
free_mem_alloc_no_barrier(ma);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_mem_alloc_deferred(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
|
|
|
|
|
|
|
|
free_mem_alloc(ma);
|
|
|
|
kfree(ma);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
|
|
|
|
{
|
|
|
|
struct bpf_mem_alloc *copy;
|
|
|
|
|
|
|
|
if (!rcu_in_progress) {
|
|
|
|
/* Fast path. No callbacks are pending, hence no need to do
|
|
|
|
* rcu_barrier-s.
|
|
|
|
*/
|
|
|
|
free_mem_alloc_no_barrier(ma);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-07-05 20:34:35 -07:00
|
|
|
copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL);
|
2022-09-02 14:10:58 -07:00
|
|
|
if (!copy) {
|
|
|
|
/* Slow path with inline barrier-s */
|
|
|
|
free_mem_alloc(ma);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Defer barriers into worker to let the rest of map memory to be freed */
|
2023-07-05 20:34:35 -07:00
|
|
|
memset(ma, 0, sizeof(*ma));
|
2022-09-02 14:10:58 -07:00
|
|
|
INIT_WORK(©->work, free_mem_alloc_deferred);
|
|
|
|
queue_work(system_unbound_wq, ©->work);
|
|
|
|
}
|
|
|
|
|
2022-09-02 14:10:43 -07:00
|
|
|
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
|
|
|
|
{
|
|
|
|
struct bpf_mem_caches *cc;
|
|
|
|
struct bpf_mem_cache *c;
|
2022-09-02 14:10:58 -07:00
|
|
|
int cpu, i, rcu_in_progress;
|
2022-09-02 14:10:43 -07:00
|
|
|
|
|
|
|
if (ma->cache) {
|
2022-09-02 14:10:58 -07:00
|
|
|
rcu_in_progress = 0;
|
2022-09-02 14:10:43 -07:00
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
c = per_cpu_ptr(ma->cache, cpu);
|
2023-07-05 20:34:40 -07:00
|
|
|
WRITE_ONCE(c->draining, true);
|
2022-10-21 19:49:12 +08:00
|
|
|
irq_work_sync(&c->refill_work);
|
2022-09-02 14:10:43 -07:00
|
|
|
drain_mem_cache(c);
|
2023-07-05 20:34:34 -07:00
|
|
|
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
|
2022-09-02 14:10:43 -07:00
|
|
|
}
|
2022-09-02 14:10:57 -07:00
|
|
|
/* objcg is the same across cpus */
|
2022-09-02 14:10:43 -07:00
|
|
|
if (c->objcg)
|
|
|
|
obj_cgroup_put(c->objcg);
|
2022-09-02 14:10:58 -07:00
|
|
|
destroy_mem_alloc(ma, rcu_in_progress);
|
2022-09-02 14:10:43 -07:00
|
|
|
}
|
|
|
|
if (ma->caches) {
|
2022-09-02 14:10:58 -07:00
|
|
|
rcu_in_progress = 0;
|
2022-09-02 14:10:43 -07:00
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
cc = per_cpu_ptr(ma->caches, cpu);
|
|
|
|
for (i = 0; i < NUM_CACHES; i++) {
|
|
|
|
c = &cc->cache[i];
|
2023-07-05 20:34:40 -07:00
|
|
|
WRITE_ONCE(c->draining, true);
|
2022-10-21 19:49:12 +08:00
|
|
|
irq_work_sync(&c->refill_work);
|
2022-09-02 14:10:43 -07:00
|
|
|
drain_mem_cache(c);
|
2023-07-05 20:34:34 -07:00
|
|
|
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
|
2022-09-02 14:10:43 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (c->objcg)
|
|
|
|
obj_cgroup_put(c->objcg);
|
2022-09-02 14:10:58 -07:00
|
|
|
destroy_mem_alloc(ma, rcu_in_progress);
|
2022-09-02 14:10:43 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* notrace is necessary here and in other functions to make sure
|
|
|
|
* bpf programs cannot attach to them and cause llist corruptions.
|
|
|
|
*/
|
|
|
|
static void notrace *unit_alloc(struct bpf_mem_cache *c)
|
|
|
|
{
|
|
|
|
struct llist_node *llnode = NULL;
|
|
|
|
unsigned long flags;
|
|
|
|
int cnt = 0;
|
|
|
|
|
|
|
|
/* Disable irqs to prevent the following race for majority of prog types:
|
|
|
|
* prog_A
|
|
|
|
* bpf_mem_alloc
|
|
|
|
* preemption or irq -> prog_B
|
|
|
|
* bpf_mem_alloc
|
|
|
|
*
|
|
|
|
* but prog_B could be a perf_event NMI prog.
|
|
|
|
* Use per-cpu 'active' counter to order free_list access between
|
|
|
|
* unit_alloc/unit_free/bpf_mem_refill.
|
|
|
|
*/
|
|
|
|
local_irq_save(flags);
|
|
|
|
if (local_inc_return(&c->active) == 1) {
|
|
|
|
llnode = __llist_del_first(&c->free_llist);
|
|
|
|
if (llnode)
|
|
|
|
cnt = --c->free_cnt;
|
|
|
|
}
|
|
|
|
local_dec(&c->active);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
|
|
|
|
WARN_ON(cnt < 0);
|
|
|
|
|
2022-09-02 14:10:50 -07:00
|
|
|
if (cnt < c->low_watermark)
|
2022-09-02 14:10:43 -07:00
|
|
|
irq_work_raise(c);
|
|
|
|
return llnode;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Though 'ptr' object could have been allocated on a different cpu
|
|
|
|
* add it to the free_llist of the current cpu.
|
|
|
|
* Let kfree() logic deal with it when it's later called from irq_work.
|
|
|
|
*/
|
|
|
|
static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
|
|
|
|
{
|
|
|
|
struct llist_node *llnode = ptr - LLIST_NODE_SZ;
|
|
|
|
unsigned long flags;
|
|
|
|
int cnt = 0;
|
|
|
|
|
|
|
|
BUILD_BUG_ON(LLIST_NODE_SZ > 8);
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
if (local_inc_return(&c->active) == 1) {
|
|
|
|
__llist_add(llnode, &c->free_llist);
|
|
|
|
cnt = ++c->free_cnt;
|
|
|
|
} else {
|
|
|
|
/* unit_free() cannot fail. Therefore add an object to atomic
|
|
|
|
* llist. free_bulk() will drain it. Though free_llist_extra is
|
|
|
|
* a per-cpu list we have to use atomic llist_add here, since
|
|
|
|
* it also can be interrupted by bpf nmi prog that does another
|
|
|
|
* unit_free() into the same free_llist_extra.
|
|
|
|
*/
|
|
|
|
llist_add(llnode, &c->free_llist_extra);
|
|
|
|
}
|
|
|
|
local_dec(&c->active);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
|
2022-09-02 14:10:50 -07:00
|
|
|
if (cnt > c->high_watermark)
|
2022-09-02 14:10:43 -07:00
|
|
|
/* free few objects from current cpu into global kmalloc pool */
|
|
|
|
irq_work_raise(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Called from BPF program or from sys_bpf syscall.
|
|
|
|
* In both cases migration is disabled.
|
|
|
|
*/
|
|
|
|
void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
|
|
|
|
{
|
|
|
|
int idx;
|
|
|
|
void *ret;
|
|
|
|
|
|
|
|
if (!size)
|
|
|
|
return ZERO_SIZE_PTR;
|
|
|
|
|
|
|
|
idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
|
|
|
|
if (idx < 0)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
|
|
|
|
return !ret ? NULL : ret + LLIST_NODE_SZ;
|
|
|
|
}
|
|
|
|
|
|
|
|
void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
|
|
|
|
{
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
if (!ptr)
|
|
|
|
return;
|
|
|
|
|
2022-09-06 19:38:53 -07:00
|
|
|
idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
|
2022-09-02 14:10:43 -07:00
|
|
|
if (idx < 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
|
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
|
|
|
|
ret = unit_alloc(this_cpu_ptr(ma->cache));
|
|
|
|
return !ret ? NULL : ret + LLIST_NODE_SZ;
|
|
|
|
}
|
|
|
|
|
|
|
|
void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
|
|
|
|
{
|
|
|
|
if (!ptr)
|
|
|
|
return;
|
|
|
|
|
|
|
|
unit_free(this_cpu_ptr(ma->cache), ptr);
|
|
|
|
}
|
2023-03-22 14:52:42 -07:00
|
|
|
|
|
|
|
/* Directly does a kfree() without putting 'ptr' back to the free_llist
|
|
|
|
* for reuse and without waiting for a rcu_tasks_trace gp.
|
|
|
|
* The caller must first go through the rcu_tasks_trace gp for 'ptr'
|
|
|
|
* before calling bpf_mem_cache_raw_free().
|
|
|
|
* It could be used when the rcu_tasks_trace callback does not have
|
|
|
|
* a hold on the original bpf_mem_alloc object that allocated the
|
|
|
|
* 'ptr'. This should only be used in the uncommon code path.
|
|
|
|
* Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
|
|
|
|
* and may affect performance.
|
|
|
|
*/
|
|
|
|
void bpf_mem_cache_raw_free(void *ptr)
|
|
|
|
{
|
|
|
|
if (!ptr)
|
|
|
|
return;
|
|
|
|
|
|
|
|
kfree(ptr - LLIST_NODE_SZ);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* When flags == GFP_KERNEL, it signals that the caller will not cause
|
|
|
|
* deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
|
|
|
|
* kmalloc if the free_llist is empty.
|
|
|
|
*/
|
|
|
|
void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
|
|
|
|
{
|
|
|
|
struct bpf_mem_cache *c;
|
|
|
|
void *ret;
|
|
|
|
|
|
|
|
c = this_cpu_ptr(ma->cache);
|
|
|
|
|
|
|
|
ret = unit_alloc(c);
|
|
|
|
if (!ret && flags == GFP_KERNEL) {
|
|
|
|
struct mem_cgroup *memcg, *old_memcg;
|
|
|
|
|
|
|
|
memcg = get_memcg(c);
|
|
|
|
old_memcg = set_active_memcg(memcg);
|
|
|
|
ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
|
|
|
|
set_active_memcg(old_memcg);
|
|
|
|
mem_cgroup_put(memcg);
|
|
|
|
}
|
|
|
|
|
|
|
|
return !ret ? NULL : ret + LLIST_NODE_SZ;
|
|
|
|
}
|