Merge branch 'bpf-reduce-the-use-of-migrate_-disable-enable'

Hou Tao says:

====================
The use of migrate_{disable|enable} pair in BPF is mainly due to the
introduction of bpf memory allocator and the use of per-CPU data struct
in its internal implementation. The caller needs to disable migration
before invoking the alloc or free APIs of bpf memory allocator, and
enable migration after the invocation.

The main users of bpf memory allocator are various kind of bpf maps in
which the map values or the special fields in the map values are
allocated by using bpf memory allocator.

At present, the running context for bpf program has already disabled
migration explictly or implictly, therefore, when these maps are
manipulated in bpf program, it is OK to not invoke migrate_disable()
and migrate_enable() pair. Howevers, it is not always the case when
these maps are manipulated through bpf syscall, therefore many
migrate_{disable|enable} pairs are added when the map can either be
manipulated by BPF program or BPF syscall.

The initial idea of reducing the use of migrate_{disable|enable} comes
from Alexei [1]. I turned it into a patch set that archives the goals
through the following three methods:

1. remove unnecessary migrate_{disable|enable} pair
when the BPF syscall path also disables migration, it is OK to remove
the pair. Patch #1~#3 fall into this category, while patch #4~#5 are
partially included.

2. move the migrate_{disable|enable} pair from inner callee to outer
   caller
Instead of invoking migrate_disable() in the inner callee, invoking
migrate_disable() in the outer caller to simplify reasoning about when
migrate_disable() is needed. Patch #4~#5 and patch #6~#19 belongs to
this category.

3. add cant_migrate() check in the inner callee
Add cant_migrate() check in the inner callee to ensure the guarantee
that migration is disabled is not broken. Patch #1~#5, #13, #16~#19 also
belong to this category.

Please check the individual patches for more details. Comments are
always welcome.

Change Log:
v2:
  * sqaush the ->map_free related patches (#10~#12, #15) into one patch
  * remove unnecessary cant_migrate() checks.

v1: https://lore.kernel.org/bpf/20250106081900.1665573-1-houtao@huaweicloud.com
====================

Link: https://patch.msgid.link/20250108010728.207536-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2025-01-08 18:06:37 -08:00
commit e8ec1c9486
12 changed files with 55 additions and 88 deletions

View File

@ -735,13 +735,13 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback
u64 ret = 0;
void *val;
cant_migrate();
if (flags != 0)
return -EINVAL;
is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
array = container_of(map, struct bpf_array, map);
if (is_percpu)
migrate_disable();
for (i = 0; i < map->max_entries; i++) {
if (is_percpu)
val = this_cpu_ptr(array->pptrs[i]);
@ -756,8 +756,6 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback
break;
}
if (is_percpu)
migrate_enable();
return num_elems;
}

View File

@ -15,22 +15,20 @@ static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
static void bpf_cgrp_storage_lock(void)
{
migrate_disable();
cant_migrate();
this_cpu_inc(bpf_cgrp_storage_busy);
}
static void bpf_cgrp_storage_unlock(void)
{
this_cpu_dec(bpf_cgrp_storage_busy);
migrate_enable();
}
static bool bpf_cgrp_storage_trylock(void)
{
migrate_disable();
cant_migrate();
if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
this_cpu_dec(bpf_cgrp_storage_busy);
migrate_enable();
return false;
}
return true;
@ -47,17 +45,18 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
if (!local_storage) {
rcu_read_unlock();
return;
}
if (!local_storage)
goto out;
bpf_cgrp_storage_lock();
bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
out:
rcu_read_unlock();
migrate_enable();
}
static struct bpf_local_storage_data *

View File

@ -62,16 +62,17 @@ void bpf_inode_storage_free(struct inode *inode)
if (!bsb)
return;
migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(bsb->storage);
if (!local_storage) {
rcu_read_unlock();
return;
}
if (!local_storage)
goto out;
bpf_local_storage_destroy(local_storage);
out:
rcu_read_unlock();
migrate_enable();
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)

View File

@ -81,9 +81,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
if (smap->bpf_ma) {
migrate_disable();
selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
migrate_enable();
if (selem)
/* Keep the original bpf_map_kzalloc behavior
* before started using the bpf_mem_cache_alloc.
@ -174,17 +172,14 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
return;
}
if (smap) {
migrate_disable();
if (smap)
bpf_mem_cache_free(&smap->storage_ma, local_storage);
migrate_enable();
} else {
else
/* smap could be NULL if the selem that triggered
* this 'local_storage' creation had been long gone.
* In this case, directly do call_rcu().
*/
call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
}
}
/* rcu tasks trace callback for bpf_ma == false */
@ -217,7 +212,10 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
migrate_disable();
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
migrate_enable();
bpf_mem_cache_raw_free(selem);
}
@ -256,9 +254,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
* bpf_mem_cache_free will be able to reuse selem
* immediately.
*/
migrate_disable();
bpf_mem_cache_free(&smap->selem_ma, selem);
migrate_enable();
return;
}
@ -497,15 +493,11 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
if (smap->bpf_ma) {
migrate_disable();
if (smap->bpf_ma)
storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
migrate_enable();
} else {
else
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
gfp_flags | __GFP_NOWARN);
}
if (!storage) {
err = -ENOMEM;
goto uncharge;
@ -902,15 +894,11 @@ void bpf_local_storage_map_free(struct bpf_map *map,
while ((selem = hlist_entry_safe(
rcu_dereference_raw(hlist_first_rcu(&b->list)),
struct bpf_local_storage_elem, map_node))) {
if (busy_counter) {
migrate_disable();
if (busy_counter)
this_cpu_inc(*busy_counter);
}
bpf_selem_unlink(selem, true);
if (busy_counter) {
if (busy_counter)
this_cpu_dec(*busy_counter);
migrate_enable();
}
cond_resched_rcu();
}
rcu_read_unlock();

View File

@ -24,22 +24,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
migrate_disable();
cant_migrate();
this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
}
static bool bpf_task_storage_trylock(void)
{
migrate_disable();
cant_migrate();
if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
return false;
}
return true;
@ -72,18 +70,19 @@ void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(task->bpf_storage);
if (!local_storage) {
rcu_read_unlock();
return;
}
if (!local_storage)
goto out;
bpf_task_storage_lock();
bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
out:
rcu_read_unlock();
migrate_enable();
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)

View File

@ -91,9 +91,7 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
if (!refcount_dec_and_test(&cpumask->usage))
return;
migrate_disable();
bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
migrate_enable();
}
__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)

View File

@ -897,11 +897,9 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
check_and_free_fields(htab, l);
migrate_disable();
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
bpf_mem_cache_free(&htab->ma, l);
migrate_enable();
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@ -1502,10 +1500,9 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
/* It's called from a worker thread, so disable migration here,
* since bpf_mem_cache_free() relies on that.
/* It's called from a worker thread and migration has been disabled,
* therefore, it is OK to invoke bpf_mem_cache_free() directly.
*/
migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@ -1517,7 +1514,6 @@ static void delete_all_elements(struct bpf_htab *htab)
}
cond_resched();
}
migrate_enable();
}
static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
@ -2208,17 +2204,18 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
bool is_percpu;
u64 ret = 0;
cant_migrate();
if (flags != 0)
return -EINVAL;
is_percpu = htab_is_percpu(htab);
roundup_key_size = round_up(map->key_size, 8);
/* disable migration so percpu value prepared here will be the
* same as the one seen by the bpf program with bpf_map_lookup_elem().
/* migration has been disabled, so percpu value prepared here will be
* the same as the one seen by the bpf program with
* bpf_map_lookup_elem().
*/
if (is_percpu)
migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
b = &htab->buckets[i];
rcu_read_lock();
@ -2244,8 +2241,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
rcu_read_unlock();
}
out:
if (is_percpu)
migrate_enable();
return num_elems;
}

View File

@ -2066,9 +2066,7 @@ unlock:
/* The contained type can also have resources, including a
* bpf_list_head which needs to be freed.
*/
migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}
@ -2105,9 +2103,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
obj -= field->graph_root.node_offset;
migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}

View File

@ -289,16 +289,11 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
}
static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
const void *value,
bool disable_migration)
const void *value)
{
struct lpm_trie_node *node;
if (disable_migration)
migrate_disable();
node = bpf_mem_cache_alloc(&trie->ma);
if (disable_migration)
migrate_enable();
if (!node)
return NULL;
@ -342,10 +337,8 @@ static long trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
/* Allocate and fill a new node. Need to disable migration before
* invoking bpf_mem_cache_alloc().
*/
new_node = lpm_trie_node_alloc(trie, value, true);
/* Allocate and fill a new node */
new_node = lpm_trie_node_alloc(trie, value);
if (!new_node)
return -ENOMEM;
@ -425,8 +418,7 @@ static long trie_update_elem(struct bpf_map *map,
goto out;
}
/* migration is disabled within the locked scope */
im_node = lpm_trie_node_alloc(trie, NULL, false);
im_node = lpm_trie_node_alloc(trie, NULL);
if (!im_node) {
trie->n_entries--;
ret = -ENOMEM;
@ -452,11 +444,9 @@ static long trie_update_elem(struct bpf_map *map,
out:
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
migrate_disable();
if (ret)
bpf_mem_cache_free(&trie->ma, new_node);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
migrate_enable();
return ret;
}
@ -555,10 +545,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
out:
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
migrate_disable();
bpf_mem_cache_free_rcu(&trie->ma, free_parent);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
migrate_enable();
return ret;
}

View File

@ -259,9 +259,7 @@ void range_tree_destroy(struct range_tree *rt)
while ((rn = range_it_iter_first(rt, 0, -1U))) {
range_it_remove(rn, rt);
migrate_disable();
bpf_mem_free(&bpf_global_ma, rn);
migrate_enable();
}
}

View File

@ -796,11 +796,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
if (!btf_is_kernel(field->kptr.btf)) {
pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
field->kptr.btf_id);
migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
pointee_struct_meta->record : NULL,
fields[i].type == BPF_KPTR_PERCPU);
migrate_enable();
} else {
field->kptr.dtor(xchgd_field);
}
@ -835,8 +833,14 @@ static void bpf_map_free(struct bpf_map *map)
struct btf_record *rec = map->record;
struct btf *btf = map->btf;
/* implementation dependent freeing */
/* implementation dependent freeing. Disabling migration to simplify
* the free of values or special fields allocated from bpf memory
* allocator.
*/
migrate_disable();
map->ops->map_free(map);
migrate_enable();
/* Delay freeing of btf_record for maps, as map_free
* callback usually needs access to them. It is better to do it here
* than require each callback to do the free itself manually.

View File

@ -50,15 +50,16 @@ void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_local_storage *sk_storage;
migrate_disable();
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage) {
rcu_read_unlock();
return;
}
if (!sk_storage)
goto out;
bpf_local_storage_destroy(sk_storage);
out:
rcu_read_unlock();
migrate_enable();
}
static void bpf_sk_storage_map_free(struct bpf_map *map)
@ -160,6 +161,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
migrate_disable();
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
@ -212,6 +214,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
out:
rcu_read_unlock();
migrate_enable();
/* In case of an error, don't free anything explicitly here, the
* caller is responsible to call bpf_sk_storage_free.