linux-stable/fs/bcachefs/btree_locking.c
Kent Overstreet c4accde498 bcachefs: Ensure srcu lock is not held too long
The SRCU read lock that btree_trans takes exists to make it safe for
bch2_trans_relock() to deref pointers to btree nodes/key cache items we
don't have locked, but as a side effect it blocks reclaim from freeing
those items.

Thus, it's important to not hold it for too long: we need to
differentiate between bch2_trans_unlock() calls that will be only for a
short duration, and ones that will be for an unbounded duration.

This introduces bch2_trans_unlock_long(), to be used mainly by the data
move paths.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-11-04 14:17:11 -04:00

818 lines
19 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_locking.h"
#include "btree_types.h"
static struct lock_class_key bch2_btree_node_lock_key;
void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
enum six_lock_init_flags flags)
{
__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
lockdep_set_novalidate_class(&b->lock);
}
#ifdef CONFIG_LOCKDEP
void bch2_assert_btree_nodes_not_locked(void)
{
#if 0
//Re-enable when lock_class_is_held() is merged:
BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
#endif
}
#endif
/* Btree node locking: */
struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
struct btree_path *skip,
struct btree_bkey_cached_common *b,
unsigned level)
{
struct btree_path *path;
struct six_lock_count ret;
memset(&ret, 0, sizeof(ret));
if (IS_ERR_OR_NULL(b))
return ret;
trans_for_each_path(trans, path)
if (path != skip && &path->l[level].b->c == b) {
int t = btree_node_locked_type(path, level);
if (t != BTREE_NODE_UNLOCKED)
ret.n[t]++;
}
return ret;
}
/* unlock */
void bch2_btree_node_unlock_write(struct btree_trans *trans,
struct btree_path *path, struct btree *b)
{
bch2_btree_node_unlock_write_inlined(trans, path, b);
}
/* lock */
/*
* @trans wants to lock @b with type @type
*/
struct trans_waiting_for_lock {
struct btree_trans *trans;
struct btree_bkey_cached_common *node_want;
enum six_lock_type lock_want;
/* for iterating over held locks :*/
u8 path_idx;
u8 level;
u64 lock_start_time;
};
struct lock_graph {
struct trans_waiting_for_lock g[8];
unsigned nr;
};
static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
{
struct trans_waiting_for_lock *i;
prt_printf(out, "Found lock cycle (%u entries):", g->nr);
prt_newline(out);
for (i = g->g; i < g->g + g->nr; i++)
bch2_btree_trans_to_text(out, i->trans);
}
static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
{
struct trans_waiting_for_lock *i;
for (i = g->g; i != g->g + g->nr; i++) {
if (i != g->g)
prt_str(out, "<- ");
prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
}
prt_newline(out);
}
static void lock_graph_up(struct lock_graph *g)
{
closure_put(&g->g[--g->nr].trans->ref);
}
static noinline void lock_graph_pop_all(struct lock_graph *g)
{
while (g->nr)
lock_graph_up(g);
}
static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
{
g->g[g->nr++] = (struct trans_waiting_for_lock) {
.trans = trans,
.node_want = trans->locking,
.lock_want = trans->locking_wait.lock_want,
};
}
static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
{
closure_get(&trans->ref);
__lock_graph_down(g, trans);
}
static bool lock_graph_remove_non_waiters(struct lock_graph *g)
{
struct trans_waiting_for_lock *i;
for (i = g->g + 1; i < g->g + g->nr; i++)
if (i->trans->locking != i->node_want ||
i->trans->locking_wait.start_time != i[-1].lock_start_time) {
while (g->g + g->nr > i)
lock_graph_up(g);
return true;
}
return false;
}
static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
{
if (i == g->g) {
trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
} else {
i->trans->lock_must_abort = true;
wake_up_process(i->trans->locking_wait.task);
return 0;
}
}
static int btree_trans_abort_preference(struct btree_trans *trans)
{
if (trans->lock_may_not_fail)
return 0;
if (trans->locking_wait.lock_want == SIX_LOCK_write)
return 1;
if (!trans->in_traverse_all)
return 2;
return 3;
}
static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
{
struct trans_waiting_for_lock *i, *abort = NULL;
unsigned best = 0, pref;
int ret;
if (lock_graph_remove_non_waiters(g))
return 0;
/* Only checking, for debugfs: */
if (cycle) {
print_cycle(cycle, g);
ret = -1;
goto out;
}
for (i = g->g; i < g->g + g->nr; i++) {
pref = btree_trans_abort_preference(i->trans);
if (pref > best) {
abort = i;
best = pref;
}
}
if (unlikely(!best)) {
struct printbuf buf = PRINTBUF;
prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
for (i = g->g; i < g->g + g->nr; i++) {
struct btree_trans *trans = i->trans;
bch2_btree_trans_to_text(&buf, trans);
prt_printf(&buf, "backtrace:");
prt_newline(&buf);
printbuf_indent_add(&buf, 2);
bch2_prt_task_backtrace(&buf, trans->locking_wait.task);
printbuf_indent_sub(&buf, 2);
prt_newline(&buf);
}
bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
BUG();
}
ret = abort_lock(g, abort);
out:
if (ret)
while (g->nr)
lock_graph_up(g);
return ret;
}
static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
struct printbuf *cycle)
{
struct btree_trans *orig_trans = g->g->trans;
struct trans_waiting_for_lock *i;
for (i = g->g; i < g->g + g->nr; i++)
if (i->trans == trans) {
closure_put(&trans->ref);
return break_cycle(g, cycle);
}
if (g->nr == ARRAY_SIZE(g->g)) {
closure_put(&trans->ref);
if (orig_trans->lock_may_not_fail)
return 0;
while (g->nr)
lock_graph_up(g);
if (cycle)
return 0;
trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
}
__lock_graph_down(g, trans);
return 0;
}
static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
{
return t1 + t2 > 1;
}
int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
{
struct lock_graph g;
struct trans_waiting_for_lock *top;
struct btree_bkey_cached_common *b;
struct btree_path *path;
unsigned path_idx;
int ret;
if (trans->lock_must_abort) {
if (cycle)
return -1;
trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
}
g.nr = 0;
lock_graph_down(&g, trans);
next:
if (!g.nr)
return 0;
top = &g.g[g.nr - 1];
trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) {
if (!path->nodes_locked)
continue;
if (path_idx != top->path_idx) {
top->path_idx = path_idx;
top->level = 0;
top->lock_start_time = 0;
}
for (;
top->level < BTREE_MAX_DEPTH;
top->level++, top->lock_start_time = 0) {
int lock_held = btree_node_locked_type(path, top->level);
if (lock_held == BTREE_NODE_UNLOCKED)
continue;
b = &READ_ONCE(path->l[top->level].b)->c;
if (IS_ERR_OR_NULL(b)) {
/*
* If we get here, it means we raced with the
* other thread updating its btree_path
* structures - which means it can't be blocked
* waiting on a lock:
*/
if (!lock_graph_remove_non_waiters(&g)) {
/*
* If lock_graph_remove_non_waiters()
* didn't do anything, it must be
* because we're being called by debugfs
* checking for lock cycles, which
* invokes us on btree_transactions that
* aren't actually waiting on anything.
* Just bail out:
*/
lock_graph_pop_all(&g);
}
goto next;
}
if (list_empty_careful(&b->lock.wait_list))
continue;
raw_spin_lock(&b->lock.wait_lock);
list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
BUG_ON(b != trans->locking);
if (top->lock_start_time &&
time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
continue;
top->lock_start_time = trans->locking_wait.start_time;
/* Don't check for self deadlock: */
if (trans == top->trans ||
!lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
continue;
closure_get(&trans->ref);
raw_spin_unlock(&b->lock.wait_lock);
ret = lock_graph_descend(&g, trans, cycle);
if (ret)
return ret;
goto next;
}
raw_spin_unlock(&b->lock.wait_lock);
}
}
if (g.nr > 1 && cycle)
print_chain(cycle, &g);
lock_graph_up(&g);
goto next;
}
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
{
struct btree_trans *trans = p;
return bch2_check_for_deadlock(trans, NULL);
}
int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
struct btree_bkey_cached_common *b,
bool lock_may_not_fail)
{
int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
int ret;
/*
* Must drop our read locks before calling six_lock_write() -
* six_unlock() won't do wakeups until the reader count
* goes to 0, and it's safe because we have the node intent
* locked:
*/
six_lock_readers_add(&b->lock, -readers);
ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
lock_may_not_fail, _RET_IP_);
six_lock_readers_add(&b->lock, readers);
if (ret)
mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
return ret;
}
void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
struct btree_path *linked;
unsigned i;
int ret;
/*
* XXX BIG FAT NOTICE
*
* Drop all read locks before taking a write lock:
*
* This is a hack, because bch2_btree_node_lock_write_nofail() is a
* hack - but by dropping read locks first, this should never fail, and
* we only use this in code paths where whatever read locks we've
* already taken are no longer needed:
*/
trans_for_each_path(trans, linked) {
if (!linked->nodes_locked)
continue;
for (i = 0; i < BTREE_MAX_DEPTH; i++)
if (btree_node_read_locked(linked, i)) {
btree_node_unlock(trans, linked, i);
btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
}
}
ret = __btree_node_lock_write(trans, path, b, true);
BUG_ON(ret);
}
/* relock */
static inline bool btree_path_get_locks(struct btree_trans *trans,
struct btree_path *path,
bool upgrade,
struct get_locks_fail *f)
{
unsigned l = path->level;
int fail_idx = -1;
do {
if (!btree_path_node(path, l))
break;
if (!(upgrade
? bch2_btree_node_upgrade(trans, path, l)
: bch2_btree_node_relock(trans, path, l))) {
fail_idx = l;
if (f) {
f->l = l;
f->b = path->l[l].b;
}
}
l++;
} while (l < path->locks_want);
/*
* When we fail to get a lock, we have to ensure that any child nodes
* can't be relocked so bch2_btree_path_traverse has to walk back up to
* the node that we failed to relock:
*/
if (fail_idx >= 0) {
__bch2_btree_path_unlock(trans, path);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
do {
path->l[fail_idx].b = upgrade
? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
: ERR_PTR(-BCH_ERR_no_btree_node_relock);
--fail_idx;
} while (fail_idx >= 0);
}
if (path->uptodate == BTREE_ITER_NEED_RELOCK)
path->uptodate = BTREE_ITER_UPTODATE;
bch2_trans_verify_locks(trans);
return path->uptodate < BTREE_ITER_NEED_RELOCK;
}
bool __bch2_btree_node_relock(struct btree_trans *trans,
struct btree_path *path, unsigned level,
bool trace)
{
struct btree *b = btree_path_node(path, level);
int want = __btree_lock_want(path, level);
if (race_fault())
goto fail;
if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
(btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, &b->c, level, want))) {
mark_btree_node_locked(trans, path, level, want);
return true;
}
fail:
if (trace && !trans->notrace_relock_fail)
trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
return false;
}
/* upgrade */
bool bch2_btree_node_upgrade(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
struct btree *b = path->l[level].b;
struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
if (!is_btree_node(path, level))
return false;
switch (btree_lock_want(path, level)) {
case BTREE_NODE_UNLOCKED:
BUG_ON(btree_node_locked(path, level));
return true;
case BTREE_NODE_READ_LOCKED:
BUG_ON(btree_node_intent_locked(path, level));
return bch2_btree_node_relock(trans, path, level);
case BTREE_NODE_INTENT_LOCKED:
break;
case BTREE_NODE_WRITE_LOCKED:
BUG();
}
if (btree_node_intent_locked(path, level))
return true;
if (race_fault())
return false;
if (btree_node_locked(path, level)) {
bool ret;
six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
ret = six_lock_tryupgrade(&b->c.lock);
six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
if (ret)
goto success;
} else {
if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
goto success;
}
/*
* Do we already have an intent lock via another path? If so, just bump
* lock count:
*/
if (btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
btree_node_unlock(trans, path, level);
goto success;
}
trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
return false;
success:
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
return true;
}
/* Btree path locking: */
/*
* Only for btree_cache.c - only relocks intent locks
*/
int bch2_btree_path_relock_intent(struct btree_trans *trans,
struct btree_path *path)
{
unsigned l;
for (l = path->level;
l < path->locks_want && btree_path_node(path, l);
l++) {
if (!bch2_btree_node_relock(trans, path, l)) {
__bch2_btree_path_unlock(trans, path);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
}
}
return 0;
}
__flatten
bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
struct get_locks_fail f;
return btree_path_get_locks(trans, path, false, &f);
}
int __bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
}
return 0;
}
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want,
struct get_locks_fail *f)
{
EBUG_ON(path->locks_want >= new_locks_want);
path->locks_want = new_locks_want;
return btree_path_get_locks(trans, path, true, f);
}
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want,
struct get_locks_fail *f)
{
struct btree_path *linked;
if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
return true;
/*
* XXX: this is ugly - we'd prefer to not be mucking with other
* iterators in the btree_trans here.
*
* On failure to upgrade the iterator, setting iter->locks_want and
* calling get_locks() is sufficient to make bch2_btree_path_traverse()
* get the locks we want on transaction restart.
*
* But if this iterator was a clone, on transaction restart what we did
* to this iterator isn't going to be preserved.
*
* Possibly we could add an iterator field for the parent iterator when
* an iterator is a copy - for now, we'll just upgrade any other
* iterators with the same btree id.
*
* The code below used to be needed to ensure ancestor nodes get locked
* before interior nodes - now that's handled by
* bch2_btree_path_traverse_all().
*/
if (!path->cached && !trans->in_traverse_all)
trans_for_each_path(trans, linked)
if (linked != path &&
linked->cached == path->cached &&
linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
btree_path_get_locks(trans, linked, true, NULL);
}
return false;
}
void __bch2_btree_path_downgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
unsigned l;
if (trans->restarted)
return;
EBUG_ON(path->locks_want < new_locks_want);
path->locks_want = new_locks_want;
while (path->nodes_locked &&
(l = btree_path_highest_level_locked(path)) >= path->locks_want) {
if (l > path->level) {
btree_node_unlock(trans, path, l);
} else {
if (btree_node_intent_locked(path, l)) {
six_lock_downgrade(&path->l[l].b->c.lock);
mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
}
break;
}
}
bch2_btree_path_verify_locks(path);
path->downgrade_seq++;
trace_path_downgrade(trans, _RET_IP_, path);
}
/* Btree transaction locking: */
void bch2_trans_downgrade(struct btree_trans *trans)
{
struct btree_path *path;
if (trans->restarted)
return;
trans_for_each_path(trans, path)
bch2_btree_path_downgrade(trans, path);
}
int bch2_trans_relock(struct btree_trans *trans)
{
struct btree_path *path;
if (unlikely(trans->restarted))
return -((int) trans->restarted);
trans_for_each_path(trans, path)
if (path->should_be_locked &&
!bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
return 0;
}
int bch2_trans_relock_notrace(struct btree_trans *trans)
{
struct btree_path *path;
if (unlikely(trans->restarted))
return -((int) trans->restarted);
trans_for_each_path(trans, path)
if (path->should_be_locked &&
!bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
return 0;
}
void bch2_trans_unlock_noassert(struct btree_trans *trans)
{
struct btree_path *path;
trans_for_each_path(trans, path)
__bch2_btree_path_unlock(trans, path);
}
void bch2_trans_unlock(struct btree_trans *trans)
{
struct btree_path *path;
trans_for_each_path(trans, path)
__bch2_btree_path_unlock(trans, path);
}
void bch2_trans_unlock_long(struct btree_trans *trans)
{
bch2_trans_unlock(trans);
bch2_trans_srcu_unlock(trans);
}
bool bch2_trans_locked(struct btree_trans *trans)
{
struct btree_path *path;
trans_for_each_path(trans, path)
if (path->nodes_locked)
return true;
return false;
}
int __bch2_trans_mutex_lock(struct btree_trans *trans,
struct mutex *lock)
{
int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
if (ret)
mutex_unlock(lock);
return ret;
}
/* Debug */
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_path_verify_locks(struct btree_path *path)
{
unsigned l;
if (!path->nodes_locked) {
BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
btree_path_node(path, path->level));
return;
}
for (l = 0; l < BTREE_MAX_DEPTH; l++) {
int want = btree_lock_want(path, l);
int have = btree_node_locked_type(path, l);
BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
BUG_ON(is_btree_node(path, l) &&
(want == BTREE_NODE_UNLOCKED ||
have != BTREE_NODE_WRITE_LOCKED) &&
want != have);
}
}
void bch2_trans_verify_locks(struct btree_trans *trans)
{
struct btree_path *path;
trans_for_each_path(trans, path)
bch2_btree_path_verify_locks(path);
}
#endif