bcachefs: Deadlock cycle detector

We've outgrown our own deadlock avoidance strategy.

The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.

Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.

That approach had some issues, though.
 - Sometimes we'd issue transaction restarts unnecessarily, when no
   deadlock would have actually occured. Lock ordering restarts have
   become our primary cause of transaction restarts, on some workloads
   totally 20% of actual transaction commits.

 - To avoid deadlock or livelock, we'd often have to take intent locks
   when we only wanted a read lock: with the lock ordering approach, it
   is actually illegal to hold _any_ read lock while blocking on an intent
   lock, and this has been causing us unnecessary lock contention.

 - It was getting fragile - the various lock ordering rules are not
   trivial, and we'd been seeing occasional livelock issues related to
   this machinery.

So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.

When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.

If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.

Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
This commit is contained in:
Kent Overstreet 2022-08-22 13:23:47 -04:00 committed by Kent Overstreet
parent 62448afee7
commit 33bd5d0686
9 changed files with 322 additions and 33 deletions

View File

@ -1400,7 +1400,8 @@ struct bch_sb_field_disk_groups {
x(trans_restart_key_cache_upgrade, 70) \ x(trans_restart_key_cache_upgrade, 70) \
x(trans_traverse_all, 71) \ x(trans_traverse_all, 71) \
x(transaction_commit, 72) \ x(transaction_commit, 72) \
x(write_super, 73) x(write_super, 73) \
x(trans_restart_would_deadlock_recursion_limit, 74)
enum bch_persistent_counters { enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t, #define x(t, n, ...) BCH_COUNTER_##t,

View File

@ -2848,9 +2848,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
trans->fn = fn; trans->fn = fn;
trans->last_begin_time = ktime_get_ns(); trans->last_begin_time = ktime_get_ns();
trans->fn_idx = bch2_trans_get_fn_idx(trans, c, fn); trans->fn_idx = bch2_trans_get_fn_idx(trans, c, fn);
trans->task = current; trans->locking_wait.task = current;
trans->journal_replay_not_finished = trans->journal_replay_not_finished =
!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
closure_init_stack(&trans->ref);
bch2_trans_alloc_paths(trans, c); bch2_trans_alloc_paths(trans, c);
@ -2877,7 +2878,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
mutex_lock(&c->btree_trans_lock); mutex_lock(&c->btree_trans_lock);
list_for_each_entry(pos, &c->btree_trans_list, list) { list_for_each_entry(pos, &c->btree_trans_list, list) {
if (trans->task->pid < pos->task->pid) { if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
list_add_tail(&trans->list, &pos->list); list_add_tail(&trans->list, &pos->list);
goto list_add_done; goto list_add_done;
} }
@ -2919,6 +2920,8 @@ void bch2_trans_exit(struct btree_trans *trans)
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
closure_sync(&trans->ref);
if (s) if (s)
s->max_mem = max(s->max_mem, trans->mem_max); s->max_mem = max(s->max_mem, trans->mem_max);
@ -2997,7 +3000,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
static char lock_types[] = { 'r', 'i', 'w' }; static char lock_types[] = { 'r', 'i', 'w' };
unsigned l; unsigned l;
prt_printf(out, "%i %s\n", trans->task->pid, trans->fn); prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
trans_for_each_path(trans, path) { trans_for_each_path(trans, path) {
if (!path->nodes_locked) if (!path->nodes_locked)
@ -3029,7 +3032,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
trans->locking_path_idx, trans->locking_path_idx,
path->cached ? 'c' : 'b', path->cached ? 'c' : 'b',
trans->locking_level, trans->locking_level,
lock_types[trans->locking_lock_type], lock_types[trans->locking_wait.lock_want],
bch2_btree_ids[trans->locking_btree_id]); bch2_btree_ids[trans->locking_btree_id]);
bch2_bpos_to_text(out, trans->locking_pos); bch2_bpos_to_text(out, trans->locking_pos);

View File

@ -81,11 +81,14 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
return &trans->paths[idx]; return &trans->paths[idx];
} }
#define trans_for_each_path(_trans, _path) \ #define trans_for_each_path_from(_trans, _path, _start) \
for (_path = __trans_next_path((_trans), 0); \ for (_path = __trans_next_path((_trans), _start); \
(_path); \ (_path); \
_path = __trans_next_path((_trans), (_path)->idx + 1)) _path = __trans_next_path((_trans), (_path)->idx + 1))
#define trans_for_each_path(_trans, _path) \
trans_for_each_path_from(_trans, _path, 0)
static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
{ {
unsigned idx = path ? path->sorted_idx + 1 : 0; unsigned idx = path ? path->sorted_idx + 1 : 0;

View File

@ -52,10 +52,248 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
/* lock */ /* lock */
void __bch2_btree_node_lock_write(struct btree_trans *trans, /*
struct btree_bkey_cached_common *b) * @trans wants to lock @b with type @type
*/
struct trans_waiting_for_lock {
struct btree_trans *trans;
struct btree_bkey_cached_common *node_want;
enum six_lock_type lock_want;
/* for iterating over held locks :*/
u8 path_idx;
u8 level;
u64 lock_start_time;
};
struct lock_graph {
struct trans_waiting_for_lock g[8];
unsigned nr;
};
static void lock_graph_pop(struct lock_graph *g)
{
closure_put(&g->g[--g->nr].trans->ref);
}
static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
{
int ret;
if (i == g->g) {
ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
} else {
i->trans->lock_must_abort = true;
ret = 0;
}
for (i = g->g + 1; i < g->g + g->nr; i++)
wake_up_process(i->trans->locking_wait.task);
return ret;
}
static noinline int break_cycle(struct lock_graph *g)
{
struct trans_waiting_for_lock *i;
for (i = g->g; i < g->g + g->nr; i++) {
if (i->trans->lock_may_not_fail ||
i->trans->locking_wait.lock_want == SIX_LOCK_write)
continue;
return abort_lock(g, i);
}
for (i = g->g; i < g->g + g->nr; i++) {
if (i->trans->lock_may_not_fail ||
!i->trans->in_traverse_all)
continue;
return abort_lock(g, i);
}
for (i = g->g; i < g->g + g->nr; i++) {
if (i->trans->lock_may_not_fail)
continue;
return abort_lock(g, i);
}
BUG();
}
static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans)
{
struct btree_trans *orig_trans = g->g->trans;
struct trans_waiting_for_lock *i;
int ret = 0;
for (i = g->g; i < g->g + g->nr; i++) {
if (i->trans->locking != i->node_want)
while (g->g + g->nr >= i) {
lock_graph_pop(g);
return 0;
}
if (i->trans == trans) {
ret = break_cycle(g);
if (ret)
goto deadlock;
/*
* If we didn't abort (instead telling another
* transaction to abort), keep checking:
*/
}
}
if (g->nr == ARRAY_SIZE(g->g)) {
if (orig_trans->lock_may_not_fail)
return 0;
trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
goto deadlock;
}
closure_get(&trans->ref);
g->g[g->nr++] = (struct trans_waiting_for_lock) {
.trans = trans,
.node_want = trans->locking,
.lock_want = trans->locking_wait.lock_want,
};
return 0;
deadlock:
while (g->nr)
lock_graph_pop(g);
return ret;
}
#if 0
static void print_cycle(struct printbuf *out, struct lock_graph *g)
{
struct trans_waiting_for_lock *i;
prt_str(out, "Found lock cycle:");
prt_newline(out);
for (i = g->g; i < g->g + g->nr; i++)
bch2_btree_trans_to_text(out, i->trans);
}
#endif
static noinline void lock_graph_remove_non_waiters(struct lock_graph *g)
{
struct trans_waiting_for_lock *i;
for (i = g->g + 1; i < g->g + g->nr; i++)
if (i->trans->locking != i->node_want ||
i->trans->locking_wait.start_time != i[-1].lock_start_time) {
while (g->g + g->nr >= i)
lock_graph_pop(g);
return;
}
BUG();
}
static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
{
return t1 + t2 > 1;
}
static int check_for_deadlock(struct btree_trans *trans)
{
struct lock_graph g;
struct trans_waiting_for_lock *top;
struct btree_bkey_cached_common *b;
struct btree_path *path;
int ret;
if (trans->lock_must_abort)
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
g.nr = 0;
ret = lock_graph_descend(&g, trans);
BUG_ON(ret);
next:
if (!g.nr)
return 0;
top = &g.g[g.nr - 1];
trans_for_each_path_from(top->trans, path, top->path_idx) {
if (!path->nodes_locked)
continue;
if (top->path_idx != path->idx) {
top->path_idx = path->idx;
top->level = 0;
top->lock_start_time = 0;
}
for (;
top->level < BTREE_MAX_DEPTH;
top->level++, top->lock_start_time = 0) {
int lock_held = btree_node_locked_type(path, top->level);
if (lock_held == BTREE_NODE_UNLOCKED)
continue;
b = &READ_ONCE(path->l[top->level].b)->c;
if (unlikely(IS_ERR_OR_NULL(b))) {
lock_graph_remove_non_waiters(&g);
goto next;
}
if (list_empty_careful(&b->lock.wait_list))
continue;
raw_spin_lock(&b->lock.wait_lock);
list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
BUG_ON(b != trans->locking);
if (top->lock_start_time &&
time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
continue;
top->lock_start_time = trans->locking_wait.start_time;
/* Don't check for self deadlock: */
if (trans == top->trans ||
!lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
continue;
ret = lock_graph_descend(&g, trans);
raw_spin_unlock(&b->lock.wait_lock);
if (ret)
return ret < 0 ? ret : 0;
goto next;
}
raw_spin_unlock(&b->lock.wait_lock);
}
}
lock_graph_pop(&g);
goto next;
}
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
{
struct btree_trans *trans = p;
return check_for_deadlock(trans);
}
int __bch2_btree_node_lock_write(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
bool lock_may_not_fail)
{ {
int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
int ret;
/* /*
* Must drop our read locks before calling six_lock_write() - * Must drop our read locks before calling six_lock_write() -
@ -64,8 +302,10 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans,
* locked: * locked:
*/ */
six_lock_readers_add(&b->lock, -readers); six_lock_readers_add(&b->lock, -readers);
btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write); ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
six_lock_readers_add(&b->lock, readers); six_lock_readers_add(&b->lock, readers);
return ret;
} }
static inline bool path_has_read_locks(struct btree_path *path) static inline bool path_has_read_locks(struct btree_path *path)

View File

@ -183,22 +183,41 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
void bch2_btree_node_unlock_write(struct btree_trans *, void bch2_btree_node_unlock_write(struct btree_trans *,
struct btree_path *, struct btree *); struct btree_path *, struct btree *);
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
/* lock: */ /* lock: */
static inline int __btree_node_lock_nopath(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type,
bool lock_may_not_fail)
{
int ret;
trans->lock_may_not_fail = lock_may_not_fail;
trans->locking = b;
trans->lock_must_abort = false;
ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
bch2_six_check_for_deadlock, trans);
WRITE_ONCE(trans->locking, NULL);
WRITE_ONCE(trans->locking_wait.start_time, 0);
return ret;
}
static inline int __must_check static inline int __must_check
btree_node_lock_nopath(struct btree_trans *trans, btree_node_lock_nopath(struct btree_trans *trans,
struct btree_bkey_cached_common *b, struct btree_bkey_cached_common *b,
enum six_lock_type type) enum six_lock_type type)
{ {
six_lock_type(&b->lock, type, NULL, NULL); return __btree_node_lock_nopath(trans, b, type, false);
return 0;
} }
static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
struct btree_bkey_cached_common *b, struct btree_bkey_cached_common *b,
enum six_lock_type type) enum six_lock_type type)
{ {
int ret = btree_node_lock_nopath(trans, b, type); int ret = __btree_node_lock_nopath(trans, b, type, true);
BUG_ON(ret); BUG_ON(ret);
} }
@ -210,8 +229,6 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
enum six_lock_type type, enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p) six_lock_should_sleep_fn should_sleep_fn, void *p)
{ {
int ret;
if (six_trylock_type(&b->lock, type)) if (six_trylock_type(&b->lock, type))
return 0; return 0;
@ -219,11 +236,10 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
trans->locking_pos = pos; trans->locking_pos = pos;
trans->locking_btree_id = path->btree_id; trans->locking_btree_id = path->btree_id;
trans->locking_level = level; trans->locking_level = level;
trans->locking_lock_type = type; trans->lock_may_not_fail = false;
trans->locking = b; trans->locking = b;
ret = six_lock_type(&b->lock, type, should_sleep_fn, p); return six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
trans->locking = NULL; bch2_six_check_for_deadlock, trans);
return ret;
} }
/* /*
@ -279,12 +295,15 @@ static inline int btree_node_lock(struct btree_trans *trans,
return ret; return ret;
} }
void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *); int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *, bool);
static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, static inline int __btree_node_lock_write(struct btree_trans *trans,
struct btree_path *path, struct btree_path *path,
struct btree_bkey_cached_common *b) struct btree_bkey_cached_common *b,
bool lock_may_not_fail)
{ {
int ret;
EBUG_ON(&path->l[b->level].b->c != b); EBUG_ON(&path->l[b->level].b->c != b);
EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq); EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
EBUG_ON(!btree_node_intent_locked(path, b->level)); EBUG_ON(!btree_node_intent_locked(path, b->level));
@ -296,8 +315,21 @@ static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
*/ */
mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write); mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
if (unlikely(!six_trylock_write(&b->lock))) ret = likely(six_trylock_write(&b->lock))
__bch2_btree_node_lock_write(trans, b); ? 0
: __bch2_btree_node_lock_write(trans, b, lock_may_not_fail);
if (ret)
mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
return ret;
}
static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
int ret = __btree_node_lock_write(trans, path, b, true);
BUG_ON(ret);
} }
static inline int __must_check static inline int __must_check
@ -305,8 +337,7 @@ bch2_btree_node_lock_write(struct btree_trans *trans,
struct btree_path *path, struct btree_path *path,
struct btree_bkey_cached_common *b) struct btree_bkey_cached_common *b)
{ {
bch2_btree_node_lock_write_nofail(trans, path, b); return __btree_node_lock_write(trans, path, b, false);
return 0;
} }
/* relock: */ /* relock: */

View File

@ -387,15 +387,19 @@ struct btree_trans_commit_hook {
struct btree_trans { struct btree_trans {
struct bch_fs *c; struct bch_fs *c;
const char *fn; const char *fn;
struct closure ref;
struct list_head list; struct list_head list;
u64 last_begin_time; u64 last_begin_time;
struct btree_bkey_cached_common *locking;
unsigned locking_path_idx; unsigned locking_path_idx;
struct bpos locking_pos; struct bpos locking_pos;
u8 locking_btree_id; u8 locking_btree_id;
u8 locking_level; u8 locking_level;
u8 locking_lock_type; u8 lock_may_not_fail;
struct task_struct *task; u8 lock_must_abort;
struct btree_bkey_cached_common *locking;
struct six_lock_waiter locking_wait;
int srcu_idx; int srcu_idx;
u8 fn_idx; u8 fn_idx;

View File

@ -534,7 +534,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
mutex_lock(&c->btree_trans_lock); mutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) { list_for_each_entry(trans, &c->btree_trans_list, list) {
if (trans->task->pid <= i->iter) if (trans->locking_wait.task->pid <= i->iter)
continue; continue;
ret = flush_buf(i); ret = flush_buf(i);
@ -546,11 +546,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
prt_printf(&i->buf, "backtrace:"); prt_printf(&i->buf, "backtrace:");
prt_newline(&i->buf); prt_newline(&i->buf);
printbuf_indent_add(&i->buf, 2); printbuf_indent_add(&i->buf, 2);
prt_backtrace(&i->buf, trans->task); prt_backtrace(&i->buf, trans->locking_wait.task);
printbuf_indent_sub(&i->buf, 2); printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf); prt_newline(&i->buf);
i->iter = trans->task->pid; i->iter = trans->locking_wait.task->pid;
} }
mutex_unlock(&c->btree_trans_lock); mutex_unlock(&c->btree_trans_lock);

View File

@ -35,6 +35,7 @@
x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \
x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \
x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\
x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\
x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \

View File

@ -1065,6 +1065,12 @@ TRACE_EVENT(trans_restart_would_deadlock,
__entry->want_pos_snapshot) __entry->want_pos_snapshot)
); );
DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip)
);
TRACE_EVENT(trans_restart_would_deadlock_write, TRACE_EVENT(trans_restart_would_deadlock_write,
TP_PROTO(struct btree_trans *trans), TP_PROTO(struct btree_trans *trans),
TP_ARGS(trans), TP_ARGS(trans),