mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-09 14:43:16 +00:00
bcachefs: btree write buffer knows how to accumulate bch_accounting keys
Teach the btree write buffer how to accumulate accounting keys - instead of having the newer key overwrite the older key as we do with other updates, we need to add them together. Also, add a flag so that write buffer flush knows when journal replay is finished flushing accounting, and teach it to hold accounting keys until that flag is set. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
9dec2a473b
commit
5d9667d1d6
@ -593,6 +593,7 @@ struct bch_dev {
|
||||
x(new_fs) \
|
||||
x(started) \
|
||||
x(btree_running) \
|
||||
x(accounting_replay_done) \
|
||||
x(may_go_rw) \
|
||||
x(rw) \
|
||||
x(was_rw) \
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "btree_update.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_write_buffer.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "journal.h"
|
||||
@ -134,7 +135,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
|
||||
|
||||
static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
|
||||
struct btree_write_buffered_key *wb,
|
||||
bool *write_locked, size_t *fast)
|
||||
bool *write_locked,
|
||||
bool *accounting_accumulated,
|
||||
size_t *fast)
|
||||
{
|
||||
struct btree_path *path;
|
||||
int ret;
|
||||
@ -147,6 +150,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
|
||||
struct bkey u;
|
||||
struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
|
||||
|
||||
if (k.k->type == KEY_TYPE_accounting)
|
||||
bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
|
||||
bkey_s_c_to_accounting(k));
|
||||
}
|
||||
*accounting_accumulated = true;
|
||||
|
||||
/*
|
||||
* We can't clone a path that has write locks: unshare it now, before
|
||||
* set_pos and traverse():
|
||||
@ -259,8 +272,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
struct journal *j = &c->journal;
|
||||
struct btree_write_buffer *wb = &c->btree_write_buffer;
|
||||
struct btree_iter iter = { NULL };
|
||||
size_t skipped = 0, fast = 0, slowpath = 0;
|
||||
size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
|
||||
bool write_locked = false;
|
||||
bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_unlock(trans);
|
||||
@ -301,11 +315,22 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
|
||||
BUG_ON(!k->journal_seq);
|
||||
|
||||
if (!accounting_replay_done &&
|
||||
k->k.k.type == KEY_TYPE_accounting) {
|
||||
slowpath++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (i + 1 < &darray_top(wb->sorted) &&
|
||||
wb_key_eq(i, i + 1)) {
|
||||
struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
|
||||
|
||||
skipped++;
|
||||
if (k->k.k.type == KEY_TYPE_accounting &&
|
||||
n->k.k.type == KEY_TYPE_accounting)
|
||||
bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
|
||||
bkey_i_to_s_c_accounting(&k->k));
|
||||
|
||||
overwritten++;
|
||||
n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
|
||||
k->journal_seq = 0;
|
||||
continue;
|
||||
@ -340,13 +365,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
bch2_btree_iter_set_pos(&iter, k->k.k.p);
|
||||
btree_iter_path(trans, &iter)->preserve = false;
|
||||
|
||||
bool accounting_accumulated = false;
|
||||
do {
|
||||
if (race_fault()) {
|
||||
ret = -BCH_ERR_journal_reclaim_would_deadlock;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
|
||||
ret = wb_flush_one(trans, &iter, k, &write_locked,
|
||||
&accounting_accumulated, &fast);
|
||||
if (!write_locked)
|
||||
bch2_trans_begin(trans);
|
||||
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
|
||||
@ -387,8 +414,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
if (!i->journal_seq)
|
||||
continue;
|
||||
|
||||
bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
|
||||
bch2_btree_write_buffer_journal_flush);
|
||||
if (!accounting_replay_done &&
|
||||
i->k.k.type == KEY_TYPE_accounting) {
|
||||
could_not_insert++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!could_not_insert)
|
||||
bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
|
||||
bch2_btree_write_buffer_journal_flush);
|
||||
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
@ -401,13 +435,45 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
btree_write_buffered_insert(trans, i));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
i->journal_seq = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If journal replay hasn't finished with accounting keys we
|
||||
* can't flush accounting keys at all - condense them and leave
|
||||
* them for next time.
|
||||
*
|
||||
* Q: Can the write buffer overflow?
|
||||
* A Shouldn't be any actual risk. It's just new accounting
|
||||
* updates that the write buffer can't flush, and those are only
|
||||
* going to be generated by interior btree node updates as
|
||||
* journal replay has to split/rewrite nodes to make room for
|
||||
* its updates.
|
||||
*
|
||||
* And for those new acounting updates, updates to the same
|
||||
* counters get accumulated as they're flushed from the journal
|
||||
* to the write buffer - see the patch for eytzingcer tree
|
||||
* accumulated. So we could only overflow if the number of
|
||||
* distinct counters touched somehow was very large.
|
||||
*/
|
||||
if (could_not_insert) {
|
||||
struct btree_write_buffered_key *dst = wb->flushing.keys.data;
|
||||
|
||||
darray_for_each(wb->flushing.keys, i)
|
||||
if (i->journal_seq)
|
||||
*dst++ = *i;
|
||||
wb->flushing.keys.nr = dst - wb->flushing.keys.data;
|
||||
}
|
||||
}
|
||||
err:
|
||||
if (ret || !could_not_insert) {
|
||||
bch2_journal_pin_drop(j, &wb->flushing.pin);
|
||||
wb->flushing.keys.nr = 0;
|
||||
}
|
||||
|
||||
bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
|
||||
trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
|
||||
bch2_journal_pin_drop(j, &wb->flushing.pin);
|
||||
wb->flushing.keys.nr = 0;
|
||||
trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -290,6 +290,8 @@ int bch2_journal_replay(struct bch_fs *c)
|
||||
k->overwritten = true;
|
||||
}
|
||||
|
||||
set_bit(BCH_FS_accounting_replay_done, &c->flags);
|
||||
|
||||
/*
|
||||
* First, attempt to replay keys in sorted order. This is more
|
||||
* efficient - better locality of btree access - but some might fail if
|
||||
@ -1060,6 +1062,7 @@ int bch2_fs_initialize(struct bch_fs *c)
|
||||
* set up the journal.pin FIFO and journal.cur pointer:
|
||||
*/
|
||||
bch2_fs_journal_start(&c->journal, 1);
|
||||
set_bit(BCH_FS_accounting_replay_done, &c->flags);
|
||||
bch2_journal_set_replay_done(&c->journal);
|
||||
|
||||
ret = bch2_fs_read_write_early(c);
|
||||
|
Loading…
x
Reference in New Issue
Block a user