mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-06 05:02:31 +00:00
bcachefs: Fragmentation LRU
Now that we have much more efficient updates to the LRU btree, this patch adds a new LRU that indexes buckets by fragmentation. This means copygc no longer has to scan every bucket to find buckets that need to be evacuated. Changes: - A new field in bch_alloc_v4, fragmentation_lru - this corresponds to the bucket's position in the fragmentation LRU. We add a new field for this instead of calculating it as needed because we may make the fragmentation LRU optional; this field indicates whether a bucket is on the fragmentation LRU. Also, zoned devices will introduce variable bucket sizes; explicitly recording the LRU position will be safer for them. - A new copygc path for using the fragmentation LRU instead of scanning every bucket and building up an in-memory heap. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
1b30ed5fd8
commit
80c3308578
@ -415,6 +415,8 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
|
||||
prt_newline(out);
|
||||
prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]);
|
||||
prt_newline(out);
|
||||
prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
|
||||
prt_newline(out);
|
||||
prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
|
||||
prt_newline(out);
|
||||
|
||||
@ -910,8 +912,8 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
|
||||
!new_a->io_time[READ])
|
||||
new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
|
||||
|
||||
old_lru = alloc_lru_idx(*old_a);
|
||||
new_lru = alloc_lru_idx(*new_a);
|
||||
old_lru = alloc_lru_idx_read(*old_a);
|
||||
new_lru = alloc_lru_idx_read(*new_a);
|
||||
|
||||
if (old_lru != new_lru) {
|
||||
ret = bch2_lru_change(trans, new->k.p.inode,
|
||||
@ -921,6 +923,18 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
|
||||
return ret;
|
||||
}
|
||||
|
||||
new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
|
||||
bch_dev_bkey_exists(c, new->k.p.inode));
|
||||
|
||||
if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
|
||||
ret = bch2_lru_change(trans,
|
||||
BCH_LRU_FRAGMENTATION_START,
|
||||
bucket_to_u64(new->k.p),
|
||||
old_a->fragmentation_lru, new_a->fragmentation_lru);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (old_a->gen != new_a->gen) {
|
||||
ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
|
||||
if (ret)
|
||||
@ -1777,7 +1791,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
|
||||
goto out;
|
||||
|
||||
/* We expect harmless races here due to the btree write buffer: */
|
||||
if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v))
|
||||
if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
|
||||
goto out;
|
||||
|
||||
BUG_ON(a->v.data_type != BCH_DATA_cached);
|
||||
|
@ -64,11 +64,24 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
|
||||
a.stripe, a, data_type);
|
||||
}
|
||||
|
||||
static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
|
||||
static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
|
||||
{
|
||||
return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
|
||||
}
|
||||
|
||||
static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
|
||||
struct bch_dev *ca)
|
||||
{
|
||||
if (a.data_type != BCH_DATA_btree &&
|
||||
a.data_type != BCH_DATA_user)
|
||||
return 0;
|
||||
|
||||
if (a.dirty_sectors >= ca->mi.bucket_size)
|
||||
return 0;
|
||||
|
||||
return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
|
||||
}
|
||||
|
||||
static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
|
||||
{
|
||||
return ((u64) alloc_gc_gen(a) >> 4) << 56;
|
||||
|
@ -927,7 +927,6 @@ struct bch_fs {
|
||||
|
||||
/* COPYGC */
|
||||
struct task_struct *copygc_thread;
|
||||
copygc_heap copygc_heap;
|
||||
struct write_point copygc_write_point;
|
||||
s64 copygc_wait;
|
||||
bool copygc_running;
|
||||
|
@ -992,6 +992,7 @@ struct bch_alloc_v4 {
|
||||
__u64 io_time[2];
|
||||
__u32 stripe;
|
||||
__u32 nr_external_backpointers;
|
||||
__u64 fragmentation_lru;
|
||||
} __packed __aligned(8);
|
||||
|
||||
#define BCH_ALLOC_V4_U64s_V0 6
|
||||
@ -1563,7 +1564,8 @@ struct bch_sb_field_journal_seq_blacklist {
|
||||
x(inode_v3, 23) \
|
||||
x(unwritten_extents, 24) \
|
||||
x(bucket_gens, 25) \
|
||||
x(lru_v2, 26)
|
||||
x(lru_v2, 26) \
|
||||
x(fragmentation_lru, 27)
|
||||
|
||||
enum bcachefs_metadata_version {
|
||||
bcachefs_metadata_version_min = 9,
|
||||
|
@ -89,15 +89,4 @@ struct disk_reservation {
|
||||
unsigned nr_replicas;
|
||||
};
|
||||
|
||||
struct copygc_heap_entry {
|
||||
u8 dev;
|
||||
u8 gen;
|
||||
u8 replicas;
|
||||
u32 fragmentation;
|
||||
u32 sectors;
|
||||
u64 bucket;
|
||||
};
|
||||
|
||||
typedef HEAP(struct copygc_heap_entry) copygc_heap;
|
||||
|
||||
#endif /* _BUCKETS_TYPES_H */
|
||||
|
@ -93,6 +93,13 @@ int bch2_lru_change(struct btree_trans *trans,
|
||||
bch2_lru_set(trans, lru_id, dev_bucket, new_time);
|
||||
}
|
||||
|
||||
static const char * const bch2_lru_types[] = {
|
||||
#define x(n) #n,
|
||||
BCH_LRU_TYPES()
|
||||
#undef x
|
||||
NULL
|
||||
};
|
||||
|
||||
static int bch2_check_lru_key(struct btree_trans *trans,
|
||||
struct btree_iter *lru_iter,
|
||||
struct bkey_s_c lru_k,
|
||||
@ -105,7 +112,9 @@ static int bch2_check_lru_key(struct btree_trans *trans,
|
||||
const struct bch_alloc_v4 *a;
|
||||
struct printbuf buf1 = PRINTBUF;
|
||||
struct printbuf buf2 = PRINTBUF;
|
||||
enum bch_lru_type type = lru_type(lru_k);
|
||||
struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
|
||||
u64 idx;
|
||||
int ret;
|
||||
|
||||
if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
|
||||
@ -121,9 +130,17 @@ static int bch2_check_lru_key(struct btree_trans *trans,
|
||||
|
||||
a = bch2_alloc_to_v4(k, &a_convert);
|
||||
|
||||
switch (type) {
|
||||
case BCH_LRU_read:
|
||||
idx = alloc_lru_idx_read(*a);
|
||||
break;
|
||||
case BCH_LRU_fragmentation:
|
||||
idx = a->fragmentation_lru;
|
||||
break;
|
||||
}
|
||||
|
||||
if (lru_k.k->type != KEY_TYPE_set ||
|
||||
a->data_type != BCH_DATA_cached ||
|
||||
a->io_time[READ] != lru_pos_time(lru_k.k->p)) {}
|
||||
lru_pos_time(lru_k.k->p) != idx) {
|
||||
if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
|
||||
*last_flushed_pos = lru_k.k->p;
|
||||
ret = bch2_btree_write_buffer_flush_sync(trans) ?:
|
||||
@ -131,17 +148,14 @@ static int bch2_check_lru_key(struct btree_trans *trans,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (fsck_err_on(lru_k.k->type != KEY_TYPE_set ||
|
||||
a->data_type != BCH_DATA_cached ||
|
||||
a->io_time[READ] != lru_pos_time(lru_k.k->p), c,
|
||||
"incorrect lru entry (time %llu) %s\n"
|
||||
" for %s",
|
||||
lru_pos_time(lru_k.k->p),
|
||||
(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
|
||||
(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
|
||||
if (fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
|
||||
" %s\n"
|
||||
" for %s",
|
||||
bch2_lru_types[type],
|
||||
lru_pos_time(lru_k.k->p),
|
||||
(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
|
||||
(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
|
||||
ret = bch2_btree_delete_at(trans, lru_iter, 0);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
out:
|
||||
err:
|
||||
|
@ -22,6 +22,27 @@ static inline u64 lru_pos_time(struct bpos pos)
|
||||
return pos.inode & ~(~0ULL << LRU_TIME_BITS);
|
||||
}
|
||||
|
||||
#define BCH_LRU_TYPES() \
|
||||
x(read) \
|
||||
x(fragmentation)
|
||||
|
||||
enum bch_lru_type {
|
||||
#define x(n) BCH_LRU_##n,
|
||||
BCH_LRU_TYPES()
|
||||
#undef x
|
||||
};
|
||||
|
||||
#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
|
||||
|
||||
static inline enum bch_lru_type lru_type(struct bkey_s_c l)
|
||||
{
|
||||
u16 lru_id = l.k->p.inode >> 48;
|
||||
|
||||
if (lru_id == BCH_LRU_FRAGMENTATION_START)
|
||||
return BCH_LRU_fragmentation;
|
||||
return BCH_LRU_read;
|
||||
}
|
||||
|
||||
int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
|
||||
void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
|
||||
|
@ -652,13 +652,13 @@ static noinline void verify_bucket_evacuated(struct btree_trans *trans, struct b
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
int __bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
int __bch2_evacuate_bucket(struct btree_trans *trans,
|
||||
struct moving_context *ctxt,
|
||||
struct bpos bucket, int gen,
|
||||
struct data_update_opts _data_opts)
|
||||
{
|
||||
struct bch_fs *c = ctxt->c;
|
||||
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
|
||||
struct btree_trans trans;
|
||||
struct btree_iter iter;
|
||||
struct bkey_buf sk;
|
||||
struct bch_backpointer bp;
|
||||
@ -667,17 +667,17 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
struct bkey_s_c k;
|
||||
struct data_update_opts data_opts;
|
||||
unsigned dirty_sectors, bucket_size;
|
||||
u64 fragmentation;
|
||||
u64 bp_offset = 0, cur_inum = U64_MAX;
|
||||
int ret = 0;
|
||||
|
||||
bch2_bkey_buf_init(&sk);
|
||||
bch2_trans_init(&trans, c, 0, 0);
|
||||
|
||||
bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
|
||||
bucket, BTREE_ITER_CACHED);
|
||||
ret = lockrestart_do(&trans,
|
||||
ret = lockrestart_do(trans,
|
||||
bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
|
||||
bch2_trans_iter_exit(&trans, &iter);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (ret) {
|
||||
bch_err(c, "%s: error looking up alloc key: %s", __func__, bch2_err_str(ret));
|
||||
@ -687,17 +687,18 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
a = bch2_alloc_to_v4(k, &a_convert);
|
||||
dirty_sectors = a->dirty_sectors;
|
||||
bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
|
||||
fragmentation = a->fragmentation_lru;
|
||||
|
||||
ret = bch2_btree_write_buffer_flush(&trans);
|
||||
ret = bch2_btree_write_buffer_flush(trans);
|
||||
if (ret) {
|
||||
bch_err(c, "%s: error flushing btree write buffer: %s", __func__, bch2_err_str(ret));
|
||||
goto err;
|
||||
}
|
||||
|
||||
while (!(ret = move_ratelimit(&trans, ctxt))) {
|
||||
bch2_trans_begin(&trans);
|
||||
while (!(ret = move_ratelimit(trans, ctxt))) {
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = bch2_get_next_backpointer(&trans, bucket, gen,
|
||||
ret = bch2_get_next_backpointer(trans, bucket, gen,
|
||||
&bp_offset, &bp,
|
||||
BTREE_ITER_CACHED);
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
@ -712,7 +713,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
struct bkey_s_c k;
|
||||
unsigned i = 0;
|
||||
|
||||
k = bch2_backpointer_get_key(&trans, &iter,
|
||||
k = bch2_backpointer_get_key(trans, &iter,
|
||||
bucket, bp_offset, bp);
|
||||
ret = bkey_err(k);
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
@ -725,9 +726,9 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
bch2_bkey_buf_reassemble(&sk, c, k);
|
||||
k = bkey_i_to_s_c(sk.k);
|
||||
|
||||
ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
|
||||
ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
|
||||
if (ret) {
|
||||
bch2_trans_iter_exit(&trans, &iter);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -741,15 +742,15 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
i++;
|
||||
}
|
||||
|
||||
ret = bch2_move_extent(&trans, &iter, ctxt, io_opts,
|
||||
ret = bch2_move_extent(trans, &iter, ctxt, io_opts,
|
||||
bp.btree_id, k, data_opts);
|
||||
bch2_trans_iter_exit(&trans, &iter);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
continue;
|
||||
if (ret == -ENOMEM) {
|
||||
/* memory allocation failure, wait for some IO to finish */
|
||||
bch2_move_ctxt_wait_for_io(ctxt, &trans);
|
||||
bch2_move_ctxt_wait_for_io(ctxt, trans);
|
||||
continue;
|
||||
}
|
||||
if (ret)
|
||||
@ -761,7 +762,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
} else {
|
||||
struct btree *b;
|
||||
|
||||
b = bch2_backpointer_get_node(&trans, &iter,
|
||||
b = bch2_backpointer_get_node(trans, &iter,
|
||||
bucket, bp_offset, bp);
|
||||
ret = PTR_ERR_OR_ZERO(b);
|
||||
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
|
||||
@ -773,8 +774,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
if (!b)
|
||||
goto next;
|
||||
|
||||
ret = bch2_btree_node_rewrite(&trans, &iter, b, 0);
|
||||
bch2_trans_iter_exit(&trans, &iter);
|
||||
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
continue;
|
||||
@ -791,17 +792,16 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
|
||||
bp_offset++;
|
||||
}
|
||||
|
||||
trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, ret);
|
||||
trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
|
||||
|
||||
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
|
||||
bch2_trans_unlock(&trans);
|
||||
bch2_trans_unlock(trans);
|
||||
move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
|
||||
closure_sync(&ctxt->cl);
|
||||
if (!ctxt->write_error)
|
||||
verify_bucket_evacuated(&trans, bucket, gen);
|
||||
verify_bucket_evacuated(trans, bucket, gen);
|
||||
}
|
||||
err:
|
||||
bch2_trans_exit(&trans);
|
||||
bch2_bkey_buf_exit(&sk, c);
|
||||
return ret;
|
||||
}
|
||||
@ -814,12 +814,15 @@ int bch2_evacuate_bucket(struct bch_fs *c,
|
||||
struct write_point_specifier wp,
|
||||
bool wait_on_copygc)
|
||||
{
|
||||
struct btree_trans trans;
|
||||
struct moving_context ctxt;
|
||||
int ret;
|
||||
|
||||
bch2_trans_init(&trans, c, 0, 0);
|
||||
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
|
||||
ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts);
|
||||
ret = __bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts);
|
||||
bch2_moving_ctxt_exit(&ctxt);
|
||||
bch2_trans_exit(&trans);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -66,7 +66,8 @@ int bch2_move_data(struct bch_fs *,
|
||||
bool,
|
||||
move_pred_fn, void *);
|
||||
|
||||
int __bch2_evacuate_bucket(struct moving_context *,
|
||||
int __bch2_evacuate_bucket(struct btree_trans *,
|
||||
struct moving_context *,
|
||||
struct bpos, int,
|
||||
struct data_update_opts);
|
||||
int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "alloc_foreground.h"
|
||||
#include "btree_iter.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_write_buffer.h"
|
||||
#include "buckets.h"
|
||||
#include "clock.h"
|
||||
#include "disk_groups.h"
|
||||
@ -19,6 +20,7 @@
|
||||
#include "eytzinger.h"
|
||||
#include "io.h"
|
||||
#include "keylist.h"
|
||||
#include "lru.h"
|
||||
#include "move.h"
|
||||
#include "movinggc.h"
|
||||
#include "super-io.h"
|
||||
@ -31,138 +33,105 @@
|
||||
#include <linux/sort.h>
|
||||
#include <linux/wait.h>
|
||||
|
||||
static inline int fragmentation_cmp(copygc_heap *heap,
|
||||
struct copygc_heap_entry l,
|
||||
struct copygc_heap_entry r)
|
||||
static int bch2_bucket_is_movable(struct btree_trans *trans,
|
||||
struct bpos bucket, u64 time, u8 *gen)
|
||||
{
|
||||
return cmp_int(l.fragmentation, r.fragmentation);
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
struct bch_alloc_v4 _a;
|
||||
const struct bch_alloc_v4 *a;
|
||||
int ret;
|
||||
|
||||
if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset))
|
||||
return 0;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, 0);
|
||||
k = bch2_btree_iter_peek_slot(&iter);
|
||||
ret = bkey_err(k);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
a = bch2_alloc_to_v4(k, &_a);
|
||||
*gen = a->gen;
|
||||
ret = (a->data_type == BCH_DATA_btree ||
|
||||
a->data_type == BCH_DATA_user) &&
|
||||
a->fragmentation_lru &&
|
||||
a->fragmentation_lru <= time;
|
||||
|
||||
if (ret) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_bkey_val_to_text(&buf, trans->c, k);
|
||||
pr_debug("%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int find_buckets_to_copygc(struct bch_fs *c)
|
||||
static int bch2_copygc_next_bucket(struct btree_trans *trans,
|
||||
struct bpos *bucket, u8 *gen, struct bpos *pos)
|
||||
{
|
||||
copygc_heap *h = &c->copygc_heap;
|
||||
struct btree_trans trans;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
bch2_trans_init(&trans, c, 0, 0);
|
||||
ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
|
||||
bpos_max(*pos, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0)),
|
||||
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
|
||||
0, k, ({
|
||||
*bucket = u64_to_bucket(k.k->p.offset);
|
||||
|
||||
/*
|
||||
* Find buckets with lowest sector counts, skipping completely
|
||||
* empty buckets, by building a maxheap sorted by sector count,
|
||||
* and repeatedly replacing the maximum element until all
|
||||
* buckets have been visited.
|
||||
*/
|
||||
h->used = 0;
|
||||
bch2_bucket_is_movable(trans, *bucket, lru_pos_time(k.k->p), gen);
|
||||
}));
|
||||
|
||||
for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
|
||||
BTREE_ITER_PREFETCH, k, ret) {
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
|
||||
struct copygc_heap_entry e;
|
||||
struct bch_alloc_v4 a_convert;
|
||||
const struct bch_alloc_v4 *a;
|
||||
|
||||
a = bch2_alloc_to_v4(k, &a_convert);
|
||||
|
||||
if ((a->data_type != BCH_DATA_btree &&
|
||||
a->data_type != BCH_DATA_user) ||
|
||||
a->dirty_sectors >= ca->mi.bucket_size ||
|
||||
bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
|
||||
continue;
|
||||
|
||||
e = (struct copygc_heap_entry) {
|
||||
.dev = iter.pos.inode,
|
||||
.gen = a->gen,
|
||||
.replicas = 1 + a->stripe_redundancy,
|
||||
.fragmentation = div_u64((u64) a->dirty_sectors * (1ULL << 31),
|
||||
ca->mi.bucket_size),
|
||||
.sectors = a->dirty_sectors,
|
||||
.bucket = iter.pos.offset,
|
||||
};
|
||||
heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
|
||||
|
||||
}
|
||||
bch2_trans_iter_exit(&trans, &iter);
|
||||
|
||||
bch2_trans_exit(&trans);
|
||||
return ret;
|
||||
*pos = iter.pos;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
return ret ? 0 : -ENOENT;
|
||||
}
|
||||
|
||||
static int bch2_copygc(struct bch_fs *c)
|
||||
{
|
||||
copygc_heap *h = &c->copygc_heap;
|
||||
struct copygc_heap_entry e;
|
||||
struct bch_move_stats move_stats;
|
||||
struct bch_dev *ca;
|
||||
unsigned dev_idx;
|
||||
size_t heap_size = 0;
|
||||
struct btree_trans trans;
|
||||
struct moving_context ctxt;
|
||||
struct data_update_opts data_opts = {
|
||||
.btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
|
||||
};
|
||||
struct bpos bucket;
|
||||
struct bpos pos;
|
||||
u8 gen = 0;
|
||||
unsigned nr_evacuated;
|
||||
int ret = 0;
|
||||
|
||||
bch2_move_stats_init(&move_stats, "copygc");
|
||||
|
||||
for_each_rw_member(ca, c, dev_idx)
|
||||
heap_size += ca->mi.nbuckets >> 7;
|
||||
|
||||
if (h->size < heap_size) {
|
||||
free_heap(&c->copygc_heap);
|
||||
if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
|
||||
bch_err(c, "error allocating copygc heap");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
ret = find_buckets_to_copygc(c);
|
||||
if (ret) {
|
||||
bch2_fs_fatal_error(c, "error walking buckets to copygc!");
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!h->used) {
|
||||
s64 wait = S64_MAX, dev_wait;
|
||||
u64 dev_min_wait_fragmented = 0;
|
||||
u64 dev_min_wait_allowed = 0;
|
||||
int dev_min_wait = -1;
|
||||
|
||||
for_each_rw_member(ca, c, dev_idx) {
|
||||
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
|
||||
s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
|
||||
ca->mi.bucket_size) >> 1);
|
||||
s64 fragmented = usage.d[BCH_DATA_user].fragmented;
|
||||
|
||||
dev_wait = max(0LL, allowed - fragmented);
|
||||
|
||||
if (dev_min_wait < 0 || dev_wait < wait) {
|
||||
dev_min_wait = dev_idx;
|
||||
dev_min_wait_fragmented = fragmented;
|
||||
dev_min_wait_allowed = allowed;
|
||||
}
|
||||
}
|
||||
|
||||
bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu",
|
||||
dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed);
|
||||
return 0;
|
||||
}
|
||||
|
||||
heap_resort(h, fragmentation_cmp, NULL);
|
||||
|
||||
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
|
||||
writepoint_ptr(&c->copygc_write_point),
|
||||
false);
|
||||
bch2_trans_init(&trans, c, 0, 0);
|
||||
|
||||
/* not correct w.r.t. device removal */
|
||||
while (h->used && !ret) {
|
||||
BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
|
||||
ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen,
|
||||
data_opts);
|
||||
ret = bch2_btree_write_buffer_flush(&trans);
|
||||
BUG_ON(ret);
|
||||
|
||||
for (nr_evacuated = 0, pos = POS_MIN;
|
||||
nr_evacuated < 32 && !ret;
|
||||
nr_evacuated++, pos = bpos_nosnap_successor(pos)) {
|
||||
ret = bch2_copygc_next_bucket(&trans, &bucket, &gen, &pos) ?:
|
||||
__bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts);
|
||||
if (bkey_eq(pos, POS_MAX))
|
||||
break;
|
||||
}
|
||||
|
||||
bch2_trans_exit(&trans);
|
||||
bch2_moving_ctxt_exit(&ctxt);
|
||||
|
||||
/* no entries in LRU btree found, or got to end: */
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
|
||||
if (ret < 0 && !bch2_err_matches(ret, EROFS))
|
||||
bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
|
||||
|
||||
|
@ -1105,6 +1105,9 @@ int bch2_fs_recovery(struct bch_fs *c)
|
||||
c->opts.version_upgrade = true;
|
||||
c->opts.fsck = true;
|
||||
c->opts.fix_errors = FSCK_OPT_YES;
|
||||
} else if (c->sb.version < bcachefs_metadata_version_fragmentation_lru) {
|
||||
bch_info(c, "version prior to backpointers, upgrade required");
|
||||
c->opts.version_upgrade = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -487,7 +487,6 @@ static void __bch2_fs_free(struct bch_fs *c)
|
||||
kfree(rcu_dereference_protected(c->disk_groups, 1));
|
||||
kfree(c->journal_seq_blacklist_table);
|
||||
kfree(c->unused_inode_hints);
|
||||
free_heap(&c->copygc_heap);
|
||||
|
||||
if (c->io_complete_wq)
|
||||
destroy_workqueue(c->io_complete_wq);
|
||||
|
@ -723,8 +723,8 @@ TRACE_EVENT(move_data,
|
||||
TRACE_EVENT(evacuate_bucket,
|
||||
TP_PROTO(struct bch_fs *c, struct bpos *bucket,
|
||||
unsigned sectors, unsigned bucket_size,
|
||||
int ret),
|
||||
TP_ARGS(c, bucket, sectors, bucket_size, ret),
|
||||
u64 fragmentation, int ret),
|
||||
TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev )
|
||||
@ -732,6 +732,7 @@ TRACE_EVENT(evacuate_bucket,
|
||||
__field(u64, bucket )
|
||||
__field(u32, sectors )
|
||||
__field(u32, bucket_size )
|
||||
__field(u64, fragmentation )
|
||||
__field(int, ret )
|
||||
),
|
||||
|
||||
@ -741,14 +742,15 @@ TRACE_EVENT(evacuate_bucket,
|
||||
__entry->bucket = bucket->offset;
|
||||
__entry->sectors = sectors;
|
||||
__entry->bucket_size = bucket_size;
|
||||
__entry->fragmentation = fragmentation;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
|
||||
TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->member, __entry->bucket,
|
||||
__entry->sectors, __entry->bucket_size,
|
||||
__entry->ret)
|
||||
__entry->fragmentation, __entry->ret)
|
||||
);
|
||||
|
||||
TRACE_EVENT(copygc,
|
||||
|
Loading…
Reference in New Issue
Block a user