More bcachefs updates for 6.7-rc1

- assorted prep work for disk space accounting rewrite
  - BTREE_TRIGGER_ATOMIC: after combining our trigger callbacks, this
    makes our trigger context more explicit
  - A few fixes to avoid excessive transaction restarts on multithreaded
    workloads: fstests (in addition to ktest tests) are now checking
    slowpath counters, and that's shaking out a few bugs
  - Assorted tracepoint improvements
  - Starting to break up bcachefs_format.h and move on disk types so
    they're with the code they belong to; this will make room to start
    documenting the on disk format better.
  - A few minor fixes
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmWtjOsACgkQE6szbY3K
 bnbyXRAAsx+yM81TFqsLzRRqf8oocRwf2dj5XzExz9Ig/lYQS5LIVROS2OxwDsAc
 DeaYQSTcph9dkOswCrNR96bBnEgmmZ1ClfVI6WRXvm6vs4rjhSMNbNaVyySrMUVn
 5p/Lsn1/RKl0lWMYlHrdryo+106zRcr6z1Hiv9QCXkXhzdkV8wFYDkfbMveShUsu
 KobC29wvd2EfZr04nqsIXS/y/iRIXhtZqJmFCiAguN70UWrwUwArpELHI5Ve+WPZ
 9VjgFXW6Ka3QxJs/20tX+t24DrC+eDXR44DzQmxwG5mPBBpXkcSk5UgRw/EUag5U
 5+mDZQ5Ei3gvZvUwrilMosVy3pIw0IuvqeqwDGFoFXs1cce01QCMN+NG/dBTQw9i
 KGGxJw5sOrZ8fIiFnypk1M+r9NVtA8MjriLNR5bJjCWPSpWqzkT2HzxFXc6HmTZu
 vsE/AxwC1RLA6B2HZlDEqLOdHE3cofkDiIzWM5ABvb4p118iyk9hE6HhAufk5UdE
 HaG646kGB8pUY/sCxBIOD6K2pgthDFv+fftTM7X+uIazD3bovvPQCEInu48/KAHn
 /KmslSPO0txyjnRFMbXFJvd4Fgfo44GcBCeqGpy3B79aEJ3nroyRZ0qNnnsqj0Gl
 picUWjTn4W561Q1zBXuE/6cLWEp+sfaqYQcM8L3CCitRTVDPaCQ=
 =yd+F
 -----END PGP SIGNATURE-----

Merge tag 'bcachefs-2024-01-21' of https://evilpiepirate.org/git/bcachefs

Pull more bcachefs updates from Kent Overstreet:
 "Some fixes, Some refactoring, some minor features:

   - Assorted prep work for disk space accounting rewrite

   - BTREE_TRIGGER_ATOMIC: after combining our trigger callbacks, this
     makes our trigger context more explicit

   - A few fixes to avoid excessive transaction restarts on
     multithreaded workloads: fstests (in addition to ktest tests) are
     now checking slowpath counters, and that's shaking out a few bugs

   - Assorted tracepoint improvements

   - Starting to break up bcachefs_format.h and move on disk types so
     they're with the code they belong to; this will make room to start
     documenting the on disk format better.

   - A few minor fixes"

* tag 'bcachefs-2024-01-21' of https://evilpiepirate.org/git/bcachefs: (46 commits)
  bcachefs: Improve inode_to_text()
  bcachefs: logged_ops_format.h
  bcachefs: reflink_format.h
  bcachefs; extents_format.h
  bcachefs: ec_format.h
  bcachefs: subvolume_format.h
  bcachefs: snapshot_format.h
  bcachefs: alloc_background_format.h
  bcachefs: xattr_format.h
  bcachefs: dirent_format.h
  bcachefs: inode_format.h
  bcachefs; quota_format.h
  bcachefs: sb-counters_format.h
  bcachefs: counters.c -> sb-counters.c
  bcachefs: comment bch_subvolume
  bcachefs: bch_snapshot::btime
  bcachefs: add missing __GFP_NOWARN
  bcachefs: opts->compression can now also be applied in the background
  bcachefs: Prep work for variable size btree node buffers
  bcachefs: grab s_umount only if snapshotting
  ...
This commit is contained in:
Linus Torvalds 2024-01-21 14:01:12 -08:00
commit 35a4474b5c
78 changed files with 1629 additions and 1426 deletions

View File

@ -27,7 +27,6 @@ bcachefs-y := \
checksum.o \
clock.o \
compress.o \
counters.o \
darray.o \
debug.o \
dirent.o \
@ -71,6 +70,7 @@ bcachefs-y := \
reflink.o \
replicas.o \
sb-clean.o \
sb-counters.o \
sb-downgrade.o \
sb-errors.o \
sb-members.o \

View File

@ -273,7 +273,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
c, err, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
bch2_data_types[a.v->data_type]);
bch2_data_type_str(a.v->data_type));
break;
case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors ||
@ -321,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
{
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
unsigned i;
prt_newline(out);
printbuf_indent_add(out, 2);
prt_printf(out, "gen %u oldest_gen %u data_type %s",
a->gen, a->oldest_gen,
a->data_type < BCH_DATA_NR
? bch2_data_types[a->data_type]
: "(invalid data type)");
prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
bch2_prt_data_type(out, a->data_type);
prt_newline(out);
prt_printf(out, "journal_seq %llu", a->journal_seq);
prt_newline(out);
@ -353,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
prt_newline(out);
prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
prt_newline(out);
if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
printbuf_indent_add(out, 2);
for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
prt_newline(out);
bch2_backpointer_to_text(out, &bps[i]);
}
printbuf_indent_sub(out, 2);
}
printbuf_indent_sub(out, 2);
}
@ -839,7 +818,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
}
if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) {
if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
u64 journal_seq = trans->journal_res.seq;
u64 bucket_journal_seq = new_a->journal_seq;
@ -1625,13 +1604,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
return ret;
}
struct discard_buckets_state {
u64 seen;
u64 open;
u64 need_journal_commit;
u64 discarded;
struct bch_dev *ca;
u64 need_journal_commit_this_dev;
};
static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
{
if (s->ca == ca)
return;
if (s->ca && s->need_journal_commit_this_dev >
bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
bch2_journal_flush_async(&c->journal, NULL);
if (s->ca)
percpu_ref_put(&s->ca->ref);
if (ca)
percpu_ref_get(&ca->ref);
s->ca = ca;
s->need_journal_commit_this_dev = 0;
}
static int bch2_discard_one_bucket(struct btree_trans *trans,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
u64 *seen,
u64 *open,
u64 *need_journal_commit,
u64 *discarded)
struct discard_buckets_state *s)
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
@ -1643,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
int ret = 0;
ca = bch_dev_bkey_exists(c, pos.inode);
if (!percpu_ref_tryget(&ca->io_ref)) {
bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
return 0;
}
discard_buckets_next_dev(c, s, ca);
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
(*open)++;
s->open++;
goto out;
}
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
c->journal.flushed_seq_ondisk,
pos.inode, pos.offset)) {
(*need_journal_commit)++;
s->need_journal_commit++;
s->need_journal_commit_this_dev++;
goto out;
}
@ -1732,9 +1738,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
goto out;
count_event(c, bucket_discard);
(*discarded)++;
s->discarded++;
out:
(*seen)++;
s->seen++;
bch2_trans_iter_exit(trans, &iter);
percpu_ref_put(&ca->io_ref);
printbuf_exit(&buf);
@ -1744,7 +1750,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
static void bch2_do_discards_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@ -1756,19 +1762,14 @@ static void bch2_do_discards_work(struct work_struct *work)
ret = bch2_trans_run(c,
for_each_btree_key(trans, iter,
BTREE_ID_need_discard, POS_MIN, 0, k,
bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
&seen,
&open,
&need_journal_commit,
&discarded)));
bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
if (need_journal_commit * 2 > seen)
bch2_journal_flush_async(&c->journal, NULL);
discard_buckets_next_dev(c, &s, NULL);
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
bch2_err_str(ret));
}
void bch2_do_discards(struct bch_fs *c)

View File

@ -0,0 +1,92 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
struct bch_alloc {
struct bch_val v;
__u8 fields;
__u8 gen;
__u8 data[];
} __packed __aligned(8);
#define BCH_ALLOC_FIELDS_V1() \
x(read_time, 16) \
x(write_time, 16) \
x(data_type, 8) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
x(oldest_gen, 8) \
x(stripe, 32) \
x(stripe_redundancy, 8)
enum {
#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
BCH_ALLOC_FIELDS_V1()
#undef x
};
struct bch_alloc_v2 {
struct bch_val v;
__u8 nr_fields;
__u8 gen;
__u8 oldest_gen;
__u8 data_type;
__u8 data[];
} __packed __aligned(8);
#define BCH_ALLOC_FIELDS_V2() \
x(read_time, 64) \
x(write_time, 64) \
x(dirty_sectors, 32) \
x(cached_sectors, 32) \
x(stripe, 32) \
x(stripe_redundancy, 8)
struct bch_alloc_v3 {
struct bch_val v;
__le64 journal_seq;
__le32 flags;
__u8 nr_fields;
__u8 gen;
__u8 oldest_gen;
__u8 data_type;
__u8 data[];
} __packed __aligned(8);
LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
struct bch_alloc_v4 {
struct bch_val v;
__u64 journal_seq;
__u32 flags;
__u8 gen;
__u8 oldest_gen;
__u8 data_type;
__u8 stripe_redundancy;
__u32 dirty_sectors;
__u32 cached_sectors;
__u64 io_time[2];
__u32 stripe;
__u32 nr_external_backpointers;
__u64 fragmentation_lru;
} __packed __aligned(8);
#define BCH_ALLOC_V4_U64s_V0 6
#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
#define KEY_TYPE_BUCKET_GENS_BITS 8
#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
struct bch_bucket_gens {
struct bch_val v;
u8 gens[KEY_TYPE_BUCKET_GENS_NR];
} __packed __aligned(8);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */

View File

@ -1525,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str
unsigned data_type = ob->data_type;
barrier(); /* READ_ONCE() doesn't work on bitfields */
prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
prt_printf(out, "%zu ref %u ",
ob - c->open_buckets,
atomic_read(&ob->pin),
data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
atomic_read(&ob->pin));
bch2_prt_data_type(out, data_type);
prt_printf(out, " %u:%llu gen %u allocated %u/%u",
ob->dev, ob->bucket, ob->gen,
ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
if (ob->ec)

View File

@ -400,13 +400,24 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
return ret;
}
static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
{
return bpos_eq(l.k->p, r.k->p) &&
bkey_bytes(l.k) == bkey_bytes(r.k) &&
!memcmp(l.v, r.v, bkey_val_bytes(l.k));
}
struct extents_to_bp_state {
struct bpos bucket_start;
struct bpos bucket_end;
struct bkey_buf last_flushed;
};
static int check_bp_exists(struct btree_trans *trans,
struct extents_to_bp_state *s,
struct bpos bucket,
struct bch_backpointer bp,
struct bkey_s_c orig_k,
struct bpos bucket_start,
struct bpos bucket_end,
struct bkey_buf *last_flushed)
struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
struct btree_iter bp_iter = { NULL };
@ -417,8 +428,8 @@ static int check_bp_exists(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp);
if (bpos_lt(bucket, bucket_start) ||
bpos_gt(bucket, bucket_end))
if (bpos_lt(bucket, s->bucket_start) ||
bpos_gt(bucket, s->bucket_end))
return 0;
if (!bch2_dev_bucket_exists(c, bucket))
@ -433,11 +444,9 @@ static int check_bp_exists(struct btree_trans *trans,
if (bp_k.k->type != KEY_TYPE_backpointer ||
memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
if (!bpos_eq(orig_k.k->p, last_flushed->k->k.p) ||
bkey_bytes(orig_k.k) != bkey_bytes(&last_flushed->k->k) ||
memcmp(orig_k.v, &last_flushed->k->v, bkey_val_bytes(orig_k.k))) {
bch2_bkey_buf_reassemble(&tmp, c, orig_k);
bch2_bkey_buf_reassemble(&tmp, c, orig_k);
if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
if (bp.level) {
bch2_trans_unlock(trans);
bch2_btree_interior_updates_flush(c);
@ -447,7 +456,7 @@ static int check_bp_exists(struct btree_trans *trans,
if (ret)
goto err;
bch2_bkey_buf_copy(last_flushed, c, tmp.k);
bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
@ -475,10 +484,8 @@ static int check_bp_exists(struct btree_trans *trans,
}
static int check_extent_to_backpointers(struct btree_trans *trans,
struct extents_to_bp_state *s,
enum btree_id btree, unsigned level,
struct bpos bucket_start,
struct bpos bucket_end,
struct bkey_buf *last_flushed,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
@ -498,9 +505,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
bch2_extent_ptr_to_bp(c, btree, level,
k, p, &bucket_pos, &bp);
ret = check_bp_exists(trans, bucket_pos, bp, k,
bucket_start, bucket_end,
last_flushed);
ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
return ret;
}
@ -509,10 +514,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
}
static int check_btree_root_to_backpointers(struct btree_trans *trans,
struct extents_to_bp_state *s,
enum btree_id btree_id,
struct bpos bucket_start,
struct bpos bucket_end,
struct bkey_buf *last_flushed,
int *level)
{
struct bch_fs *c = trans->c;
@ -536,9 +539,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
*level = b->c.level;
k = bkey_i_to_s_c(&b->key);
ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1,
bucket_start, bucket_end,
last_flushed, k);
ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@ -559,7 +560,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
si_meminfo(&i);
mem_bytes = i.totalram * i.mem_unit;
return div_u64(mem_bytes >> 1, btree_bytes(c));
return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
@ -610,43 +611,35 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
}
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct bpos bucket_start,
struct bpos bucket_end)
struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
enum btree_id btree_id;
struct bkey_s_c k;
struct bkey_buf last_flushed;
int ret = 0;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
btree_id++) {
int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
check_btree_root_to_backpointers(trans, btree_id,
bucket_start, bucket_end,
&last_flushed, &level));
check_btree_root_to_backpointers(trans, s, btree_id, &level));
if (ret)
return ret;
while (level >= depth) {
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
level,
BTREE_ITER_PREFETCH);
while (1) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
struct bkey_s_c k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
ret = bkey_err(k) ?:
check_extent_to_backpointers(trans, btree_id, level,
bucket_start, bucket_end,
&last_flushed, k) ?:
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
@ -668,7 +661,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
}
}
bch2_bkey_buf_exit(&last_flushed, c);
return 0;
}
@ -731,37 +723,43 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
struct bpos start = POS_MIN, end;
struct extents_to_bp_state s = { .bucket_start = POS_MIN };
int ret;
bch2_bkey_buf_init(&s.last_flushed);
bkey_init(&s.last_flushed.k->k);
while (1) {
ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
if (ret)
break;
if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
if ( bpos_eq(s.bucket_start, POS_MIN) &&
!bpos_eq(s.bucket_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
__func__, btree_nodes_fit_in_ram(c));
if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
if (!bpos_eq(s.bucket_start, POS_MIN) ||
!bpos_eq(s.bucket_end, SPOS_MAX)) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "check_extents_to_backpointers(): ");
bch2_bpos_to_text(&buf, start);
bch2_bpos_to_text(&buf, s.bucket_start);
prt_str(&buf, "-");
bch2_bpos_to_text(&buf, end);
bch2_bpos_to_text(&buf, s.bucket_end);
bch_verbose(c, "%s", buf.buf);
printbuf_exit(&buf);
}
ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
if (ret || bpos_eq(end, SPOS_MAX))
ret = bch2_check_extents_to_backpointers_pass(trans, &s);
if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
break;
start = bpos_successor(end);
s.bucket_start = bpos_successor(s.bucket_end);
}
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
bch_err_fn(c, ret);
return ret;

View File

@ -2,6 +2,7 @@
#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"

View File

@ -1204,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c)
return c->opts.block_size >> 9;
}
static inline size_t btree_sectors(const struct bch_fs *c)
{
return c->opts.btree_node_size >> 9;
}
static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
{
return c->btree_key_cache_btrees & (1U << btree);

View File

@ -417,600 +417,12 @@ struct bch_set {
struct bch_val v;
};
/* Extents */
/*
* In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
* preceded by checksum/compression information (bch_extent_crc32 or
* bch_extent_crc64).
*
* One major determining factor in the format of extents is how we handle and
* represent extents that have been partially overwritten and thus trimmed:
*
* If an extent is not checksummed or compressed, when the extent is trimmed we
* don't have to remember the extent we originally allocated and wrote: we can
* merely adjust ptr->offset to point to the start of the data that is currently
* live. The size field in struct bkey records the current (live) size of the
* extent, and is also used to mean "size of region on disk that we point to" in
* this case.
*
* Thus an extent that is not checksummed or compressed will consist only of a
* list of bch_extent_ptrs, with none of the fields in
* bch_extent_crc32/bch_extent_crc64.
*
* When an extent is checksummed or compressed, it's not possible to read only
* the data that is currently live: we have to read the entire extent that was
* originally written, and then return only the part of the extent that is
* currently live.
*
* Thus, in addition to the current size of the extent in struct bkey, we need
* to store the size of the originally allocated space - this is the
* compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
* when the extent is trimmed, instead of modifying the offset field of the
* pointer, we keep a second smaller offset field - "offset into the original
* extent of the currently live region".
*
* The other major determining factor is replication and data migration:
*
* Each pointer may have its own bch_extent_crc32/64. When doing a replicated
* write, we will initially write all the replicas in the same format, with the
* same checksum type and compression format - however, when copygc runs later (or
* tiering/cache promotion, anything that moves data), it is not in general
* going to rewrite all the pointers at once - one of the replicas may be in a
* bucket on one device that has very little fragmentation while another lives
* in a bucket that has become heavily fragmented, and thus is being rewritten
* sooner than the rest.
*
* Thus it will only move a subset of the pointers (or in the case of
* tiering/cache promotion perhaps add a single pointer without dropping any
* current pointers), and if the extent has been partially overwritten it must
* write only the currently live portion (or copygc would not be able to reduce
* fragmentation!) - which necessitates a different bch_extent_crc format for
* the new pointer.
*
* But in the interests of space efficiency, we don't want to store one
* bch_extent_crc for each pointer if we don't have to.
*
* Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
* bch_extent_ptrs appended arbitrarily one after the other. We determine the
* type of a given entry with a scheme similar to utf8 (except we're encoding a
* type, not a size), encoding the type in the position of the first set bit:
*
* bch_extent_crc32 - 0b1
* bch_extent_ptr - 0b10
* bch_extent_crc64 - 0b100
*
* We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
* bch_extent_crc64 is the least constrained).
*
* Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
* until the next bch_extent_crc32/64.
*
* If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
* is neither checksummed nor compressed.
*/
/* 128 bits, sufficient for cryptographic MACs: */
struct bch_csum {
__le64 lo;
__le64 hi;
} __packed __aligned(8);
#define BCH_EXTENT_ENTRY_TYPES() \
x(ptr, 0) \
x(crc32, 1) \
x(crc64, 2) \
x(crc128, 3) \
x(stripe_ptr, 4) \
x(rebalance, 5)
#define BCH_EXTENT_ENTRY_MAX 6
enum bch_extent_entry_type {
#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
BCH_EXTENT_ENTRY_TYPES()
#undef x
};
/* Compressed/uncompressed size are stored biased by 1: */
struct bch_extent_crc32 {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u32 type:2,
_compressed_size:7,
_uncompressed_size:7,
offset:7,
_unused:1,
csum_type:4,
compression_type:4;
__u32 csum;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u32 csum;
__u32 compression_type:4,
csum_type:4,
_unused:1,
offset:7,
_uncompressed_size:7,
_compressed_size:7,
type:2;
#endif
} __packed __aligned(8);
#define CRC32_SIZE_MAX (1U << 7)
#define CRC32_NONCE_MAX 0
struct bch_extent_crc64 {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:3,
_compressed_size:9,
_uncompressed_size:9,
offset:9,
nonce:10,
csum_type:4,
compression_type:4,
csum_hi:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 csum_hi:16,
compression_type:4,
csum_type:4,
nonce:10,
offset:9,
_uncompressed_size:9,
_compressed_size:9,
type:3;
#endif
__u64 csum_lo;
} __packed __aligned(8);
#define CRC64_SIZE_MAX (1U << 9)
#define CRC64_NONCE_MAX ((1U << 10) - 1)
struct bch_extent_crc128 {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:4,
_compressed_size:13,
_uncompressed_size:13,
offset:13,
nonce:13,
csum_type:4,
compression_type:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 compression_type:4,
csum_type:4,
nonce:13,
offset:13,
_uncompressed_size:13,
_compressed_size:13,
type:4;
#endif
struct bch_csum csum;
} __packed __aligned(8);
#define CRC128_SIZE_MAX (1U << 13)
#define CRC128_NONCE_MAX ((1U << 13) - 1)
/*
* @reservation - pointer hasn't been written to, just reserved
*/
struct bch_extent_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:1,
cached:1,
unused:1,
unwritten:1,
offset:44, /* 8 petabytes */
dev:8,
gen:8;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 gen:8,
dev:8,
offset:44,
unwritten:1,
unused:1,
cached:1,
type:1;
#endif
} __packed __aligned(8);
struct bch_extent_stripe_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:5,
block:8,
redundancy:4,
idx:47;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 idx:47,
redundancy:4,
block:8,
type:5;
#endif
};
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:6,
unused:34,
compression:8, /* enum bch_compression_opt */
target:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 target:16,
compression:8,
unused:34,
type:6;
#endif
};
union bch_extent_entry {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
unsigned long type;
#elif __BITS_PER_LONG == 32
struct {
unsigned long pad;
unsigned long type;
};
#else
#error edit for your odd byteorder.
#endif
#define x(f, n) struct bch_extent_##f f;
BCH_EXTENT_ENTRY_TYPES()
#undef x
};
struct bch_btree_ptr {
struct bch_val v;
__u64 _data[0];
struct bch_extent_ptr start[];
} __packed __aligned(8);
struct bch_btree_ptr_v2 {
struct bch_val v;
__u64 mem_ptr;
__le64 seq;
__le16 sectors_written;
__le16 flags;
struct bpos min_key;
__u64 _data[0];
struct bch_extent_ptr start[];
} __packed __aligned(8);
LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
struct bch_extent {
struct bch_val v;
__u64 _data[0];
union bch_extent_entry start[];
} __packed __aligned(8);
struct bch_reservation {
struct bch_val v;
__le32 generation;
__u8 nr_replicas;
__u8 pad[3];
} __packed __aligned(8);
/* Maximum size (in u64s) a single pointer could be: */
#define BKEY_EXTENT_PTR_U64s_MAX\
((sizeof(struct bch_extent_crc128) + \
sizeof(struct bch_extent_ptr)) / sizeof(__u64))
/* Maximum possible size of an entire extent value: */
#define BKEY_EXTENT_VAL_U64s_MAX \
(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
/* * Maximum possible size of an entire extent, key + value: */
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
/* Btree pointers don't carry around checksums: */
#define BKEY_BTREE_PTR_VAL_U64s_MAX \
((sizeof(struct bch_btree_ptr_v2) + \
sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
#define BKEY_BTREE_PTR_U64s_MAX \
(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
/* Inodes */
#define BLOCKDEV_INODE_MAX 4096
#define BCACHEFS_ROOT_INO 4096
struct bch_inode {
struct bch_val v;
__le64 bi_hash_seed;
__le32 bi_flags;
__le16 bi_mode;
__u8 fields[];
} __packed __aligned(8);
struct bch_inode_v2 {
struct bch_val v;
__le64 bi_journal_seq;
__le64 bi_hash_seed;
__le64 bi_flags;
__le16 bi_mode;
__u8 fields[];
} __packed __aligned(8);
struct bch_inode_v3 {
struct bch_val v;
__le64 bi_journal_seq;
__le64 bi_hash_seed;
__le64 bi_flags;
__le64 bi_sectors;
__le64 bi_size;
__le64 bi_version;
__u8 fields[];
} __packed __aligned(8);
#define INODEv3_FIELDS_START_INITIAL 6
#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
struct bch_inode_generation {
struct bch_val v;
__le32 bi_generation;
__le32 pad;
} __packed __aligned(8);
/*
* bi_subvol and bi_parent_subvol are only set for subvolume roots:
*/
#define BCH_INODE_FIELDS_v2() \
x(bi_atime, 96) \
x(bi_ctime, 96) \
x(bi_mtime, 96) \
x(bi_otime, 96) \
x(bi_size, 64) \
x(bi_sectors, 64) \
x(bi_uid, 32) \
x(bi_gid, 32) \
x(bi_nlink, 32) \
x(bi_generation, 32) \
x(bi_dev, 32) \
x(bi_data_checksum, 8) \
x(bi_compression, 8) \
x(bi_project, 32) \
x(bi_background_compression, 8) \
x(bi_data_replicas, 8) \
x(bi_promote_target, 16) \
x(bi_foreground_target, 16) \
x(bi_background_target, 16) \
x(bi_erasure_code, 16) \
x(bi_fields_set, 16) \
x(bi_dir, 64) \
x(bi_dir_offset, 64) \
x(bi_subvol, 32) \
x(bi_parent_subvol, 32)
#define BCH_INODE_FIELDS_v3() \
x(bi_atime, 96) \
x(bi_ctime, 96) \
x(bi_mtime, 96) \
x(bi_otime, 96) \
x(bi_uid, 32) \
x(bi_gid, 32) \
x(bi_nlink, 32) \
x(bi_generation, 32) \
x(bi_dev, 32) \
x(bi_data_checksum, 8) \
x(bi_compression, 8) \
x(bi_project, 32) \
x(bi_background_compression, 8) \
x(bi_data_replicas, 8) \
x(bi_promote_target, 16) \
x(bi_foreground_target, 16) \
x(bi_background_target, 16) \
x(bi_erasure_code, 16) \
x(bi_fields_set, 16) \
x(bi_dir, 64) \
x(bi_dir_offset, 64) \
x(bi_subvol, 32) \
x(bi_parent_subvol, 32) \
x(bi_nocow, 8)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
x(data_checksum, 8) \
x(compression, 8) \
x(project, 32) \
x(background_compression, 8) \
x(data_replicas, 8) \
x(promote_target, 16) \
x(foreground_target, 16) \
x(background_target, 16) \
x(erasure_code, 16) \
x(nocow, 8)
enum inode_opt_id {
#define x(name, ...) \
Inode_opt_##name,
BCH_INODE_OPTS()
#undef x
Inode_opt_nr,
};
#define BCH_INODE_FLAGS() \
x(sync, 0) \
x(immutable, 1) \
x(append, 2) \
x(nodump, 3) \
x(noatime, 4) \
x(i_size_dirty, 5) \
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
x(backptr_untrusted, 8)
/* bits 20+ reserved for packed fields below: */
enum bch_inode_flags {
#define x(t, n) BCH_INODE_##t = 1U << n,
BCH_INODE_FLAGS()
#undef x
};
enum __bch_inode_flags {
#define x(t, n) __BCH_INODE_##t = n,
BCH_INODE_FLAGS()
#undef x
};
LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
LE64_BITMASK(INODEv3_FIELDS_START,
struct bch_inode_v3, bi_flags, 31, 36);
LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
/* Dirents */
/*
* Dirents (and xattrs) have to implement string lookups; since our b-tree
* doesn't support arbitrary length strings for the key, we instead index by a
* 64 bit hash (currently truncated sha1) of the string, stored in the offset
* field of the key - using linear probing to resolve hash collisions. This also
* provides us with the readdir cookie posix requires.
*
* Linear probing requires us to use whiteouts for deletions, in the event of a
* collision:
*/
struct bch_dirent {
struct bch_val v;
/* Target inode number: */
union {
__le64 d_inum;
struct { /* DT_SUBVOL */
__le32 d_child_subvol;
__le32 d_parent_subvol;
};
};
/*
* Copy of mode bits 12-15 from the target inode - so userspace can get
* the filetype without having to do a stat()
*/
__u8 d_type;
__u8 d_name[];
} __packed __aligned(8);
#define DT_SUBVOL 16
#define BCH_DT_MAX 17
#define BCH_NAME_MAX 512
/* Xattrs */
#define KEY_TYPE_XATTR_INDEX_USER 0
#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
#define KEY_TYPE_XATTR_INDEX_SECURITY 4
struct bch_xattr {
struct bch_val v;
__u8 x_type;
__u8 x_name_len;
__le16 x_val_len;
__u8 x_name[];
} __packed __aligned(8);
/* Bucket/allocation information: */
struct bch_alloc {
struct bch_val v;
__u8 fields;
__u8 gen;
__u8 data[];
} __packed __aligned(8);
#define BCH_ALLOC_FIELDS_V1() \
x(read_time, 16) \
x(write_time, 16) \
x(data_type, 8) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
x(oldest_gen, 8) \
x(stripe, 32) \
x(stripe_redundancy, 8)
enum {
#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
BCH_ALLOC_FIELDS_V1()
#undef x
};
struct bch_alloc_v2 {
struct bch_val v;
__u8 nr_fields;
__u8 gen;
__u8 oldest_gen;
__u8 data_type;
__u8 data[];
} __packed __aligned(8);
#define BCH_ALLOC_FIELDS_V2() \
x(read_time, 64) \
x(write_time, 64) \
x(dirty_sectors, 32) \
x(cached_sectors, 32) \
x(stripe, 32) \
x(stripe_redundancy, 8)
struct bch_alloc_v3 {
struct bch_val v;
__le64 journal_seq;
__le32 flags;
__u8 nr_fields;
__u8 gen;
__u8 oldest_gen;
__u8 data_type;
__u8 data[];
} __packed __aligned(8);
LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
struct bch_alloc_v4 {
struct bch_val v;
__u64 journal_seq;
__u32 flags;
__u8 gen;
__u8 oldest_gen;
__u8 data_type;
__u8 stripe_redundancy;
__u32 dirty_sectors;
__u32 cached_sectors;
__u64 io_time[2];
__u32 stripe;
__u32 nr_external_backpointers;
__u64 fragmentation_lru;
} __packed __aligned(8);
#define BCH_ALLOC_V4_U64s_V0 6
#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
struct bch_backpointer {
struct bch_val v;
__u8 btree_id;
@ -1021,154 +433,6 @@ struct bch_backpointer {
struct bpos pos;
} __packed __aligned(8);
#define KEY_TYPE_BUCKET_GENS_BITS 8
#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
struct bch_bucket_gens {
struct bch_val v;
u8 gens[KEY_TYPE_BUCKET_GENS_NR];
} __packed __aligned(8);
/* Quotas: */
enum quota_types {
QTYP_USR = 0,
QTYP_GRP = 1,
QTYP_PRJ = 2,
QTYP_NR = 3,
};
enum quota_counters {
Q_SPC = 0,
Q_INO = 1,
Q_COUNTERS = 2,
};
struct bch_quota_counter {
__le64 hardlimit;
__le64 softlimit;
};
struct bch_quota {
struct bch_val v;
struct bch_quota_counter c[Q_COUNTERS];
} __packed __aligned(8);
/* Erasure coding */
struct bch_stripe {
struct bch_val v;
__le16 sectors;
__u8 algorithm;
__u8 nr_blocks;
__u8 nr_redundant;
__u8 csum_granularity_bits;
__u8 csum_type;
__u8 pad;
struct bch_extent_ptr ptrs[];
} __packed __aligned(8);
/* Reflink: */
struct bch_reflink_p {
struct bch_val v;
__le64 idx;
/*
* A reflink pointer might point to an indirect extent which is then
* later split (by copygc or rebalance). If we only pointed to part of
* the original indirect extent, and then one of the fragments is
* outside the range we point to, we'd leak a refcount: so when creating
* reflink pointers, we need to store pad values to remember the full
* range we were taking a reference on.
*/
__le32 front_pad;
__le32 back_pad;
} __packed __aligned(8);
struct bch_reflink_v {
struct bch_val v;
__le64 refcount;
union bch_extent_entry start[0];
__u64 _data[];
} __packed __aligned(8);
struct bch_indirect_inline_data {
struct bch_val v;
__le64 refcount;
u8 data[];
};
/* Inline data */
struct bch_inline_data {
struct bch_val v;
u8 data[];
};
/* Subvolumes: */
#define SUBVOL_POS_MIN POS(0, 1)
#define SUBVOL_POS_MAX POS(0, S32_MAX)
#define BCACHEFS_ROOT_SUBVOL 1
struct bch_subvolume {
struct bch_val v;
__le32 flags;
__le32 snapshot;
__le64 inode;
/*
* Snapshot subvolumes form a tree, separate from the snapshot nodes
* tree - if this subvolume is a snapshot, this is the ID of the
* subvolume it was created from:
*/
__le32 parent;
__le32 pad;
bch_le128 otime;
};
LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
/*
* We need to know whether a subvolume is a snapshot so we can know whether we
* can delete it (or whether it should just be rm -rf'd)
*/
LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
/* Snapshots */
struct bch_snapshot {
struct bch_val v;
__le32 flags;
__le32 parent;
__le32 children[2];
__le32 subvol;
/* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
__le32 tree;
__le32 depth;
__le32 skip[3];
};
LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
/* True if a subvolume points to this snapshot node: */
LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
/*
* Snapshot trees:
*
* The snapshot_trees btree gives us persistent indentifier for each tree of
* bch_snapshot nodes, and allow us to record and easily find the root/master
* subvolume that other snapshots were created from:
*/
struct bch_snapshot_tree {
struct bch_val v;
__le32 master_subvol;
__le32 root_snapshot;
};
/* LRU btree: */
struct bch_lru {
@ -1178,33 +442,6 @@ struct bch_lru {
#define LRU_ID_STRIPES (1U << 16)
/* Logged operations btree: */
struct bch_logged_op_truncate {
struct bch_val v;
__le32 subvol;
__le32 pad;
__le64 inum;
__le64 new_i_size;
};
enum logged_op_finsert_state {
LOGGED_OP_FINSERT_start,
LOGGED_OP_FINSERT_shift_extents,
LOGGED_OP_FINSERT_finish,
};
struct bch_logged_op_finsert {
struct bch_val v;
__u8 state;
__u8 pad[3];
__le32 subvol;
__le64 inum;
__le64 dst_offset;
__le64 src_offset;
__le64 pos;
};
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@ -1230,6 +467,19 @@ struct bch_sb_field {
x(ext, 13) \
x(downgrade, 14)
#include "alloc_background_format.h"
#include "extents_format.h"
#include "reflink_format.h"
#include "ec_format.h"
#include "inode_format.h"
#include "dirent_format.h"
#include "xattr_format.h"
#include "quota_format.h"
#include "logged_ops_format.h"
#include "snapshot_format.h"
#include "subvolume_format.h"
#include "sb-counters_format.h"
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
BCH_SB_FIELDS()
@ -1465,23 +715,6 @@ struct bch_sb_field_replicas {
struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);
/* BCH_SB_FIELD_quota: */
struct bch_sb_quota_counter {
__le32 timelimit;
__le32 warnlimit;
};
struct bch_sb_quota_type {
__le64 flags;
struct bch_sb_quota_counter c[Q_COUNTERS];
};
struct bch_sb_field_quota {
struct bch_sb_field field;
struct bch_sb_quota_type q[QTYP_NR];
} __packed __aligned(8);
/* BCH_SB_FIELD_disk_groups: */
#define BCH_SB_LABEL_SIZE 32
@ -1500,101 +733,6 @@ struct bch_sb_field_disk_groups {
struct bch_disk_group entries[];
} __packed __aligned(8);
/* BCH_SB_FIELD_counters */
#define BCH_PERSISTENT_COUNTERS() \
x(io_read, 0) \
x(io_write, 1) \
x(io_move, 2) \
x(bucket_invalidate, 3) \
x(bucket_discard, 4) \
x(bucket_alloc, 5) \
x(bucket_alloc_fail, 6) \
x(btree_cache_scan, 7) \
x(btree_cache_reap, 8) \
x(btree_cache_cannibalize, 9) \
x(btree_cache_cannibalize_lock, 10) \
x(btree_cache_cannibalize_lock_fail, 11) \
x(btree_cache_cannibalize_unlock, 12) \
x(btree_node_write, 13) \
x(btree_node_read, 14) \
x(btree_node_compact, 15) \
x(btree_node_merge, 16) \
x(btree_node_split, 17) \
x(btree_node_rewrite, 18) \
x(btree_node_alloc, 19) \
x(btree_node_free, 20) \
x(btree_node_set_root, 21) \
x(btree_path_relock_fail, 22) \
x(btree_path_upgrade_fail, 23) \
x(btree_reserve_get_fail, 24) \
x(journal_entry_full, 25) \
x(journal_full, 26) \
x(journal_reclaim_finish, 27) \
x(journal_reclaim_start, 28) \
x(journal_write, 29) \
x(read_promote, 30) \
x(read_bounce, 31) \
x(read_split, 33) \
x(read_retry, 32) \
x(read_reuse_race, 34) \
x(move_extent_read, 35) \
x(move_extent_write, 36) \
x(move_extent_finish, 37) \
x(move_extent_fail, 38) \
x(move_extent_start_fail, 39) \
x(copygc, 40) \
x(copygc_wait, 41) \
x(gc_gens_end, 42) \
x(gc_gens_start, 43) \
x(trans_blocked_journal_reclaim, 44) \
x(trans_restart_btree_node_reused, 45) \
x(trans_restart_btree_node_split, 46) \
x(trans_restart_fault_inject, 47) \
x(trans_restart_iter_upgrade, 48) \
x(trans_restart_journal_preres_get, 49) \
x(trans_restart_journal_reclaim, 50) \
x(trans_restart_journal_res_get, 51) \
x(trans_restart_key_cache_key_realloced, 52) \
x(trans_restart_key_cache_raced, 53) \
x(trans_restart_mark_replicas, 54) \
x(trans_restart_mem_realloced, 55) \
x(trans_restart_memory_allocation_failure, 56) \
x(trans_restart_relock, 57) \
x(trans_restart_relock_after_fill, 58) \
x(trans_restart_relock_key_cache_fill, 59) \
x(trans_restart_relock_next_node, 60) \
x(trans_restart_relock_parent_for_fill, 61) \
x(trans_restart_relock_path, 62) \
x(trans_restart_relock_path_intent, 63) \
x(trans_restart_too_many_iters, 64) \
x(trans_restart_traverse, 65) \
x(trans_restart_upgrade, 66) \
x(trans_restart_would_deadlock, 67) \
x(trans_restart_would_deadlock_write, 68) \
x(trans_restart_injected, 69) \
x(trans_restart_key_cache_upgrade, 70) \
x(trans_traverse_all, 71) \
x(transaction_commit, 72) \
x(write_super, 73) \
x(trans_restart_would_deadlock_recursion_limit, 74) \
x(trans_restart_write_buffer_flush, 75) \
x(trans_restart_split_race, 76) \
x(write_buffer_flush_slowpath, 77) \
x(write_buffer_flush_sync, 78)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
BCH_PERSISTENT_COUNTERS()
#undef x
BCH_COUNTER_NR
};
struct bch_sb_field_counters {
struct bch_sb_field field;
__le64 d[];
};
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:

View File

@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out,
next_key_bits -= 64;
}
bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
if (!next_key_bits)
break;

View File

@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
}
#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
.key_invalid = key_type_cookie_invalid, \
.val_to_text = key_type_cookie_to_text, \
.min_val_size = 8, \
})

View File

@ -83,9 +83,10 @@ enum btree_update_flags {
__BTREE_TRIGGER_NORUN,
__BTREE_TRIGGER_TRANSACTIONAL,
__BTREE_TRIGGER_ATOMIC,
__BTREE_TRIGGER_GC,
__BTREE_TRIGGER_INSERT,
__BTREE_TRIGGER_OVERWRITE,
__BTREE_TRIGGER_GC,
__BTREE_TRIGGER_BUCKET_INVALIDATE,
};
@ -107,6 +108,10 @@ enum btree_update_flags {
* causing us to go emergency read-only)
*/
#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL)
#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC)
/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
/* @new is entering the btree */
#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
@ -114,9 +119,6 @@ enum btree_update_flags {
/* @old is leaving the btree */
#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
/* signal from bucket invalidate path to alloc trigger */
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)

View File

@ -720,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
struct bkey_i min_key, max_key;
unsigned j, cacheline = 1;
unsigned cacheline = 1;
t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
bset_ro_tree_capacity(b, t));
@ -823,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
set_btree_bset(b, t, i);
}
void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
struct btree_node_entry *bne)
void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
{
struct bset *i = &bne->keys;
struct bset_tree *t;
BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(b->nsets >= MAX_BSETS);

View File

@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b,
void bch2_btree_keys_init(struct btree *);
void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct bch_fs *, struct btree *,
struct btree_node_entry *);
void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
void bch2_bset_insert(struct btree *, struct btree_node_iter *,

View File

@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
kvpfree(b->data, btree_bytes(c));
kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
b->data = kvpmalloc(btree_bytes(c), gfp);
b->data = kvpmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
kvpfree(b->data, btree_bytes(c));
kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
bkey_btree_ptr_init(&b->key);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
b->byte_order = ilog2(btree_bytes(c));
b->byte_order = ilog2(c->opts.btree_node_size);
return b;
}
@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
kvpfree(c->verify_ondisk, btree_bytes(c));
kvpfree(c->verify_ondisk, c->opts.btree_node_size);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@ -1192,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
" failed unpacked %zu\n",
b->unpack_fn_len,
b->nr.live_u64s * sizeof(u64),
btree_bytes(c) - sizeof(struct btree_node),
btree_buf_bytes(b) - sizeof(struct btree_node),
b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0],
b->sib_u64s[1],

View File

@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b)
_iter = 0; _iter < (_tbl)->size; _iter++) \
rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
static inline size_t btree_bytes(struct bch_fs *c)
static inline size_t btree_buf_bytes(const struct btree *b)
{
return c->opts.btree_node_size;
return 1UL << b->byte_order;
}
static inline size_t btree_max_u64s(struct bch_fs *c)
static inline size_t btree_buf_max_u64s(const struct btree *b)
{
return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
}
static inline size_t btree_pages(struct bch_fs *c)
static inline size_t btree_max_u64s(const struct bch_fs *c)
{
return btree_bytes(c) / PAGE_SIZE;
return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
}
static inline unsigned btree_blocks(struct bch_fs *c)
static inline size_t btree_sectors(const struct bch_fs *c)
{
return c->opts.btree_node_size >> SECTOR_SHIFT;
}
static inline unsigned btree_blocks(const struct bch_fs *c)
{
return btree_sectors(c) >> c->block_bits;
}

View File

@ -597,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@ -615,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@ -637,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@ -649,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@ -664,8 +664,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[g->data_type],
bch2_data_types[data_type],
bch2_data_type_str(g->data_type),
bch2_data_type_str(data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (data_type == BCH_DATA_btree) {
@ -1238,11 +1238,11 @@ static int bch2_gc_done(struct bch_fs *c,
for (i = 0; i < BCH_DATA_NR; i++) {
copy_dev_field(dev_usage_buckets_wrong,
d[i].buckets, "%s buckets", bch2_data_types[i]);
d[i].buckets, "%s buckets", bch2_data_type_str(i));
copy_dev_field(dev_usage_sectors_wrong,
d[i].sectors, "%s sectors", bch2_data_types[i]);
d[i].sectors, "%s sectors", bch2_data_type_str(i));
copy_dev_field(dev_usage_fragmented_wrong,
d[i].fragmented, "%s fragmented", bch2_data_types[i]);
d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
}
}
@ -1253,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
copy_fs_field(fs_usage_hidden_wrong,
hidden, "hidden");
b.hidden, "hidden");
copy_fs_field(fs_usage_btree_wrong,
btree, "btree");
b.btree, "btree");
if (!metadata_only) {
copy_fs_field(fs_usage_data_wrong,
data, "data");
b.data, "data");
copy_fs_field(fs_usage_cached_wrong,
cached, "cached");
b.cached, "cached");
copy_fs_field(fs_usage_reserved_wrong,
reserved, "reserved");
b.reserved, "reserved");
copy_fs_field(fs_usage_nr_inodes_wrong,
nr_inodes,"nr_inodes");
b.nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(fs_usage_persistent_reserved_wrong,
@ -1417,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
gc.gen,
bch2_data_types[new.data_type],
bch2_data_types[gc.data_type]))
bch2_data_type_str(new.data_type),
bch2_data_type_str(gc.data_type)))
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
@ -1428,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
gc.gen, \
bch2_data_types[gc.data_type], \
bch2_data_type_str(gc.data_type), \
new._f, gc._f)) \
new._f = gc._f; \

View File

@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
unsigned flags = memalloc_nofs_save();
void *p;
BUG_ON(size > btree_bytes(c));
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
for (k = unwritten_whiteouts_start(c, b);
k != unwritten_whiteouts_end(c, b);
for (k = unwritten_whiteouts_start(b);
k != unwritten_whiteouts_end(b);
k = bkey_p_next(k))
*--ptrs = k;
@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
verify_no_dups(b, new_whiteouts,
(void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
memcpy_u64s(unwritten_whiteouts_start(c, b),
memcpy_u64s(unwritten_whiteouts_start(b),
new_whiteouts, b->whiteout_u64s);
btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
}
bytes = sorting_entire_node
? btree_bytes(c)
? btree_buf_bytes(b)
: __vstruct_bytes(struct btree_node, u64s);
out = btree_bounce_alloc(c, bytes, &used_mempool);
@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
if (sorting_entire_node) {
u64s = le16_to_cpu(out->keys.u64s);
BUG_ON(bytes != btree_bytes(c));
BUG_ON(bytes != btree_buf_bytes(b));
/*
* Our temporary buffer is the same size as the btree node's
@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
bch2_bset_init_next(c, b, bne);
bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
@ -1160,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ptr_written, b->written);
} else {
for (bne = write_block(b);
bset_byte_offset(b, bne) < btree_bytes(c);
bset_byte_offset(b, bne) < btree_buf_bytes(b);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq &&
!bch2_journal_seq_is_blacklisted(c,
@ -1172,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"found bset signature after last bset");
}
sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
sorted->keys.u64s = 0;
set_btree_bset(b, b->set, &b->data->keys);
@ -1188,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(b->nr.live_u64s != u64s);
btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
if (updated_range)
bch2_btree_node_drop_keys_outside_node(b);
@ -1284,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work)
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
bio->bi_iter.bi_size = btree_buf_bytes(b);
if (rb->have_ioref) {
bio_set_dev(bio, ca->disk_sb.bdev);
@ -1512,7 +1512,7 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
}
if (best >= 0) {
memcpy(b->data, ra->buf[best], btree_bytes(c));
memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
} else {
ret = -1;
@ -1578,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
for (i = 0; i < ra->nr; i++) {
ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
ra->bio[i] = bio_alloc_bioset(NULL,
buf_pages(ra->buf[i], btree_bytes(c)),
buf_pages(ra->buf[i], btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@ -1598,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
rb->pick = pick;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@ -1665,7 +1665,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
bio = bio_alloc_bioset(NULL,
buf_pages(b->data, btree_bytes(c)),
buf_pages(b->data, btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@ -1679,7 +1679,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
bch2_bio_map(bio, b->data, btree_bytes(c));
bch2_bio_map(bio, b->data, btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@ -2074,8 +2074,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
i->u64s = 0;
sort_iter_add(&sort_iter.iter,
unwritten_whiteouts_start(c, b),
unwritten_whiteouts_end(c, b));
unwritten_whiteouts_start(b),
unwritten_whiteouts_end(b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
b->whiteout_u64s = 0;
@ -2251,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
bch2_bset_init_next(c, b, bne);
bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);

View File

@ -1337,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
if (path->should_be_locked &&
!trans->restarted &&
(!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
(!dup || !bch2_btree_path_relock_norestart(trans, dup)))
return;
if (dup) {

View File

@ -819,6 +819,11 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
/*
* This should not be used in a fastpath, without first trying _do in
* nonblocking mode - it will cause excessive transaction restarts and
* potentially livelocking:
*/
#define drop_locks_do(_trans, _do) \
({ \
bch2_trans_unlock(_trans); \

View File

@ -631,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
}
__flatten
bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
{
struct get_locks_fail f;
@ -642,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
int __bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
if (!bch2_btree_path_relock_norestart(trans, path)) {
trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
}
@ -759,12 +758,39 @@ int bch2_trans_relock(struct btree_trans *trans)
if (unlikely(trans->restarted))
return -((int) trans->restarted);
trans_for_each_path(trans, path, i)
trans_for_each_path(trans, path, i) {
struct get_locks_fail f;
if (path->should_be_locked &&
!bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
!btree_path_get_locks(trans, path, false, &f)) {
if (trace_trans_restart_relock_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bpos_to_text(&buf, path->pos);
prt_printf(&buf, " l=%u seq=%u node seq=",
f.l, path->l[f.l].lock_seq);
if (IS_ERR_OR_NULL(f.b)) {
prt_str(&buf, bch2_err_str(PTR_ERR(f.b)));
} else {
prt_printf(&buf, "%u", f.b->c.lock.seq);
struct six_lock_count c =
bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l);
prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
c = six_lock_counts(&f.b->c.lock);
prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
}
trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
printbuf_exit(&buf);
}
count_event(trans->c, trans_restart_relock);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
}
return 0;
}
@ -778,7 +804,7 @@ int bch2_trans_relock_notrace(struct btree_trans *trans)
trans_for_each_path(trans, path, i)
if (path->should_be_locked &&
!bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
!bch2_btree_path_relock_norestart(trans, path)) {
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
return 0;

View File

@ -312,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *,
/* relock: */
bool bch2_btree_path_relock_norestart(struct btree_trans *,
struct btree_path *, unsigned long);
bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
int __bch2_btree_path_relock(struct btree_trans *,
struct btree_path *, unsigned long);
@ -353,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
struct get_locks_fail {
unsigned l;
struct btree *b;
};
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
struct btree_path *, unsigned,
struct get_locks_fail *);

View File

@ -139,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
EBUG_ON(insert->k.u64s >
bch_btree_keys_u64s_remaining(trans->c, b));
EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
k = bch2_btree_node_iter_peek_all(node_iter, b);
@ -160,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
push_whiteout(trans->c, b, insert->k.p);
push_whiteout(b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
@ -348,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
static inline int btree_key_can_insert(struct btree_trans *trans,
struct btree *b, unsigned u64s)
{
struct bch_fs *c = trans->c;
if (!bch2_btree_node_insert_fits(c, b, u64s))
if (!bch2_btree_node_insert_fits(b, u64s))
return -BCH_ERR_btree_insert_btree_node_full;
return 0;
@ -418,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_k))
return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
@ -448,9 +445,6 @@ static int run_one_mem_trigger(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
return 0;
if (old_ops->trigger == new_ops->trigger) {
ret = bch2_key_trigger(trans, i->btree_id, i->level,
old, bkey_i_to_s(new),
@ -586,9 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
int ret = 0;
trans_for_each_update(trans, i) {
/*
* XXX: synchronization of cached update triggers with gc
@ -596,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
*/
BUG_ON(i->cached || i->level);
if (gc_visited(c, gc_pos_btree_node(insert_l(trans, i)->b))) {
ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
if (ret)
break;
return ret;
}
}
return ret;
return 0;
}
static inline int
@ -680,6 +672,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
return -BCH_ERR_btree_insert_need_mark_replicas;
/* XXX: we only want to run this if deltas are nonzero */
bch2_trans_account_disk_usage_change(trans);
h = trans->hooks;
while (h) {
ret = h->fn(trans, h);
@ -689,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
trans_for_each_update(trans, i)
if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
ret = run_one_mem_trigger(trans, i, i->flags);
if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags);
if (ret)
goto fatal_err;
}
@ -994,6 +989,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
!trans->journal_entries_u64s)
goto out_reset;
memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
goto out_reset;

View File

@ -430,6 +430,9 @@ struct btree_trans {
struct journal_res journal_res;
u64 *journal_seq;
struct disk_reservation *disk_res;
struct bch_fs_usage_base fs_usage_delta;
unsigned journal_u64s;
unsigned extra_disk_res; /* XXX kill */
struct replicas_delta_list *fs_usage_deltas;
@ -653,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_reflink)| \
BIT_ULL(BKEY_TYPE_btree))
#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
(BIT_ULL(BKEY_TYPE_alloc)| \
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
@ -661,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
@ -738,4 +741,9 @@ enum btree_node_sibling {
btree_next_sib,
};
struct get_locks_fail {
unsigned l;
struct btree *b;
};
#endif /* _BCACHEFS_BTREE_TYPES_H */

View File

@ -159,7 +159,7 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
{
size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
}
/* Btree node freeing/allocation: */
@ -1097,7 +1097,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
if (bch2_btree_node_insert_fits(path->l[update_level].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
@ -1401,7 +1401,7 @@ static void __btree_split_node(struct btree_update *as,
unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
nr_keys[i].val_u64s;
if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c))
if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
n[i]->data->format = b->format;
btree_node_set_format(n[i], n[i]->data->format);
@ -1703,7 +1703,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_btree_node_prep_for_write(trans, path, b);
if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(trans, path, b);
goto split;
}

View File

@ -184,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b)
b->sib_u64s[1] = b->nr.live_u64s;
}
static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
static inline void *btree_data_end(struct btree *b)
{
return (void *) b->data + btree_bytes(c);
return (void *) b->data + btree_buf_bytes(b);
}
static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
struct btree *b)
static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
{
return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
}
static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
struct btree *b)
static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
{
return btree_data_end(c, b);
return btree_data_end(b);
}
static inline void *write_block(struct btree *b)
@ -221,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
return __btree_addr_written(b, k);
}
static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
struct btree *b,
void *end)
static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
{
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
b->whiteout_u64s;
ssize_t total = c->opts.btree_node_size >> 3;
ssize_t total = btree_buf_bytes(b) >> 3;
/* Always leave one extra u64 for bch2_varint_decode: */
used++;
@ -235,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
return total - used;
}
static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
struct btree *b)
static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
{
ssize_t remaining = __bch_btree_u64s_remaining(c, b,
ssize_t remaining = __bch2_btree_u64s_remaining(b,
btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(remaining < 0);
@ -260,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b)
return 8 << BTREE_WRITE_SET_U64s_BITS;
}
static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct btree *b)
static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
__bch_btree_u64s_remaining(c, b, bne->keys.start);
__bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
@ -281,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
return NULL;
}
static inline void push_whiteout(struct bch_fs *c, struct btree *b,
struct bpos pos)
static inline void push_whiteout(struct btree *b, struct bpos pos)
{
struct bkey_packed k;
BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
EBUG_ON(btree_node_just_written(b));
if (!bkey_pack_pos(&k, pos, b)) {
@ -299,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
bkey_p_copy(unwritten_whiteouts_start(b), &k);
}
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
*/
static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
struct btree *b, unsigned u64s)
static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
{
if (unlikely(btree_node_need_rewrite(b)))
return false;
return u64s <= bch_btree_keys_u64s_remaining(c, b);
return u64s <= bch2_btree_keys_u64s_remaining(b);
}
void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);

View File

@ -125,13 +125,12 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
struct btree_write_buffered_key *wb,
bool *write_locked, size_t *fast)
{
struct bch_fs *c = trans->c;
struct btree_path *path;
int ret;
EBUG_ON(!wb->journal_seq);
EBUG_ON(!c->btree_write_buffer.flushing.pin.seq);
EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
ret = bch2_btree_iter_traverse(iter);
if (ret)
@ -155,7 +154,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
*write_locked = true;
}
if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) {
if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
*write_locked = false;
return wb_flush_one_slowpath(trans, iter, wb);
}

View File

@ -25,7 +25,7 @@
#include <linux/preempt.h>
static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
enum bch_data_type data_type,
s64 sectors)
{
@ -54,20 +54,20 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
bch2_fs_usage_acc_to_base(c, i);
for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
usage->reserved += usage->persistent_reserved[i];
usage->b.reserved += usage->persistent_reserved[i];
for (unsigned i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
}
for_each_member_device(c, ca) {
struct bch_dev_usage dev = bch2_dev_usage_read(ca);
usage->hidden += (dev.d[BCH_DATA_sb].buckets +
dev.d[BCH_DATA_journal].buckets) *
usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
dev.d[BCH_DATA_journal].buckets) *
ca->mi.bucket_size;
}
@ -188,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out,
prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
prt_printf(out, "hidden:\t\t\t\t%llu\n",
fs_usage->u.hidden);
fs_usage->u.b.hidden);
prt_printf(out, "data:\t\t\t\t%llu\n",
fs_usage->u.data);
fs_usage->u.b.data);
prt_printf(out, "cached:\t\t\t\t%llu\n",
fs_usage->u.cached);
fs_usage->u.b.cached);
prt_printf(out, "reserved:\t\t\t%llu\n",
fs_usage->u.reserved);
fs_usage->u.b.reserved);
prt_printf(out, "nr_inodes:\t\t\t%llu\n",
fs_usage->u.nr_inodes);
fs_usage->u.b.nr_inodes);
prt_printf(out, "online reserved:\t\t%llu\n",
fs_usage->online_reserved);
@ -225,10 +225,10 @@ static u64 reserve_factor(u64 r)
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
{
return min(fs_usage->u.hidden +
fs_usage->u.btree +
fs_usage->u.data +
reserve_factor(fs_usage->u.reserved +
return min(fs_usage->u.b.hidden +
fs_usage->u.b.btree +
fs_usage->u.b.data +
reserve_factor(fs_usage->u.b.reserved +
fs_usage->online_reserved),
c->capacity);
}
@ -240,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
u64 data, reserved;
ret.capacity = c->capacity -
bch2_fs_usage_read_one(c, &c->usage_base->hidden);
bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
bch2_fs_usage_read_one(c, &c->usage_base->btree);
reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
percpu_u64_get(c->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
return ret;
}
@ -284,7 +284,7 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
prt_newline(out);
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
prt_str(out, bch2_data_types[i]);
bch2_prt_data_type(out, i);
prt_tab(out);
prt_u64(out, usage->d[i].buckets);
prt_tab_rjust(out);
@ -308,9 +308,9 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
fs_usage = fs_usage_ptr(c, journal_seq, gc);
if (data_type_is_hidden(old->data_type))
fs_usage->hidden -= ca->mi.bucket_size;
fs_usage->b.hidden -= ca->mi.bucket_size;
if (data_type_is_hidden(new->data_type))
fs_usage->hidden += ca->mi.bucket_size;
fs_usage->b.hidden += ca->mi.bucket_size;
u = dev_usage_ptr(ca, journal_seq, gc);
@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c,
if (idx < 0)
return -1;
fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
return 0;
}
@ -394,7 +394,7 @@ int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
preempt_enable();
err:
@ -523,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on(g->data_type &&
g->data_type != data_type, c,
"different types of data in same bucket: %s, %s",
bch2_data_types[g->data_type],
bch2_data_types[data_type])) {
bch2_data_type_str(g->data_type),
bch2_data_type_str(data_type))) {
ret = -EIO;
goto err;
}
@ -532,7 +532,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
bch2_data_types[g->data_type ?: data_type],
bch2_data_type_str(g->data_type ?: data_type),
g->dirty_sectors, sectors)) {
ret = -EIO;
goto err;
@ -575,7 +575,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@ -588,7 +588,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@ -603,7 +603,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"while marking %s",
ptr->dev, bucket_nr, b_gen,
*bucket_gen(ca, bucket_nr),
bch2_data_types[bucket_data_type ?: ptr_data_type],
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@ -624,8 +624,8 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type],
bch2_data_types[ptr_data_type],
bch2_data_type_str(bucket_data_type),
bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@ -638,7 +638,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
bucket_sectors, sectors,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@ -677,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
}
dst->nr_inodes -= deltas->nr_inodes;
dst->b.nr_inodes -= deltas->nr_inodes;
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
added -= deltas->persistent_reserved[i];
dst->reserved -= deltas->persistent_reserved[i];
dst->b.reserved -= deltas->persistent_reserved[i];
dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
}
@ -694,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
percpu_up_read(&c->mark_lock);
}
int bch2_trans_fs_usage_apply(struct btree_trans *trans,
struct replicas_delta_list *deltas)
void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
static int warned_disk_usage = 0;
bool warn = false;
u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
struct replicas_delta *d, *d2;
struct replicas_delta *top = (void *) deltas->d + deltas->used;
struct bch_fs_usage *dst;
s64 added = 0, should_not_have_added;
unsigned i;
percpu_down_read(&c->mark_lock);
preempt_disable();
dst = fs_usage_ptr(c, trans->journal_res.seq, false);
struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
struct bch_fs_usage_base *src = &trans->fs_usage_delta;
for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
switch (d->r.data_type) {
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
added += d->delta;
}
if (__update_replicas(c, dst, &d->r, d->delta))
goto need_mark;
}
dst->nr_inodes += deltas->nr_inodes;
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
added += deltas->persistent_reserved[i];
dst->reserved += deltas->persistent_reserved[i];
dst->persistent_reserved[i] += deltas->persistent_reserved[i];
}
s64 added = src->btree + src->data + src->reserved;
/*
* Not allowed to reduce sectors_available except by getting a
* reservation:
*/
should_not_have_added = added - (s64) disk_res_sectors;
s64 should_not_have_added = added - (s64) disk_res_sectors;
if (unlikely(should_not_have_added > 0)) {
u64 old, new, v = atomic64_read(&c->sectors_available);
@ -754,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
this_cpu_sub(*c->online_reserved, added);
}
dst->hidden += src->hidden;
dst->btree += src->btree;
dst->data += src->data;
dst->cached += src->cached;
dst->reserved += src->reserved;
dst->nr_inodes += src->nr_inodes;
preempt_enable();
percpu_up_read(&c->mark_lock);
@ -761,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"disk usage increased %lli more than %llu sectors reserved)",
should_not_have_added, disk_res_sectors);
}
int bch2_trans_fs_usage_apply(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
struct bch_fs *c = trans->c;
struct replicas_delta *d, *d2;
struct replicas_delta *top = (void *) deltas->d + deltas->used;
struct bch_fs_usage *dst;
unsigned i;
percpu_down_read(&c->mark_lock);
preempt_disable();
dst = fs_usage_ptr(c, trans->journal_res.seq, false);
for (d = deltas->d; d != top; d = replicas_delta_next(d))
if (__update_replicas(c, dst, &d->r, d->delta))
goto need_mark;
dst->b.nr_inodes += deltas->nr_inodes;
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
dst->b.reserved += deltas->persistent_reserved[i];
dst->persistent_reserved[i] += deltas->persistent_reserved[i];
}
preempt_enable();
percpu_up_read(&c->mark_lock);
return 0;
need_mark:
/* revert changes: */
@ -1084,7 +1096,7 @@ static int __trigger_reservation(struct btree_trans *trans,
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
fs_usage->reserved += sectors;
fs_usage->b.reserved += sectors;
fs_usage->persistent_reserved[replicas - 1] += sectors;
preempt_enable();
@ -1130,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
bch2_data_types[a->v.data_type],
bch2_data_types[type],
bch2_data_types[type]);
bch2_data_type_str(a->v.data_type),
bch2_data_type_str(type),
bch2_data_type_str(type));
ret = -EIO;
goto err;
}

View File

@ -356,6 +356,8 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
ret; \
})
void bch2_trans_account_disk_usage_change(struct btree_trans *);
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
@ -385,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
return false;
}
static inline const char *bch2_data_type_str(enum bch_data_type type)
{
return type < BCH_DATA_NR
? __bch2_data_types[type]
: "(invalid data type)";
}
static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
{
if (type < BCH_DATA_NR)
prt_str(out, __bch2_data_types[type]);
else
prt_printf(out, "(invalid data type %u)", type);
}
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,

View File

@ -45,23 +45,18 @@ struct bch_dev_usage {
} d[BCH_DATA_NR];
};
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
struct bch_fs_usage_base {
u64 hidden;
u64 btree;
u64 data;
u64 cached;
u64 reserved;
u64 nr_inodes;
};
/* XXX: add stats for compression ratio */
#if 0
u64 uncompressed;
u64 compressed;
#endif
/* broken out: */
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
struct bch_fs_usage_base b;
u64 persistent_reserved[BCH_REPLICAS_MAX];
u64 replicas[];
};

View File

@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
while (1) {
do {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread && kthread_should_stop())
break;
@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
schedule();
try_to_freeze();
}
} while (0);
__set_current_state(TASK_RUNNING);
del_timer_sync(&wait.cpu_timer);

View File

@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
{
if (type < BCH_COMPRESSION_TYPE_NR)
prt_str(out, __bch2_compression_types[type]);
else
prt_printf(out, "(invalid compression type %u)", type);
}
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,

View File

@ -285,9 +285,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
bch2_bkey_set_needs_rebalance(c, insert,
op->opts.background_target,
op->opts.background_compression) ?:
bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
@ -529,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_MOVE|
m->data_opts.write_flags;
m->op.compression_opt = io_opts.background_compression ?: io_opts.compression;
m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
bkey_for_each_ptr(ptrs, ptr)

View File

@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
return false;
bio = bio_alloc_bioset(ca->disk_sb.bdev,
buf_pages(n_sorted, btree_bytes(c)),
buf_pages(n_sorted, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
bch2_bio_map(bio, n_sorted, btree_bytes(c));
bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
submit_bio_wait(bio);
bio_put(bio);
percpu_ref_put(&ca->io_ref);
memcpy(n_ondisk, n_sorted, btree_bytes(c));
memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
v->written = 0;
if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
}
bio = bio_alloc_bioset(ca->disk_sb.bdev,
buf_pages(n_ondisk, btree_bytes(c)),
buf_pages(n_ondisk, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
bch2_bio_map(bio, n_ondisk, btree_bytes(c));
bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
ret = submit_bio_wait(bio);
if (ret) {
@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
kvpfree(n_ondisk, btree_bytes(c));
kvpfree(n_ondisk, btree_buf_bytes(b));
percpu_ref_put(&ca->io_ref);
}

View File

@ -0,0 +1,42 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DIRENT_FORMAT_H
#define _BCACHEFS_DIRENT_FORMAT_H
/*
* Dirents (and xattrs) have to implement string lookups; since our b-tree
* doesn't support arbitrary length strings for the key, we instead index by a
* 64 bit hash (currently truncated sha1) of the string, stored in the offset
* field of the key - using linear probing to resolve hash collisions. This also
* provides us with the readdir cookie posix requires.
*
* Linear probing requires us to use whiteouts for deletions, in the event of a
* collision:
*/
struct bch_dirent {
struct bch_val v;
/* Target inode number: */
union {
__le64 d_inum;
struct { /* DT_SUBVOL */
__le32 d_child_subvol;
__le32 d_parent_subvol;
};
};
/*
* Copy of mode bits 12-15 from the target inode - so userspace can get
* the filetype without having to do a stat()
*/
__u8 d_type;
__u8 d_name[];
} __packed __aligned(8);
#define DT_SUBVOL 16
#define BCH_DT_MAX 17
#define BCH_NAME_MAX 512
#endif /* _BCACHEFS_DIRENT_FORMAT_H */

View File

@ -190,7 +190,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
a->v.stripe_redundancy, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
iter.pos.inode, iter.pos.offset, a->v.gen,
bch2_data_types[a->v.data_type],
bch2_data_type_str(a->v.data_type),
a->v.dirty_sectors,
a->v.stripe, s.k->p.offset)) {
ret = -EIO;
@ -200,7 +200,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
iter.pos.inode, iter.pos.offset, a->v.gen,
bch2_data_types[a->v.data_type],
bch2_data_type_str(a->v.data_type),
a->v.dirty_sectors,
s.k->p.offset)) {
ret = -EIO;
@ -367,7 +367,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
}
}
if (!(flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))) {
if (flags & BTREE_TRIGGER_ATOMIC) {
struct stripe *m = genradix_ptr(&c->stripes, idx);
if (!m) {

19
fs/bcachefs/ec_format.h Normal file
View File

@ -0,0 +1,19 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_EC_FORMAT_H
#define _BCACHEFS_EC_FORMAT_H
struct bch_stripe {
struct bch_val v;
__le16 sectors;
__u8 algorithm;
__u8 nr_blocks;
__u8 nr_redundant;
__u8 csum_granularity_bits;
__u8 csum_type;
__u8 pad;
struct bch_extent_ptr ptrs[];
} __packed __aligned(8);
#endif /* _BCACHEFS_EC_FORMAT_H */

View File

@ -8,6 +8,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
@ -1018,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
bch2_csum_types[crc.csum_type],
bch2_compression_types[crc.compression_type]);
bch2_csum_types[crc.csum_type]);
bch2_prt_compression_type(out, crc.compression_type);
break;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
@ -1334,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
unsigned target, unsigned compression)
struct bch_io_opts *opts)
{
struct bkey_s k = bkey_i_to_s(_k);
struct bch_extent_rebalance *r;
unsigned target = opts->background_target;
unsigned compression = background_compression(*opts);
bool needs_rebalance;
if (!bkey_extent_is_direct_data(k.k))

View File

@ -708,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
unsigned, unsigned);
struct bch_io_opts *);
/* Generic extent code: */

View File

@ -0,0 +1,295 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_EXTENTS_FORMAT_H
#define _BCACHEFS_EXTENTS_FORMAT_H
/*
* In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
* preceded by checksum/compression information (bch_extent_crc32 or
* bch_extent_crc64).
*
* One major determining factor in the format of extents is how we handle and
* represent extents that have been partially overwritten and thus trimmed:
*
* If an extent is not checksummed or compressed, when the extent is trimmed we
* don't have to remember the extent we originally allocated and wrote: we can
* merely adjust ptr->offset to point to the start of the data that is currently
* live. The size field in struct bkey records the current (live) size of the
* extent, and is also used to mean "size of region on disk that we point to" in
* this case.
*
* Thus an extent that is not checksummed or compressed will consist only of a
* list of bch_extent_ptrs, with none of the fields in
* bch_extent_crc32/bch_extent_crc64.
*
* When an extent is checksummed or compressed, it's not possible to read only
* the data that is currently live: we have to read the entire extent that was
* originally written, and then return only the part of the extent that is
* currently live.
*
* Thus, in addition to the current size of the extent in struct bkey, we need
* to store the size of the originally allocated space - this is the
* compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
* when the extent is trimmed, instead of modifying the offset field of the
* pointer, we keep a second smaller offset field - "offset into the original
* extent of the currently live region".
*
* The other major determining factor is replication and data migration:
*
* Each pointer may have its own bch_extent_crc32/64. When doing a replicated
* write, we will initially write all the replicas in the same format, with the
* same checksum type and compression format - however, when copygc runs later (or
* tiering/cache promotion, anything that moves data), it is not in general
* going to rewrite all the pointers at once - one of the replicas may be in a
* bucket on one device that has very little fragmentation while another lives
* in a bucket that has become heavily fragmented, and thus is being rewritten
* sooner than the rest.
*
* Thus it will only move a subset of the pointers (or in the case of
* tiering/cache promotion perhaps add a single pointer without dropping any
* current pointers), and if the extent has been partially overwritten it must
* write only the currently live portion (or copygc would not be able to reduce
* fragmentation!) - which necessitates a different bch_extent_crc format for
* the new pointer.
*
* But in the interests of space efficiency, we don't want to store one
* bch_extent_crc for each pointer if we don't have to.
*
* Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
* bch_extent_ptrs appended arbitrarily one after the other. We determine the
* type of a given entry with a scheme similar to utf8 (except we're encoding a
* type, not a size), encoding the type in the position of the first set bit:
*
* bch_extent_crc32 - 0b1
* bch_extent_ptr - 0b10
* bch_extent_crc64 - 0b100
*
* We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
* bch_extent_crc64 is the least constrained).
*
* Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
* until the next bch_extent_crc32/64.
*
* If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
* is neither checksummed nor compressed.
*/
#define BCH_EXTENT_ENTRY_TYPES() \
x(ptr, 0) \
x(crc32, 1) \
x(crc64, 2) \
x(crc128, 3) \
x(stripe_ptr, 4) \
x(rebalance, 5)
#define BCH_EXTENT_ENTRY_MAX 6
enum bch_extent_entry_type {
#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
BCH_EXTENT_ENTRY_TYPES()
#undef x
};
/* Compressed/uncompressed size are stored biased by 1: */
struct bch_extent_crc32 {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u32 type:2,
_compressed_size:7,
_uncompressed_size:7,
offset:7,
_unused:1,
csum_type:4,
compression_type:4;
__u32 csum;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u32 csum;
__u32 compression_type:4,
csum_type:4,
_unused:1,
offset:7,
_uncompressed_size:7,
_compressed_size:7,
type:2;
#endif
} __packed __aligned(8);
#define CRC32_SIZE_MAX (1U << 7)
#define CRC32_NONCE_MAX 0
struct bch_extent_crc64 {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:3,
_compressed_size:9,
_uncompressed_size:9,
offset:9,
nonce:10,
csum_type:4,
compression_type:4,
csum_hi:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 csum_hi:16,
compression_type:4,
csum_type:4,
nonce:10,
offset:9,
_uncompressed_size:9,
_compressed_size:9,
type:3;
#endif
__u64 csum_lo;
} __packed __aligned(8);
#define CRC64_SIZE_MAX (1U << 9)
#define CRC64_NONCE_MAX ((1U << 10) - 1)
struct bch_extent_crc128 {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:4,
_compressed_size:13,
_uncompressed_size:13,
offset:13,
nonce:13,
csum_type:4,
compression_type:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 compression_type:4,
csum_type:4,
nonce:13,
offset:13,
_uncompressed_size:13,
_compressed_size:13,
type:4;
#endif
struct bch_csum csum;
} __packed __aligned(8);
#define CRC128_SIZE_MAX (1U << 13)
#define CRC128_NONCE_MAX ((1U << 13) - 1)
/*
* @reservation - pointer hasn't been written to, just reserved
*/
struct bch_extent_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:1,
cached:1,
unused:1,
unwritten:1,
offset:44, /* 8 petabytes */
dev:8,
gen:8;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 gen:8,
dev:8,
offset:44,
unwritten:1,
unused:1,
cached:1,
type:1;
#endif
} __packed __aligned(8);
struct bch_extent_stripe_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:5,
block:8,
redundancy:4,
idx:47;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 idx:47,
redundancy:4,
block:8,
type:5;
#endif
};
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:6,
unused:34,
compression:8, /* enum bch_compression_opt */
target:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 target:16,
compression:8,
unused:34,
type:6;
#endif
};
union bch_extent_entry {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
unsigned long type;
#elif __BITS_PER_LONG == 32
struct {
unsigned long pad;
unsigned long type;
};
#else
#error edit for your odd byteorder.
#endif
#define x(f, n) struct bch_extent_##f f;
BCH_EXTENT_ENTRY_TYPES()
#undef x
};
struct bch_btree_ptr {
struct bch_val v;
__u64 _data[0];
struct bch_extent_ptr start[];
} __packed __aligned(8);
struct bch_btree_ptr_v2 {
struct bch_val v;
__u64 mem_ptr;
__le64 seq;
__le16 sectors_written;
__le16 flags;
struct bpos min_key;
__u64 _data[0];
struct bch_extent_ptr start[];
} __packed __aligned(8);
LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
struct bch_extent {
struct bch_val v;
__u64 _data[0];
union bch_extent_entry start[];
} __packed __aligned(8);
/* Maximum size (in u64s) a single pointer could be: */
#define BKEY_EXTENT_PTR_U64s_MAX\
((sizeof(struct bch_extent_crc128) + \
sizeof(struct bch_extent_ptr)) / sizeof(__u64))
/* Maximum possible size of an entire extent value: */
#define BKEY_EXTENT_VAL_U64s_MAX \
(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
/* * Maximum possible size of an entire extent, key + value: */
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
/* Btree pointers don't carry around checksums: */
#define BKEY_BTREE_PTR_VAL_U64s_MAX \
((sizeof(struct bch_btree_ptr_v2) + \
sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
#define BKEY_BTREE_PTR_U64s_MAX \
(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
struct bch_reservation {
struct bch_val v;
__le32 generation;
__u8 nr_replicas;
__u8 pad[3];
} __packed __aligned(8);
struct bch_inline_data {
struct bch_val v;
u8 data[];
};
#endif /* _BCACHEFS_EXTENTS_FORMAT_H */

View File

@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
}
#define eytzinger1_for_each(_i, _size) \
for ((_i) = eytzinger1_first((_size)); \
for (unsigned (_i) = eytzinger1_first((_size)); \
(_i) != 0; \
(_i) = eytzinger1_next((_i), (_size)))
@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
}
#define eytzinger0_for_each(_i, _size) \
for ((_i) = eytzinger0_first((_size)); \
for (unsigned (_i) = eytzinger0_first((_size)); \
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))

View File

@ -77,6 +77,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
/* bios must be 512 byte aligned: */
if ((offset|iter->count) & (SECTOR_SIZE - 1))
return -EINVAL;
ret = min_t(loff_t, iter->count,
max_t(loff_t, 0, i_size_read(&inode->v) - offset));

View File

@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
}
}
void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
u64 start, u64 end)
int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
u64 *start, u64 end,
bool nonblocking)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
pgoff_t index = start >> PAGE_SECTORS_SHIFT;
pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
struct folio_batch fbatch;
s64 i_sectors_delta = 0;
unsigned i, j;
int ret = 0;
if (end <= start)
return;
if (end <= *start)
return 0;
folio_batch_init(&fbatch);
while (filemap_get_folios(inode->v.i_mapping,
&index, end_index, &fbatch)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
if (!nonblocking)
folio_lock(folio);
else if (!folio_trylock(folio)) {
folio_batch_release(&fbatch);
ret = -EAGAIN;
break;
}
u64 folio_start = folio_sector(folio);
u64 folio_end = folio_end_sector(folio);
unsigned folio_offset = max(start, folio_start) - folio_start;
unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
struct bch_folio *s;
BUG_ON(end <= folio_start);
folio_lock(folio);
s = bch2_folio(folio);
*start = min(end, folio_end);
struct bch_folio *s = bch2_folio(folio);
if (s) {
unsigned folio_offset = max(*start, folio_start) - folio_start;
unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
spin_lock(&s->lock);
for (j = folio_offset; j < folio_offset + folio_len; j++) {
for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
i_sectors_delta -= s->s[j].state == SECTOR_dirty;
bch2_folio_sector_set(folio, s, j,
folio_sector_reserve(s->s[j].state));
@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
}
bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
return ret;
}
static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,

View File

@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
int bch2_get_folio_disk_reservation(struct bch_fs *,
struct bch_inode_info *,

View File

@ -675,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
drop_locks_do(trans,
(bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
if (bch2_mark_pagecache_reserved(inode, &hole_start,
iter.pos.offset, true))
drop_locks_do(trans,
bch2_mark_pagecache_reserved(inode, &hole_start,
iter.pos.offset, false));
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))

View File

@ -337,11 +337,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
create_flags |= BCH_CREATE_SNAPSHOT_RO;
/* why do we need this lock? */
down_read(&c->vfs_sb->s_umount);
if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
/* sync_inodes_sb enforce s_umount is locked */
down_read(&c->vfs_sb->s_umount);
sync_inodes_sb(c->vfs_sb);
up_read(&c->vfs_sb->s_umount);
}
retry:
if (arg.src_ptr) {
error = user_path_at(arg.dirfd,
@ -425,8 +426,6 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
goto retry;
}
err1:
up_read(&c->vfs_sb->s_umount);
return error;
}

View File

@ -506,22 +506,33 @@ int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
prt_printf(out, "mode=%o ", inode->bi_mode);
printbuf_indent_add(out, 2);
prt_printf(out, "mode=%o", inode->bi_mode);
prt_newline(out);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
prt_printf(out, " (%x)", inode->bi_flags);
prt_newline(out);
prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
inode->bi_journal_seq,
inode->bi_size,
inode->bi_sectors,
inode->bi_version);
prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
prt_newline(out);
prt_printf(out, "bi_size=%llu", inode->bi_size);
prt_newline(out);
prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
prt_newline(out);
prt_newline(out);
prt_printf(out, "bi_version=%llu", inode->bi_version);
#define x(_name, _bits) \
prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
prt_printf(out, #_name "=%llu", (u64) inode->_name); \
prt_newline(out);
BCH_INODE_FIELDS_v3()
#undef x
printbuf_indent_sub(out, 2);
}
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
@ -587,7 +598,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
}
}
if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) {
if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
BUG_ON(!trans->journal_res.seq);
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
@ -597,7 +608,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
struct bch_fs *c = trans->c;
percpu_down_read(&c->mark_lock);
this_cpu_add(c->usage_gc->nr_inodes, nr);
this_cpu_add(c->usage_gc->b.nr_inodes, nr);
percpu_up_read(&c->mark_lock);
}

166
fs/bcachefs/inode_format.h Normal file
View File

@ -0,0 +1,166 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_INODE_FORMAT_H
#define _BCACHEFS_INODE_FORMAT_H
#define BLOCKDEV_INODE_MAX 4096
#define BCACHEFS_ROOT_INO 4096
struct bch_inode {
struct bch_val v;
__le64 bi_hash_seed;
__le32 bi_flags;
__le16 bi_mode;
__u8 fields[];
} __packed __aligned(8);
struct bch_inode_v2 {
struct bch_val v;
__le64 bi_journal_seq;
__le64 bi_hash_seed;
__le64 bi_flags;
__le16 bi_mode;
__u8 fields[];
} __packed __aligned(8);
struct bch_inode_v3 {
struct bch_val v;
__le64 bi_journal_seq;
__le64 bi_hash_seed;
__le64 bi_flags;
__le64 bi_sectors;
__le64 bi_size;
__le64 bi_version;
__u8 fields[];
} __packed __aligned(8);
#define INODEv3_FIELDS_START_INITIAL 6
#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
struct bch_inode_generation {
struct bch_val v;
__le32 bi_generation;
__le32 pad;
} __packed __aligned(8);
/*
* bi_subvol and bi_parent_subvol are only set for subvolume roots:
*/
#define BCH_INODE_FIELDS_v2() \
x(bi_atime, 96) \
x(bi_ctime, 96) \
x(bi_mtime, 96) \
x(bi_otime, 96) \
x(bi_size, 64) \
x(bi_sectors, 64) \
x(bi_uid, 32) \
x(bi_gid, 32) \
x(bi_nlink, 32) \
x(bi_generation, 32) \
x(bi_dev, 32) \
x(bi_data_checksum, 8) \
x(bi_compression, 8) \
x(bi_project, 32) \
x(bi_background_compression, 8) \
x(bi_data_replicas, 8) \
x(bi_promote_target, 16) \
x(bi_foreground_target, 16) \
x(bi_background_target, 16) \
x(bi_erasure_code, 16) \
x(bi_fields_set, 16) \
x(bi_dir, 64) \
x(bi_dir_offset, 64) \
x(bi_subvol, 32) \
x(bi_parent_subvol, 32)
#define BCH_INODE_FIELDS_v3() \
x(bi_atime, 96) \
x(bi_ctime, 96) \
x(bi_mtime, 96) \
x(bi_otime, 96) \
x(bi_uid, 32) \
x(bi_gid, 32) \
x(bi_nlink, 32) \
x(bi_generation, 32) \
x(bi_dev, 32) \
x(bi_data_checksum, 8) \
x(bi_compression, 8) \
x(bi_project, 32) \
x(bi_background_compression, 8) \
x(bi_data_replicas, 8) \
x(bi_promote_target, 16) \
x(bi_foreground_target, 16) \
x(bi_background_target, 16) \
x(bi_erasure_code, 16) \
x(bi_fields_set, 16) \
x(bi_dir, 64) \
x(bi_dir_offset, 64) \
x(bi_subvol, 32) \
x(bi_parent_subvol, 32) \
x(bi_nocow, 8)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
x(data_checksum, 8) \
x(compression, 8) \
x(project, 32) \
x(background_compression, 8) \
x(data_replicas, 8) \
x(promote_target, 16) \
x(foreground_target, 16) \
x(background_target, 16) \
x(erasure_code, 16) \
x(nocow, 8)
enum inode_opt_id {
#define x(name, ...) \
Inode_opt_##name,
BCH_INODE_OPTS()
#undef x
Inode_opt_nr,
};
#define BCH_INODE_FLAGS() \
x(sync, 0) \
x(immutable, 1) \
x(append, 2) \
x(nodump, 3) \
x(noatime, 4) \
x(i_size_dirty, 5) \
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
x(backptr_untrusted, 8)
/* bits 20+ reserved for packed fields below: */
enum bch_inode_flags {
#define x(t, n) BCH_INODE_##t = 1U << n,
BCH_INODE_FLAGS()
#undef x
};
enum __bch_inode_flags {
#define x(t, n) __BCH_INODE_##t = n,
BCH_INODE_FLAGS()
#undef x
};
LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
LE64_BITMASK(INODEv3_FIELDS_START,
struct bch_inode_v3, bi_flags, 31, 36);
LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
#endif /* _BCACHEFS_INODE_FORMAT_H */

View File

@ -442,9 +442,7 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
ret = bch2_bkey_set_needs_rebalance(c, copy,
opts.background_target,
opts.background_compression) ?:
ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:

View File

@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
ret = bch2_bkey_set_needs_rebalance(c, sk.k,
op->opts.background_target,
op->opts.background_compression) ?:
ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
@ -1447,10 +1445,11 @@ static void __bch2_write(struct bch_write_op *op)
op->flags |= BCH_WRITE_DONE;
if (ret < 0) {
bch_err_inum_offset_ratelimited(c,
op->pos.inode,
op->pos.offset << 9,
"%s(): error: %s", __func__, bch2_err_str(ret));
if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
bch_err_inum_offset_ratelimited(c,
op->pos.inode,
op->pos.offset << 9,
"%s(): error: %s", __func__, bch2_err_str(ret));
op->error = ret;
break;
}

View File

@ -27,6 +27,47 @@ static const char * const bch2_journal_errors[] = {
NULL
};
static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
{
union journal_res_state s = READ_ONCE(j->reservations);
unsigned i = seq & JOURNAL_BUF_MASK;
struct journal_buf *buf = j->buf + i;
prt_printf(out, "seq:");
prt_tab(out);
prt_printf(out, "%llu", seq);
prt_newline(out);
printbuf_indent_add(out, 2);
prt_printf(out, "refcount:");
prt_tab(out);
prt_printf(out, "%u", journal_state_count(s, i));
prt_newline(out);
prt_printf(out, "size:");
prt_tab(out);
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
prt_printf(out, "expires");
prt_tab(out);
prt_printf(out, "%li jiffies", buf->expires - jiffies);
prt_newline(out);
printbuf_indent_sub(out, 2);
}
static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
{
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 24);
for (u64 seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
seq++)
bch2_journal_buf_to_text(out, j, seq);
}
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
@ -156,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
* We don't close a journal_buf until the next journal_buf is finished writing,
* and can be opened again - this also initializes the next journal_buf:
*/
static void __journal_entry_close(struct journal *j, unsigned closed_val)
static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
@ -185,7 +226,17 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
trace_journal_entry_close(c, vstruct_bytes(buf->data));
if (trace_journal_entry_close_enabled() && trace) {
struct printbuf pbuf = PRINTBUF;
pbuf.atomic++;
prt_str(&pbuf, "entry size: ");
prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
prt_newline(&pbuf);
bch2_prt_task_backtrace(&pbuf, current, 1);
trace_journal_entry_close(c, pbuf.buf);
printbuf_exit(&pbuf);
}
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
@ -225,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
void bch2_journal_halt(struct journal *j)
{
spin_lock(&j->lock);
__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
if (!j->err_seq)
j->err_seq = journal_cur_seq(j);
journal_wake(j);
@ -239,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j)
/* Don't close it yet if we already have a write in flight: */
if (ret)
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
else if (nr_unwritten_journal_entries(j)) {
struct journal_buf *buf = journal_cur_buf(j);
@ -406,7 +457,7 @@ static void journal_write_work(struct work_struct *work)
if (delta > 0)
mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
else
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
unlock:
spin_unlock(&j->lock);
}
@ -463,13 +514,21 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
ret = journal_entry_open(j);
if (ret == JOURNAL_ERR_max_in_flight) {
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, true);
trace_and_count(c, journal_entry_full, c);
if (trace_journal_entry_full_enabled()) {
struct printbuf buf = PRINTBUF;
buf.atomic++;
bch2_journal_bufs_to_text(&buf, j);
trace_journal_entry_full(c, buf.buf);
printbuf_exit(&buf);
}
count_event(c, journal_entry_full);
}
unlock:
can_discard = j->can_discard;
@ -549,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
/*
* Not enough room in current journal entry, have to flush it:
*/
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
} else {
journal_cur_buf(j)->u64s_reserved += d;
}
@ -606,7 +665,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
struct journal_res res = { 0 };
if (journal_entry_is_open(j))
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
spin_unlock(&j->lock);
@ -786,7 +845,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
if (buf->need_flush_to_write_buffer) {
if (seq == journal_cur_seq(j))
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
union journal_res_state s;
s.v = atomic64_read_acquire(&j->reservations.counter);
@ -1339,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
}
prt_newline(out);
for (u64 seq = journal_cur_seq(j);
seq >= journal_last_unwritten_seq(j);
--seq) {
unsigned i = seq & JOURNAL_BUF_MASK;
prt_printf(out, "unwritten entry:");
prt_tab(out);
prt_printf(out, "%llu", seq);
prt_newline(out);
printbuf_indent_add(out, 2);
prt_printf(out, "refcount:");
prt_tab(out);
prt_printf(out, "%u", journal_state_count(s, i));
prt_newline(out);
prt_printf(out, "sectors:");
prt_tab(out);
prt_printf(out, "%u", j->buf[i].sectors);
prt_newline(out);
prt_printf(out, "expires");
prt_tab(out);
prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
prt_newline(out);
printbuf_indent_sub(out, 2);
}
prt_printf(out, "unwritten entries:");
prt_newline(out);
bch2_journal_bufs_to_text(out, j);
prt_printf(out,
"replay done:\t\t%i\n",

View File

@ -683,10 +683,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
for (i = 0; i < nr_types; i++) {
if (i < BCH_DATA_NR)
prt_printf(out, " %s", bch2_data_types[i]);
else
prt_printf(out, " (unknown data type %u)", i);
bch2_prt_data_type(out, i);
prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),

View File

@ -0,0 +1,30 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
#define _BCACHEFS_LOGGED_OPS_FORMAT_H
struct bch_logged_op_truncate {
struct bch_val v;
__le32 subvol;
__le32 pad;
__le64 inum;
__le64 new_i_size;
};
enum logged_op_finsert_state {
LOGGED_OP_FINSERT_start,
LOGGED_OP_FINSERT_shift_extents,
LOGGED_OP_FINSERT_finish,
};
struct bch_logged_op_finsert {
struct bch_val v;
__u8 state;
__u8 pad[3];
__le32 subvol;
__le64 inum;
__le64 dst_offset;
__le64 src_offset;
__le64 pos;
};
#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */

View File

@ -6,9 +6,11 @@
#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "errcode.h"
@ -34,12 +36,46 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
printbuf_tabstop_push(out, 20);
prt_str(out, "rewrite ptrs:");
prt_tab(out);
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
prt_newline(out);
prt_str(out, "kill ptrs: ");
prt_tab(out);
bch2_prt_u64_base2(out, data_opts->kill_ptrs);
prt_newline(out);
prt_str(out, "target: ");
prt_tab(out);
bch2_target_to_text(out, c, data_opts->target);
prt_newline(out);
prt_str(out, "compression: ");
prt_tab(out);
bch2_compression_opt_to_text(out, background_compression(*io_opts));
prt_newline(out);
prt_str(out, "extra replicas: ");
prt_tab(out);
prt_u64(out, data_opts->extra_replicas);
}
static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
if (trace_move_extent_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
trace_move_extent(c, buf.buf);
printbuf_exit(&buf);
}
@ -111,6 +147,15 @@ static void move_write(struct moving_io *io)
return;
}
if (trace_move_extent_write_enabled()) {
struct bch_fs *c = io->write.op.c;
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
trace_move_extent_write(c, buf.buf);
printbuf_exit(&buf);
}
closure_get(&io->write.ctxt->cl);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
atomic_inc(&io->write.ctxt->write_ios);
@ -241,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
trace_move_extent2(c, k, &io_opts, &data_opts);
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@ -759,6 +805,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
if (!b)
goto next;
unsigned sectors = btree_ptr_sectors_written(&b->key);
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
bch2_trans_iter_exit(trans, &iter);
@ -768,11 +816,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
goto err;
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate,
c->opts.btree_node_size >> 9);
bch2_ratelimit_increment(ctxt->rate, sectors);
if (ctxt->stats) {
atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
atomic64_add(sectors, &ctxt->stats->sectors_seen);
atomic64_add(sectors, &ctxt->stats->sectors_moved);
}
}
next:
@ -1083,9 +1130,9 @@ int bch2_data_job(struct bch_fs *c,
void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
prt_printf(out, "%s: data type=%s pos=",
stats->name,
bch2_data_types[stats->data_type]);
prt_printf(out, "%s: data type==", stats->name);
bch2_prt_data_type(out, stats->data_type);
prt_str(out, " pos=");
bch2_bbpos_to_text(out, stats->pos);
prt_newline(out);
printbuf_indent_add(out, 2);

View File

@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = {
NULL
};
const char * const bch2_compression_types[] = {
const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = {
NULL
};
const char * const bch2_data_types[] = {
const char * const __bch2_data_types[] = {
BCH_DATA_TYPES()
NULL
};

View File

@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
extern const char * const bch2_compression_types[];
extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const bch2_data_types[];
extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
extern const char * const bch2_jset_entry_types[];
extern const char * const bch2_fs_usage_types[];
@ -564,6 +564,11 @@ struct bch_io_opts {
#undef x
};
static inline unsigned background_compression(struct bch_io_opts opts)
{
return opts.background_compression ?: opts.compression;
}
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);

View File

@ -0,0 +1,47 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_QUOTA_FORMAT_H
#define _BCACHEFS_QUOTA_FORMAT_H
/* KEY_TYPE_quota: */
enum quota_types {
QTYP_USR = 0,
QTYP_GRP = 1,
QTYP_PRJ = 2,
QTYP_NR = 3,
};
enum quota_counters {
Q_SPC = 0,
Q_INO = 1,
Q_COUNTERS = 2,
};
struct bch_quota_counter {
__le64 hardlimit;
__le64 softlimit;
};
struct bch_quota {
struct bch_val v;
struct bch_quota_counter c[Q_COUNTERS];
} __packed __aligned(8);
/* BCH_SB_FIELD_quota: */
struct bch_sb_quota_counter {
__le32 timelimit;
__le32 warnlimit;
};
struct bch_sb_quota_type {
__le64 flags;
struct bch_sb_quota_counter c[Q_COUNTERS];
};
struct bch_sb_field_quota {
struct bch_sb_field field;
struct bch_sb_quota_type q[QTYP_NR];
} __packed __aligned(8);
#endif /* _BCACHEFS_QUOTA_FORMAT_H */

View File

@ -177,8 +177,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
prt_str(&buf, "target=");
bch2_target_to_text(&buf, c, r->target);
prt_str(&buf, " compression=");
struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
prt_str(&buf, bch2_compression_opts[opt.type]);
bch2_compression_opt_to_text(&buf, r->compression);
prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, k);
@ -254,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
if (k.k->p.inode) {
target = io_opts->background_target;
compression = io_opts->background_compression ?: io_opts->compression;
compression = background_compression(*io_opts);
} else {
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
target = r ? r->target : io_opts->background_target;
compression = r ? r->compression :
(io_opts->background_compression ?: io_opts->compression);
compression = r ? r->compression : background_compression(*io_opts);
}
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
@ -371,6 +369,7 @@ static int do_rebalance(struct moving_context *ctxt)
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
!atomic64_read(&r->scan_stats.sectors_seen)) {
bch2_moving_ctxt_flush_all(ctxt);
bch2_trans_unlock_long(trans);
rebalance_wait(c);
}
@ -385,7 +384,6 @@ static int bch2_rebalance_thread(void *arg)
struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct moving_context ctxt;
int ret;
set_freezable();
@ -393,8 +391,7 @@ static int bch2_rebalance_thread(void *arg)
writepoint_ptr(&c->rebalance_write_point),
true);
while (!kthread_should_stop() &&
!(ret = do_rebalance(&ctxt)))
while (!kthread_should_stop() && !do_rebalance(&ctxt))
;
bch2_moving_ctxt_exit(&ctxt);

View File

@ -280,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_inodes:
c->usage_base->nr_inodes = le64_to_cpu(u->v);
c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_key_version:
atomic64_set(&c->key_version,

View File

@ -292,10 +292,10 @@ static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *f
}
}
int bch2_trans_mark_reflink_v(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
unsigned flags)
int bch2_trigger_reflink_v(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
unsigned flags)
{
if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
(flags & BTREE_TRIGGER_INSERT))
@ -324,7 +324,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
min(datalen, 32U), d.v->data);
}
int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
unsigned flags)
@ -486,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
if (dst_inum.inum < src_inum.inum) {
/* Avoid some lock cycle transaction restarts */
ret = bch2_btree_iter_traverse(&dst_iter);
if (ret)
continue;
}
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
bch2_btree_iter_set_pos(&src_iter, src_want);
@ -538,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
opts.background_target,
opts.background_compression) ?:
ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?:
bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,

View File

@ -24,14 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
.trigger = bch2_trans_mark_reflink_v, \
.trigger = bch2_trigger_reflink_v, \
.min_val_size = 8, \
})
@ -39,7 +39,7 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
int bch2_trigger_indirect_inline_data(struct btree_trans *,
enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
unsigned);
@ -47,7 +47,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
.val_to_text = bch2_indirect_inline_data_to_text, \
.trigger = bch2_trans_mark_indirect_inline_data, \
.trigger = bch2_trigger_indirect_inline_data, \
.min_val_size = 8, \
})

View File

@ -0,0 +1,33 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_REFLINK_FORMAT_H
#define _BCACHEFS_REFLINK_FORMAT_H
struct bch_reflink_p {
struct bch_val v;
__le64 idx;
/*
* A reflink pointer might point to an indirect extent which is then
* later split (by copygc or rebalance). If we only pointed to part of
* the original indirect extent, and then one of the fragments is
* outside the range we point to, we'd leak a refcount: so when creating
* reflink pointers, we need to store pad values to remember the full
* range we were taking a reference on.
*/
__le32 front_pad;
__le32 back_pad;
} __packed __aligned(8);
struct bch_reflink_v {
struct bch_val v;
__le64 refcount;
union bch_extent_entry start[0];
__u64 _data[];
} __packed __aligned(8);
struct bch_indirect_inline_data {
struct bch_val v;
__le64 refcount;
u8 data[];
};
#endif /* _BCACHEFS_REFLINK_FORMAT_H */

View File

@ -9,6 +9,12 @@
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
static int bch2_memcmp(const void *l, const void *r, size_t size)
{
return memcmp(l, r, size);
}
/* Replicas tracking - in memory: */
static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
@ -33,21 +39,16 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
struct bch_replicas_entry_v0 *e)
{
unsigned i;
if (e->data_type < BCH_DATA_NR)
prt_printf(out, "%s", bch2_data_types[e->data_type]);
else
prt_printf(out, "(invalid data type %u)", e->data_type);
bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u [", e->nr_devs);
for (i = 0; i < e->nr_devs; i++)
for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
@ -55,15 +56,10 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
void bch2_replicas_entry_to_text(struct printbuf *out,
struct bch_replicas_entry_v1 *e)
{
unsigned i;
if (e->data_type < BCH_DATA_NR)
prt_printf(out, "%s", bch2_data_types[e->data_type]);
else
prt_printf(out, "(invalid data type %u)", e->data_type);
bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
for (i = 0; i < e->nr_devs; i++)
for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
@ -831,7 +827,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
bch2_memcmp, NULL);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =

View File

@ -207,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = BCH_FS_USAGE_inodes;
u->v = cpu_to_le64(c->usage_base->nr_inodes);
u->v = cpu_to_le64(c->usage_base->b.nr_inodes);
}
{

View File

@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "super-io.h"
#include "counters.h"
#include "sb-counters.h"
/* BCH_SB_FIELD_counters */

View File

@ -1,11 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_COUNTERS_H
#define _BCACHEFS_COUNTERS_H
#ifndef _BCACHEFS_SB_COUNTERS_H
#define _BCACHEFS_SB_COUNTERS_H
#include "bcachefs.h"
#include "super-io.h"
int bch2_sb_counters_to_cpu(struct bch_fs *);
int bch2_sb_counters_from_cpu(struct bch_fs *);
@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
#endif // _BCACHEFS_COUNTERS_H
#endif // _BCACHEFS_SB_COUNTERS_H

View File

@ -0,0 +1,98 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
#define _BCACHEFS_SB_COUNTERS_FORMAT_H
#define BCH_PERSISTENT_COUNTERS() \
x(io_read, 0) \
x(io_write, 1) \
x(io_move, 2) \
x(bucket_invalidate, 3) \
x(bucket_discard, 4) \
x(bucket_alloc, 5) \
x(bucket_alloc_fail, 6) \
x(btree_cache_scan, 7) \
x(btree_cache_reap, 8) \
x(btree_cache_cannibalize, 9) \
x(btree_cache_cannibalize_lock, 10) \
x(btree_cache_cannibalize_lock_fail, 11) \
x(btree_cache_cannibalize_unlock, 12) \
x(btree_node_write, 13) \
x(btree_node_read, 14) \
x(btree_node_compact, 15) \
x(btree_node_merge, 16) \
x(btree_node_split, 17) \
x(btree_node_rewrite, 18) \
x(btree_node_alloc, 19) \
x(btree_node_free, 20) \
x(btree_node_set_root, 21) \
x(btree_path_relock_fail, 22) \
x(btree_path_upgrade_fail, 23) \
x(btree_reserve_get_fail, 24) \
x(journal_entry_full, 25) \
x(journal_full, 26) \
x(journal_reclaim_finish, 27) \
x(journal_reclaim_start, 28) \
x(journal_write, 29) \
x(read_promote, 30) \
x(read_bounce, 31) \
x(read_split, 33) \
x(read_retry, 32) \
x(read_reuse_race, 34) \
x(move_extent_read, 35) \
x(move_extent_write, 36) \
x(move_extent_finish, 37) \
x(move_extent_fail, 38) \
x(move_extent_start_fail, 39) \
x(copygc, 40) \
x(copygc_wait, 41) \
x(gc_gens_end, 42) \
x(gc_gens_start, 43) \
x(trans_blocked_journal_reclaim, 44) \
x(trans_restart_btree_node_reused, 45) \
x(trans_restart_btree_node_split, 46) \
x(trans_restart_fault_inject, 47) \
x(trans_restart_iter_upgrade, 48) \
x(trans_restart_journal_preres_get, 49) \
x(trans_restart_journal_reclaim, 50) \
x(trans_restart_journal_res_get, 51) \
x(trans_restart_key_cache_key_realloced, 52) \
x(trans_restart_key_cache_raced, 53) \
x(trans_restart_mark_replicas, 54) \
x(trans_restart_mem_realloced, 55) \
x(trans_restart_memory_allocation_failure, 56) \
x(trans_restart_relock, 57) \
x(trans_restart_relock_after_fill, 58) \
x(trans_restart_relock_key_cache_fill, 59) \
x(trans_restart_relock_next_node, 60) \
x(trans_restart_relock_parent_for_fill, 61) \
x(trans_restart_relock_path, 62) \
x(trans_restart_relock_path_intent, 63) \
x(trans_restart_too_many_iters, 64) \
x(trans_restart_traverse, 65) \
x(trans_restart_upgrade, 66) \
x(trans_restart_would_deadlock, 67) \
x(trans_restart_would_deadlock_write, 68) \
x(trans_restart_injected, 69) \
x(trans_restart_key_cache_upgrade, 70) \
x(trans_traverse_all, 71) \
x(transaction_commit, 72) \
x(write_super, 73) \
x(trans_restart_would_deadlock_recursion_limit, 74) \
x(trans_restart_write_buffer_flush, 75) \
x(trans_restart_split_race, 76) \
x(write_buffer_flush_slowpath, 77) \
x(write_buffer_flush_sync, 78)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
BCH_PERSISTENT_COUNTERS()
#undef x
BCH_COUNTER_NR
};
struct bch_sb_field_counters {
struct bch_sb_field field;
__le64 d[];
};
#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */

View File

@ -251,7 +251,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Data allowed:");
prt_tab(out);
if (BCH_MEMBER_DATA_ALLOWED(&m))
prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
else
prt_printf(out, "(none)");
prt_newline(out);
@ -259,7 +259,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Has data:");
prt_tab(out);
if (data_have)
prt_bitflags(out, bch2_data_types, data_have);
prt_bitflags(out, __bch2_data_types, data_have);
else
prt_printf(out, "(none)");
prt_newline(out);

View File

@ -1053,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
n->v.tree = cpu_to_le32(tree);
n->v.depth = cpu_to_le32(depth);
n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
n->v.btime.hi = 0;
for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
@ -1681,5 +1683,5 @@ int bch2_snapshots_read(struct bch_fs *c)
void bch2_fs_snapshots_exit(struct bch_fs *c)
{
kfree(rcu_dereference_protected(c->snapshots, true));
kvfree(rcu_dereference_protected(c->snapshots, true));
}

View File

@ -0,0 +1,36 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
#define _BCACHEFS_SNAPSHOT_FORMAT_H
struct bch_snapshot {
struct bch_val v;
__le32 flags;
__le32 parent;
__le32 children[2];
__le32 subvol;
/* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
__le32 tree;
__le32 depth;
__le32 skip[3];
bch_le128 btime;
};
LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
/* True if a subvolume points to this snapshot node: */
LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
/*
* Snapshot trees:
*
* The snapshot_trees btree gives us persistent indentifier for each tree of
* bch_snapshot nodes, and allow us to record and easily find the root/master
* subvolume that other snapshots were created from:
*/
struct bch_snapshot_tree {
struct bch_val v;
__le32 master_subvol;
__le32 root_snapshot;
};
#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */

View File

@ -0,0 +1,35 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
#define _BCACHEFS_SUBVOLUME_FORMAT_H
#define SUBVOL_POS_MIN POS(0, 1)
#define SUBVOL_POS_MAX POS(0, S32_MAX)
#define BCACHEFS_ROOT_SUBVOL 1
struct bch_subvolume {
struct bch_val v;
__le32 flags;
__le32 snapshot;
__le64 inode;
/*
* Snapshot subvolumes form a tree, separate from the snapshot nodes
* tree - if this subvolume is a snapshot, this is the ID of the
* subvolume it was created from:
*
* This is _not_ necessarily the subvolume of the directory containing
* this subvolume:
*/
__le32 parent;
__le32 pad;
bch_le128 otime;
};
LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
/*
* We need to know whether a subvolume is a snapshot so we can know whether we
* can delete it (or whether it should just be rm -rf'd)
*/
LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */

View File

@ -2,7 +2,6 @@
#include "bcachefs.h"
#include "checksum.h"
#include "counters.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
@ -13,6 +12,7 @@
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
#include "sb-counters.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "sb-members.h"
@ -1321,7 +1321,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Superblock size:");
prt_tab(out);
prt_printf(out, "%zu", vstruct_bytes(sb));
prt_units_u64(out, vstruct_bytes(sb));
prt_str(out, "/");
prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
prt_newline(out);
prt_printf(out, "Clean:");

View File

@ -23,7 +23,6 @@
#include "checksum.h"
#include "clock.h"
#include "compress.h"
#include "counters.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
@ -49,6 +48,7 @@
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
#include "sb-counters.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "snapshot.h"
@ -883,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@ -1625,7 +1625,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (data) {
struct printbuf data_has = PRINTBUF;
prt_bitflags(&data_has, bch2_data_types, data);
prt_bitflags(&data_has, __bch2_data_types, data);
bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
printbuf_exit(&data_has);
ret = -EBUSY;

View File

@ -21,6 +21,7 @@
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "inode.h"
@ -247,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
mutex_lock(&c->btree_cache.lock);
list_for_each_entry(b, &c->btree_cache.live, list)
ret += btree_bytes(c);
ret += btree_buf_bytes(b);
mutex_unlock(&c->btree_cache.lock);
return ret;
@ -330,7 +331,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
prt_newline(out);
for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
prt_str(out, bch2_compression_types[i]);
bch2_prt_compression_type(out, i);
prt_tab(out);
prt_human_readable_u64(out, s[i].sectors_compressed << 9);
@ -725,8 +726,10 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_sb(c, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
if ((id == Opt_background_target ||
id == Opt_background_compression) && v)
if (v &&
(id == Opt_background_target ||
id == Opt_background_compression ||
(id == Opt_compression && !c->opts.background_compression)))
bch2_set_rebalance_needs_scan(c, 0);
ret = size;
@ -883,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
for (i = 1; i < BCH_DATA_NR; i++)
prt_printf(out, "%-12s:%12llu\n",
bch2_data_types[i],
bch2_data_type_str(i),
percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
}
}
@ -908,7 +911,7 @@ SHOW(bch2_dev)
}
if (attr == &sysfs_has_data) {
prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
prt_char(out, '\n');
}

View File

@ -46,7 +46,7 @@ DECLARE_EVENT_CLASS(fs_str,
__assign_str(str, str);
),
TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
);
DECLARE_EVENT_CLASS(trans_str,
@ -273,28 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full,
TP_ARGS(c)
);
DEFINE_EVENT(bch_fs, journal_entry_full,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
DEFINE_EVENT(fs_str, journal_entry_full,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
TRACE_EVENT(journal_entry_close,
TP_PROTO(struct bch_fs *c, unsigned bytes),
TP_ARGS(c, bytes),
TP_STRUCT__entry(
__field(dev_t, dev )
__field(u32, bytes )
),
TP_fast_assign(
__entry->dev = c->dev;
__entry->bytes = bytes;
),
TP_printk("%d,%d entry bytes %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->bytes)
DEFINE_EVENT(fs_str, journal_entry_close,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(bio, journal_write,
@ -542,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail,
__entry->level = path->level;
TRACE_BPOS_assign(pos, path->pos);
c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
__entry->self_read_count = c.n[SIX_LOCK_read];
__entry->self_intent_count = c.n[SIX_LOCK_intent];
@ -827,40 +813,28 @@ TRACE_EVENT(bucket_evacuate,
);
DEFINE_EVENT(fs_str, move_extent,
TP_PROTO(struct bch_fs *c, const char *k),
TP_ARGS(c, k)
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_read,
TP_PROTO(struct bch_fs *c, const char *k),
TP_ARGS(c, k)
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_write,
TP_PROTO(struct bch_fs *c, const char *k),
TP_ARGS(c, k)
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_finish,
TP_PROTO(struct bch_fs *c, const char *k),
TP_ARGS(c, k)
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
TRACE_EVENT(move_extent_fail,
TP_PROTO(struct bch_fs *c, const char *msg),
TP_ARGS(c, msg),
TP_STRUCT__entry(
__field(dev_t, dev )
__string(msg, msg )
),
TP_fast_assign(
__entry->dev = c->dev;
__assign_str(msg, msg);
),
TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
DEFINE_EVENT(fs_str, move_extent_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_start_fail,
@ -1039,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race,
__entry->level = b->c.level;
__entry->written = b->written;
__entry->blocks = btree_blocks(trans->c);
__entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b);
__entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b);
),
TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
@ -1146,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path)
);
struct get_locks_fail;
TRACE_EVENT(trans_restart_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@ -1195,11 +1167,9 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->node_seq)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_path *path),
TP_ARGS(trans, caller_ip, path)
DEFINE_EVENT(trans_str, trans_restart_relock,
TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
TP_ARGS(trans, caller_ip, str)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,

View File

@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
{
while (nr_bits)
prt_char(out, '0' + ((v >> --nr_bits) & 1));
}
void bch2_prt_u64_base2(struct printbuf *out, u64 v)
{
bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
}
void bch2_print_string_as_lines(const char *prefix, const char *lines)
{
const char *p;
@ -1186,7 +1191,9 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret)
{
darray_init(ret);
char *dev_name = kstrdup(_dev_name, GFP_KERNEL), *s = dev_name;
char *dev_name, *s, *orig;
dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
if (!dev_name)
return -ENOMEM;
@ -1201,10 +1208,10 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret)
}
}
kfree(dev_name);
kfree(orig);
return 0;
err:
bch2_darray_str_exit(ret);
kfree(dev_name);
kfree(orig);
return -ENOMEM;
}

View File

@ -342,7 +342,8 @@ bool bch2_is_zero(const void *, size_t);
u64 bch2_read_flag_list(char *, const char * const[]);
void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2(struct printbuf *, u64);
void bch2_print_string_as_lines(const char *prefix, const char *lines);

View File

@ -590,8 +590,9 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
mutex_unlock(&inode->ei_update_lock);
if (value &&
(opt_id == Opt_background_compression ||
opt_id == Opt_background_target))
(opt_id == Opt_background_target ||
opt_id == Opt_background_compression ||
(opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);

View File

@ -0,0 +1,19 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_XATTR_FORMAT_H
#define _BCACHEFS_XATTR_FORMAT_H
#define KEY_TYPE_XATTR_INDEX_USER 0
#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
#define KEY_TYPE_XATTR_INDEX_SECURITY 4
struct bch_xattr {
struct bch_val v;
__u8 x_type;
__u8 x_name_len;
__le16 x_val_len;
__u8 x_name[];
} __packed __aligned(8);
#endif /* _BCACHEFS_XATTR_FORMAT_H */